--- /dev/null
+obj-$(CONFIG_MLX4_VNIC) += mlx4_vnic.o
+
+mlx4_vnic-y := vnic_data_main.o vnic_data_ib.o vnic_data_netdev.o vnic_data_neigh.o \
+ vnic_data_fs.o vnic_data_tx.o vnic_data_ethtool.o vnic_data_rx.o \
+ vnic_fip_main.o vnic_fip_ib.o vnic_fip_discover.o vnic_fip_pkt.o \
+ vnic_fip_login.o vnic_fip_vhub.o vnic_mcast.o vnic_port.o \
+ vnic_param.o vnic_qp.o vnic_main.o fip_parser.o \
+ vnic_data_mac.o
+
--- /dev/null
+digraph {
+ FIP_GW_HOST_ADMIN;
+ FIP_GW_MCAST_RCVD;
+ FIP_GW_CONNECTED;
+}
--- /dev/null
+digraph {
+
+ vnic_login_create_1 -> register_netdev; //
+ __vnic_login_create -> vnic_login_create_1; //
+ vnic_new_intf_store -> __vnic_login_create; //
+ vnic_port_data_init -> __vnic_login_create; //
+ vnic_ib_dev_add_one -> vnic_port_data_init; //
+ fip_vnic_login_create -> vnic_login_create_1; //
+ fip_vnic_test_login -> fip_vnic_login_create [label="login_wq", color=blue]; //
+ fip_vnic_destroy -> fip_vnic_test_login; //
+ fip_purge_vnics -> fip_vnic_destroy; //
+ fip_purge_vnics -> fip_purge_vnics [label="fip_wq", color=blue]; //
+ fip_vnic_close -> fip_purge_vnics [label="fip_wq", color=blue];
+ fip_vnic_hadmin_init -> fip_vnic_test_login; //
+ fip_gw_update_hadmin_gw -> fip_vnic_hadmin_init; //
+ fip_discover_hadmin_update -> fip_gw_update_hadmin_gw; //
+ fip_hadmin_sysfs_update -> fip_discover_hadmin_update [label="fip_wq", color=blue]; //
+ fip_vnic_fsm -> fip_vnic_test_login; //
+ fip_gw_create_vnics -> fip_vnic_fsm; //
+
+
+ fip_gw_update_hadmin_gw -> fip_vnic_fsm;
+ fip_vnic_login_ack_recv -> fip_vnic_fsm; //
+ fip_discover_rx_packet_bh -> fip_vnic_login_ack_recv;
+ fip_vnic_tbl_done -> fip_vnic_fsm; //
+ vhub_handle_tbl -> fip_vnic_tbl_done; //
+ fip_vnic_recv_bh -> vhub_handle_tbl; //
+ fip_vnic_recv -> fip_vnic_recv_bh [label="fip_wq", color=blue]; //
+ fip_vnic_comp -> fip_vnic_recv;
+
+ fip_discover_rx_advertise_bh -> fip_discover_gw_fsm;
+
+ fip_hadmin_vnic_refresh -> fip_vnic_fsm; //
+ fip_gw_create_vnics -> fip_hadmin_vnic_refresh //
+ fip_gw_modified -> fip_gw_create_vnics; //
+ fip_discover_rx_advertise_bh -> fip_gw_modified; //
+ fip_discover_rx_packet_bh -> fip_discover_rx_advertise_bh; //
+ fip_discover_process_rx_bh -> fip_discover_rx_packet_bh; //
+ fip_discover_process_rx -> fip_discover_process_rx_bh [label="fip_wq", color=blue]; //
+ fip_discover_comp -> fip_discover_process_rx;
+
+
+
+ fip_discover_rx_advertise_bh -> fip_gw_create_vnics;
+ fip_discover_gw_fsm -> fip_gw_create_vnics;
+
+ vnic_login_pre_create_1 -> vnic_alloc_netdev; //
+ __vnic_login_create -> vnic_login_pre_create_1;
+ fip_vnic_hadmin_init -> vnic_login_pre_create_1;
+ fip_vnic_login_init -> vnic_login_pre_create_1;
+ fip_vnic_fsm -> fip_vnic_login_init;
+
+
+}
--- /dev/null
+digraph {
+ -> FIP_NO_FLUSH [label="fip_vnic_alloc"];
+ FIP_PARTIAL_FLUSH;
+ FIP_FULL_FLUSH;
+}
--- /dev/null
+digraph {
+ FIP_VNIC_CLOSED;
+ fip_vnic_alloc [shape=regular];
+ fip_vnic_alloc -> FIP_VNIC_HADMIN_IDLE [label="hadmin"];
+ fip_vnic_alloc -> FIP_VNIC_LOGIN [label="none hadmin"];
+ FIP_VNIC_WAIT_4_ACK;
+ FIP_VNIC_RINGS_INIT;
+ FIP_VNIC_MCAST_INIT;
+ FIP_VNIC_MCAST_INIT_DONE;
+ FIP_VNIC_VHUB_INIT;
+ FIP_VNIC_VHUB_INIT_DONE;
+ FIP_VNIC_VHUB_DONE;
+ FIP_VNIC_VHUB_WRITE;
+ FIP_VNIC_CONNECTED;
+}
--- /dev/null
+/*
+ * Copyright (c) 2010 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_pkt.h"
+
+static const struct subcode_rules {
+ u64 req_mask;
+ u64 opt_mask;
+} subcodes_array[FIP_MAX_SUBCODES] = {
+ [FIP_HOST_SOL_SUB_OPCODE] = {
+ .req_mask = FIP_MASK(VENDOR_ID) |
+ FIP_MASK(ADDRESS),
+ .opt_mask = FIP_MASK(EXT_DESC),
+ },
+ [FIP_GW_ADV_SUB_OPCODE] = {
+ .req_mask = FIP_MASK(VENDOR_ID) |
+ FIP_MASK(ADDRESS) |
+ FIP_MASK(GW_INFORMATION) |
+ FIP_MASK(GW_IDENTIFIER) |
+ FIP_MASK(KA_PARAMS),
+ .opt_mask = FIP_MASK(EXT_DESC),
+ },
+ [FIP_HOST_LOGIN_SUB_OPCODE] = {
+ .req_mask = FIP_MASK(VENDOR_ID) |
+ FIP_MASK(ADDRESS) |
+ FIP_MASK(LOGIN) |
+ FIP_MASK(PARTITION),
+ .opt_mask = FIP_MASK(EXT_DESC),
+ },
+ [FIP_GW_LOGIN_SUB_OPCODE] = {
+ .req_mask = FIP_MASK(VENDOR_ID) |
+ FIP_MASK(ADDRESS) |
+ FIP_MASK(LOGIN) |
+ FIP_MASK(PARTITION),
+ .opt_mask = FIP_MASK(EXT_DESC),
+ },
+ [FIP_HOST_LOGOUT_SUB_OPCODE] = {
+ .req_mask = FIP_MASK(VENDOR_ID) |
+ FIP_MASK(VNIC_IDENTITY),
+ },
+ [FIP_GW_UPDATE_SUB_OPCODE] = {
+ .req_mask = FIP_MASK(VENDOR_ID) |
+ FIP_MASK(VHUB_UPDATE),
+ .opt_mask = FIP_MASK(EXT_DESC),
+ },
+ [FIP_GW_TABLE_SUB_OPCODE] = {
+ .req_mask = FIP_MASK(VENDOR_ID) |
+ FIP_MASK(VHUB_TABLE),
+ },
+ [FIP_HOST_ALIVE_SUB_OPCODE] = {
+ .req_mask = FIP_MASK(VENDOR_ID) |
+ FIP_MASK(VNIC_IDENTITY),
+ },
+};
+
+static int type2idx(struct fip_content *fc, struct fip_fip_type *ft)
+{
+ void *p = ft;
+
+ switch (ft->type) {
+ case FIP_TYPE(VENDOR_ID):
+ fc->fvend = p;
+ return FIP_TYPE_IDX(VENDOR_ID);
+ case FIP_TYPE(ADDRESS):
+ fc->fa.fa[fc->fa.num++] = p;
+ return FIP_TYPE_IDX(ADDRESS);
+ case FIP_TYPE(GW_INFORMATION):
+ fc->fgwi = p;
+ return FIP_TYPE_IDX(GW_INFORMATION);
+ case FIP_TYPE(LOGIN):
+ fc->fl = p;
+ return FIP_TYPE_IDX(LOGIN);
+ case FIP_TYPE(VHUB_UPDATE):
+ fc->fvu = p;
+ return FIP_TYPE_IDX(VHUB_UPDATE);
+ case FIP_TYPE(VHUB_TABLE):
+ fc->fvt = p;
+ return FIP_TYPE_IDX(VHUB_TABLE);
+ case FIP_TYPE(VNIC_IDENTITY):
+ fc->fvi = p;
+ return FIP_TYPE_IDX(VNIC_IDENTITY);
+ case FIP_TYPE(PARTITION):
+ fc->fp = p;
+ return FIP_TYPE_IDX(PARTITION);
+ case FIP_TYPE(GW_IDENTIFIER):
+ fc->fgid = p;
+ return FIP_TYPE_IDX(GW_IDENTIFIER);
+ case FIP_TYPE(KA_PARAMS):
+ fc->fka = p;
+ return FIP_TYPE_IDX(KA_PARAMS);
+ case FIP_TYPE(EXT_DESC):
+ fc->fed.fed[fc->fed.num++] = p;
+ return FIP_TYPE_IDX(EXT_DESC);
+ default:
+ return -1;
+ }
+}
+
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+static const char *fip_type_str(int type)
+{
+ switch (type) {
+ FIP_CASE_STR(VENDOR_ID);
+ FIP_CASE_STR(ADDRESS);
+ FIP_CASE_STR(GW_INFORMATION);
+ FIP_CASE_STR(LOGIN);
+ FIP_CASE_STR(VHUB_UPDATE);
+ FIP_CASE_STR(VHUB_TABLE);
+ FIP_CASE_STR(VNIC_IDENTITY);
+ FIP_CASE_STR(PARTITION);
+ FIP_CASE_STR(GW_IDENTIFIER);
+ FIP_CASE_STR(KA_PARAMS);
+ FIP_CASE_STR(EXT_DESC);
+ default:
+ return "Unknown";
+ }
+}
+
+static const char *fip_subcode_str(int subcode)
+{
+ switch (subcode) {
+ FIP_SUBCODE_CASE_STR(FIP_HOST_SOL_SUB_OPCODE);
+ FIP_SUBCODE_CASE_STR(FIP_GW_ADV_SUB_OPCODE);
+ FIP_SUBCODE_CASE_STR(FIP_HOST_LOGIN_SUB_OPCODE);
+ FIP_SUBCODE_CASE_STR(FIP_GW_LOGIN_SUB_OPCODE);
+ FIP_SUBCODE_CASE_STR(FIP_HOST_LOGOUT_SUB_OPCODE);
+ FIP_SUBCODE_CASE_STR(FIP_GW_UPDATE_SUB_OPCODE);
+ FIP_SUBCODE_CASE_STR(FIP_GW_TABLE_SUB_OPCODE);
+ FIP_SUBCODE_CASE_STR(FIP_HOST_ALIVE_SUB_OPCODE);
+ default:
+ return "Unknown";
+ }
+}
+#endif
+
+static int verify_mlx_sig(void *p)
+{
+ static const char *mlx4_str = "mellanox";
+ __be64 mlx_str_64 = *(__be64 *)mlx4_str;
+ __be64 *sig = p;
+
+ return *sig != mlx_str_64;
+}
+
+static int next_type(struct vnic_port *port, void *tlv, int len,
+ struct fip_content *fc, int *sz, int *idx)
+{
+ struct fip_fip_type *ft;
+
+ if (sizeof *ft > len) {
+ vnic_dbg_parse(port->name, "message too short\n");
+ return -1;
+ }
+ ft = tlv
+ ;
+ vnic_dbg_parse(port->name, "TLV: type %s(%d)\n", fip_type_str(ft->type),
+ ft->type);
+
+ if (!ft->length || (ft->length << 2 > len)) {
+ vnic_dbg_parse(port->name, "TLV does not fit in message: %s(%d) "
+ "tlv->len %d, remaining %d\n", fip_type_str(ft->type),
+ ft->type, ft->length << 2, len);
+ return -1;
+ }
+
+ *sz = (ft->length << 2);
+
+ *idx = type2idx(fc, ft);
+ if (*idx < 0) {
+ vnic_dbg_parse(port->name, "unkown type %d\n", ft->type);
+ return -1;
+ }
+
+ if (ft->type == FIP_TYPE(VENDOR_ID) && verify_mlx_sig(fc->fvend->vendor_id)) {
+ vnic_dbg_parse(port->name, "mellanox signature check failed\n");
+ return -1;
+ }
+
+ if (ft->type == FIP_TYPE(VHUB_TABLE) || ft->type == FIP_TYPE(VHUB_UPDATE)) {
+ int cte_list_sz;
+ struct context_table_entry *cte_start;
+
+ if (ft->type == FIP_TYPE(VHUB_TABLE)) {
+ unsigned hdr = be16_to_cpu(fc->fvt->hdr) >> 14;
+
+ if (hdr > FIP_TABLE_HDR_ONLY) {
+ vnic_dbg_parse(port->name, "invalid table header %d\n", hdr);
+ return -1;
+ }
+ cte_list_sz = *sz - sizeof(struct fip_vhub_table_tlv);
+ /* Todo, the next 2 lines are comented because the size of the tbl tlv is
+ miscomputed in BXM versions 1.3.6-5 and it causes tables to be discarded.
+ In reality the size should be used with the lines in tact. */
+ /*if (hdr == FIP_TABLE_HDR_LAST)
+ cte_list_sz -= 4;
+ */
+
+ cte_start = (struct context_table_entry *)(fc->fvt + 1);
+ } else {
+ cte_list_sz = *sz - sizeof(struct fip_vhub_update_tlv);
+ cte_start = (struct context_table_entry *)(fc->fvu + 1);
+ }
+
+
+ fc->cte.num = cte_list_sz / sizeof(struct context_table_entry);
+ fc->cte.cte = cte_start;
+ }
+
+
+ return 0;
+}
+
+static inline int check_eoib_ver(struct vnic_port *port,
+ struct fip_eoib_ver *eoib_ver, int sz, int *len)
+{
+ if (unlikely(sz < sizeof *eoib_ver)) {
+ vnic_dbg_parse(port->name, "message too short\n");
+ *len = sz;
+ return -ENOMEM;
+ }
+ *len = sizeof *eoib_ver;
+ if (unlikely(eoib_ver->version >> 4)) {
+ vnic_dbg_parse(port->name, "eoib version check failed: %d\n", eoib_ver->version >> 4);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static void dump_raw(struct vnic_port *port, void *buf, int len)
+{
+ int i;
+
+ for (i = 0; i < len / 4; ++i)
+ vnic_dbg_parse(port->name, "0x%08x\n", be32_to_cpu(((__be32 *)(buf))[i]));
+}
+
+static inline int check_fip_hdr(struct vnic_port *port,
+ struct fip_header_simple *fh, int sz, int *len)
+{
+ if (unlikely(sizeof *fh > sz)) {
+ vnic_dbg_parse(port->name, "message too short\n");
+ return -1;
+ }
+
+ if (unlikely(fh->opcode != cpu_to_be16(EOIB_FIP_OPCODE))) {
+ vnic_dbg_parse(port->name, "not fip opcode\n");
+ return -1;
+ }
+
+ if (unlikely((be16_to_cpu(fh->list_length) << 2) > (sz - sizeof *fh))) {
+ vnic_dbg_parse(port->name, "message too short: header length = %u, "
+ "left length = %lu\n",
+ be16_to_cpu(fh->list_length) << 2, sz - sizeof *fh);
+ return -1;
+ }
+
+ *len = sizeof *fh;
+
+ return 0;
+}
+
+static int check_fip_mask(struct vnic_port *port, struct fip_content *fc)
+{
+ u64 req_mask = subcodes_array[fc->fh->subcode].req_mask;
+ u64 opt_mask = subcodes_array[fc->fh->subcode].opt_mask;
+
+ if (((fc->mask & req_mask) != req_mask) ||
+ ((fc->mask & ~opt_mask) & ~req_mask)) {
+ vnic_dbg_parse(port->name, "%s: mask check failed: mask 0x%llx,"
+ "req_mask 0x%llx, opt_mask 0x%llx\n",
+ fip_subcode_str(fc->fh->subcode), fc->mask, req_mask, opt_mask);
+ return -1;
+ }
+
+ return 0;
+}
+
+static void dump_cte(struct vnic_port *port, struct context_table_entry *cte)
+{
+ vnic_dbg_parse(port->name, "CTE: V(%d) RSS(%d) type(%d) MAC(%pM) QPN(0x%06x) SL(%d) LID(0x%04x)\n",
+ (0x1 & (cte->v_rss_type >> 7)),
+ (0x1 & (cte->v_rss_type >> 6)),
+ (cte->v_rss_type & 0xf),
+ cte->mac, be32_to_cpu(cte->qpn) & 0xffffff,
+ (cte->sl & 0xf), be16_to_cpu(cte->lid));
+}
+
+static void dump_vnic_identity(struct vnic_port *port,
+ struct fip_vnic_identity_tlv *fvi)
+{
+#define VHUB_ID be32_to_cpu(fvi->flags_vhub_id)
+
+ vnic_dbg_parse(port->name, "%s: U(%d) R(%d) VP(%d) VHUBID(x%x) TUSN(0x%x) VNIC_ID(0x%x)"
+ "MAC(%pM) GUID("GUID_FORMAT") VNIC NAME (%s)\n",
+ fip_type_str(fvi->ft.type), (VHUB_ID >> 31), (0x01 & (VHUB_ID >> 30)),
+ (0x01 & (VHUB_ID >> 24)), VHUB_ID & 0xffffff, be32_to_cpu(fvi->tusn),
+ be16_to_cpu(fvi->vnic_id), fvi->mac, GUID_ARG(fvi->port_guid), fvi->vnic_name);
+}
+
+static void dump_vnic_partition(struct vnic_port *port, struct fip_partition_tlv *fp)
+{
+ vnic_dbg_parse(port->name, "%s: PKEY(0x%x)\n", fip_type_str(fp->ft.type),
+ be16_to_cpu(fp->pkey));
+}
+
+
+static void dump_gw_identifier(struct vnic_port *port, struct fip_gw_identifier_tlv *fgid)
+{
+ vnic_dbg_parse(port->name, "%s: SYS GUID("GUID_FORMAT") SYS NAME(%s) GW PORT NAME(%s)\n",
+ fip_type_str(fgid->ft.type), GUID_ARG(fgid->sys_guid), fgid->sys_name, fgid->sys_name);
+}
+
+static void dump_ka_params(struct vnic_port *port, struct fip_ka_params_tlv *fka)
+{
+ vnic_dbg_parse(port->name, "%s: GW_ADV_PERIOD(%d) GW_KA_PERIOD(%d) VNIC_KA_PERIOD(%d)\n",
+ fip_type_str(fka->ft.type), be32_to_cpu(fka->adv_period),
+ be32_to_cpu(fka->ka_period), be32_to_cpu(fka->vnic_ka_period));
+}
+
+static void dump_vhub_table(struct vnic_port *port, struct fip_content *fc)
+{
+ int i;
+
+ vnic_dbg_parse(port->name, "%s: VP(%d) vhub id(0x%x) TUSN(0x%x) HDR(%d) table size (%d)\n",
+ fip_type_str(fc->fvt->ft.type), be32_to_cpu(fc->fvt->vp_vhub_id) >> 24 & 1,
+ be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff, be32_to_cpu(fc->fvt->tusn),
+ be16_to_cpu(fc->fvt->hdr) >> 14, be16_to_cpu(fc->fvt->table_size));
+ for (i = 0; i < fc->cte.num; ++i)
+ dump_cte(port, &fc->cte.cte[i]);
+}
+
+static void dump_fip_login(struct vnic_port *port, struct fip_login_tlv *p)
+{
+ vnic_dbg_parse(port->name, "%s: mtu(%d) vnic_id(0x%x) v_m_vp_h(0x%x) vlan(0x%x) mac(%pM)"
+ "mgid_prefix("MGID_PREFIX_FMT") vfields(0x%0x) syndrom(%d) QPN(0x%x)"
+ " vnic_name(%s)\n", fip_type_str(p->ft.type), be16_to_cpu(p->mtu),
+ be16_to_cpu(p->vnic_id), be16_to_cpu(p->flags_vlan) >> 12,
+ be16_to_cpu(p->flags_vlan) & 0xfff, p->mac, MGID_PRE_ARG(p->eth_gid_prefix),
+ be16_to_cpu(p->vfields), be32_to_cpu(p->syndrom_ctrl_qpn) >> 24,
+ be32_to_cpu(p->syndrom_ctrl_qpn) & 0xffffff, p->vnic_name);
+}
+
+static void dump_fip_address(struct vnic_port *port, struct fip_address_tlv *fa)
+{
+ vnic_dbg_parse(port->name, "%s: GW_TYPE(%d) QPN(0x%x) SL(%d), GW_PORT_ID(0x%x),"
+ " LID(0x%x) GUID(" GUID_FORMAT ")\n", fip_type_str(fa->ft.type),
+ be32_to_cpu(fa->gwtype_qpn) >> 24, be32_to_cpu(fa->gwtype_qpn) & 0xffffff,
+ be16_to_cpu(fa->sl_gwportid) >> 12, be16_to_cpu(fa->sl_gwportid) & 0xfff,
+ be16_to_cpu(fa->lid), GUID_ARG(fa->guid));
+}
+
+static void dump_vhub_update(struct vnic_port *port, struct fip_content *fc)
+{
+#define VHUB_ID_1 be32_to_cpu(fc->fvu->state_vhub_id)
+ int i;
+
+ vnic_dbg_parse((port->name), "%s: eport_state(%s) vp(%d) vhub_id(0x%x) tusn(0x%x)\n",
+ fip_type_str(fc->fvu->ft.type), eport_state_str(VHUB_ID_1 >> 28 & 3),
+ VHUB_ID_1 >> 24 & 1, VHUB_ID_1 & 0xffffff, be32_to_cpu(fc->fvu->tusn));
+ for (i = 0; i < fc->cte.num; ++i)
+ dump_cte(port, &fc->cte.cte[i]);
+}
+
+static void dump_gateway_information(struct vnic_port *port,
+ struct fip_gw_information_tlv *fgwi)
+{
+ vnic_dbg_parse(port->name, "%s: accept host administered(%s) nmac_mgid(%d) "
+ "nrss_mgid(%d) ntss_qpn(%d), n_rss(%d), num_net_vnics(%d)\n",
+ fip_type_str(fgwi->ft.type), (fgwi->h_nmac_mgid >> 7) ? "Yes" : "No",
+ fgwi->h_nmac_mgid & 0x3f, fgwi->n_rss_mgid_tss_qpn >> 4,
+ fgwi->n_rss_mgid_tss_qpn & 0xf, be16_to_cpu(fgwi->n_rss_qpn_vnics) >> 12,
+ be16_to_cpu(fgwi->n_rss_qpn_vnics) & 0xfff);
+}
+
+static void dump_fip_packet(struct vnic_port *port, struct fip_content *fc)
+{
+ int i;
+
+ for (i = 0; i < fc->fa.num; ++i)
+ dump_fip_address(port, fc->fa.fa[i]);
+
+ if (fc->fgwi)
+ dump_gateway_information(port, fc->fgwi);
+
+ if (fc->fvu)
+ dump_vhub_update(port, fc);
+
+ if (fc->fl)
+ dump_fip_login(port, fc->fl);
+
+ if (fc->fvt)
+ dump_vhub_table(port, fc);
+
+ if (fc->fvi)
+ dump_vnic_identity(port, fc->fvi);
+
+ if (fc->fp)
+ dump_vnic_partition(port, fc->fp);
+
+ if (fc->fgid)
+ dump_gw_identifier(port, fc->fgid);
+
+ if (fc->fka)
+ dump_ka_params(port, fc->fka);
+}
+
+int fip_packet_parse(struct vnic_port *port, void *packet, int pkt_size, struct fip_content *fc)
+{
+ void *ptr = packet;
+ int len;
+ int err;
+ int idx;
+ u16 offset = 0;
+ int size = pkt_size;
+
+ vnic_dbg_parse(port->name, "size = %d\n", size);
+ err = check_eoib_ver(port, ptr, size, &len);
+ if (err) {
+ if (err != -EINVAL)
+ goto out_err;
+ else
+ vnic_dbg_parse(port->name, "version check failed\n");
+ }
+
+ fc->eoib_ver = ptr;
+ size -= len;
+ ptr += len;
+ offset += len;
+ fc->fh = ptr;
+
+ err = check_fip_hdr(port, ptr, size, &len);
+ if (err)
+ goto out_err;
+
+ ptr += len;
+ offset += len;
+
+ fc->fa.num = 0;
+ fc->num = 0;
+ fc->mask = 0;
+
+ /* workaround a BXM bug not reporting the correct descriptor length */
+ if (fc->fh->subcode != FIP_GW_ADV_SUB_OPCODE)
+ size = be16_to_cpu(fc->fh->list_length) << 2;
+ else
+ size -= len;
+
+ vnic_dbg_parse(port->name, "subcode = %s, size %d\n",
+ fip_subcode_str(fc->fh->subcode), size);
+ while (size > 0) {
+ err = next_type(port, ptr, size, fc, &len, &idx);
+ if (err)
+ break;
+
+ fc->offsets[fc->num] = offset;
+ fc->mask |= ((u64)1 << idx);
+ ptr += len;
+ size -= len;
+ offset += len;
+ fc->num++;
+ }
+
+ if (err)
+ goto out_err;
+
+ err = check_fip_mask(port, fc);
+ if (err) {
+ vnic_dbg_parse(port->name, "check mask: failed\n");
+ goto out_err;
+ }
+
+ dump_fip_packet(port, fc);
+
+ return 0;
+
+out_err:
+ dump_raw(port, packet, pkt_size);
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef VNIC_H
+#define VNIC_H
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/if_arp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <linux/inet_lro.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+#include <net/dst.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_sa.h>
+
+/* for mlx4_ib dev attr, used also in vnic_qp.c */
+#include "../../../../infiniband/hw/mlx4/mlx4_ib.h"
+#include "../../../../infiniband/hw/mlx4/user.h"
+
+#include "vnic_utils.h"
+
+/* driver info definition */
+#define DRV_NAME "mlx4_vnic"
+#define DRV_VER "1.4.0"
+#define DRV_LIC "Dual BSD/GPL"
+#define DRV_DESC "Mellanox BridgeX Virtual NIC Driver"
+#define DRV_AUTH "Ali Ayoub & Gabi Liron"
+
+/* backports */
+
+/* for kernel >= 3.17 */
+#define alloc_netdev_mqs(a, b, c, d, e) alloc_netdev_mqs(a, b, NET_NAME_UNKNOWN, c, d, e)
+
+#ifdef alloc_netdev_mq
+#undef alloc_netdev_mq
+#define alloc_netdev_mq(sizeof_priv, name, setup, count) \
+ alloc_netdev_mqs(sizeof_priv, name, setup, count, count)
+#endif
+
+#ifndef SET_ETHTOOL_OPS
+#define SET_ETHTOOL_OPS(netdev,ops) \
+ ( (netdev)->ethtool_ops = (ops) )
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35))
+#define _BP_NO_MC_LIST
+
+// Not sure this should be here at least this is ok for 2.6.39
+#define _BP_NO_ATT_OWNER
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0))
+#define _BP_NO_GRO
+#endif
+
+#ifndef NETIF_F_HW_VLAN_FILTER
+#define NETIF_F_HW_VLAN_FILTER NETIF_F_HW_VLAN_CTAG_FILTER
+#endif
+
+/* externs */
+extern u32 vnic_msglvl;
+extern u32 vnic_max_tx_outs;
+extern u32 vnic_lro_num;
+extern u32 vnic_mcast_create;
+extern u32 vnic_net_admin;
+extern u32 vnic_child_max;
+extern u32 vnic_napi_weight;
+extern u32 vnic_linear_small_pkt;
+extern u32 vnic_tx_rings_num;
+extern u32 vnic_rx_rings_num;
+extern u32 vnic_tx_rings_len;
+extern u32 vnic_rx_rings_len;
+extern u32 vnic_mgid_data_type;
+extern u32 vnic_encap_headroom;
+extern u32 vnic_tx_polling;
+extern u32 vnic_rx_linear;
+extern u32 vnic_change_mac;
+extern u32 vnic_learn_mac_enabled;
+extern u32 vnic_synd_backlog;
+extern u32 vnic_eport_state_enforce;
+extern u32 vnic_src_mac_enforce;
+extern u32 vnic_inline_tshold;
+
+#define MAX_NUM_PKEYS_DISCOVERY (24)
+#define ILLEGAL_PKEY_INDEX (0xFFFF)
+extern u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY];
+extern u32 vnic_discovery_pkeys_count;
+extern u32 vnic_sa_query;
+
+
+extern u32 no_bxm;
+
+extern struct workqueue_struct *port_wq;
+extern struct workqueue_struct *fip_wq;
+extern struct workqueue_struct *mcast_wq;
+extern struct workqueue_struct *login_wq;
+
+extern struct ib_sa_client vnic_sa_client;
+
+/* definitions */
+#define EOIB_SERVICE_ID ((0x10ULL << 56) | (0x0002C9E01B0000ULL))
+#define EOIB_CTRL_SERVICE_ID (EOIB_SERVICE_ID | 0x00FFULL)
+#define VNIC_SKB_QUEUE_LEN 32
+#define VNIC_CNT_MAX 32
+#define VNIC_DESC_LEN (64 + 4)
+#define VNIC_NAME_LEN 16 /* by spec, use IFNAMSIZ for OS */
+#define VNIC_SYSFS_FLEN (VNIC_NAME_LEN * 2) /* SYSFS file name len, allow pre/suffix (32)*/
+#define VNIC_SYSFS_LLEN 64
+#define VNIC_VENDOR_LEN 8
+#define GID_LEN 16
+#define GUID_LEN 8
+#define IPV4_LEN 4
+#define IPV6_LEN 16
+#define VNIC_SYSTEM_NAME_LEN 32
+#define VNIC_GW_PORT_NAME_LEN 8
+#define GID_PREFIX_LEN 5
+#define VNIC_MAX_DENTRIES 16
+#define VNIC_ID_LEN 16
+#define VNIC_CHILD_MAX 128
+#define VNIC_MAX_RETRIES 0 /* zero = unlimited */
+#define VNIC_WATCHDOG_TIMEOUT (25 * HZ) /* 25 sec */
+#define VNIC_NAPI_SCHED_TIMEOUT (5)
+#define FIP_MAX_VNICS_PER_GW (1 << 9)
+#define NOT_AVAILABLE_NUM (-1)
+#define NOT_AVAILABLE_STRING "N/A"
+#define is_valid_str(str) (strcmp(str, NOT_AVAILABLE_STRING))
+#define is_valid_num(num) (num != NOT_AVAILABLE_NUM)
+#define is_valid_guid(arr) (!!(*((u64 *)(arr))))
+#define is_valid_ipv4(arr) (!!(*((u32 *)(arr))))
+#define is_mcast_promisc(login) (!(login->n_mac_mcgid))
+#define is_ucast_promisc(login) (!!(login->dev->flags & IFF_PROMISC))
+#define ARRAY_LEN(_x) (sizeof(_x)/sizeof(_x[0]))
+
+/* TODO: cleanup VNIC_GID_RAW_ARG and friends */
+#define VNIC_GID_RAW_ARG(gid) ((u8 *)(gid))[0], \
+ ((u8 *)(gid))[1], \
+ ((u8 *)(gid))[2], \
+ ((u8 *)(gid))[3], \
+ ((u8 *)(gid))[4], \
+ ((u8 *)(gid))[5], \
+ ((u8 *)(gid))[6], \
+ ((u8 *)(gid))[7], \
+ ((u8 *)(gid))[8], \
+ ((u8 *)(gid))[9], \
+ ((u8 *)(gid))[10],\
+ ((u8 *)(gid))[11],\
+ ((u8 *)(gid))[12],\
+ ((u8 *)(gid))[13],\
+ ((u8 *)(gid))[14],\
+ ((u8 *)(gid))[15]
+#define VNIC_GUID_RAW_ARG(gid) ((u8 *)(gid))[0], \
+ ((u8 *)(gid))[1], \
+ ((u8 *)(gid))[2], \
+ ((u8 *)(gid))[3], \
+ ((u8 *)(gid))[4], \
+ ((u8 *)(gid))[5], \
+ ((u8 *)(gid))[6], \
+ ((u8 *)(gid))[7]
+
+#define VNIC_GID_ARG(gid) VNIC_GID_RAW_ARG((gid).raw)
+#define VNIC_GID_FMT "%.2x:%.2x:%.2x:%.2x:" \
+ "%.2x:%.2x:%.2x:%.2x:" \
+ "%.2x:%.2x:%.2x:%.2x:" \
+ "%.2x:%.2x:%.2x:%.2x"
+#define VNIC_GUID_FMT "%.2x:%.2x:%.2x:%.2x:" \
+ "%.2x:%.2x:%.2x:%.2x"
+
+#define MAC_6_PRINT_FMT "%.2x:%.2x:%.2x:%.2x:" \
+ "%.2x:%.2x"
+#define MAC_6_PRINT_ARG(mac) (mac)[0], (mac)[1], (mac)[2], \
+ (mac)[3], (mac)[4], (mac)[5]
+
+#define IP_4_PRINT_FMT "%d.%d.%d.%d"
+#define IP_4_PRINT_ARG(ip) (ip)[0], (ip)[1], (ip)[2], (ip)[3]
+
+#define CREATE_VHUB_ID(be_vlan, port_id) \
+ ((be16_to_cpu(be_vlan) & 0xFFF) | (((port_id) & 0xFFF) << 12))
+#define CREATE_VHUB_ID_BE(vlan, port_id) \
+ cpu_to_be32(CREATE_VHUB_ID(vlan, port_id))
+#define ROUNDUP_LOG2(x) ilog2(roundup_pow_of_two(x))
+
+#define VNIC_RX_COAL_TARGET 0x20000
+#define VNIC_RX_COAL_TIME 0x10
+#define VNIC_TX_COAL_PKTS 64
+#define VNIC_TX_COAL_TIME 0x80
+#define VNIC_RX_RATE_LOW 400000
+#define VNIC_RX_COAL_TIME_LOW 0
+#define VNIC_RX_RATE_HIGH 450000
+#define VNIC_RX_COAL_TIME_HIGH 128
+#define VNIC_RX_SIZE_THRESH 1024
+#define VNIC_RX_RATE_THRESH (1000000 / VNIC_RX_COAL_TIME_HIGH)
+#define VNIC_SAMPLE_INTERVAL 0
+#define VNIC_AVG_PKT_SMALL 256
+#define VNIC_AUTO_CONF 0xffff
+#define VNIC_MCAST_MAX_RETRY 60
+#define VNIC_MCAST_ULIMIT_RETRY 0
+#define VNIC_MCAST_BACKOF_FAC 2
+#define MLX4_DEV_CAP_FLAG_UD_SWP (1 << 28)
+#define VNIC_ETHTOOL_LINE_MAX 32
+#define VNIC_ENCAP_LEN 4
+#define VNIC_MAX_TX_SIZE 2048
+#define VNIC_MAX_RX_SIZE 4096
+#define ETH_LLC_SNAP_SIZE 8
+
+#define VNIC_SM_HEADSTART 250 /* msecs to actually start handling SM events */
+#define VNIC_MCAST_BACKOFF_MSEC 1000
+#define VNIC_MCAST_BACKOFF_MAX_MSEC 16000
+
+#define SYSFS_VLAN_ID_NO_VLAN (-1)
+
+#define VNIC_MAX_PAYLOAD_SIZE 4096
+#define VNIC_BUF_SIZE(_port) (min(_port->max_mtu_enum + \
+ IB_GRH_BYTES, VNIC_MAX_PAYLOAD_SIZE))
+
+#define VNIC_TX_QUEUE_LEN 1024 /* default, tuneable */
+#define VNIC_TX_QUEUE_LEN_MIN 64
+#define VNIC_TX_QUEUE_LEN_MAX (8 * 1024)
+
+#define VNIC_RX_QUEUE_LEN 2048 /* default, tuneable */
+#define VNIC_RX_QUEUE_LEN_MIN 64
+#define VNIC_RX_QUEUE_LEN_MAX (8 * 1024)
+
+
+#define VNIC_MODER_DELAY (HZ / 4)
+#define VNIC_STATS_DELAY VNIC_MODER_DELAY
+
+#define VNIC_AH_SL_DEFAULT 0x0
+
+#define VNIC_DATA_QKEY 0x80020003
+#define VNIC_FIP_QKEY 0x80020002
+#define VNIC_VLAN_OFFSET(login) (login->vlan_used ? VLAN_HLEN : 0)
+#define VNIC_VLAN_ENABLED(login) (login->vlan_used ? 1 : 0)
+#define VNIC_MAX_TX_CQE 32 /* default, tuneable */
+#define VNIC_MAX_RX_CQE 64 /* default, tuneable */
+#define VNIC_MAX_NUM_CPUS 32
+#define VNIC_MAX_INLINE_TSHOLD 512
+
+#define VNIC_EOIB_HDR_VER 0x0
+#define VNIC_EOIB_HDR_SIG 0x3
+#define VNIC_EOIB_HDR_UDP_CHK_OK 0x2
+#define VNIC_EOIB_HDR_TCP_CHK_OK 0x1
+#define VNIC_EOIB_HDR_IP_CHK_OK 0x1
+
+#define VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr) (eoib_hdr->encap_data & 0x3)
+#define VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr) ((eoib_hdr->encap_data >> 2) & 0x3)
+#define VNIC_EOIB_HDR_GET_VER(eoib_hdr) ((eoib_hdr->encap_data >> 4) & 0x3)
+#define VNIC_EOIB_HDR_GET_SIG(eoib_hdr) ((eoib_hdr->encap_data >> 6) & 0x3)
+
+#define VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \
+ (eoib_hdr->encap_data & 0xFC) | VNIC_EOIB_HDR_IP_CHK_OK)
+#define VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \
+ (eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_TCP_CHK_OK << 2))
+#define VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \
+ (eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_UDP_CHK_OK << 2))
+
+#define VNIC_IP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_IP_CHK_OK)
+#define VNIC_TCP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_TCP_CHK_OK)
+#define VNIC_UDP_CSUM_OK(eoib_hdr) ((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_UDP_CHK_OK)
+#define VNIC_CSUM_OK(eoib_hdr) (VNIC_IP_CSUM_OK(eoib_hdr) && \
+ (VNIC_TCP_CSUM_OK(eoib_hdr) || \
+ VNIC_UDP_CSUM_OK(eoib_hdr)))
+#define VNIC_EOIB_ZLEN_MAX (ETH_ZLEN + VNIC_ENCAP_LEN + VLAN_HLEN)
+
+#define VNIC_SKB_GET_HASH(_skb, _max) ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) % _max)
+#define VNIC_SKB_SET_HASH(_skb, _hash) ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) = _hash)
+#define VNIC_SKB_GET_ENCAP_CB(_skb) ((struct eoibhdr *)(_skb->cb + sizeof _skb->cb - 12))
+#define VNIC_SKB_GET_ENCAP(_skb) (vnic_encap_headroom ? (struct eoibhdr *)(_skb->data) : VNIC_SKB_GET_ENCAP_CB(_skb))
+#define VNIC_SKB_GET_ENCAP_OFFSET (vnic_encap_headroom ? VNIC_ENCAP_LEN :0)
+
+#define VNIC_NEIGH_GET_DQPN(_skb, _neighe) ((_neighe->rss) ? (_neighe->qpn + \
+ VNIC_SKB_GET_HASH(_skb, _neighe->login->qps_num)) : (_neighe->qpn))
+
+#define vnic_netdev_priv(netdev) (((struct vnic_login_info *)netdev_priv(netdev))->login)
+#ifndef _BP_NETDEV_NO_TMQ /* >= 2.6.27 */
+#define VNIC_TXQ_GET_HASH(_skb, _max) (skb_get_queue_mapping(_skb))
+#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev_mq(sz, nm, sp, qm)
+#define VNIC_TXQ_SET_ACTIVE(login, num) (login->dev->real_num_tx_queues = \
+ login->real_tx_rings_num = \
+ login->ndo_tx_rings_num = num)
+#define VNIC_TXQ_GET_ACTIVE(login) (login->real_tx_rings_num)
+#define VNIC_TXQ_GET(tx_res) netdev_get_tx_queue(tx_res->login->dev, tx_res->index)
+#define VNIC_TXQ_STOP(tx_res) netif_tx_stop_queue(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_STOP_ALL(login) netif_tx_stop_all_queues(login->dev)
+#define VNIC_TXQ_START(tx_res) netif_tx_start_queue(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_START_ALL(login) netif_tx_start_all_queues(login->dev)
+#define VNIC_TXQ_STOPPED(tx_res) netif_tx_queue_stopped(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_WAKE(tx_res) netif_tx_wake_queue(VNIC_TXQ_GET(tx_res))
+#else
+#define VNIC_TXQ_GET_HASH(skb, _max) VNIC_SKB_GET_HASH(skb, _max)
+#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev(sz, nm, sp)
+#define VNIC_TXQ_SET_ACTIVE(login, num) do { login->real_tx_rings_num = num; \
+ login->ndo_tx_rings_num = 1; \
+ } while (0)
+#define VNIC_TXQ_GET_ACTIVE(login) (login->real_tx_rings_num)
+#define VNIC_TXQ_STOP(tx_res) netif_stop_queue(tx_res->login->dev)
+#define VNIC_TXQ_STOP_ALL(login) netif_stop_queue(login->dev)
+#define VNIC_TXQ_START(tx_res) netif_start_queue(tx_res->login->dev)
+#define VNIC_TXQ_START_ALL(login) netif_start_queue(login->dev)
+#define VNIC_TXQ_STOPPED(tx_res) netif_queue_stopped(tx_res->login->dev)
+#define VNIC_TXQ_WAKE(tx_res) netif_wake_queue(tx_res->login->dev)
+#endif
+
+#define VNIC_ALLOC_ORDER 2
+#define VNIC_ALLOC_SIZE (PAGE_SIZE << VNIC_ALLOC_ORDER)
+#define VNIC_MAX_LRO_AGGR 64
+#define VNIC_MAX_RX_FRAGS 4
+#define VNIC_MAX_TX_FRAGS (MAX_SKB_FRAGS + 2)
+#define VNIC_MGID_PREFIX_LEN 5
+
+/* TODO, when set VNIC_MAX_TX_OUTS to 16,
+ * noticed that the last CQE overwrites the first one
+ */
+#define VNIC_MAX_TX_OUTS 8 /* default, tuneable */
+#define VNIC_MAX_LRO_DESCS 32 /* default, tuneable */
+#define VNIC_EOIB_HDR_SIZE (IB_GRH_BYTES + VNIC_ENCAP_LEN)
+#define SMALL_PACKET_SIZE (256 - NET_IP_ALIGN)
+#define HEADER_COPY_SIZE (128 - NET_IP_ALIGN)
+#define MAX_HEADER_SIZE 64
+
+#define LAG_MAP_TABLE_SIZE 32
+#define MAX_LAG_MEMBERS 16
+
+#define VNIC_FW_STR_MAX VNIC_ETHTOOL_LINE_MAX
+#define VNIC_FW_STR(u64_fw_ver, str) \
+do { \
+ snprintf(str, VNIC_FW_STR_MAX, "%d.%d.%d", \
+ (int)(u64_fw_ver >> 32), \
+ (int)(u64_fw_ver >> 16) & 0xffff, \
+ (int)(u64_fw_ver & 0xffff)); \
+} while (0);
+#define VNIC_STR_STRIP(str) \
+do { \
+ int i; \
+ for (i = 0; i < strlen(str); ++i) \
+ str[i] = str[i] == '\n' ? ' ' : str[i]; \
+} while (0);
+
+/* well known addresses */
+static const u8 ETH_BCAST_MAC[] = {
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+static const u8 ETH_ZERO_MAC[] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+/* this used in no_bxm mode only */
+static const u8 NO_BXM_MGID_PREFIX[] = {
+ 0xff, 0x13, 0xe0, 0x1b, 0x00
+};
+
+#define IS_ZERO_MAC(mac) (!memcmp((mac), ETH_ZERO_MAC, ETH_ALEN))
+#define IS_BCAST_MAC(mac) (!memcmp((mac), ETH_BCAST_MAC, ETH_ALEN))
+#define IS_MCAST_MAC(mac) (((unsigned char *)(mac))[0] & 0x01)
+#define IS_UCAST_MAC(mac) (!(IS_MCAST_MAC(mac)))
+#define IS_NEIGH_QUERY_RUNNING(neigh) \
+ (neigh->query_id >= 0 && !IS_ERR(neigh->pquery) && neigh->pquery)
+
+struct mcast_root {
+ struct rb_root mcast_tree;
+ spinlock_t mcast_rb_lock;
+ struct list_head reattach_list;
+};
+
+/* structs */
+struct vnic_port_stats {
+ unsigned long gro_held;
+ unsigned long gro_merged;
+ unsigned long gro_normal;
+ unsigned long gro_drop;
+ unsigned long lro_aggregated;
+ unsigned long lro_flushed;
+ unsigned long lro_no_desc;
+ unsigned long tso_packets;
+ unsigned long queue_stopped;
+ unsigned long wake_queue;
+ unsigned long tx_timeout;
+ unsigned long rx_chksum_good;
+ unsigned long rx_chksum_none;
+ unsigned long tx_chksum_offload;
+ unsigned long sig_ver_err;
+ unsigned long vlan_err;
+ unsigned long shared_packets;
+ unsigned long runt_packets;
+ unsigned long realloc_packets;
+ unsigned long gw_tx_packets;
+ unsigned long gw_tx_bytes;
+};
+
+#define VNIC_STATS_DO_ADD(var, val) ((var) += (unsigned long)(val))
+#define VNIC_STATS_DO_INC(var) (++(var))
+#ifdef VNIC_EXTRA_STATS /* for performance */
+#define VNIC_STATS_ADD(var, val) ((var) += (unsigned long)(val))
+#define VNIC_STATS_INC(var) (++(var))
+#else
+#define VNIC_STATS_ADD(var, val) do { } while (0)
+#define VNIC_STATS_INC(var) do { } while (0)
+#endif
+
+enum {
+ MCAST_ATTACHED,
+ MCAST_JOINED,
+ MCAST_JOIN_STARTED,
+ MCAST_JOIN_RUNNING,
+ MCAST_ATTACH_RUNNING,
+};
+
+struct vnic_port_mcast {
+ struct rb_node rb_node;
+ struct list_head list;
+ union ib_gid gid;
+ struct vnic_port *port;
+ struct completion leave_complete;
+ struct completion join_event_complete;
+ struct ib_sa_multicast *sa_mcast;
+ struct ib_sa_mcmember_rec rec;
+
+ atomic_t ref_cnt;
+ struct delayed_work join_task;
+ struct work_struct leave_task;
+ unsigned long join_task_cnt;
+ long int state;
+ spinlock_t lock;
+ u8 join_state;
+ /* IN */
+ unsigned long backoff;
+ unsigned long backoff_init;
+ unsigned long backoff_factor;
+ unsigned long retry;
+ u16 pkey;
+ u32 qkey;
+ u8 create;
+};
+
+struct vnic_mcast {
+ struct vnic_port_mcast *port_mcaste;
+ u32 qkey;
+ u16 pkey;
+ struct ib_qp *qp;
+ struct vnic_port *port;
+ struct ib_ah *ah;
+ struct completion attach_complete;
+ struct delayed_work attach_task;
+ struct delayed_work detach_task;
+ unsigned long attach_task_cnt;
+ struct rb_node rb_node;
+ struct list_head list; /* used when delete all */
+ /* IN */
+ u8 mac[ETH_ALEN];
+ union ib_gid gid;
+ union ib_gid port_gid;
+ unsigned long backoff;
+ unsigned long backoff_init;
+ unsigned backoff_factor;
+ unsigned long retry;
+ unsigned long state;
+ u8 blocking;
+ void *attach_cb_ctx;
+ void *detach_cb_ctx;
+ void (*attach_cb) (struct vnic_mcast *mcaste, void *ctx);
+ void (*detach_cb) (struct vnic_mcast *mcaste, void *ctx);
+ u8 create;
+ u8 join_state;
+ void *priv_data;
+ spinlock_t lock;
+ int attach_bit_nr;
+ unsigned long *req_attach;
+ unsigned long *cur_attached;
+ int sender_only;
+};
+
+struct vnic_mac {
+ struct rb_node rb_node; /* list or RB tree */
+ struct list_head list;
+ u16 vnic_id; /* needed for vnic child removal */
+ u8 mac[ETH_ALEN]; /* key */
+ unsigned long created;
+ unsigned long last_tx; // use jiffies_to_timeval
+};
+
+struct lag_properties {
+ u16 hash_mask;
+ u8 weights_policy;
+ u8 ca; /* conjestion aware */
+ u8 ca_thresh;
+};
+
+struct vnic_neigh {
+ struct neighbour *neighbour;
+ struct ib_ah *ah;
+ struct vnic_login *login;
+ struct rb_node rb_node;
+ struct ib_sa_query *pquery;
+ struct completion query_comp;
+ int query_id;
+ struct sk_buff_head pkt_queue;
+ struct delayed_work destroy_task;
+ u8 valid;
+ u32 qpn;
+ u16 lid;
+ u8 sl; /* only for debug */
+ u8 mac[ETH_ALEN];
+ u8 rss;
+ u16 info;
+};
+
+enum lag_gw_state {
+ GW_MEMBER_INFO_CREATED = 1 << 0,
+ GW_MEMBER_INFO_EPORT_UP = 1 << 1,
+ GW_MEMBER_INFO_MCAST = 1 << 2,
+ GW_MEMBER_INFO_MAPPED = 1 << 3,
+};
+
+struct vnic_gw_info {
+ enum lag_gw_state info;
+ int member_id;
+ u16 gw_id;
+ struct vnic_neigh neigh;
+};
+
+struct vnic_sysfs_attr {
+ void *ctx;
+ struct kobject *kobj;
+ unsigned long data;
+ char name[VNIC_SYSFS_FLEN];
+ struct module_attribute dentry;
+ struct device *dev;
+};
+
+enum gw_ext_lag_hash_policy {
+ GW_LAG_HASH_DMAC = 1 << 0,
+ GW_LAG_HASH_SMAC = 1 << 1,
+ GW_LAG_HASH_TPID = 1 << 2, /* ethertype */
+ GW_LAG_HASH_VID = 1 << 3,
+ GW_LAG_HASH_SIP = 1 << 4,
+ GW_LAG_HASH_DIP = 1 << 5,
+ GW_LAG_HASH_IP_NEXT = 1 << 6,
+ GW_LAG_HASH_SPORT = 1 << 7,
+ GW_LAG_HASH_DPORT = 1 << 8,
+ GW_LAG_LAYER_2_3 = 0x1f0
+};
+
+struct vnic_tx_buf {
+ struct sk_buff *skb;
+ u64 mapping[VNIC_MAX_TX_FRAGS];
+ u8 ip_off;
+ u8 ip6_off;
+ u8 tcp_off;
+ u8 udp_off;
+ void *phead;
+ int hlen;
+};
+
+enum {
+#if 1
+ FRAG_SZ0 = 536 - NET_IP_ALIGN, /* so 1500 mtu fits in first 2 frags */
+ FRAG_SZ1 = 1024,
+ FRAG_SZ2 = 2048,
+ FRAG_SZ3 = 4096 - FRAG_SZ2 - FRAG_SZ1 - FRAG_SZ0
+#else
+ FRAG_SZ0 = 512 - NET_IP_ALIGN,
+ FRAG_SZ1 = 1024,
+ FRAG_SZ2 = 2048,
+ FRAG_SZ3 = 4096 << VNIC_ALLOC_ORDER
+#endif
+};
+
+struct vnic_frag_info {
+ u16 frag_size;
+ u16 frag_prefix_size;
+ u16 frag_stride;
+ u16 frag_align;
+ u16 last_offset;
+};
+
+struct vnic_rx_alloc {
+ struct page *page;
+ u16 offset;
+};
+
+struct vnic_frag_data {
+ struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS];
+ u64 dma_addr[VNIC_MAX_RX_FRAGS];
+ struct sk_buff *skb; /* used only for linear buffers mode */
+};
+
+struct vnic_rx_ring {
+ struct vnic_port *port;
+ int index;
+ struct vnic_rx_alloc page_alloc[VNIC_MAX_RX_FRAGS];
+
+ u32 size; /* number of RX descs */
+ spinlock_t lock;
+ struct vnic_frag_data *rx_info;
+
+ struct vnic_frag_info frag_info[VNIC_MAX_RX_FRAGS];
+ u32 rx_skb_size;
+ u16 log_rx_info;
+ u16 num_frags;
+
+ struct ib_recv_wr wr;
+ struct ib_sge sge[VNIC_MAX_RX_FRAGS];
+
+ struct ib_srq *srq;
+ struct net_device_stats stats;
+};
+
+/* vnic states
+ these vlaues can be used only in struct fip_vnic_data.login_state */
+enum {
+ VNIC_STATE_LOGIN_OFF = 0,
+ VNIC_STATE_LOGIN_PRECREATE_1,
+ VNIC_STATE_LOGIN_PRECREATE_2,
+ VNIC_STATE_LOGIN_CREATE_1,
+ VNIC_STATE_LOGIN_CREATE_2,
+ VNIC_STATE_LOGIN_BCAST_ATTACH = 31
+};
+
+/* netdevice open state, depeneds on calls to open/stop
+ these vlaues can be used only in struct vnic_login.netdev_state */
+enum {
+ VNIC_STATE_NETDEV_OFF = 0,
+ VNIC_STATE_NETDEV_OPEN_REQ,
+ VNIC_STATE_NETDEV_OPEN,
+ VNIC_STATE_NETDEV_CARRIER_ON,
+ VNIC_STATE_NETDEV_NO_TX_ENABLE = 31
+};
+
+struct vnic_rx_res {
+ struct vnic_login *login;
+ struct ib_cq *cq;
+ struct net_lro_mgr lro;
+ struct net_lro_desc lro_desc[VNIC_MAX_LRO_DESCS];
+ struct ib_wc recv_wc[VNIC_MAX_RX_CQE];
+ int index;
+ int stopped;
+#ifndef _BP_NAPI_POLL
+ struct napi_struct napi;
+#else
+ struct net_device *poll_dev;
+#endif
+};
+
+struct vnic_tx_res {
+ struct vnic_tx_buf *tx_ring;
+ struct ib_sge tx_sge[VNIC_MAX_TX_FRAGS];
+ struct ib_wc send_wc[VNIC_MAX_TX_CQE];
+ struct ib_send_wr tx_wr;
+ struct vnic_login *login;
+ struct ib_cq *cq;
+ unsigned tx_head;
+ unsigned tx_tail;
+ unsigned tx_outstanding;
+ unsigned tx_stopped_cnt;
+ struct net_device_stats stats;
+ struct ib_ah_attr mcast_av;
+ u8 lso_hdr[VNIC_MAX_PAYLOAD_SIZE];
+ int index;
+ int stopped;
+ spinlock_t lock;
+};
+
+#ifdef VNIC_PROFILLNG
+#define VNIC_PROFILLNG_SKB_MAX 100
+struct vnic_prof_skb_entry {
+ struct sk_buff skb;
+ struct timespec tstamp;
+ unsigned long jiffies;
+ int cnt;
+ u8 nr_frags;
+};
+#endif
+
+struct vnic_qp_res {
+ struct vnic_login *login;
+ struct ib_qp *qp;
+ struct completion last_wqe_complete;
+ int tx_index;
+ int rx_index;
+};
+
+/*
+ * Wrapper struct for vnic_login, used as netdev private data.
+ * some kernels (such as 2.6.18-194.26.1) doesn't allow private
+ * data struct longer than 64KB (NETDEV_PRIV_LEN_MAX).
+ * we allocate the private data separately to work-around this limit.
+ */
+struct vnic_login_info {
+ struct vnic_login *login;
+};
+
+struct vnic_login {
+ spinlock_t lock;
+ spinlock_t stats_lock;
+ struct net_device *dev;
+ struct ethtool_drvinfo drvinfo;
+ struct vnic_port *port;
+ char desc[VNIC_DESC_LEN];
+ struct fip_vnic_data *fip_vnic; /* for ethtool/sysfs*/
+ int queue_stopped;
+ unsigned long netdev_state;
+ char name[VNIC_NAME_LEN];
+ char vnic_name[VNIC_NAME_LEN];
+ char vendor_id[VNIC_VENDOR_LEN];
+ struct vnic_neigh *gw_neigh;
+ struct vnic_gw_info lag_gw_neigh[MAX_LAG_MEMBERS];
+ struct lag_properties lag_prop;
+ int is_lag;
+ int lag_gw_map[LAG_MAP_TABLE_SIZE];
+ int lag_member_count;
+ int lag_member_active_count;
+ union ib_gid gw_mgid;
+ int promisc;
+ union ib_gid gid;
+ __be16 vid;
+ u8 vlan_used;
+ u32 qkey;
+ u16 pkey;
+ u16 pkey_index;
+ u64 gw_guid;
+ u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+ u8 n_mac_mcgid;
+ u8 sl;
+ u16 gw_port_id;
+ u16 vnic_id;
+ unsigned int max_mtu;
+ int zlen;
+ int cnt;
+ unsigned qps_num;
+ u32 qp_base_num;
+ u8 dev_addr[ETH_ALEN];
+ u8 all_vlan_gw;
+
+ /* statistics */
+ struct net_device_stats stats;
+ struct vnic_port_stats port_stats;
+
+ /* tasks */
+ struct work_struct mcast_restart;
+ struct delayed_work stats_task;
+ struct delayed_work mcast_task;
+ struct delayed_work restart_task;
+ struct mutex moder_lock;
+ struct mutex state_lock;
+
+ /* data structures */
+ struct workqueue_struct *neigh_wq;
+ struct rb_root neigh_tree;
+ struct rb_root mac_tree;
+ atomic_t vnic_child_cnt;
+ rwlock_t mac_rwlock;
+ struct mcast_root mcast_tree;
+ struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES];
+ struct list_head list;
+
+ /* QP resources */
+ struct vnic_qp_res qp_res[VNIC_MAX_NUM_CPUS];
+
+ /* RX resouces */
+ struct vnic_rx_res rx_res[VNIC_MAX_NUM_CPUS];
+ struct ib_recv_wr rx_wr;
+ u32 lro_num;
+ unsigned lro_mng_num;
+ int rx_csum;
+ unsigned napi_num;
+ unsigned rx_rings_num;
+
+ /* TX resources */
+ struct vnic_tx_res tx_res[VNIC_MAX_NUM_CPUS];
+ unsigned tx_rings_num;
+ unsigned real_tx_rings_num;
+ unsigned ndo_tx_rings_num;
+ u8 *pad_va;
+ u64 pad_dma;
+
+ /* for profiling */
+#ifdef VNIC_PROFILLNG
+ struct vnic_prof_skb_entry prof_arr[VNIC_PROFILLNG_SKB_MAX];
+ int prof_arr_it;
+#endif
+ /* interrupt coalecence */
+ u16 rx_usecs;
+ u16 rx_frames;
+ u32 pkt_rate_low;
+ u16 rx_usecs_low;
+ u32 pkt_rate_high;
+ u16 rx_usecs_high;
+ u16 sample_interval;
+ u16 adaptive_rx_coal;
+ unsigned long last_moder_packets;
+ unsigned long last_moder_tx_packets;
+ unsigned long last_moder_bytes;
+ unsigned long last_moder_jiffies;
+ unsigned long last_moder_time;
+ u16 tx_usecs;
+ u16 tx_frames;
+ u8 shared_vnic;
+ u8 shared_mac[ETH_ALEN];
+};
+
+struct eoibhdr {
+ __u8 encap_data;
+ __u8 seg_off;
+ __be16 seg_id;
+};
+
+struct vnic_ib_dev {
+ char name[VNIC_DESC_LEN];
+ struct mutex mlock;
+ struct list_head list;
+ struct list_head port_list;
+ struct ib_device *ca;
+ struct mlx4_ib_dev *mdev;
+ struct ib_device_attr attr;
+ char fw_ver_str[VNIC_FW_STR_MAX];
+};
+
+struct fip_ring_entry {
+ void *mem;
+ u64 bus_addr;
+ int length;
+ int entry_posted;
+};
+
+struct fip_ring {
+ int size;
+ struct fip_ring_entry *ring;
+ unsigned long head;
+ unsigned long tail;
+ spinlock_t ring_lock;
+ spinlock_t head_tail_lock;
+};
+
+enum fip_discover_state {
+ FIP_DISCOVER_OFF,
+ FIP_DISCOVER_INIT,
+ FIP_DISCOVER_SOLICIT,
+ FIP_DISCOVER_CLEAR
+};
+
+#define MAX_INPUT_LEN 64
+#define MAX_INPUT_ARG 12
+struct fip_hadmin_cmd {
+ u8 c_name [MAX_INPUT_LEN];
+ u8 c_mac [MAX_INPUT_LEN];
+ u8 c_vnic_id [MAX_INPUT_LEN];
+ u8 c_vid [MAX_INPUT_LEN];
+ u8 c_bxname [MAX_INPUT_LEN];
+ u8 c_bxguid [MAX_INPUT_LEN];
+ u8 c_eport [MAX_INPUT_LEN];
+ u8 c_ipv4 [MAX_INPUT_LEN];
+ u8 c_ipv6 [MAX_INPUT_LEN];
+ u8 c_emac [MAX_INPUT_LEN];
+ u8 c_pkey [MAX_INPUT_LEN];
+ u8 c_parent [MAX_INPUT_LEN];
+};
+
+struct fip_hadmin_cache {
+ struct fip_hadmin_cmd cmd;
+ u8 system_guid[GUID_LEN];
+ u8 system_name[VNIC_SYSTEM_NAME_LEN];
+ u8 eport_name[VNIC_GW_PORT_NAME_LEN];
+ u8 mac[ETH_ALEN];
+ u16 vnic_id;
+ u16 gw_port_id;
+ u16 vlan;
+ u8 vlan_used;
+ u8 all_vlan_gw;
+ u8 interface_name[VNIC_NAME_LEN];
+ u8 parent_name[VNIC_NAME_LEN];
+ int parent_used;
+ int remove;
+ struct list_head next;
+ u32 qp_base_num;
+ u8 shared_vnic_ip[IPV4_LEN];
+ u8 shared_vnic_mac[ETH_ALEN];
+};
+
+struct pkt_rcv_list {
+ struct list_head list;
+ spinlock_t lock;
+};
+
+struct fip_discover {
+ char name[VNIC_NAME_LEN];
+ struct vnic_port *port;
+ struct list_head discover_list;
+ spinlock_t lock;
+ struct list_head gw_list;
+ struct rw_semaphore l_rwsem; /* gw list rw semaphore **/
+ int hadmin_update;
+ struct list_head hadmin_cache;
+ enum fip_discover_state state;
+ int flush;
+ struct completion flush_complete;
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+ struct fip_ring rx_ring;
+ struct fip_ring tx_ring;
+ struct mcast_root mcast_tree;
+ struct delayed_work fsm_task;
+ struct delayed_work cleanup_task;
+ struct delayed_work hadmin_update_task;
+ struct work_struct pkt_rcv_task_bh;
+ struct pkt_rcv_list rcv_list;
+
+ int mcast_dest_mask;
+ unsigned long discover_mcast_attached_jiffies;
+ unsigned long discover_mcast_detached_jiffies;
+ unsigned long discover_mcast_state;
+ u16 pkey;
+ u16 pkey_index;
+ unsigned long req_attach;
+ unsigned long cur_attached;
+ unsigned new_prot_gws;
+ unsigned old_prot_gws;
+};
+
+struct fip_root {
+ struct list_head discover_list;
+};
+
+struct port_fs_dentry {
+ struct module_attribute fs_entry;
+ struct vnic_port *port;
+};
+
+struct vnic_port {
+ char name[VNIC_DESC_LEN];
+ u8 num;
+ int rx_rings_num;
+ int tx_rings_num;
+ struct vnic_ib_dev *dev;
+ struct mcast_root mcast_tree;
+ struct list_head list;
+ struct list_head login_list;
+ struct delayed_work event_task;
+ struct delayed_work event_task_light;
+ struct delayed_work discover_restart_task;
+ struct ib_event_handler event_handler;
+ struct ib_port_attr attr;
+ union ib_gid gid;
+ int rate;
+ u8 rate_enum;
+ atomic_t vnic_child_ids;
+
+ /* IB resources per port */
+ struct vnic_rx_ring *rx_ring[VNIC_MAX_NUM_CPUS];
+ struct ib_pd *pd;
+ struct ib_mr *mr;
+
+ /* for FIP */
+ struct mutex mlock;
+ struct mutex start_stop_lock;
+ u16 pkey_index;
+ u16 pkey;
+ int max_mtu_enum;
+ struct fip_root fip;
+ struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES];
+};
+
+enum fip_vnic_state {
+ FIP_VNIC_CLOSED = 0,
+ FIP_VNIC_HADMIN_IDLE = 1<<0,
+ FIP_VNIC_LOGIN = 1<<1,
+ FIP_VNIC_WAIT_4_ACK = 1<<2,
+ FIP_VNIC_RINGS_INIT = 1<<3, /* temporary, create rings */
+ FIP_VNIC_MCAST_INIT = 1<<4, /* temporary, start mcast attach */
+ FIP_VNIC_MCAST_INIT_DONE= 1<<5, /* wait for mcast cb */
+ FIP_VNIC_VHUB_INIT = 1<<6,
+ FIP_VNIC_VHUB_INIT_DONE = 1<<7, /* wait for vhub table */
+ FIP_VNIC_VHUB_DONE = 1<<8,
+ FIP_VNIC_VHUB_WRITE = 1<<9,
+ FIP_VNIC_CONNECTED = 1<<10
+};
+
+enum vhub_table_state {
+ VHUB_TBL_INIT,
+ VHUB_TBL_UP2DATE,
+ VHUB_TBL_UPDATED
+};
+
+struct vhub_elist {
+ u32 tusn;
+ int count;
+ int total_count;
+ struct list_head vnic_list; /* chain vnics */
+};
+
+struct vnic_table_entry {
+ u32 qpn;
+ u16 lid;
+ u8 mac[ETH_ALEN];
+ u8 sl;
+
+ struct list_head list;
+ u8 rss;
+ u8 valid;
+};
+
+struct vhub_table {
+ enum vhub_table_state state;
+ u32 checksum;
+ u32 tusn;
+ struct vhub_elist main_list;
+ struct vhub_elist update_list;
+};
+
+struct fip_shared_vnic_data {
+ u8 ip[IPV4_LEN];
+ u8 emac[ETH_ALEN];
+ u8 enabled;
+ u8 arp_proxy;
+};
+
+struct lag_member {
+ u32 qpn;
+ u8 sl;
+ u16 gw_port_id;
+ u16 lid;
+ u8 guid[GUID_LEN];
+ u8 eport_state;
+ u8 weight;
+ u8 link_utilization;
+};
+
+struct lag_members {
+ int num;
+ long used_bitmask;
+ struct lag_properties prop;
+ struct lag_member memb[MAX_LAG_MEMBERS];
+};
+
+struct fip_login_data {
+ u32 qpn;
+ u32 ctl_qpn;
+ u16 port_id; /* must always be uptodate */
+ u16 lid; /* must always be uptodate */
+ u16 vlan;
+ u16 pkey;
+ u16 pkey_index;
+ u16 vnic_id; /* must always be uptodate */
+ u32 vhub_id;
+ u16 mtu;
+
+ u8 sl; /* service level -- 4 bits */
+ u8 guid[GUID_LEN];
+ u8 mac[ETH_ALEN];
+ u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+ u8 vnic_name[VNIC_NAME_LEN];
+ u8 vendor_id[VNIC_VENDOR_LEN];
+ u8 n_mac_mcgid;
+ u8 n_rss_mgid;
+ u8 syndrome; /* must always be uptodate */
+
+ u8 vp; /* 1 bit: do we use vlan */
+ u8 all_vlan_gw; /* 1 bit.
+ is promisc vlan supported on this vnic */
+ struct lag_members lagm;
+};
+
+enum fip_flush {
+ FIP_NO_FLUSH,
+ FIP_PARTIAL_FLUSH, /* use this for events caused by vnic/gw logic will */
+ FIP_FULL_FLUSH /* use this for events caused by unload, host admin destroy */
+};
+
+struct fip_vnic_send_info {
+ u32 gw_qpn;
+ u32 qkey;
+ u16 gw_lid;
+ u8 gw_sl;
+};
+
+/*
+ * This struct holds informative info about the GW that can change without
+ * implecations on GW or vnic logic (only reported to user)
+ */
+struct fip_gw_volatile_info {
+ u8 system_guid[GUID_LEN];
+ u8 system_name[VNIC_SYSTEM_NAME_LEN+1];
+ u8 gw_port_name[VNIC_GW_PORT_NAME_LEN+1];
+};
+
+struct fip_vnic_data {
+ char name[VNIC_NAME_LEN];
+ enum fip_vnic_state state;
+ enum fip_flush flush;
+ spinlock_t lock;
+ spinlock_t ka_lock;
+ struct vnic_sysfs_attr dentry;
+ unsigned long login_state;
+
+ /* data structures maintenance */
+ struct fip_gw_data *gw;
+ struct vnic_port *port;
+ struct list_head gw_vnics;
+ struct vhub_table vhub_table;
+
+ /* execution maintenance */
+ unsigned long update_jiffs;
+ unsigned long keep_alive_jiffs;
+ unsigned long detached_ka_jiffs;
+ unsigned long vnic_mcaste_state;
+ struct delayed_work vnic_task;
+ struct hrtimer keepalive_timer;
+ struct list_head timer;
+ struct delayed_work vnic_gw_alive_task;
+ struct work_struct vnic_pkt_rcv_task_bh;
+ struct work_struct vnic_login_destroy_task;
+ struct work_struct vnic_login_create_task;
+ struct pkt_rcv_list vnic_rcv_list;
+ struct fip_vnic_send_info gw_address;
+
+ /* vnic driver API */
+ struct vnic_login *login;
+ unsigned long login_status;
+ int qps_num;
+ u32 qp_base_num;
+ int parent_used;
+ u8 parent_name[VNIC_NAME_LEN];
+
+ /* rx + tx data structures */
+ struct ib_cq *cq;
+ struct ib_qp *qp;
+ struct fip_ring rx_ring;
+ struct fip_ring tx_ring;
+ struct ib_ah *ah;
+
+ /* data domain */
+ union ib_gid mgid;
+
+ /* vHub context update mcast groups */
+ struct mcast_root mcast_tree;
+ struct fip_login_data login_data;
+ struct fip_shared_vnic_data shared_vnic;
+ u16 mlid;
+ /* u16 pkey_index; not used for now */
+
+ u16 vnic_id; /* unique id for GW */
+ u16 vlan;
+ u8 vlan_used;
+ u8 all_vlan_gw;
+ u16 pkey;
+ u16 pkey_index;
+ u8 hadmined; /* todo, use the state for this */
+ u8 interface_name[VNIC_NAME_LEN];
+ u8 mac_cache[ETH_ALEN];
+ atomic_t eport_state;
+ unsigned long last_send_jiffs;
+ int retry_count;
+ int synd_backlog;
+ struct fip_hadmin_cmd cmd;
+ struct fip_gw_volatile_info gw_info;
+ struct lag_members lm;
+ unsigned long req_attach;
+ unsigned long cur_attached;
+ union ib_gid ka_mcast_gid;
+};
+
+enum vhub_mgid_type {
+ VHUB_MGID_DATA = 0,
+ VHUB_MGID_UPDATE = 2,
+ VHUB_MGID_TABLE = 3,
+ VHUB_MGID_KA = 5,
+};
+
+enum fip_all_mgids {
+ FIP_MCAST_DISCOVER,
+ FIP_MCAST_SOLICIT,
+ FIP_MCAST_VHUB_DATA,
+ FIP_MCAST_VHUB_UPDATE,
+ FIP_MCAST_TABLE,
+ FIP_MCAST_VHUB_KA,
+};
+
+union vhub_mgid {
+ struct mgid {
+ u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+ u8 type;
+ u8 dmac[ETH_ALEN];
+ u8 rss_hash;
+ u8 vhub_id[3];
+ } mgid;
+ union ib_gid ib_gid;
+};
+
+void vnic_carrier_update(struct vnic_login *login);
+int vnic_param_check(void);
+
+/* mac table funcs */
+void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove);
+void vnic_child_flush(struct vnic_login *login, int all);
+int vnic_child_update(struct vnic_login *login, u8 *mac, int remove);
+int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove);
+int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id,
+ u8 *mac, u32 *qp_base_num_ptr, char *parent_name,
+ int remove);
+
+/* mcast funcs */
+int vnic_mcast_init(void);
+void vnic_mcast_cleanup(void);
+
+/*
+ * A helper function to prevent code duplication. Receives a multicast mac
+ * and a gw_id and attaches it (join + attach). The function also receives
+ * a default_mcaste (used for the MGID over default MLID hack and a user list.
+ * Returns 0 on success and non 0 on failure.
+ *
+ * in: mmac - to be used in creation MGID address
+ * in: default_mcaste - mcaste entry of the default MGID. Can be NULL
+ * in: private_data - A user pointer that can be used to identify owner
+ * in: gw_id - to be used in creation MGID address
+ */
+int _vnic_mcast_attach_mgid(struct vnic_login *login,
+ char *mmac,
+ struct vnic_mcast *default_mcaste,
+ void *private_data,
+ u16 gw_id);
+
+struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port,
+ unsigned long *req_attach,
+ unsigned long *cur_attach);
+/*
+ * A helper function to prevent code duplication. Fills vnic_mcast struct with
+ * common values.
+ *
+ * in: mcaste - mcaste to fill
+ * in: gw_id - to be used in creation MGID address
+ * in: mac - to be used in creation MGID address
+ * in: rss_hash - to be used in creation MGID address (ususally 0)
+ * in: create - value of create field in mcaste
+ */
+void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste,
+ u16 gw_id, const u8 *mac, u8 rss_hash, int create);
+
+void vnic_mcast_dealloc(struct vnic_mcast *mcaste);
+
+int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+
+/*
+ * This function grabs the mcast_tree->mcast_rb_lock
+*/
+int vnic_mcast_add(struct mcast_root *mcast_tree,
+ struct vnic_mcast *mcaste);
+int vnic_mcast_del_all(struct mcast_root *mcast_tree);
+int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner);
+
+void vnic_tree_mcast_detach(struct mcast_root *mcast_tree);
+void vnic_tree_mcast_attach(struct mcast_root *mcast_tree);
+
+/*void vnic_port_mcast_del_all(struct mcast_root *port); */
+static inline void vnic_mcast_root_init(struct mcast_root *mcast_tree)
+{
+ spin_lock_init(&mcast_tree->mcast_rb_lock);
+ INIT_LIST_HEAD(&mcast_tree->reattach_list);
+}
+
+/* port funcs */
+int vnic_ports_init(void);
+void vnic_ports_cleanup(void);
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+*/
+void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree,
+ union ib_gid *gid);
+void port_fip_discover_restart(struct work_struct *work);
+int vnic_port_fip_init(struct vnic_port *port);
+void vnic_port_fip_cleanup(struct vnic_port *port, int lock);
+
+/* others */
+void fip_refresh_mcasts(struct fip_discover *discover);
+void vnic_login_refresh_mcasts(struct vnic_port *port);
+
+/* There are 2 different create flows, for host admin and net admin.
+ * In net admin we always create the vnic after connected with GW but we do not
+ * yet know the vnic details (mac, vlan etc). We know the ring paramets and
+ * will need to create the RX/TX rings (before login).
+ * To accomplish this we call vnic_login_pre_create_1, vnic_login_pre_create_2
+ * and after login ACK we will call vnic_login_register_netdev and vnic_login_complete_ack.
+ * In Host admin, we know the vnic info but not the GW info when we create the
+ * vnic. So we call vnic_login_pre_create_1 and vnic_login_register_netdev, after
+ * getting the login ACK we will call vnic_login_pre_create_2, vnic_login_complete_ack.
+ */
+int vnic_login_register_netdev(struct fip_vnic_data *vnic,
+ const char *mac,
+ const char *name);
+int vnic_login_complete_ack(struct fip_vnic_data *vnic,
+ struct fip_login_data *login_data,
+ struct fip_shared_vnic_data *shared_vnic);
+int vnic_login_pre_create_1(struct vnic_port *port,
+ struct fip_vnic_data *vnic);
+int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag);
+
+/*
+ * When destroying login, call to stop login wq tasks. do not call from
+ * login_wq context.
+*/
+void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush);
+/*
+ * When destroy login data struct. Assumes all login wq tasks are stopped.
+ * Can be called from any context, might block for a few secs.
+*/
+void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush);
+
+/*
+ * Destroy a login datastructure.
+ * This function can not be called from login_wq context. If you need to run
+ * from login_wq use the split function vnic_login_destroy_stop_wq/wq_stopped
+ * instead.
+ */
+static inline
+void vnic_login_destroy(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+ vnic_login_destroy_stop_wq(vnic, flush);
+ vnic_login_destroy_wq_stopped(vnic, flush);
+}
+
+/* add / remove members eports from LAG GW */
+void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop);
+int vnic_member_add(struct vnic_login *login, int member_id,
+ struct lag_member *emember);
+int vnic_member_remove(struct vnic_login *login, int member_id);
+int vnic_member_modify(struct vnic_login *login, int member_id,
+ struct lag_member *emember);
+void vnic_member_remove_all(struct vnic_login *login);
+
+int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube);
+void vnic_vhube_flush(struct fip_vnic_data *vnic);
+void vnic_vhube_del(struct fip_vnic_data *vnic, u8 *mac);
+int vnic_neighe_path_query(struct vnic_neigh *neighe);
+
+void vhub_mgid_create(const char *mgid_prefix,
+ const char *mmac, /* mcast mac for bcast 0xFF.. */
+ u64 n_mac, /* bits to take from mmac */
+ u32 vhub_id,
+ enum vhub_mgid_type type,
+ u8 rss_hash,
+ union vhub_mgid *mgid);
+/*
+ * read the state of the gw eport. Can be called from any context.
+*/
+int fip_vnic_get_eport_state(struct fip_vnic_data *vnic);
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff);
+int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff);
+int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff);
+u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic);
+int fip_vnic_get_gw_type(struct fip_vnic_data *vnic);
+int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf);
+int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff);
+
+
+/*
+ * return short format string of GW info. can be called from any context.
+*/
+int fip_vnic_get_short_gw_info(struct fip_vnic_data *vnic, char *buff);
+
+void vnic_data_cleanup(void);
+
+/*
+ * This function is called from the sysfs update callback function.
+ * it parses the request and adds the request to a list. It then queues a
+ * work request to process the list from the fip_wq context.
+*/
+int fip_hadmin_sysfs_update(struct vnic_port *port,
+ const char *buffer, int count, int remove);
+int fip_gw_sysfs_show(struct vnic_port *port, char *buffer);
+int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd);
+void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd);
+
+int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address);
+void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address);
+void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn,
+ u32 qkey, u16 gw_lid, u8 gw_sl);
+
+int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic);
+
+int port_fs_init(struct vnic_port *port);
+void port_fs_exit(struct vnic_port *port);
+
+int vnic_port_query(struct vnic_port *port);
+
+#endif /* VNIC_H */
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_DATA_H
+#define _VNIC_DATA_H
+
+#include "vnic.h"
+
+enum {
+ VNIC_SEND_INLINE_FLAG_POS = 63,
+};
+
+#define VNIC_SEND_INLINE_FLAG ((u64)1 << VNIC_SEND_INLINE_FLAG_POS)
+
+/* main funcs */
+int vnic_port_data_init(struct vnic_port *port);
+void vnic_port_data_cleanup(struct vnic_port *port);
+
+/* ib funcs */
+struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind,
+ gfp_t gfp_flag);
+int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id);
+int vnic_post_recvs(struct vnic_rx_ring *ring);
+int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata, int nqps,
+ int align, struct ib_qp *list[]);
+int vnic_ib_destroy_qp(struct ib_qp *qp);
+int vnic_ib_post_send(struct ib_qp *ibqp,
+ struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr,
+ u8 ip_off, u8 ip6_off,
+ u8 tcp_off, u8 udp_off);
+struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index);
+void vnic_destroy_rx_ring(struct vnic_rx_ring *ring);
+int vnic_init_qp(struct vnic_login *login, int qp_index);
+int vnic_create_qp(struct vnic_login *login, int qp_index);
+int vnic_create_qp_range(struct vnic_login *login);
+void vnic_destroy_qp(struct vnic_login *login, int qp_index);
+int vnic_create_tx_res(struct vnic_login *login, int tx_res_index);
+int vnic_create_rx_res(struct vnic_login *login, int rx_res_index);
+void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index);
+void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index);
+
+int vnic_ib_up(struct net_device *dev);
+int vnic_ib_down(struct net_device *dev);
+int vnic_ib_open(struct net_device *dev);
+int vnic_ib_stop(struct net_device *dev);
+
+int vnic_ib_set_moder(struct vnic_login *login,
+ u16 rx_usecs, u16 rx_frames, u16 tx_usecs, u16 tx_frames);
+int vnic_port_ib_init(struct vnic_port *port);
+void vnic_port_ib_cleanup(struct vnic_port *port);
+void vnic_ib_dispatch_event(struct ib_event *event);
+#ifndef _BP_NAPI_POLL
+int vnic_poll_cq_rx(struct napi_struct *napi, int budget);
+#else
+int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget);
+#endif
+void vnic_send(struct vnic_login *login, struct sk_buff *skb,
+ struct ib_ah *ah, u32 dqpn, int tx_res_index);
+void vnic_ib_free_ring(struct vnic_rx_ring *ring);
+int vnic_ib_init_ring(struct vnic_rx_ring *ring);
+
+/* netdev funcs */
+struct net_device *vnic_alloc_netdev(struct vnic_port *port);
+void vnic_free_netdev(struct vnic_login *login);
+int vnic_restart(struct net_device *dev);
+void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr);
+void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr);
+
+/* rx funcs */
+int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc);
+int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev,
+ struct skb_frag_struct *skb_frags_rx,
+ u64 wr_id, int length);
+int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring,
+ struct ib_wc *wc, int ip_summed, char *eth_hdr_va);
+
+/* tx funcs */
+int vnic_tx(struct sk_buff *skb, struct net_device *dev);
+
+/* sysfs funcs */
+int vnic_create_dentry(struct vnic_login *login);
+void vnic_delete_dentry(struct vnic_login *login);
+
+/* ethtool funcs */
+void vnic_set_ethtool_ops(struct net_device *dev);
+
+/* neigh funcs */
+void vnic_neigh_del_all(struct vnic_login *login);
+struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac);
+void vnic_neighe_dealloc_task(struct work_struct *work);
+void vnic_neighe_dealloc(struct vnic_neigh *neighe);
+struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login,
+ const u8 *mac, u16 dlid, u32 dqpn, u8 rss);
+void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe);
+int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe);
+struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid);
+void vnic_neigh_invalidate(struct vnic_login *login);
+
+
+
+struct vnic_login *__vnic_login_create(struct vnic_port *port, int index);
+u32 vnic_hash(struct net_device *dev, struct sk_buff *skb);
+#endif /* _VNIC_DATA_H */
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static struct ethtool_ops vnic_ethtool_ops;
+
+static const char vnic_strings[][ETH_GSTRING_LEN] = {
+ /* public statistics */
+ "rx_packets", "tx_packets", "rx_bytes",
+ "tx_bytes", "rx_errors", "tx_errors",
+ "rx_dropped", "tx_dropped", "multicast",
+ "collisions", "rx_length_errors", "rx_over_errors",
+ "rx_crc_errors", "rx_frame_errors", "rx_fifo_errors",
+ "rx_missed_errors", "tx_aborted_errors", "tx_carrier_errors",
+ "tx_fifo_errors", "tx_heartbeat_errors", "tx_window_errors",
+#define VNIC_PUB_STATS_LEN 21
+
+ /* private statistics */
+ "gro_held", "gro_merged", "gro_normal", "gro_drop",
+ "lro_aggregated", "lro_flushed", "lro_no_desc",
+ "tso_packets", "queue_stopped", "wake_queue",
+ "tx_timeout", "rx_chksum_good", "rx_chksum_none",
+ "tx_chksum_offload", "sig_ver_err", "vlan_err",
+ "shared_packets", "runt_packets", "realloc_packets",
+ "gw_tx_packets", "gw_tx_bytes",
+#define VNIC_PORT_STATS_LEN 21
+
+ /* packet statistics rx_prio_X (TODO) */
+#define VNIC_PKT_STATS_LEN 0
+};
+
+#define VNIC_STATS_LEN (sizeof(vnic_strings) / ETH_GSTRING_LEN)
+
+static void vnic_get_drvinfo(struct net_device *dev,
+ struct ethtool_drvinfo *drvinfo)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ *drvinfo = login->drvinfo;
+}
+
+static u32 vnic_get_msglevel(struct net_device *dev)
+{
+ return vnic_msglvl;
+}
+
+static void vnic_set_msglevel(struct net_device *dev, u32 mlevel)
+{
+ vnic_msglvl = mlevel;
+}
+
+static int vnic_get_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *coal)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ vnic_dbg_ethtool(login->name, "get coalescing params for mtu:%d "
+ "rx_frames:%d rx_usecs:%d, "
+ "tx_frames:%d tx_usecs:%d, "
+ "adaptive_rx_coal:%d, "
+ "adaptive_tx_coal:%d\n",
+ login->dev->mtu,
+ login->rx_frames, login->rx_usecs,
+ login->tx_frames, login->tx_usecs,
+ login->adaptive_rx_coal, 0);
+
+ coal->tx_coalesce_usecs = login->tx_usecs;
+ coal->tx_max_coalesced_frames = login->tx_frames;
+ coal->rx_coalesce_usecs = login->rx_usecs;
+ coal->rx_max_coalesced_frames = login->rx_frames;
+
+ coal->pkt_rate_low = login->pkt_rate_low;
+ coal->rx_coalesce_usecs_low = login->rx_usecs_low;
+ coal->pkt_rate_high = login->pkt_rate_high;
+ coal->rx_coalesce_usecs_high = login->rx_usecs_high;
+ coal->rate_sample_interval = login->sample_interval;
+ coal->use_adaptive_rx_coalesce = login->adaptive_rx_coal;
+
+ return 0;
+}
+
+static int vnic_set_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *coal)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ login->rx_frames = (coal->rx_max_coalesced_frames ==
+ VNIC_AUTO_CONF) ?
+ VNIC_RX_COAL_TARGET /
+ login->dev->mtu + 1 : coal->rx_max_coalesced_frames;
+ login->rx_usecs = (coal->rx_coalesce_usecs ==
+ VNIC_AUTO_CONF) ?
+ VNIC_RX_COAL_TIME : coal->rx_coalesce_usecs;
+ login->tx_frames = coal->tx_max_coalesced_frames;
+ login->tx_usecs = coal->tx_coalesce_usecs;
+
+ /* Set adaptive coalescing params */
+ login->pkt_rate_low = coal->pkt_rate_low;
+ login->rx_usecs_low = coal->rx_coalesce_usecs_low;
+ login->pkt_rate_high = coal->pkt_rate_high;
+ login->rx_usecs_high = coal->rx_coalesce_usecs_high;
+ login->sample_interval = coal->rate_sample_interval;
+ login->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+ login->last_moder_time = VNIC_AUTO_CONF;
+
+ if (login->adaptive_rx_coal)
+ return 0;
+
+ vnic_ib_set_moder(login,
+ login->rx_usecs, login->rx_frames,
+ login->tx_usecs, login->tx_frames);
+
+ return 0;
+}
+
+static int vnic_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ cmd->autoneg = AUTONEG_DISABLE;
+ cmd->supported = SUPPORTED_10000baseT_Full;
+ cmd->advertising = SUPPORTED_10000baseT_Full;
+ if (netif_carrier_ok(dev)) {
+ cmd->speed = SPEED_10000;
+ cmd->duplex = DUPLEX_FULL;
+ } else {
+ cmd->speed = -1;
+ cmd->duplex = -1;
+ }
+ return 0;
+}
+
+static int vnic_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+ if ((cmd->autoneg == AUTONEG_ENABLE) ||
+ (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL))
+ return -EINVAL;
+
+ /* Nothing to change */
+ return 0;
+}
+
+static void vnic_get_strings(struct net_device *dev,
+ uint32_t stringset, uint8_t *data)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int index = 0, stats_off = 0, i;
+
+ if (stringset != ETH_SS_STATS)
+ return;
+
+ /* Add main counters */
+ for (i = 0; i < VNIC_PUB_STATS_LEN; i++)
+ strcpy(data + (index++) * ETH_GSTRING_LEN,
+ vnic_strings[i + stats_off]);
+ stats_off += VNIC_PUB_STATS_LEN;
+
+ for (i = 0; i < VNIC_PORT_STATS_LEN; i++)
+ strcpy(data + (index++) * ETH_GSTRING_LEN,
+ vnic_strings[i + stats_off]);
+ stats_off += VNIC_PORT_STATS_LEN;
+
+ for (i = 0; i < VNIC_PKT_STATS_LEN; i++)
+ strcpy(data + (index++) * ETH_GSTRING_LEN,
+ vnic_strings[i + stats_off]);
+ stats_off += VNIC_PKT_STATS_LEN;
+
+ for (i = 0; i < login->tx_rings_num; i++) {
+ sprintf(data + (index++) * ETH_GSTRING_LEN,
+ "tx%d_packets", i);
+ sprintf(data + (index++) * ETH_GSTRING_LEN,
+ "tx%d_bytes", i);
+ }
+ for (i = 0; i < login->rx_rings_num; i++) {
+ sprintf(data + (index++) * ETH_GSTRING_LEN,
+ "rx%d_packets", i);
+ sprintf(data + (index++) * ETH_GSTRING_LEN,
+ "rx%d_bytes", i);
+ }
+}
+
+static void vnic_get_ethtool_stats(struct net_device *dev,
+ struct ethtool_stats *stats, uint64_t *data)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int index = 0, i;
+
+ spin_lock_bh(&login->stats_lock);
+
+ for (i = 0; i < VNIC_PUB_STATS_LEN; i++)
+ data[index++] = ((unsigned long *) &login->stats)[i];
+ for (i = 0; i < VNIC_PORT_STATS_LEN; i++)
+ data[index++] = ((unsigned long *) &login->port_stats)[i];
+ for (i = 0; i < VNIC_PKT_STATS_LEN; i++)
+ data[index++] = 0;
+ for (i = 0; i < login->tx_rings_num; i++) {
+ data[index++] = login->tx_res[i].stats.tx_packets;
+ data[index++] = login->tx_res[i].stats.tx_bytes;
+ }
+ for (i = 0; i < login->rx_rings_num; i++) {
+ data[index++] = login->port->rx_ring[i]->stats.rx_packets;
+ data[index++] = login->port->rx_ring[i]->stats.rx_bytes;
+ }
+ spin_unlock_bh(&login->stats_lock);
+}
+
+#ifndef _BP_ETHTOOL_NO_SSETC
+static int vnic_get_sset_count(struct net_device *dev, int sset)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ switch (sset) {
+ case ETH_SS_STATS:
+ return VNIC_STATS_LEN + /* static stats + stats per ring */
+ (login->tx_rings_num + login->rx_rings_num) * 2;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+#else
+static int vnic_get_stats_count(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ return VNIC_STATS_LEN +
+ (login->tx_rings_num + login->rx_rings_num) * 2;
+}
+#endif
+
+static void vnic_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+ wol->supported = wol->wolopts = 0;
+
+ return;
+}
+
+void vnic_get_ringparam(struct net_device *dev, struct ethtool_ringparam *param)
+{
+ memset(param, 0, sizeof *param);
+ param->rx_max_pending = VNIC_MAX_RX_SIZE;
+ param->tx_max_pending = VNIC_MAX_TX_SIZE;
+ param->rx_pending = vnic_rx_rings_len;
+ param->tx_pending = vnic_tx_rings_len;
+}
+
+void vnic_set_ethtool_ops(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ struct mlx4_ib_dev *mlx4_ibdev = login->port->dev->mdev;
+
+ ASSERT(login);
+ ASSERT(login->port->dev->ca);
+ ASSERT(login->port->dev->ca->dma_device);
+
+ SET_ETHTOOL_OPS(dev, &vnic_ethtool_ops);
+ strncpy(login->drvinfo.driver, DRV_NAME, VNIC_ETHTOOL_LINE_MAX);
+ strncpy(login->drvinfo.version, DRV_VER, VNIC_ETHTOOL_LINE_MAX);
+ login->drvinfo.n_stats = 0;
+ login->drvinfo.regdump_len = 0;
+ login->drvinfo.eedump_len = 0;
+
+ sprintf(login->drvinfo.bus_info, "%s [%s:%d]",
+ pci_name(to_pci_dev(login->port->dev->ca->dma_device)),
+ login->port->dev->ca->name, login->port->num);
+ sprintf(login->drvinfo.fw_version, "%s [%.*s]",
+ login->port->dev->fw_ver_str, MLX4_BOARD_ID_LEN,
+ mlx4_ibdev->dev->board_id);
+ vnic_dbg_ethtool(login->name, "bus %s, port %d, fw_ver %s\n",
+ login->drvinfo.bus_info, login->port->num,
+ login->drvinfo.fw_version);
+
+ return;
+}
+
+static struct ethtool_ops vnic_ethtool_ops = {
+ .get_link = ethtool_op_get_link,
+ .get_drvinfo = vnic_get_drvinfo,
+ .get_msglevel = vnic_get_msglevel,
+ .set_msglevel = vnic_set_msglevel,
+ .get_coalesce = vnic_get_coalesce,
+ .set_coalesce = vnic_set_coalesce,
+ .get_strings = vnic_get_strings,
+ .get_ethtool_stats = vnic_get_ethtool_stats,
+#ifndef _BP_ETHTOOL_NO_SSETC
+ .get_sset_count = vnic_get_sset_count,
+#else
+ .get_stats_count = vnic_get_stats_count,
+#endif
+ .get_settings = vnic_get_settings,
+ .set_settings = vnic_set_settings,
+ .get_wol = vnic_get_wol,
+ .get_ringparam = vnic_get_ringparam,
+ .set_ringparam = NULL,
+};
+
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/version.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip_discover.h"
+
+#define ALL_VLAN_GW_VID "all"
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0))
+#define __MODULE_KOBJ_TYPE struct module_kobject
+#else
+#define __MODULE_KOBJ_TYPE struct module
+#endif
+
+char *login_dentry_name(char *buf, struct vnic_login *login, char *str)
+{
+ snprintf(buf, VNIC_SYSFS_FLEN, "%s%d-%s", "vnic",
+ login->cnt, str);
+ return buf;
+}
+
+char *port_dentry_name(char *buf, struct vnic_port *port, char *str)
+{
+ snprintf(buf, VNIC_SYSFS_FLEN, "%s_%s_%d",
+ str, port->dev->name, port->num);
+ return buf;
+}
+
+char *vnic_dentry_name(char *buf, struct fip_vnic_data *vnic, char *str)
+{
+ snprintf(buf, VNIC_SYSFS_FLEN, "%s-%s-%s", "vnic",
+ vnic->interface_name, str);
+ return buf;
+}
+
+#ifndef _BP_NO_ATT_OWNER
+#define DENTRY_OWNER(_vdentry) \
+ (_vdentry)->dentry.attr.owner = THIS_MODULE; \
+ (_vdentry)->kobj = &vdentry->dentry.attr.owner->mkobj.kobj;
+#else
+#define DENTRY_OWNER(_vdentry) \
+ (_vdentry)->kobj = &(THIS_MODULE)->mkobj.kobj;
+#endif
+
+#define DENTRY_REMOVE(_dentry) \
+do { \
+ vnic_dbg_sysfs((_dentry)->name, "deleted\n"); \
+ sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr); \
+ (_dentry)->ctx = NULL; \
+} while (0);
+
+#define DENTRY_CREATE(_ctx, _dentry, _name, _show, _store) \
+do { \
+ struct vnic_sysfs_attr *vdentry = _dentry; \
+ vdentry->ctx = _ctx; \
+ vdentry->dentry.show = _show; \
+ vdentry->dentry.store = _store; \
+ vdentry->dentry.attr.name = vdentry->name; \
+ vdentry->dentry.attr.mode = 0; \
+ DENTRY_OWNER(vdentry); \
+ snprintf(vdentry->name, VNIC_SYSFS_FLEN, "%s", _name); \
+ if (vdentry->dentry.store) \
+ vdentry->dentry.attr.mode |= S_IWUSR; \
+ if (vdentry->dentry.show) \
+ vdentry->dentry.attr.mode |= S_IRUGO; \
+ vnic_dbg_sysfs(_ctx->name, "creating %s\n", \
+ vdentry->name); \
+ if (strlen(_name) > VNIC_SYSFS_FLEN) { \
+ vnic_err(_ctx->name, "name too long %d > %d\n", \
+ (int)strlen(_name), VNIC_SYSFS_FLEN); \
+ vdentry->ctx = NULL; \
+ break; \
+ } \
+ if (sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr)) { \
+ vnic_err(_ctx->name, "failed to create %s\n", \
+ vdentry->dentry.attr.name); \
+ vdentry->ctx = NULL; \
+ break; \
+ } \
+ vnic_dbg_sysfs(_ctx->name, "created %s\n", vdentry->name); \
+} while (0);
+
+/* helper functions */
+static const char *port_phys_state_str(enum ib_port_state pstate)
+{
+ switch (pstate) {
+ case 0:
+ return "no_state_change";
+ case 1:
+ return "sleep";
+ case 2:
+ return "polling";
+ case 3:
+ return "disabled";
+ case 4:
+ return "port_configuration_training";
+ case 5:
+ return "up";
+ case 6:
+ return "error_recovery";
+ case 7:
+ return "phy_test";
+ default:
+ return "invalid_state";
+ }
+}
+static const char *port_state_str(enum ib_port_state pstate)
+{
+ switch (pstate) {
+ case IB_PORT_DOWN:
+ return "down";
+ case IB_PORT_INIT:
+ return "initializing";
+ case IB_PORT_ARMED:
+ return "armed";
+ case IB_PORT_ACTIVE:
+ return "active";
+ case IB_PORT_NOP:
+ return "nop";
+ case IB_PORT_ACTIVE_DEFER:
+ return "defer";
+ default:
+ return "invalid_state";
+ }
+}
+
+/* store/show functions */
+static ssize_t vnic_neigh_show(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ char *p = buf;
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_login *login = vnic_dentry->ctx;
+ struct vnic_neigh *neighe;
+ struct vnic_mcast *mcaste;
+ struct rb_node *n;
+ unsigned long flags;
+
+ /* check if GW entry is ready */
+ if (!login->gw_neigh)
+ goto out;
+ ASSERT(login->gw_neigh);
+
+ /* print GW entry */
+ neighe = login->gw_neigh;
+ p += _sprintf(p, buf, "G:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+ "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d] VALID[%d]\n",
+ MAC_6_PRINT_ARG(neighe->mac),
+ be16_to_cpu(login->vid), login->vlan_used, neighe->qpn,
+ neighe->lid, neighe->rss, neighe->sl, neighe->valid);
+
+ /* print neigh tree entries */
+ n = rb_first(&login->neigh_tree);
+ while (n) {
+ neighe = rb_entry(n, struct vnic_neigh, rb_node);
+ p += _sprintf(p, buf, "U:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+ "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d] VALID[%d]\n",
+ MAC_6_PRINT_ARG(neighe->mac),
+ be16_to_cpu(login->vid), login->vlan_used,
+ neighe->qpn, neighe->lid, neighe->rss, neighe->sl, neighe->valid);
+ n = rb_next(n);
+ }
+
+ /* print mcast tree entries */
+ spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+ n = rb_first(&login->mcast_tree.mcast_tree);
+ while (n) {
+ u16 lid = 0xFFFF;
+ mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+ n = rb_next(n);
+ if (test_bit(MCAST_ATTACHED, &mcaste->state))
+ lid = mcaste->port_mcaste->rec.mlid;
+ p += _sprintf(p, buf, "M:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+ "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d]\n",
+ MAC_6_PRINT_ARG(mcaste->mac),
+ 0, login->vlan_used, IB_MULTICAST_QPN, lid, 0, mcaste->port_mcaste->sa_mcast->rec.sl);
+ }
+ spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+out:
+ return (ssize_t)(p - buf);
+}
+
+/* store/show functions */
+static ssize_t vnic_member_show(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ char *p = buf;
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_login *login = vnic_dentry->ctx;
+ int i;
+
+ if (!login->is_lag)
+ goto out;
+
+ netif_tx_lock_bh(login->dev);
+ p += _sprintf(p, buf, "GW member count=%d active count=%d hash bitmask=0x%X\n",
+ login->lag_member_count, login->lag_member_active_count, login->lag_prop.hash_mask);
+
+ p += _sprintf(p, buf, "GW hash mapping table:\n");
+
+ for (i=0; i<LAG_MAP_TABLE_SIZE; i+=8) {
+ p += _sprintf(p, buf, "%3d %3d %3d %3d %3d %3d %3d %3d\n",
+ login->lag_gw_map[i], login->lag_gw_map[i+1], login->lag_gw_map[i+2], login->lag_gw_map[i+3],
+ login->lag_gw_map[i+4], login->lag_gw_map[i+5], login->lag_gw_map[i+6], login->lag_gw_map[i+7]);
+ }
+
+ p += _sprintf(p, buf, "\nGW member state info: (0x1-created, 0x2-eport up, 0x4-mcast join complete, 0x8-member in use)\n");
+
+ for (i=0; i<MAX_LAG_MEMBERS; i++) {
+ p += _sprintf(p, buf, "%.2d GW id=%.3d State=0x%.3x LID=%.3d QPN=0x%.6x SL[%d] VALID[%d]\n", i,
+ login->lag_gw_neigh[i].gw_id,
+ login->lag_gw_neigh[i].info,
+ login->lag_gw_neigh[i].neigh.lid,
+ login->lag_gw_neigh[i].neigh.qpn,
+ login->lag_gw_neigh[i].neigh.sl,
+ login->lag_gw_neigh[i].neigh.valid);
+ }
+ netif_tx_unlock_bh(login->dev);
+
+out:
+ return (ssize_t)(p - buf);
+}
+
+static ssize_t vnic_login_show(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ char *p = buf, tmp_line[VNIC_SYSFS_LLEN];
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_login *login = vnic_dentry->ctx;
+ struct fip_vnic_data *vnic_fip = login->fip_vnic;
+ int rc, eport_connected = test_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic_fip->login_state);
+ u16 pkey_used = 0;
+ int lag_gw;
+ int ret;
+
+ ASSERT(login->dev);
+ ASSERT(login->port->dev->ca);
+
+ /* NETDEV attributes */
+ p += _sprintf(p, buf, "NETDEV_NAME %s\n", login->dev->name);
+ p += _sprintf(p, buf, "NETDEV_LINK %s\n",
+ netif_carrier_ok(login->dev) ? "up" : "down");
+ p += _sprintf(p, buf, "NETDEV_OPEN %s\n",
+ (login->dev->flags & IFF_UP) ? "yes" : "no");
+ p += _sprintf(p, buf, "NETDEV_QSTOP %s\n",
+ netif_queue_stopped(login->dev) ? "yes" : "no");
+ p += _sprintf(p, buf, "NETDEV_MTU %d/%d\n",
+ (int)login->dev->mtu,
+ (int)login->max_mtu);
+
+ /* IOA attributes */
+ p += _sprintf(p, buf, "IOA_PORT %s:%d\n",
+ login->port->dev->ca->name,
+ login->port->num);
+ p += _sprintf(p, buf, "IOA_NAME %s\n",
+ login->desc);
+ p += _sprintf(p, buf, "IOA_LID 0x%04x\n", login->port->attr.lid);
+ p += _sprintf(p, buf, "IOA_GUID "VNIC_GUID_FMT"\n",
+ VNIC_GUID_RAW_ARG(login->port->gid.raw + 8));
+ p += _sprintf(p, buf, "IOA_LOG_LINK %s\n",
+ port_phys_state_str(login->port->attr.phys_state));
+ p += _sprintf(p, buf, "IOA_PHY_LINK %s\n",
+ port_state_str(login->port->attr.state));
+ p += _sprintf(p, buf, "IOA_MTU %d\n", login->port->max_mtu_enum);
+
+
+ /* EPORT and BX attributes */
+ if (no_bxm) {
+ p += _sprintf(p, buf, "EPORT_STATE %s\n", "bridgeless");
+ } else if (vnic_fip) {
+ p += _sprintf(p, buf, "EPORT_STATE %s\n",
+ !eport_connected ? "disconnected" :
+ (fip_vnic_get_eport_state(vnic_fip) ?
+ "up" : "down"));
+ p += _sprintf(p, buf, "EPORT_NAME %s\n",
+ fip_vnic_get_eport_name(vnic_fip, tmp_line) ?
+ NOT_AVAILABLE_STRING : tmp_line);
+ p += _sprintf(p, buf, "EPORT_QPN 0x%06x\n",
+ login->gw_neigh ? login->gw_neigh->qpn : 0);
+ p += _sprintf(p, buf, "EPORT_LID 0x%04x\n",
+ login->gw_neigh ? login->gw_neigh->lid : 0);
+ p += _sprintf(p, buf, "EPORT_ID %u\n", login->gw_port_id);
+
+ p += _sprintf(p, buf, "BX_NAME %s\n",
+ fip_vnic_get_bx_name(vnic_fip, tmp_line) ?
+ NOT_AVAILABLE_STRING : tmp_line);
+ fip_vnic_get_bx_guid(vnic_fip, tmp_line);
+ if (*((u64 *)tmp_line) == 0)
+ p += _sprintf(p, buf, "BX_GUID %s\n", NOT_AVAILABLE_STRING);
+ else
+ p += _sprintf(p, buf, "BX_GUID "VNIC_GUID_FMT"\n",
+ VNIC_GUID_RAW_ARG(tmp_line));
+
+ lag_gw = fip_vnic_get_gw_type(vnic_fip);
+ if (lag_gw) {
+ p += _sprintf(p, buf, "GW_TYPE LAG\n");
+ ret = fip_vnic_get_lag_eports(vnic_fip, p);
+ p += (ret > 0) ? ret : 0;
+ } else
+ p += _sprintf(p, buf, "GW_TYPE LEGACY\n");
+
+ rc = fip_vnic_get_all_vlan_mode(vnic_fip, tmp_line);
+ p += _sprintf(p, buf, "ALL_VLAN %s\n",
+ rc < 0 ? NOT_AVAILABLE_STRING : tmp_line);
+
+ } else {
+ p += _sprintf(p, buf, "EPORT_STATE %s\n", "error");
+ }
+
+ /* misc attributes*/
+ p += _sprintf(p, buf, "SW_RSS %s\n",
+ !eport_connected ? NOT_AVAILABLE_STRING :
+ ((login->qps_num > 1) ? "yes" : "no"));
+ p += _sprintf(p, buf, "SW_RSS_SIZE %u\n", login->qps_num);
+ p += _sprintf(p, buf, "RX_RINGS_NUM %d\n", login->rx_rings_num);
+ p += _sprintf(p, buf, "RX_RINGS_LIN %s\n",
+ login->port->rx_ring[0]->log_rx_info ? "no" : "yes");
+ p += _sprintf(p, buf, "TX_RINGS_NUM %d\n", login->tx_rings_num);
+ p += _sprintf(p, buf, "TX_RINGS_ACT %d\n",
+ VNIC_TXQ_GET_ACTIVE(login));
+ p += _sprintf(p, buf, "NDO_TSS %s\n",
+ (login->ndo_tx_rings_num > 1) ? "yes" : "no");
+ p += _sprintf(p, buf, "NDO_TSS_SIZE %u\n", login->ndo_tx_rings_num);
+ p += _sprintf(p, buf, "MCAST_PROMISC %s\n",
+ !eport_connected ? NOT_AVAILABLE_STRING :
+ (is_mcast_promisc(login) ? "yes" : "no"));
+ p += _sprintf(p, buf, "UCAST_PROMISC %s\n",
+ (is_ucast_promisc(login) ? "yes" : "no"));
+ p += _sprintf(p, buf, "MCAST_MASK %d\n", login->n_mac_mcgid);
+ p += _sprintf(p, buf, "CHILD_VNICS %d/%d\n",
+ atomic_read(&login->vnic_child_cnt),
+ vnic_child_max);
+ p += _sprintf(p, buf, "PKEY 0x%04x\n", login->pkey);
+ p += _sprintf(p, buf, "PKEY_INDEX 0x%04x\n", login->pkey_index);
+ rc = ib_query_pkey(login->port->dev->ca, login->port->num,
+ login->pkey_index, &pkey_used);
+ p += _sprintf(p, buf, "PKEY_MEMBER %s\n",
+ (rc || !eport_connected) ? NOT_AVAILABLE_STRING :
+ ((pkey_used & 0x8000) ? "full" : "partial"));
+ p += _sprintf(p, buf, "SL_DATA %u\n", login->sl);
+ p += _sprintf(p, buf, "SL_CONTROL %u\n",
+ vnic_fip ? fip_vnic_get_bx_sl(vnic_fip) : 0);
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+ p += _sprintf(p, buf, "GRO %s\n",
+ login->dev->features & NETIF_F_GRO ? "yes" : "no");
+#elif defined(NETIF_F_LRO)
+ p += _sprintf(p, buf, "LRO %s\n",
+ login->dev->features & NETIF_F_LRO ? "yes" : "no");
+ p += _sprintf(p, buf, "LRO_NUM %d\n", login->lro_num);
+#endif
+ p += _sprintf(p, buf, "NAPI %s\n",
+ login->napi_num ? "yes" : "no");
+ p += _sprintf(p, buf, "NAPI_WEIGHT %u\n",
+ login->napi_num ? vnic_napi_weight : 0);
+ p += _sprintf(p, buf, "QPN 0x%x\n",
+ login->qp_base_num);
+ p += _sprintf(p, buf, "MAC "MAC_6_PRINT_FMT"\n",
+ MAC_6_PRINT_ARG(login->dev_addr));
+ p += _sprintf(p, buf, "VNIC_ID %d\n",
+ vnic_fip ? vnic_fip->vnic_id : 0);
+ p += _sprintf(p, buf, "ADMIN_MODE %s\n",
+ !vnic_fip ? NOT_AVAILABLE_STRING :
+ (vnic_fip->hadmined ? "host" : "network"));
+
+ if (vnic_fip && vnic_fip->vlan_used)
+ p += _sprintf(p, buf, "VLAN 0x%03x\n", vnic_fip->vlan);
+ else
+ p += _sprintf(p, buf, "VLAN %s\n", NOT_AVAILABLE_STRING);
+
+ if (vnic_fip && vnic_fip->shared_vnic.enabled) {
+ p += _sprintf(p, buf, "SHARED_MAC "MAC_6_PRINT_FMT"\n",
+ MAC_6_PRINT_ARG(vnic_fip->shared_vnic.emac));
+ p += _sprintf(p, buf, "SHARED_IP "IP_4_PRINT_FMT"\n",
+ IP_4_PRINT_ARG(vnic_fip->shared_vnic.ip));
+ } else {
+ p += _sprintf(p, buf, "SHARED_MAC %s\n", NOT_AVAILABLE_STRING);
+ p += _sprintf(p, buf, "SHARED_IP %s\n", NOT_AVAILABLE_STRING);
+ }
+
+ return (ssize_t)(p - buf);
+}
+
+static ssize_t vnic_qps_show(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ char *p = buf;
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_login *login = vnic_dentry->ctx;
+ struct ib_qp *qp;
+ struct ib_qp_attr query_attr;
+ struct ib_qp_init_attr query_init_attr;
+ int i, mask = -1;
+
+ for (i = 0; i < login->qps_num; ++i) {
+ qp = login->qp_res[i].qp;
+ if (ib_query_qp(qp, &query_attr, mask, &query_init_attr))
+ continue;
+ p += _sprintf(p, buf, "QP_INDEX %d\n", i);
+ p += _sprintf(p, buf, "QP_NUM 0x%06x\n", qp->qp_num);
+ p += _sprintf(p, buf, "QP_QKEY 0x%08x\n", query_attr.qkey);
+ p += _sprintf(p, buf, "QP_STATE 0x%02x\n", query_attr.qp_state);
+ p += _sprintf(p, buf, "QP_RX_RING %d\n", i % login->rx_rings_num);
+ p += _sprintf(p, buf, "QP_PTR %p\n", qp);
+ p += _sprintf(p, buf, "QP_RX_SRQ_PTR %p\n", qp->srq);
+ p += _sprintf(p, buf, "QP_RX_CQ_PTR %p\n", qp->recv_cq);
+ p += _sprintf(p, buf, "QP_TX_CQ_PTR %p\n", qp->send_cq);
+ p += _sprintf(p, buf, "\n");
+ }
+
+ return (ssize_t)(p - buf);
+}
+static char* vnic_state_2str(enum fip_vnic_state state)
+{
+ switch(state) {
+ case FIP_VNIC_CLOSED: return "CLOSED";
+ case FIP_VNIC_CONNECTED: return "CONNECTED";
+ case FIP_VNIC_HADMIN_IDLE: return "HADMIN_IDLE";
+ case FIP_VNIC_LOGIN: return "LOGIN";
+ case FIP_VNIC_MCAST_INIT: return "MCAST_INIT";
+ case FIP_VNIC_MCAST_INIT_DONE: return "MCAST_INIT_DONE";
+ case FIP_VNIC_RINGS_INIT: return "RINGS_INIT";
+ case FIP_VNIC_VHUB_DONE: return "VHUB_DONE";
+ case FIP_VNIC_VHUB_INIT: return "VHUB_INIT";
+ case FIP_VNIC_VHUB_INIT_DONE: return "VHUB_INIT_DONE";
+ case FIP_VNIC_VHUB_WRITE: return "VHUB_WRITE";
+ case FIP_VNIC_WAIT_4_ACK: return "WAIT_4_ACK";
+ }
+ return "UNKNOWN";
+
+
+}
+
+int port_vnics_sysfs_show(struct vnic_port *port, char *buf)
+{
+ struct fip_gw_data *gw;
+ char *p = buf;
+ struct fip_discover *discover;
+ struct fip_vnic_data *vnic;
+
+ mutex_lock(&port->start_stop_lock);
+ list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+
+ down_read(&discover->l_rwsem);
+
+ list_for_each_entry(gw, &discover->gw_list, list) {
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+ p += _sprintf(p, buf, "%-15s\t%-10s\t%10s:%d %-10s\t%.7d\t%-10s\t%s\n",
+ gw->info.vol_info.system_name,
+ gw->info.vol_info.gw_port_name,
+ gw->discover->port->dev->ca->name,
+ gw->discover->port->num,
+ vnic->name,
+ vnic->vnic_id,
+ vnic->hadmined?"HOSTADMIN":"NETADMIN",
+ vnic_state_2str(vnic->state));
+ }
+ }
+
+ up_read(&discover->l_rwsem);
+ }
+
+ mutex_unlock(&port->start_stop_lock);
+ return (p - buf);
+}
+
+
+#ifdef VNIC_PROFILLNG
+static ssize_t vnic_dentry_prof_skb_show(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ char *p = buf;
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_login *login = vnic_dentry->ctx;
+ struct sk_buff *skb;
+ int i;
+
+ for (i = 0; i < VNIC_PROFILLNG_SKB_MAX; ++i) {
+ if (!login->prof_arr[i].cnt)
+ continue;
+ skb = &login->prof_arr[i].skb;
+ p += _sprintf(p, buf, "==============\n");
+ p += _sprintf(p, buf, "SKB[%d] CNT %d\n", i, login->prof_arr[i].cnt);
+ p += _sprintf(p, buf, "len %d\n", skb->len);
+ p += _sprintf(p, buf, "data_len %d\n", skb->data_len);
+ p += _sprintf(p, buf, "head_len %d\n", skb_headlen(skb));
+ p += _sprintf(p, buf, "gso %d\n", skb_is_gso(skb));
+ p += _sprintf(p, buf, "nr_frags %d\n", login->prof_arr[i].nr_frags);
+ p += _sprintf(p, buf, "jiffies %lu\n", login->prof_arr[i].jiffies);
+ p += _sprintf(p, buf, "msecs %u\n",
+ jiffies_to_msecs(login->prof_arr[i].jiffies));
+ p += _sprintf(p, buf, "msecs_diff %u\n",
+ jiffies_to_msecs(login->prof_arr[i].jiffies) -
+ jiffies_to_msecs(login->prof_arr[i ? i -1 : 0].jiffies));
+ }
+
+ return (ssize_t)(p - buf);
+}
+
+#endif
+
+static int get_guid(u8 *guid, char *s)
+{
+ if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+ guid + 0, guid + 1, guid + 2, guid + 3, guid + 4,
+ guid + 5, guid + 6, guid + 7) != 8)
+ return -1;
+
+ return 0;
+}
+
+static int get_mac(u8 *mac, char *s)
+{
+ if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+ mac + 0, mac + 1, mac + 2, mac + 3, mac + 4,
+ mac + 5) != 6)
+ return -1;
+
+ return 0;
+}
+
+static int get_ipv4(short unsigned int *ip, char *s)
+{
+ if (sscanf(s, "%hu.%hu.%hu.%hu", ip + 0, ip + 1, ip + 2, ip + 3) != 4)
+ return -1;
+
+ return 0;
+}
+
+static int get_parent(struct vnic_port *port, char *parent)
+{
+ struct net_device *parent_netdev;
+
+ /* check parent syntax */
+ if (!dev_valid_name(parent))
+ return -EINVAL;
+
+ parent_netdev = dev_get_by_name(&init_net, parent);
+ if (parent_netdev)
+ dev_put(parent_netdev);
+
+ return parent_netdev ? 0 : -ENODATA;
+}
+
+static struct fip_hadmin_cache *get_hadmin_entry(void)
+{
+ struct fip_hadmin_cache *hadmin_entry;
+
+ hadmin_entry = kzalloc(sizeof *hadmin_entry, GFP_ATOMIC);
+ if (!hadmin_entry)
+ return NULL;
+
+ hadmin_entry->vnic_id = NOT_AVAILABLE_NUM;
+ hadmin_entry->gw_port_id = NOT_AVAILABLE_NUM;
+
+ return hadmin_entry;
+}
+
+void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd)
+{
+ char *buf = (char *)cmd;
+ u8 i;
+
+ for (i = 0; i < MAX_INPUT_ARG; ++i)
+ sprintf(buf + (i * MAX_INPUT_LEN), NOT_AVAILABLE_STRING);
+}
+
+int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd)
+{
+ int count;
+
+ if (cmd) {
+ count = sprintf(buf, "name=%s mac=%s vnic_id=%s vid=%s "
+ "bxname=%s bxguid=%s eport=%s ipv4=%s ipv6=%s "
+ "emac=%s pkey=%s parent=%s\n",
+ cmd->c_name, cmd->c_mac, cmd->c_vnic_id,
+ cmd->c_vid, cmd->c_bxname, cmd->c_bxguid,
+ cmd->c_eport, cmd->c_ipv4, cmd->c_ipv6,
+ cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+ vnic_dbg_sysfs((char *)(cmd->c_name), "cmd: %s", buf);
+ } else /* print the cmd syntax */
+ count = sprintf(buf, "name=%%s mac=%%s vnic_id=%%s vid=%%s "
+ "bxname=%%s bxguid=%%s eport=%%s ipv4=%%s "
+ "ipv6=%%s emac=%%s pkey=%%s parent=%%s\n");
+
+ return count;
+}
+
+/* create/destroy child vNic; syntax example:
+ * +00:11:22:33:44:55
+ */
+static ssize_t vnic_child_write(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod,
+ const char *buf, size_t count)
+{
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_login *login = vnic_dentry->ctx;
+ char action = buf[0];
+ char *buf_mac = (char *)buf + 1;
+ int remove = -1;
+ u8 mac[ETH_ALEN];
+
+ if (action == '-')
+ remove = 1;
+ if (action == '+')
+ remove = 0;
+
+ if (remove < 0 || get_mac(mac, buf_mac) || !is_valid_ether_addr(mac))
+ return -EINVAL;
+
+ vnic_learn_mac(login->dev, mac, remove);
+ return count;
+}
+
+int fip_hadmin_sysfs_update(struct vnic_port *port,
+ const char *buf, int count, int remove)
+{
+ struct fip_discover *discover;
+ struct fip_hadmin_cache *hadmin_entry, *hadmin_it;
+ struct fip_hadmin_cmd *cmd;
+ char *name = NULL;
+ int rc, num;
+ u16 pkey;
+
+ hadmin_entry = get_hadmin_entry();
+ if (!hadmin_entry) {
+ rc = -ENOMEM;
+ vnic_dbg_sysfs(port->name, "get_hadmin_entry failed\n");
+ goto err;
+ }
+
+ cmd = &hadmin_entry->cmd;
+ rc = sscanf(buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s "
+ "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s",
+ cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid,
+ cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4,
+ cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+ if (rc != MAX_INPUT_ARG) {
+ vnic_dbg_sysfs(port->name, "sscanf failed, rc %d\n", rc);
+ rc = -EINVAL;
+ goto err;
+ } else
+ name = (char *)(cmd->c_name);
+
+ /* get parent name */
+ if (!dev_valid_name(cmd->c_parent))
+ hadmin_entry->parent_used = 0;
+ else if (remove || !get_parent(port, cmd->c_parent)) {
+ vnic_dbg_sysfs(name, "parent set %s\n", cmd->c_parent);
+ strncpy(hadmin_entry->parent_name, cmd->c_parent,
+ sizeof(hadmin_entry->parent_name));
+ hadmin_entry->parent_used = 1;
+ } else {
+ vnic_warn(name, "invalid parent name %s\n", cmd->c_parent);
+ rc = -EINVAL;
+ goto err;
+ }
+
+ /* get vNic ID dec (must) */
+ if (sscanf(cmd->c_vnic_id, "%d", &num) != 1) {
+ /* abort on failure */
+ vnic_warn(name, "invalid vNic ID %s\n", cmd->c_vnic_id);
+ rc = -EINVAL;
+ goto err;
+ }
+ hadmin_entry->vnic_id = (u16)num;
+
+ /* get vNic MAC (must) */
+ if (get_mac(hadmin_entry->mac, cmd->c_mac)) {
+ vnic_warn(name, "invalid vNic MAC %s\n", cmd->c_vnic_id);
+ rc = -EINVAL;
+ goto err;
+ }
+
+ /* get interface name (must) */
+ if ((!dev_valid_name(cmd->c_name) && !hadmin_entry->parent_used) ||
+ ((strlen(cmd->c_name) > VNIC_NAME_LEN) && hadmin_entry->parent_used)) {
+ vnic_warn(name, "invalid vNic name %s\n", cmd->c_name);
+ rc = -EINVAL;
+ goto err;
+ }
+
+ strncpy(hadmin_entry->interface_name, cmd->c_name,
+ sizeof(hadmin_entry->interface_name));
+
+ /* get BX GUID, if fails, get BX NAME */
+ if (get_guid(hadmin_entry->system_guid, cmd->c_bxguid)) {
+ strncpy(hadmin_entry->system_name, cmd->c_bxname,
+ sizeof(hadmin_entry->system_name));
+ vnic_dbg_sysfs(name, "use BX NAME %s\n", cmd->c_bxname);
+ }
+
+ /* get shared emac/ip */
+ if (!get_ipv4((short unsigned int *)hadmin_entry->shared_vnic_ip,
+ cmd->c_ipv4)) {
+ /* TODO, add IPv6 support for shared vNic */
+ get_mac(hadmin_entry->shared_vnic_mac, cmd->c_emac);
+ vnic_dbg_sysfs(name, "use shared ip/mac\n");
+ }
+
+#ifndef VLAN_GROUP_ARRAY_LEN
+#define VLAN_GROUP_ARRAY_LEN VLAN_N_VID
+#endif
+
+ /* get VLAN field (dec) */
+ if ((sscanf(cmd->c_vid, "%d", &num) == 1) &&
+ num < VLAN_GROUP_ARRAY_LEN && num >= 0) {
+ /* set other fields on success, skip on failure */
+ vnic_dbg_sysfs(name, "vlan set 0x%x\n", hadmin_entry->vlan);
+ hadmin_entry->vlan_used = 1;
+ hadmin_entry->vlan = (u16)num;
+ } else if (!strcmp(cmd->c_vid, ALL_VLAN_GW_VID)) {
+ /* Dont set 'vlan_used'. the code counts on it being NULL for
+ * host admin vnics in all_vlan mode, when Vlans are used */
+ hadmin_entry->vlan = 0;
+ hadmin_entry->all_vlan_gw = 1;
+ }
+
+ /* get eport name */
+ if (!strlen(cmd->c_eport)) {
+ vnic_warn(name, "invalid eport name %s\n", cmd->c_eport);
+ rc = -EINVAL;
+ goto err;
+ }
+ strncpy(hadmin_entry->eport_name, cmd->c_eport,
+ sizeof(hadmin_entry->eport_name));
+
+ /* set remove/add flag */
+ vnic_dbg_sysfs(name, "%s hadmin vNic\n", remove ? "remove" : "add");
+ hadmin_entry->remove = remove;
+
+ /* set pkey (hex) */
+ if ((sscanf(cmd->c_pkey, "%x", &num) != 1) || !num)
+ pkey = 0xffff; /* default */
+ else
+ pkey = (u16)num | 0x8000;
+ vnic_dbg_sysfs(name, "pkey 0x%x\n", pkey);
+
+ /* cannot sleep in this functions for child vnics flow
+ * (avoid schedule while atomic oops)
+ * TODO: check if holding start_stop_lock is needed here
+ */
+ //mutex_lock(&port->start_stop_lock);
+
+ list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+ if (discover->pkey == pkey) {
+ spin_lock_irq(&discover->lock);
+
+ if (discover->flush != FIP_NO_FLUSH) {
+ rc = -EBUSY;
+ spin_unlock_irq(&discover->lock);
+ goto skip;
+ }
+
+ /* check that this mac/vlan is not in the cache list
+ * (saves redundant queue_delayed_work call during
+ * vnic_learn_mac bursts)
+ */
+ list_for_each_entry_reverse(hadmin_it, &discover->hadmin_cache, next) {
+ if (!memcmp(hadmin_entry->mac, hadmin_it->mac, ETH_ALEN) &&
+ hadmin_entry->vlan == hadmin_it->vlan &&
+ hadmin_entry->remove == hadmin_it->remove) {
+ rc = -EEXIST;
+ spin_unlock_irq(&discover->lock);
+ goto skip;
+ }
+ }
+ list_add_tail(&hadmin_entry->next, &discover->hadmin_cache);
+ /* calls fip_discover_hadmin_update() */
+ queue_delayed_work(fip_wq, &discover->hadmin_update_task, HZ/10);
+ spin_unlock_irq(&discover->lock);
+ goto updated_discover;
+ }
+ }
+
+ //mutex_unlock(&port->start_stop_lock);
+ vnic_dbg_sysfs(name, "Requested PKEY=0x%x is not configured\n", pkey);
+ goto skip;
+
+err:
+ vnic_dbg_sysfs(name, "Invalid host admin request format string. Request rejected\n");
+skip:
+ kfree(hadmin_entry);
+ return rc;
+
+updated_discover:
+ //mutex_unlock(&port->start_stop_lock);
+ return count;
+}
+
+static ssize_t vnic_login_cmd(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ char *p = buf;
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct fip_vnic_data *vnic_fip = vnic_dentry->ctx;
+ struct fip_hadmin_cmd *cmd;
+
+ if (!vnic_fip || !vnic_fip->hadmined)
+ goto out;
+
+ cmd = &vnic_fip->cmd;
+ p += _sprintf(p, buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s "
+ "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s ",
+ cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid,
+ cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4,
+ cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+ p += _sprintf(p, buf, "ib_port=%s", vnic_fip->port->name);
+ p += _sprintf(p, buf, "\n");
+
+out:
+ return (ssize_t)(p - buf);
+}
+
+int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic)
+{
+ char name[VNIC_SYSFS_FLEN];
+
+ DENTRY_CREATE(vnic, &vnic->dentry,
+ vnic_dentry_name(name, vnic, "cmd"),
+ vnic_login_cmd, NULL);
+ return 0;
+}
+
+void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic)
+{
+ if (vnic->dentry.ctx)
+ DENTRY_REMOVE(&vnic->dentry);
+}
+
+int vnic_create_dentry(struct vnic_login *login)
+{
+ int i = 0;
+ char name[VNIC_SYSFS_FLEN];
+
+ DENTRY_CREATE(login, &login->dentries[i++],
+ login_dentry_name(name, login, "info"),
+ vnic_login_show, NULL);
+ DENTRY_CREATE(login, &login->dentries[i++],
+ login_dentry_name(name, login, "child"),
+ NULL, vnic_child_write);
+ DENTRY_CREATE(login, &login->dentries[i++],
+ login_dentry_name(name, login, "neigh"),
+ vnic_neigh_show, NULL);
+ DENTRY_CREATE(login, &login->dentries[i++],
+ login_dentry_name(name, login, "qps"),
+ vnic_qps_show, NULL);
+ DENTRY_CREATE(login, &login->dentries[i++],
+ login_dentry_name(name, login, "member"),
+ vnic_member_show, NULL);
+
+#ifdef VNIC_PROFILLNG
+ DENTRY_CREATE(login, &login->dentries[i++],
+ login_dentry_name(name, login, "prof_skb"),
+ vnic_dentry_prof_skb_show, NULL);
+#endif
+ return 0;
+}
+
+void vnic_delete_dentry(struct vnic_login *login)
+{
+ int i;
+
+ for (i = 0; i < VNIC_MAX_DENTRIES; ++i) {
+ if (login->dentries[i].ctx)
+ DENTRY_REMOVE(&login->dentries[i]);
+ }
+}
+
+static ssize_t port_gw_fs_show(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_port *port = vnic_dentry->ctx;
+
+ return fip_gw_sysfs_show(port, buf);
+}
+
+
+static ssize_t port_vnics_fs_show(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_port *port = vnic_dentry->ctx;
+ return port_vnics_sysfs_show(port, buf);
+}
+
+static ssize_t port_hadmin_syntax(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+ /* print cmd syntax only (for usage) */
+ return vnic_login_cmd_set(buf, NULL);
+}
+
+static ssize_t port_hadmin_add_write(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod,
+ const char *buf, size_t count)
+{
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_port *port = vnic_dentry->ctx;
+
+ return fip_hadmin_sysfs_update(port, buf, count, 0);
+}
+
+static ssize_t port_hadmin_del_write(struct module_attribute *attr,
+ __MODULE_KOBJ_TYPE *mod,
+ const char *buf, size_t count)
+{
+ struct vnic_sysfs_attr *vnic_dentry =
+ container_of(attr, struct vnic_sysfs_attr, dentry);
+ struct vnic_port *port = vnic_dentry->ctx;
+
+ return fip_hadmin_sysfs_update(port, buf, count, 1);
+}
+
+int port_fs_init(struct vnic_port *port)
+{
+ int i = 0;
+ char name[VNIC_SYSFS_FLEN];
+
+ DENTRY_CREATE(port, &port->dentries[i++],
+ port_dentry_name(name, port, "host_add"),
+ port_hadmin_syntax, port_hadmin_add_write);
+
+ DENTRY_CREATE(port, &port->dentries[i++],
+ port_dentry_name(name, port, "host_del"),
+ port_hadmin_syntax, port_hadmin_del_write);
+
+ DENTRY_CREATE(port, &port->dentries[i++],
+ port_dentry_name(name, port, "gws"),
+ port_gw_fs_show, NULL);
+
+ DENTRY_CREATE(port, &port->dentries[i++],
+ port_dentry_name(name, port, "vnics"),
+ port_vnics_fs_show, NULL);
+ return 0;
+}
+
+void port_fs_exit(struct vnic_port *port)
+{
+ int i;
+
+ for (i = 0; i < VNIC_MAX_DENTRIES; ++i) {
+ if (port->dentries[i].ctx)
+ DENTRY_REMOVE(&port->dentries[i]);
+ }
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <rdma/ib_cache.h>
+#include <net/ip6_checksum.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id)
+{
+ struct ib_recv_wr *bad_wr;
+ int i, rc;
+
+ ring->wr.wr_id = wr_id;
+
+ for (i = 0; i < ring->num_frags; i++)
+ ring->sge[i].addr = ring->rx_info[wr_id].dma_addr[i];
+
+ rc = ib_post_srq_recv(ring->srq, &ring->wr, &bad_wr);
+ if (unlikely(rc)) {
+ /* we will not use a lock here. In the worst case we will have
+ * an incorrect value of need_refill. Not a biggie
+ */
+
+ /*ring->rx_info[wr_id].info = VNIC_FRAG_NOT_POSTED;
+ ring->need_refill = 1;
+ */
+ vnic_dbg_data(ring->port->name, "receive failed for buf %llu (%d)\n",
+ wr_id, rc);
+ }
+
+ return rc;
+}
+
+static void vnic_dealloc_tx_skb(struct vnic_login *login, unsigned cq_index,
+ u64 wr_id)
+{
+ struct vnic_tx_res *tx_res = &login->tx_res[cq_index];
+ int is_inline = !!(wr_id & VNIC_SEND_INLINE_FLAG);
+ struct sk_buff *skb;
+ u64 *mapping;
+ int i, off = 0;
+
+ wr_id &= ~VNIC_SEND_INLINE_FLAG;
+ skb = tx_res->tx_ring[wr_id].skb;
+ ASSERT(skb);
+ mapping = tx_res->tx_ring[wr_id].mapping;
+
+ if (!is_inline) {
+ if (!vnic_encap_headroom && !skb_is_gso(skb)) {
+ ib_dma_unmap_single(login->port->dev->ca, mapping[off],
+ VNIC_ENCAP_LEN, DMA_TO_DEVICE);
+ off++;
+ }
+ if (skb_headlen(skb)) {
+ ib_dma_unmap_single(login->port->dev->ca, mapping[off],
+ skb_headlen(skb), DMA_TO_DEVICE);
+ off++;
+ }
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ ib_dma_unmap_page(login->port->dev->ca,
+ mapping[i + off], frag->size,
+ DMA_TO_DEVICE);
+ }
+ }
+
+ /* dealloc skb */
+ dev_kfree_skb_any(skb);
+ tx_res->tx_ring[wr_id].skb = NULL;
+}
+
+static void vnic_ib_handle_tx_wc(struct vnic_login *login,
+ int tx_res_index, struct ib_wc *wc)
+{
+ struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+ u64 wr_id = wc->wr_id & ~VNIC_SEND_INLINE_FLAG;
+
+ vnic_dbg_data(login->name, "send completion: wr_id %llu, status: %d "
+ "[head %d - tail %d]\n", wr_id, wc->status,
+ tx_res->tx_head, tx_res->tx_tail);
+
+ ASSERT(wr_id < vnic_tx_rings_len);
+ vnic_dealloc_tx_skb(login, tx_res_index, wc->wr_id);
+
+ ++tx_res->tx_tail;
+ --tx_res->tx_outstanding;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)) {
+ vnic_warn(login->name, "failed send event "
+ "(status %d, wr_id %llu, vend_err 0x%x)\n",
+ wc->status, wr_id, wc->vendor_err);
+ vnic_warn(login->name, "TX CQE error, queueing rings restart\n");
+ if (!login->queue_stopped)
+ queue_delayed_work(login_wq, &login->restart_task, HZ / 100);
+ }
+}
+
+int vnic_post_recvs(struct vnic_rx_ring *ring)
+{
+ int i, rc;
+
+ for (i = 0; i < ring->size; i++) {
+ rc = vnic_post_recv(ring, i);
+ if (rc) {
+ vnic_err(ring->port->name, "Failed post receive %d\n", rc);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+static int vnic_vlan_is_valid(struct vnic_login *login,
+ struct vlan_ethhdr *veth)
+{
+ ASSERT(veth->h_vlan_proto == htons(ETH_P_8021Q));
+ if ((be16_to_cpu(veth->h_vlan_TCI) & 0xfff) !=
+ be16_to_cpu(login->vid)) {
+ vnic_dbg_data(login->name, "invalid vlan, ingress vid "
+ "0x%x, login: vid 0x%x vlan_used %d\n",
+ be16_to_cpu(veth->h_vlan_TCI),
+ be16_to_cpu(login->vid),
+ login->vlan_used);
+ return 0;
+ }
+
+ return 1;
+}
+
+/* If a vlan tag should exist in the eth_hdr - validate it.
+ is_vlan_proto is set if vlan protocol is present in the eth header
+ return values 0 - on success, 1 - on error :
+ for all vlans gateway (promisc vlan):
+ 0 - there is no vlan or there is a vlan and it is valid
+ 1 - vlan is present and not valid.
+ for all other vlans:
+ 0 - there shouldn't be a vlan, or vlan should be present and is valid.
+ 1 - vlan should be present and it is not, ot it is not valid. */
+static int validate_vnic_vlan(struct vnic_login *login,
+ struct vlan_ethhdr *veth,
+ int *is_vlan_proto)
+{
+ int is_vlan = !!(veth->h_vlan_proto == htons(ETH_P_8021Q));
+
+ *is_vlan_proto = is_vlan;
+
+ if (login->all_vlan_gw)
+ return 0;
+
+ if (VNIC_VLAN_ENABLED(login) && login->vid && !is_vlan) {
+ vnic_dbg_data(login->name, "missing vlan tag\n");
+ VNIC_STATS_INC(login->port_stats.vlan_err);
+ return 1;
+ }
+
+ if (is_vlan && unlikely(!vnic_vlan_is_valid(login, veth))) {
+ vnic_dbg_data(login->name, "invalid vlan tag\n");
+ VNIC_STATS_INC(login->port_stats.vlan_err);
+ return 1;
+ }
+
+ return 0;
+}
+
+static void vnic_ib_handle_rx_wc_linear(struct vnic_login *login,
+ struct ib_wc *wc, int rx_ring_index)
+{
+ struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index];
+ struct eoibhdr *eoib_hdr;
+ struct sk_buff *skb;
+ struct vlan_ethhdr *veth;
+ int rc, wr_id = wc->wr_id, checksum_ok, ip_summed,
+ buf_size = VNIC_BUF_SIZE(ring->port);
+ int is_vlan_proto;
+ u64 mapping;
+ u16 eth_type;
+ u8 *va, *eth_hdr;
+
+ spin_lock_bh(&ring->lock);
+ ASSERT(wr_id < ring->size);
+
+ skb = ring->rx_info[wr_id].skb;
+ mapping = ring->rx_info[wr_id].dma_addr[0];
+
+ /* termination with error */
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ if(wc->status != IB_WC_REM_ABORT_ERR &&
+ wc->status != IB_WC_LOC_LEN_ERR) {
+ vnic_dbg_data(login->name, "RX CQE error "
+ "(status %d, vend_err 0x%x), "
+ "queueing rings restart\n",
+ wc->status, wc->vendor_err);
+ if (!login->queue_stopped)
+ queue_delayed_work(login_wq,
+ &login->restart_task,
+ HZ / 10);
+ }
+ goto repost;
+ }
+
+ ASSERT(skb);
+ ASSERT(mapping);
+
+ /* If we can't allocate a new RX buffer, dump
+ * this packet and reuse the old buffer.
+ */
+ if (unlikely(!vnic_alloc_rx_skb(ring, wr_id, GFP_ATOMIC))) {
+ VNIC_STATS_DO_INC(login->stats.rx_dropped);
+ goto repost;
+ }
+
+ ib_dma_unmap_single(login->port->dev->ca, mapping,
+ buf_size, DMA_FROM_DEVICE);
+ skb_put(skb, wc->byte_len);
+ skb_pull(skb, IB_GRH_BYTES);
+
+ /* check EoIB header signature and version */
+ va = skb->data;
+ eoib_hdr = (struct eoibhdr *)va;
+ if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG ||
+ VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) {
+ vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n",
+ VNIC_EOIB_HDR_GET_SIG(eoib_hdr),
+ VNIC_EOIB_HDR_GET_VER(eoib_hdr));
+ VNIC_STATS_INC(login->port_stats.sig_ver_err);
+ goto repost;
+ }
+
+ /* check EoIB CSUM */
+ checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr);
+ ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ if (likely((checksum_ok)))
+ VNIC_STATS_INC(login->port_stats.rx_chksum_good);
+ else
+ VNIC_STATS_INC(login->port_stats.rx_chksum_none);
+
+ /* Ethernet header */
+ skb_pull(skb, VNIC_ENCAP_LEN);
+ va += VNIC_ENCAP_LEN;
+ veth = (struct vlan_ethhdr *)(va);
+
+ eth_hdr = va;
+ eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto);
+
+ /* validate VLAN tag, strip it if valid */
+ if (validate_vnic_vlan(login, veth, &is_vlan_proto))
+ goto repost;
+
+ /* for all_vlan_gw - we don't strip the packet but send it as is*/
+ if (!login->all_vlan_gw && is_vlan_proto) {
+ eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto);
+ eth_hdr += VLAN_HLEN;
+ skb_pull(skb, VLAN_HLEN);
+ memmove(eth_hdr, va, ETH_ALEN * 2);
+ }
+
+ /* update skb fields, keep this before LRO/GRO funcs */
+ skb->dev = login->dev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ skb->ip_summed = ip_summed;
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+ if ((login->dev->features & NETIF_F_GRO) && checksum_ok) {
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+ int ret;
+
+ ret = napi_gro_receive(&rx_res->napi, skb);
+ if (ret == GRO_HELD)
+ VNIC_STATS_INC(login->port_stats.gro_held);
+ else if (ret == GRO_NORMAL)
+ VNIC_STATS_INC(login->port_stats.gro_normal);
+ else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
+ VNIC_STATS_INC(login->port_stats.gro_merged);
+ else
+ VNIC_STATS_INC(login->port_stats.gro_drop);
+
+ goto rx_repost;
+ }
+#elif defined(NETIF_F_LRO)
+ if (login->dev->features & NETIF_F_LRO && checksum_ok) {
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+
+ /* processed for LRO */
+ lro_receive_skb(&rx_res->lro, skb, NULL);
+ VNIC_STATS_INC(login->port_stats.lro_aggregated);
+
+ goto rx_repost;
+ }
+#endif
+
+ rc = vnic_rx(login, skb, wc);
+ if (unlikely(rc)) {
+ vnic_dbg_data(login->name, "vnic_rx failed, rc %d\n", rc);
+ goto repost;
+ }
+
+rx_repost:
+ VNIC_STATS_INC(ring->stats.rx_packets);
+ VNIC_STATS_ADD(ring->stats.rx_bytes, wc->byte_len);
+
+ VNIC_STATS_DO_INC(login->stats.rx_packets);
+ VNIC_STATS_DO_ADD(login->stats.rx_bytes, wc->byte_len);
+
+ if (unlikely(vnic_post_recv(ring, wr_id)))
+ vnic_dbg_data(login->name, "failed to post RX WQE id %d\n",
+ (int)wr_id);
+ spin_unlock_bh(&ring->lock);
+
+ return;
+
+repost:
+ login->dev->last_rx = jiffies;
+ if (unlikely(vnic_post_recv(ring, wr_id)))
+ vnic_dbg_data(login->name, "failed to post RX WQE id %d\n",
+ (int)wr_id);
+
+ VNIC_STATS_INC(ring->stats.rx_dropped);
+ VNIC_STATS_DO_INC(login->stats.rx_dropped);
+ spin_unlock_bh(&ring->lock);
+
+ return;
+}
+
+static void vnic_ib_handle_rx_wc(struct vnic_login *login,
+ struct ib_wc *wc, int rx_ring_index)
+{
+ struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index];
+ struct ib_device *ib_device = login->port->dev->ca;
+ struct vnic_frag_data *frags_entry;
+ struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS] = {};
+ struct eoibhdr *eoib_hdr;
+ struct vlan_ethhdr *veth;
+ struct iphdr *ip_hdr;
+ u64 wr_id = wc->wr_id;
+ u16 eth_type;
+ u8 *va, *eth_hdr, ip_type;
+ int rc, checksum_ok, ip_offset = ETH_HLEN,
+ packet_length = wc->byte_len - VNIC_EOIB_HDR_SIZE,
+ page_offset = VNIC_EOIB_HDR_SIZE, ip_summed;
+ int is_vlan_proto;
+
+ spin_lock_bh(&ring->lock);
+ ASSERT(wr_id < ring->size);
+
+ /* termination with error */
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ if(wc->status != IB_WC_REM_ABORT_ERR &&
+ wc->status != IB_WC_LOC_LEN_ERR) {
+ vnic_dbg_data(login->name, "RX CQE error "
+ "(status %d, vend_err 0x%x), "
+ "queueing rings restart\n",
+ wc->status, wc->vendor_err);
+ if (!login->queue_stopped)
+ queue_delayed_work(login_wq, &login->restart_task, HZ / 10);
+ goto out;
+ }
+ goto drop_repost;
+ }
+
+ frags_entry = &ring->rx_info[wr_id];
+
+ /* ensure cache coherency for packet headers and get vq */
+ ib_dma_sync_single_for_cpu(ib_device,
+ ring->rx_info[wr_id].dma_addr[0] + IB_GRH_BYTES,
+ MAX_HEADER_SIZE, DMA_FROM_DEVICE);
+
+ va = page_address(ring->rx_info[wr_id].frags[0].page.p) +
+ ring->rx_info[wr_id].frags[0].page_offset + IB_GRH_BYTES;
+
+ /* check EoIB header signature and version */
+ eoib_hdr = (struct eoibhdr *)va;
+ if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG ||
+ VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) {
+ vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n",
+ VNIC_EOIB_HDR_GET_SIG(eoib_hdr),
+ VNIC_EOIB_HDR_GET_VER(eoib_hdr));
+ VNIC_STATS_INC(login->port_stats.sig_ver_err);
+ goto unmap_repost;
+ }
+
+ /* check EoIB CSUM */
+ checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr);
+ ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+ if (likely((checksum_ok)))
+ VNIC_STATS_INC(login->port_stats.rx_chksum_good);
+ else
+ VNIC_STATS_INC(login->port_stats.rx_chksum_none);
+
+ /* Ethernet header */
+ va += VNIC_ENCAP_LEN;
+ veth = (struct vlan_ethhdr *)(va);
+
+ eth_hdr = va;
+ eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto);
+
+ /* validate VLAN tag, strip it if valid
+ * - if VID is set and !0, then VLAN tag must exist
+ * note: VID zero can accept untagged packets
+ * - if ingress VID exists: validate it, and update the packet
+ * note: rx user prio is ignored
+ * - else; it's valid untagged packet
+ */
+ if (validate_vnic_vlan(login, veth, &is_vlan_proto))
+ goto unmap_repost;
+
+ /* for all_vlan_gw - we don't strip the packet but send it as is*/
+ if (!login->all_vlan_gw && is_vlan_proto) {
+ ip_offset += VLAN_HLEN;
+ page_offset += VLAN_HLEN;
+ packet_length -= VLAN_HLEN;
+ eth_hdr += VLAN_HLEN;
+ eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto);
+ memmove(eth_hdr, va, ETH_ALEN * 2);
+ }
+
+ /* IP header */
+ va += ip_offset;
+ ip_hdr = (struct iphdr *)va;
+ ip_type = ip_hdr->protocol;
+
+ ib_dma_sync_single_for_device(ib_device,
+ frags_entry->dma_addr[0] + IB_GRH_BYTES,
+ MAX_HEADER_SIZE, DMA_FROM_DEVICE);
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+ if ((login->dev->features & NETIF_F_GRO) && checksum_ok) {
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+ struct sk_buff *gro_skb;
+ struct skb_frag_struct *gro_frags;
+ int nr_frags, ret;
+
+ gro_skb = napi_get_frags(&rx_res->napi);
+ if (!gro_skb)
+ goto drop_repost;
+
+ gro_frags = skb_shinfo(gro_skb)->frags;
+ nr_frags = vnic_unmap_and_replace_rx(ring, ib_device,
+ gro_frags, wr_id,
+ wc->byte_len);
+ if (unlikely(!nr_frags))
+ goto drop_repost;
+
+ /* disregard GRH and eoib headers */
+ gro_frags[0].page_offset += page_offset;
+ gro_frags[0].size -= page_offset;
+
+ skb_shinfo(gro_skb)->nr_frags = nr_frags;
+ gro_skb->len = packet_length;
+ gro_skb->data_len = packet_length;
+ gro_skb->truesize += packet_length;
+ gro_skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ /* processed for GRO */
+ skb_record_rx_queue(gro_skb, rx_res->index);
+ ret = napi_gro_frags(&rx_res->napi);
+ if (ret == GRO_HELD)
+ VNIC_STATS_INC(login->port_stats.gro_held);
+ else if (ret == GRO_NORMAL)
+ VNIC_STATS_INC(login->port_stats.gro_normal);
+ else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
+ VNIC_STATS_INC(login->port_stats.gro_merged);
+ else
+ VNIC_STATS_INC(login->port_stats.gro_drop);
+
+ goto rx_repost;
+ }
+#elif defined(NETIF_F_LRO)
+ if (login->dev->features & NETIF_F_LRO && checksum_ok &&
+ eth_type == ETH_P_IP && ip_type == IPPROTO_TCP) {
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+ int nr_frags;
+
+ /* unmap the needed fragment and reallocate them.
+ * Fragments that were not used will be reused as is.*/
+ nr_frags = vnic_unmap_and_replace_rx(ring, ib_device, frags,
+ wr_id, wc->byte_len);
+ if (unlikely(!nr_frags))
+ goto drop_repost;
+
+ /* disregard GRH and eoib headers */
+ frags[0].page_offset += page_offset;
+ frags[0].size -= page_offset;
+
+ /* processed for LRO */
+#if defined(CONFIG_COMPAT_LRO_ENABLED)
+ lro_receive_frags(&rx_res->lro, frags, packet_length,
+ packet_length, NULL, 0);
+#endif
+ VNIC_STATS_INC(login->port_stats.lro_aggregated);
+
+ goto rx_repost;
+ }
+#endif
+
+ rc = vnic_rx_skb(login, ring, wc, ip_summed, eth_hdr);
+ if (unlikely(rc)) {
+ vnic_dbg_data(login->name, "vnic_rx_skb failed, rc %d\n", rc);
+ goto drop_repost;
+ }
+
+rx_repost:
+ /* must hold lock when touching login->stats so the stats
+ * task won't read invalid values
+ */
+ spin_lock(&login->stats_lock);
+ VNIC_STATS_INC(ring->stats.rx_packets);
+ VNIC_STATS_ADD(ring->stats.rx_bytes, packet_length);
+
+ VNIC_STATS_DO_INC(login->stats.rx_packets);
+ VNIC_STATS_DO_ADD(login->stats.rx_bytes, packet_length);
+ spin_unlock(&login->stats_lock);
+
+ login->dev->last_rx = jiffies;
+ if (vnic_post_recv(ring, wr_id))
+ vnic_dbg_data(login->name, "vnic_post_recv failed, "
+ "wr_id %llu\n", wr_id);
+ spin_unlock_bh(&ring->lock);
+
+ return;
+
+unmap_repost:
+ /* ignore rc of vnic_unmap_and_replace_rx() */
+ vnic_unmap_and_replace_rx(ring, ib_device, frags,
+ wr_id, wc->byte_len);
+drop_repost:
+ VNIC_STATS_INC(ring->stats.rx_dropped);
+
+ spin_lock(&login->stats_lock);
+ VNIC_STATS_DO_INC(login->stats.rx_dropped);
+ spin_unlock(&login->stats_lock);
+
+ if (vnic_post_recv(ring, wr_id))
+ vnic_dbg_data(login->name, "vnic_post_recv failed, "
+ "wr_id %llu\n", wr_id);
+out:
+ spin_unlock_bh(&ring->lock);
+ return;
+}
+
+static inline void vnic_drain_tx_cq(struct vnic_login *login,
+ int tx_res_index)
+{
+ struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+ int n, i;
+
+ do {
+ n = ib_poll_cq(tx_res->cq, VNIC_MAX_TX_CQE, tx_res->send_wc);
+ for (i = 0; i < n; ++i)
+ vnic_ib_handle_tx_wc(login, tx_res_index,
+ tx_res->send_wc + i);
+ } while (n == VNIC_MAX_TX_CQE);
+}
+
+static void vnic_drain_arm_tx_cq(struct vnic_login *login, int tx_res_index)
+{
+ struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+
+ ASSERT(login);
+ ASSERT(login->dev);
+
+ /* darin CQ then [arm] it */
+ vnic_drain_tx_cq(login, tx_res_index);
+
+ /* in tx interrupt mode, arm TX CQ after every interrupt */
+ if (!vnic_tx_polling && ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP))
+ vnic_dbg(login->name, "ib_req_notify_cq failed\n");
+ else if (unlikely(VNIC_TXQ_STOPPED(tx_res) &&
+ test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))) {
+ if ((tx_res->tx_outstanding <= vnic_tx_rings_len >> 1)) {
+ if (!test_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state)) {
+ VNIC_STATS_DO_INC(login->port_stats.wake_queue);
+ VNIC_TXQ_WAKE(tx_res);
+ }
+ /* make sure that after arming the cq, there is no access to
+ * login fields to avoid conflict with cq event handler.
+ * i.e., ib_req_notify_cq() must come at the end of this func
+ */
+ } else if (ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP)) {
+ vnic_dbg(login->name, "ib_req_notify_cq failed\n");
+ /* TODO: have to reset the device here */
+ }
+ }
+}
+
+static inline void vnic_comp_handler_tx(struct ib_cq *cq, void *ctx)
+{
+ struct vnic_tx_res *tx_res = ctx;
+
+ if (!vnic_tx_polling) {
+ spin_lock(&tx_res->lock);
+ vnic_drain_arm_tx_cq(tx_res->login, tx_res->index);
+ spin_unlock(&tx_res->lock);
+ } else
+ vnic_drain_arm_tx_cq(tx_res->login, tx_res->index);
+
+}
+
+static int vnic_drain_rx_cq(struct vnic_login *login, int max_poll,
+ int rx_res_index)
+{
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+ int polled, i;
+
+ ASSERT(max_poll <= vnic_napi_weight);
+ polled = ib_poll_cq(rx_res->cq, max_poll, rx_res->recv_wc);
+
+ for (i = 0; vnic_rx_linear && i < polled; ++i)
+ vnic_ib_handle_rx_wc_linear(login, &rx_res->recv_wc[i],
+ rx_res_index);
+
+ for (i = 0; !vnic_rx_linear && i < polled; ++i)
+ vnic_ib_handle_rx_wc(login, &rx_res->recv_wc[i],
+ rx_res_index);
+
+#ifdef NETIF_F_LRO
+ /* Done CQ handling: flush all LRO sessions unconditionally */
+ if (login->dev->features & NETIF_F_LRO) {
+ VNIC_STATS_INC(login->port_stats.lro_flushed);
+ lro_flush_all(&rx_res->lro);
+ }
+#endif
+
+ return polled;
+}
+
+/* RX CQ polling - called by NAPI */
+#ifndef _BP_NAPI_POLL
+int vnic_poll_cq_rx(struct napi_struct *napi, int budget)
+{
+ struct vnic_rx_res *rx_res = container_of(napi, struct vnic_rx_res, napi);
+ struct vnic_login *login = rx_res->login;
+ struct ib_cq *cq_rx = rx_res->cq;
+ int rx_res_index = rx_res->index, polled;
+
+ /* shouldn't happen, since when stopped=1 NAPI is disabled */
+ if (unlikely(rx_res->stopped)) {
+#ifndef _BP_NAPI_NETIFRX
+ napi_complete(napi);
+#else
+ netif_rx_complete(login->dev, napi);
+#endif
+ return 0;
+ }
+
+ polled = vnic_drain_rx_cq(login, min(budget, VNIC_MAX_RX_CQE), rx_res_index);
+ vnic_dbg_data(login->name, "after vnic_drain_rx_cq budget %d,"
+ " done %d, index %d\n", budget, polled, rx_res_index);
+
+ /* If we used up all the quota - we're probably not done yet... */
+ ASSERT(polled <= budget);
+ if (polled < budget) {
+ /* ATTENTION: ARM CQ must come after napi_complete() */
+#ifndef _BP_NAPI_NETIFRX
+ napi_complete(napi);
+#else
+ netif_rx_complete(login->dev, napi);
+#endif
+ /* Eventually calls vnic_comp_handler_rx() */
+ if (ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP))
+ vnic_err(login->name, "ib_req_notify_cq failed\n");
+ }
+
+ return polled;
+}
+#else
+int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget)
+{
+ struct vnic_rx_res *rx_res = poll_dev->priv;
+ struct vnic_login *login = rx_res->login;
+ struct ib_cq *cq_rx = rx_res->cq;
+ int rx_res_index = rx_res->index, polled, max_poll = min(*budget, poll_dev->quota);
+
+ /* shouldn't happen, since when stopped=1 NAPI is disabled */
+ if (unlikely(rx_res->stopped)) {
+ netif_rx_complete(poll_dev);
+ return 0;
+ }
+
+ while (max_poll >= 0) {
+ polled = vnic_drain_rx_cq(login, min(max_poll, VNIC_MAX_RX_CQE), rx_res_index);
+ if (polled <= 0)
+ break;
+ else {
+ poll_dev->quota -= polled;
+ *budget -= polled;
+ }
+ max_poll -= polled;
+ }
+
+ if (!max_poll)
+ return 1;
+
+ netif_rx_complete(poll_dev);
+ ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP);
+
+ return 0;
+}
+#endif
+
+static void vnic_comp_handler_rx(struct ib_cq *cq, void *rx_res_ptr)
+{
+ struct vnic_rx_res *rx_res = rx_res_ptr;
+ struct vnic_login *login = rx_res->login;
+
+ ASSERT(rx_res->cq == cq);
+ ASSERT(login->dev);
+
+ /* is this happens, will re-arm later in vnic_open */
+ if (unlikely(rx_res->stopped))
+ return;
+
+#ifndef _BP_NAPI_POLL
+ /* calls vnic_poll_cq_rx() */
+#ifndef _BP_NAPI_NETIFRX
+ napi_schedule(&rx_res->napi);
+#else
+ netif_rx_schedule(login->dev, &rx_res->napi);
+#endif
+#else
+ netif_rx_schedule(rx_res->poll_dev);
+#endif /* _BP_NAPI_POLL*/
+
+}
+
+static void vnic_stop_qp(struct vnic_login *login, int qp_index)
+{
+ struct ib_qp_attr qp_attr = { .qp_state = IB_QPS_ERR };
+ struct vnic_qp_res *qp_res = &login->qp_res[qp_index];
+ struct vnic_rx_res *rx_res = &login->rx_res[qp_res->rx_index];
+ struct vnic_tx_res *tx_res = &login->tx_res[qp_res->tx_index];
+ struct vnic_rx_ring *ring = login->port->rx_ring[rx_res->index];
+ unsigned long flags;
+ int polled, attr_mask, rc, i;
+
+ /* move QP to ERR, wait for last WQE async event to drain the SRQ */
+ rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE);
+ if (rc) {
+ /* calls vnic_qp_event_handler() */
+ vnic_warn(login->name, "failed to modify QP 0x%x to ERR state"
+ " (err = %d)\n", qp_res->qp->qp_num, rc);
+ /* continue anyway, but don't wait for completion */
+ } else {
+ wait_for_completion(&qp_res->last_wqe_complete);
+ }
+
+ /* === at this point, no NAPI/RX comps === */
+
+ /* drain TX CQ before moving to RESET, must hold tx_res->lock to
+ * protect from vnic_comp_handler_tx() after this call, all CQEs
+ * are polled (either by this direct call, or by CQ handlers)
+ */
+ spin_lock_irqsave(&tx_res->lock, flags);
+ vnic_drain_tx_cq(login, tx_res->index);
+ spin_unlock_irqrestore(&tx_res->lock, flags);
+
+ /* drain RX CQ before moving to RESET drop and re-post all comps */
+ spin_lock_bh(&ring->lock);
+ do {
+ polled = ib_poll_cq(rx_res->cq, VNIC_MAX_RX_CQE, rx_res->recv_wc);
+ for (i = 0; i < polled; ++i)
+ if (vnic_post_recv(ring, rx_res->recv_wc[i].wr_id))
+ vnic_dbg_data(login->name, "vnic_post_recv failed, "
+ "wr_id %llu\n", rx_res->recv_wc[i].wr_id);
+ } while (polled == VNIC_MAX_RX_CQE);
+ spin_unlock_bh(&ring->lock);
+
+ /* move QP to RESET */
+ qp_attr.qp_state = IB_QPS_RESET;
+ rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE);
+ if (rc)
+ vnic_warn(login->name, "failed to modify QP 0x%x to RESET"
+ " state (err = %d)\n", qp_res->qp->qp_num, rc);
+
+ /* move QP to INIT to avoid multicast qp cache misses */
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.qkey = login->qkey;
+ qp_attr.port_num = login->port->num;
+ qp_attr.pkey_index = login->pkey_index;
+ attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+ rc = ib_modify_qp(qp_res->qp, &qp_attr, attr_mask);
+ if (rc)
+ vnic_warn(login->name, "failed to modify QP 0x%x to INIT state"
+ " (err = %d)\n", qp_res->qp->qp_num, rc);
+}
+
+int vnic_ib_stop(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ struct vnic_tx_res *tx_res;
+ unsigned long begin = jiffies;
+ int wr_id, i;
+
+ /* flush tx and rx comps */
+ for (i = 0; i < login->qps_num; ++i)
+ vnic_stop_qp(login, i);
+
+ /* check any pending tx comps */
+ for (i = 0; i < login->tx_rings_num; i++) {
+ tx_res = &login->tx_res[i];
+ /* if tx_outstanding is non-zero, give it a chance to complete */
+ if (!tx_res->tx_outstanding)
+ continue;
+ msleep(10);
+
+ /* else, drain tx cq. This is indicates that something is
+ * wrong, thus we won't protect vnic_comp_handler_tx() here
+ */
+ while (tx_res->tx_outstanding &&
+ time_before(jiffies, begin + 5 * HZ)) {
+ vnic_drain_tx_cq(login, i);
+ msleep(1);
+ }
+
+ /* if they're still not complete, force skb deallocation */
+ if (!tx_res->tx_outstanding)
+ continue;
+ vnic_warn(login->name, "timing out: %d sends not completed\n",
+ tx_res->tx_outstanding);
+ while (tx_res->tx_outstanding) {
+ wr_id = tx_res->tx_tail & (vnic_tx_rings_len - 1);
+ vnic_dealloc_tx_skb(login, i, wr_id);
+ ++tx_res->tx_tail;
+ --tx_res->tx_outstanding;
+ }
+ }
+
+ return 0;
+}
+
+int vnic_ib_open(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int i;
+
+ /* move QP to RTS and attach to bcast group */
+ for (i = 0; i < login->qps_num; ++i) {
+ if (vnic_init_qp(login, i)) {
+ vnic_err(login->name, "vnic_init_qp failed\n");
+ goto stop_qps;
+ }
+ }
+
+ return 0;
+
+stop_qps:
+ for (--i ; i >= 0; --i)
+ vnic_stop_qp(login, i);
+
+ return -EINVAL;
+}
+
+void vnic_destroy_qp(struct vnic_login *login, int qp_index)
+{
+ struct ib_qp *qp = login->qp_res[qp_index].qp;
+
+ if (!qp)
+ return;
+ if (ib_destroy_qp(qp))
+ vnic_warn(login->name, "ib_destroy_qp failed\n");
+ return;
+}
+
+void vnic_qp_to_reset(struct vnic_login *login, struct ib_qp *qp)
+{
+ struct ib_qp_attr qp_attr;
+ int rc;
+
+ qp_attr.qp_state = IB_QPS_RESET;
+ rc = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (rc)
+ vnic_err(login->name, "ib_modify_qp 0x%06x to RESET err %d\n",
+ qp->qp_num, rc);
+}
+
+int vnic_qp_to_init(struct vnic_login *login, struct ib_qp *qp, u32 qkey)
+{
+ struct ib_qp_attr qp_attr;
+ int attr_mask, rc;
+
+ /* move QP to INIT */
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.qkey = qkey;
+ qp_attr.port_num = login->port->num;
+ /* pkey will be overwritten later by login->pkey_index */
+ qp_attr.pkey_index = login->port->pkey_index;
+ attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+ rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+ if (rc) {
+ vnic_err(login->name, "ib_modify_qp 0x%06x to INIT err %d\n",
+ qp->qp_num, rc);
+ goto out_qp_reset;
+ }
+
+ return 0;
+
+out_qp_reset:
+ vnic_qp_to_reset(login, qp);
+ return rc;
+}
+
+int vnic_init_qp(struct vnic_login *login, int qp_index)
+{
+ struct ib_qp_attr qp_attr;
+ int attr_mask, rc, rc1;
+ struct ib_qp *qp = login->qp_res[qp_index].qp;
+
+ init_completion(&login->qp_res[qp_index].last_wqe_complete);
+ /* move QP to INIT */
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.qkey = login->qkey;
+ qp_attr.port_num = login->port->num;
+ qp_attr.pkey_index = login->pkey_index;
+ attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+ rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+ if (rc) {
+ vnic_err(login->name, "ib_modify_qp to INIT err %d\n", rc);
+ goto out_qp_reset;
+ }
+
+ /* move QP to RTR */
+ qp_attr.qp_state = IB_QPS_RTR;
+ attr_mask &= ~IB_QP_PORT;
+ rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+ if (rc) {
+ vnic_err(login->name, "ib_modify_qp to RTR err %d\n", rc);
+ goto out_qp_reset;
+ }
+
+ /* move QP to RTS */
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ attr_mask |= IB_QP_SQ_PSN;
+ attr_mask &= ~IB_QP_PKEY_INDEX;
+ rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+ if (rc) {
+ vnic_err(login->name, "ib_modify_qp to RTS err, rc %d\n", rc);
+ goto out_qp_reset;
+ }
+
+ /* What a Good QP! */
+ vnic_dbg_data(login->name, "qpn 0x%06x moved to RTS\n",
+ qp->qp_num);
+
+ return 0;
+
+out_qp_reset:
+ qp_attr.qp_state = IB_QPS_RESET;
+ rc1 = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+ if (rc1)
+ vnic_err(login->name, "ib_modify_qp to RESET err %d\n", rc1);
+
+ return rc;
+}
+
+static void vnic_qp_event_handler(struct ib_event *event, void *ctx)
+{
+ struct vnic_qp_res *qp_res = ctx;
+ struct vnic_login *login = qp_res->login;
+
+ ASSERT(login);
+ vnic_dbg_data(login->name, "[%s] qpn %d got event %d\n",
+ event->device->name, event->element.qp->qp_num,
+ event->event);
+ if (event->event == IB_EVENT_QP_LAST_WQE_REACHED)
+ complete(&qp_res->last_wqe_complete);
+}
+
+void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index)
+{
+ struct ib_cq *cq = login->rx_res[rx_res_index].cq;
+ int rc = 0;
+
+ if (cq)
+ rc = ib_destroy_cq(cq);
+ if (rc)
+ vnic_warn(login->name, "ib_destroy_cq() index %d failed\n",
+ rx_res_index);
+}
+
+void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index)
+{
+ struct ib_cq *cq = login->tx_res[tx_res_index].cq;
+ struct vnic_tx_buf *tx_ring = login->tx_res[tx_res_index].tx_ring;
+ int rc = 0;
+
+ if (tx_ring)
+ vfree(tx_ring);
+ if (cq)
+ rc = ib_destroy_cq(cq);
+ if (rc)
+ vnic_warn(login->name, "ib_destroy_cq() index %d failed\n",
+ tx_res_index);
+}
+
+#if 0
+static inline int get_comp_vector(int index, struct vnic_port *port)
+{
+ int vector;
+ int num_cpus = roundup_pow_of_two(num_online_cpus());
+ int port_for_eq;
+
+ port_for_eq = (((index / port->dev->mdev->eq_per_port) %
+ port->dev->mdev->dev->caps.num_ports) + 1);
+ vector = (index % port->dev->mdev->eq_per_port) +
+ (port_for_eq * num_cpus);
+
+ return vector;
+}
+#endif
+
+int vnic_create_rx_res(struct vnic_login *login, int rx_res_index)
+{
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+ int comp_vector = rx_res_index % login->port->dev->ca->num_comp_vectors;
+ struct ib_cq *cq =
+ ib_create_cq(login->port->dev->ca,
+ vnic_comp_handler_rx,
+ NULL, &login->rx_res[rx_res_index],
+ vnic_rx_rings_len, comp_vector);
+ if (IS_ERR(cq)) {
+ vnic_err(login->name, "ib_create_cq failed, index %d, "
+ "comp_vector %d, rc %d\n",
+ rx_res_index, comp_vector, (int)PTR_ERR(cq));
+ return -EINVAL;
+ }
+
+ rx_res->cq = cq;
+ rx_res->index = rx_res_index;
+ rx_res->login = login;
+
+ return 0;
+}
+
+int vnic_create_tx_res(struct vnic_login *login, int tx_res_index)
+{
+ struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+ struct ib_cq *cq;
+ struct vnic_tx_buf *tx_ring;
+ int i, comp_vector;
+
+ tx_ring = vmalloc(vnic_tx_rings_len * sizeof *tx_res->tx_ring);
+ if (!tx_ring) {
+ vnic_err(login->name, "vmalloc failed to allocate %u * %lu\n",
+ vnic_tx_rings_len,
+ (long unsigned int) (sizeof *tx_res->tx_ring));
+ return -ENOMEM;
+ }
+ memset(tx_ring, 0, vnic_tx_rings_len * sizeof *tx_res->tx_ring);
+
+ /* create TX CQ and set WQE drafts */
+ tx_res->tx_wr.sg_list = tx_res->tx_sge;
+ tx_res->tx_wr.send_flags = IB_SEND_SIGNALED;
+ tx_res->tx_wr.wr.ud.remote_qkey = login->qkey;
+
+ for (i = 0; i < VNIC_MAX_TX_FRAGS; ++i)
+ tx_res->tx_sge[i].lkey = login->port->mr->lkey;
+
+ /* set mcast av draft*/
+ memset(&tx_res->mcast_av, 0, sizeof(struct ib_ah_attr));
+ tx_res->mcast_av.port_num = login->port->num;
+ tx_res->mcast_av.ah_flags = IB_AH_GRH;
+
+ /* create tx cq */
+ comp_vector = tx_res_index % login->port->dev->ca->num_comp_vectors;
+ cq = ib_create_cq(login->port->dev->ca,
+ vnic_comp_handler_tx,
+ NULL, &login->tx_res[tx_res_index],
+ vnic_tx_rings_len, comp_vector);
+ if (IS_ERR(cq)) {
+ vnic_err(login->name, "ib_create_cq failed, index %d, "
+ "comp_vector %d, rc %d\n",
+ tx_res_index, comp_vector, (int)PTR_ERR(cq));
+ vfree(tx_ring);
+ return -EINVAL;
+ }
+
+ tx_res->tx_ring = tx_ring;
+ tx_res->cq = cq;
+ tx_res->index = tx_res_index;
+ tx_res->login = login;
+
+ return 0;
+}
+
+int vnic_create_qp_range(struct vnic_login *login)
+{
+ int qp_index, create_flags = 0, rc;
+ struct ib_qp_init_attr *attr;
+ struct ib_qp *qps[VNIC_MAX_NUM_CPUS];
+ struct vnic_qp_res *qp_res;
+
+ attr = kzalloc(VNIC_MAX_NUM_CPUS * sizeof *attr, GFP_KERNEL);
+ if (!attr)
+ return -ENOMEM;
+
+ create_flags |= login->port->dev->attr.device_cap_flags &
+ IB_DEVICE_BLOCK_MULTICAST_LOOPBACK ?
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK : 0;
+
+ /* TODO: rename IB_QP_CREATE_IPOIB_UD_LSO */
+ create_flags |= login->port->dev->attr.device_cap_flags &
+ IB_DEVICE_UD_TSO ?
+ IB_QP_CREATE_IPOIB_UD_LSO : 0;
+
+ for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+ qp_res = &login->qp_res[qp_index];
+ qp_res->tx_index = qp_index % login->tx_rings_num;
+ qp_res->rx_index = qp_index % login->rx_rings_num;
+ memset(&attr[qp_index], 0, sizeof(struct ib_qp_init_attr));
+ attr[qp_index].cap.max_send_wr = vnic_tx_rings_len;
+ attr[qp_index].cap.max_send_sge = VNIC_MAX_TX_FRAGS;
+ attr[qp_index].cap.max_recv_wr = 0; /* we use SRQ */
+ attr[qp_index].cap.max_recv_sge = 0;
+ attr[qp_index].sq_sig_type = IB_SIGNAL_ALL_WR;
+ attr[qp_index].qp_type = IB_QPT_UD;
+ attr[qp_index].send_cq = login->tx_res[qp_res->tx_index].cq;
+ attr[qp_index].recv_cq = login->rx_res[qp_res->rx_index].cq;
+ attr[qp_index].srq = login->port->rx_ring[qp_res->rx_index]->srq;
+ attr[qp_index].event_handler = vnic_qp_event_handler;
+ attr[qp_index].qp_context = &login->qp_res[qp_index];
+ attr[qp_index].create_flags = create_flags;
+ attr[qp_index].cap.max_inline_data = vnic_inline_tshold;
+ }
+
+
+ rc = vnic_ib_create_qp_range(login->port->pd, attr, NULL,
+ login->qps_num, login->qps_num, qps);
+ if (rc) {
+ vnic_err(login->name, "vnic_ib_create_qp_range failed, rc %d\n", rc);
+ goto err;
+ }
+
+ for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+ qp_res = &login->qp_res[qp_index];
+ qp_res->qp = qps[qp_index];
+ qp_res->login = login;
+ }
+
+ for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+ rc = vnic_qp_to_init(login, qps[qp_index], login->qkey);
+ if (rc) {
+ vnic_err(login->name, "vnic_qp_to_init failed, rc %d\n", rc);
+ goto destroy_qps;
+ }
+ }
+
+ kfree(attr);
+ return 0;
+
+destroy_qps:
+ for (qp_index--; qp_index>=0; qp_index--)
+ vnic_qp_to_reset(login, qps[qp_index]);
+
+ for (qp_index = 0; qp_index < login->qps_num; ++qp_index)
+ vnic_destroy_qp(login, qp_index);
+
+err:
+ kfree(attr);
+ return rc;
+}
+
+static inline int use_inline(struct sk_buff *skb)
+{
+ return skb->len <= vnic_inline_tshold && !skb_shinfo(skb)->nr_frags;
+}
+
+int vnic_post_send(struct vnic_login *login, int tx_res_index,
+ u64 wr_id, struct ib_ah *ah, u32 dqpn)
+{
+ struct ib_send_wr *bad_wr;
+ struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+ struct vnic_qp_res *qp_res = &login->qp_res[tx_res_index % login->qps_num];
+ struct vnic_tx_buf *tx_req = &tx_res->tx_ring[wr_id];
+ skb_frag_t *frags = skb_shinfo(tx_req->skb)->frags;
+ int nr_frags = skb_shinfo(tx_req->skb)->nr_frags, i, off = 0;
+
+ ASSERT(qp_res);
+ ASSERT(tx_res);
+ ASSERT(qp_res->tx_index == tx_res->index);
+ ASSERT(qp_res->qp->send_cq == tx_res->cq);
+
+ if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) {
+ tx_res->tx_sge[off].addr = tx_req->mapping[off];
+ tx_res->tx_sge[off].length = VNIC_ENCAP_LEN;
+ off++;
+ }
+
+ if (likely(skb_headlen(tx_req->skb))) {
+ if (vnic_encap_headroom && use_inline(tx_req->skb)) {
+ tx_res->tx_wr.send_flags |= IB_SEND_INLINE;
+ wr_id |= VNIC_SEND_INLINE_FLAG;
+ tx_res->tx_sge[off].addr = (unsigned long)tx_req->skb->data;
+ } else {
+ tx_res->tx_wr.send_flags &= ~IB_SEND_INLINE;
+ tx_res->tx_sge[off].addr = tx_req->mapping[off];
+ }
+ tx_res->tx_sge[off].length = skb_headlen(tx_req->skb);
+ off++;
+ }
+
+ for (i = 0; i < nr_frags; ++i) {
+ tx_res->tx_sge[i + off].addr = tx_req->mapping[i + off];
+ tx_res->tx_sge[i + off].length = frags[i].size;
+ }
+
+ /* handle runt packets using additional SG */
+ if (unlikely(tx_req->skb->len < login->zlen)) {
+ /* Note: always extend runt packets (for both
+ * internal & external) for virtualization, some emulators
+ * drop runt packets, so we need to avoid runt packets even
+ * if the traffic is not passing the bridge
+ */
+ vnic_dbg_data(login->name, "runt packet, skb %p len %d => %d\n",
+ tx_req->skb, tx_req->skb->len, login->zlen);
+ /* If there are frags, then packets is longer than 60B */
+ if (use_inline(tx_req->skb))
+ tx_res->tx_sge[i + off].addr = (u64)(unsigned long)login->pad_va;
+ else
+ tx_res->tx_sge[i + off].addr = login->pad_dma;
+
+ tx_res->tx_sge[i + off].length = login->zlen - tx_req->skb->len;
+ ++nr_frags;
+ VNIC_STATS_INC(login->port_stats.runt_packets);
+ }
+
+ tx_res->tx_wr.num_sge = nr_frags + off;
+ tx_res->tx_wr.wr_id = wr_id;
+ tx_res->tx_wr.wr.ud.remote_qpn = dqpn;
+ tx_res->tx_wr.wr.ud.ah = ah;
+
+ /* check if we need to calc csum */
+ if (tx_req->skb->ip_summed == CHECKSUM_PARTIAL) {
+ u16 csum_pseudo;
+
+ /* calc pseudo header csum without the length
+ * and put in the transport's header checksum field.
+ * The HW will calculate the rest of it (SWP)
+ */
+ if (tx_req->ip_off)
+ csum_pseudo = ~csum_tcpudp_magic(ip_hdr(tx_req->skb)->saddr,
+ ip_hdr(tx_req->skb)->daddr,
+ 0, /* length */
+ ip_hdr(tx_req->skb)->protocol,
+ 0);
+ else
+ csum_pseudo = ~csum_ipv6_magic(&ipv6_hdr(tx_req->skb)->saddr,
+ &ipv6_hdr(tx_req->skb)->daddr,
+ 0, /* length */
+ ipv6_hdr(tx_req->skb)->nexthdr,
+ 0);
+
+ /* place the calculated csum in the checksum field in
+ * tcp/udp header
+ */
+ if (tx_req->tcp_off)
+ tcp_hdr(tx_req->skb)->check = csum_pseudo;
+ else
+ udp_hdr(tx_req->skb)->check = csum_pseudo;
+
+ /* set CSUM flag in ib_send_wr */
+ tx_res->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+ } else {
+ /* csum already calculated in SW */
+ tx_res->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+ }
+
+ /* prepare TSO header */
+ if (skb_is_gso(tx_req->skb)) {
+ tx_res->tx_wr.wr.ud.mss = skb_shinfo(tx_req->skb)->gso_size + tx_req->hlen;
+ tx_res->tx_wr.wr.ud.header = tx_req->phead;
+ tx_res->tx_wr.wr.ud.hlen = tx_req->hlen;
+ tx_res->tx_wr.opcode = IB_WR_LSO;
+ } else {
+ tx_res->tx_wr.opcode = IB_WR_SEND;
+ }
+
+ vnic_dbg_data(login->name,
+ "skb %p wr_id %llu sqpn 0x%06x dqpn 0x%06x num_sge "
+ "%d phead %p was sent\n", tx_req->skb, wr_id, qp_res->qp->qp_num,
+ dqpn, tx_res->tx_wr.num_sge, tx_req->phead);
+
+ /* if EoIB encap is OOB, copy LRO header to linear part */
+ if (!vnic_encap_headroom && skb_is_gso(tx_req->skb)) {
+ memcpy(tx_res->lso_hdr, VNIC_SKB_GET_ENCAP(tx_req->skb),
+ VNIC_ENCAP_LEN);
+ memcpy((u8 *)(tx_res->lso_hdr) + VNIC_ENCAP_LEN,
+ tx_res->tx_wr.wr.ud.header,
+ tx_res->tx_wr.wr.ud.hlen);
+ tx_res->tx_wr.wr.ud.header = tx_res->lso_hdr;
+ tx_res->tx_wr.wr.ud.mss += VNIC_ENCAP_LEN;
+ tx_res->tx_wr.wr.ud.hlen += VNIC_ENCAP_LEN;
+ }
+
+ return vnic_ib_post_send(qp_res->qp, &tx_res->tx_wr, &bad_wr,
+ tx_req->ip_off,
+ tx_req->ip6_off,
+ tx_req->tcp_off,
+ tx_req->udp_off);
+}
+
+static int vnic_dma_map_tx(struct ib_device *ca, struct vnic_tx_buf *tx_req)
+{
+ struct sk_buff *skb = tx_req->skb;
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u64 *mapping = tx_req->mapping;
+ int i = 0, off = 0, headlen = skb_headlen(skb);
+
+ if (vnic_encap_headroom && use_inline(skb))
+ return 0;
+
+ if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) {
+ mapping[off] = ib_dma_map_single(ca, VNIC_SKB_GET_ENCAP(skb),
+ VNIC_ENCAP_LEN, DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(ca, mapping[off])))
+ return -EIO;
+ off++;
+ }
+
+ if (likely(headlen)) {
+ mapping[off] = ib_dma_map_single(ca, skb->data,
+ headlen, DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(ca, mapping[off])))
+ goto partial_error;
+ off++;
+ }
+
+ for (i = 0; i < shinfo->nr_frags; ++i) {
+ skb_frag_t *frag = &shinfo->frags[i];
+ mapping[i + off] = ib_dma_map_page(ca, frag->page.p,
+ frag->page_offset,
+ frag->size, DMA_TO_DEVICE);
+ if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
+ goto partial_error;
+ }
+
+ return 0;
+
+partial_error:
+ for (--i; i >= 0; i--) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ ib_dma_unmap_page(ca, mapping[i + off], frag->size,
+ DMA_TO_DEVICE);
+ }
+
+ if (headlen)
+ ib_dma_unmap_single(ca, mapping[--off], skb_headlen(skb),
+ DMA_TO_DEVICE);
+
+ if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb))
+ ib_dma_unmap_single(ca, mapping[--off], VNIC_ENCAP_LEN,
+ DMA_TO_DEVICE);
+
+ return -EIO;
+}
+
+void vnic_send(struct vnic_login *login, struct sk_buff *skb,
+ struct ib_ah *ah, u32 dqpn, int tx_res_index)
+{
+ struct eoibhdr *_eoib_hdr = VNIC_SKB_GET_ENCAP(skb);
+ struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+ struct vnic_tx_buf *tx_req;
+ unsigned long flags = 0;
+ u64 wr_id;
+ int tx_pkt_num = 1;
+ u8 ip_off;
+
+ if (!vnic_tx_polling)
+ spin_lock_irqsave(&tx_res->lock, flags);
+
+ ASSERT(tx_res_index < login->tx_rings_num);
+ wr_id = tx_res->tx_head & (vnic_tx_rings_len - 1);
+ tx_req = &tx_res->tx_ring[wr_id];
+ tx_req->skb = skb;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ tx_req->ip_off = tx_req->ip6_off = tx_req->tcp_off = tx_req->udp_off = 0;
+ if (VNIC_IP_CSUM_OK(_eoib_hdr)) {
+ ip_off = vnic_encap_headroom ?
+ ((skb_network_header(skb) - skb->data) >> 1) :
+ /* skb_network_header doesn't count the encap since it's OOB */
+ ((skb_network_header(skb) - skb->data + VNIC_ENCAP_LEN) >> 1);
+ switch (ntohs(skb->protocol)) {
+ case ETH_P_IP:
+ tx_req->ip_off = ip_off;
+ break;
+ case ETH_P_IPV6:
+ tx_req->ip6_off = ip_off;
+ }
+ }
+ if (VNIC_TCP_CSUM_OK(_eoib_hdr))
+ tx_req->tcp_off =
+ (skb_transport_header(skb) - skb_network_header(skb)) >> 2;
+ else if (VNIC_UDP_CSUM_OK(_eoib_hdr))
+ tx_req->udp_off =
+ (skb_transport_header(skb) - skb_network_header(skb)) >> 2;
+ ASSERT(!tx_req->udp_off || !tx_req->tcp_off);
+ vnic_dbg_data(login->name, "ip_off = %d, tcp_off = %d, udp_off = %d\n",
+ tx_req->ip_off, tx_req->tcp_off, tx_req->udp_off);
+ VNIC_STATS_INC(login->port_stats.tx_chksum_offload);
+ }
+
+ /* TSO skb */
+ if (skb_is_gso(skb)) {
+ tx_req->hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+ tx_req->phead = skb->data;
+ ASSERT(skb_pull(skb, tx_req->hlen));
+ VNIC_STATS_INC(login->port_stats.tso_packets);
+ tx_pkt_num = skb_shinfo(tx_req->skb)->gso_segs;
+ }
+
+ /* map tx skb */
+ if (unlikely(vnic_dma_map_tx(login->port->dev->ca, tx_req)))
+ goto err;
+
+ /* send.. unmap.. free skb.. drain tx cq.. [pray] */
+ if (unlikely(++tx_res->tx_outstanding == vnic_tx_rings_len)) {
+ if (++tx_res->tx_stopped_cnt % 100 == 0)
+ vnic_dbg(login->name, "tx queue %d stopped cnt %d, outs %d\n",
+ tx_res->index,
+ tx_res->tx_stopped_cnt,
+ tx_res->tx_outstanding);
+ ASSERT(!VNIC_TXQ_STOPPED(tx_res));
+ VNIC_TXQ_STOP(tx_res);
+ /* vnic_drain_arm_tx_cq() will arm the cq OR resume the ring */
+ VNIC_STATS_DO_INC(login->port_stats.queue_stopped);
+ }
+
+ ASSERT(tx_res->tx_outstanding <= vnic_tx_rings_len);
+
+ if (unlikely(vnic_post_send(login, tx_res_index, wr_id, ah, dqpn))) {
+ vnic_warn(login->name, "vnic_post_send failed\n");
+ VNIC_STATS_DO_INC(tx_res->stats.tx_errors);
+ VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+ --tx_res->tx_outstanding;
+ vnic_dealloc_tx_skb(login, tx_res->index, wr_id);
+ /* no need to netif_wake_queue() here, because
+ * vnic_comp_handler_tx() will eventually be called
+ * for armed cq, and it will wake-up the queue when it's ready
+ */
+ } else {
+ VNIC_STATS_DO_ADD(tx_res->stats.tx_packets, tx_pkt_num);
+ VNIC_STATS_DO_ADD(tx_res->stats.tx_bytes, skb->len);
+ login->dev->trans_start = jiffies;
+ ++tx_res->tx_head;
+
+
+ if (vnic_tx_polling) {
+ if (likely(!skb_shared(skb)))
+ skb_orphan(skb);
+ else
+ VNIC_STATS_DO_INC(login->port_stats.shared_packets);
+ }
+ }
+
+ /* poll every vnic_max_tx_outs packets */
+ if (vnic_tx_polling) {
+ if (tx_res->tx_outstanding > vnic_max_tx_outs ||
+ VNIC_TXQ_STOPPED(tx_res))
+ vnic_drain_arm_tx_cq(login, tx_res_index);
+ } else
+ spin_unlock_irqrestore(&tx_res->lock, flags);
+
+ return;
+
+err:
+ VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+ VNIC_STATS_DO_INC(tx_res->stats.tx_errors);
+ dev_kfree_skb_any(skb);
+
+ if (!vnic_tx_polling)
+ spin_unlock_irqrestore(&tx_res->lock, flags);
+
+ return;
+}
+
+void vnic_ib_free_ring(struct vnic_rx_ring *ring)
+{
+ ASSERT(ring->srq);
+ ib_destroy_srq(ring->srq);
+}
+
+int vnic_ib_init_ring(struct vnic_rx_ring *ring)
+{
+ struct ib_srq_init_attr srq_attr;
+ struct vnic_port *port = ring->port;
+ int rc = 0, headroom = 10;
+
+ /* alloc SRQ */
+ memset(&srq_attr, 0, sizeof(struct ib_srq_init_attr));
+ srq_attr.attr.max_sge = VNIC_MAX_RX_FRAGS;
+ srq_attr.attr.max_wr = vnic_rx_rings_len + headroom;
+ srq_attr.attr.srq_limit = vnic_rx_rings_len + headroom;
+ ring->srq = ib_create_srq(port->pd, &srq_attr);
+ if (IS_ERR(ring->srq)) {
+ vnic_err(ring->port->name, "ib_create_srq failed, index %d, rc %d\n",
+ ring->index, (int)PTR_ERR(ring->srq));
+ rc = (int)PTR_ERR(ring->srq);
+ }
+
+ return rc;
+}
+
+int vnic_port_ib_init(struct vnic_port *port)
+{
+ int i;
+
+ /* alloc PD */
+ port->pd = ib_alloc_pd(port->dev->ca);
+ if (IS_ERR(port->pd)) {
+ vnic_err(port->name, "failed to allocate PD\n");
+ goto err;
+ }
+ vnic_dbg_data(port->name, "port->pd %p\n", port);
+
+ /* alloc MR */
+ port->mr = ib_get_dma_mr(port->pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(port->mr)) {
+ vnic_err(port->name, "failed to allocate MR\n");
+ goto free_pd;
+ }
+ vnic_dbg_data(port->name, "port->mr %p\n", port->mr);
+
+ /* alloc RX RING */
+ for (i = 0; i < port->rx_rings_num; ++i) {
+ port->rx_ring[i] = vnic_create_rx_ring(port, i);
+ if (IS_ERR(port->rx_ring[i])) {
+ vnic_err(port->name, "failed to allocate rx_ring %d\n", i);
+ port->rx_ring[i] = NULL;
+ goto free_rx_ring;
+ }
+ }
+ vnic_dbg_data(port->name, "allocated %d RX ring\n", port->rx_rings_num);
+
+ return 0;
+
+free_rx_ring:
+ for (i = 0; i < port->rx_rings_num; ++i)
+ vnic_destroy_rx_ring(port->rx_ring[i]);
+/* free_mr: */
+ ib_dereg_mr(port->mr);
+free_pd:
+ ib_dealloc_pd(port->pd);
+err:
+ return -EINVAL;
+
+}
+
+void vnic_port_ib_cleanup(struct vnic_port *port)
+{
+ int i;
+
+ for (i = 0; i < port->rx_rings_num; ++i)
+ vnic_destroy_rx_ring(port->rx_ring[i]);
+
+ ib_dereg_mr(port->mr);
+ ib_dealloc_pd(port->pd);
+
+ return;
+}
+
+void vnic_ib_dispatch_event(struct ib_event *event)
+{
+ return;
+}
+
+int vnic_ib_set_moder(struct vnic_login *login, u16 rx_usecs, u16 rx_frames,
+ u16 tx_usecs, u16 tx_frames)
+{
+ int rc, i;
+
+ vnic_dbg_moder(login->name, "set coalescing params for mtu:%d to "
+ "rx_frames:%d rx_usecs:%d, "
+ "tx_frames:%d tx_usecs:%d, "
+ "adaptive_rx_coal:%d, "
+ "adaptive_tx_coal:%d, "
+ "sample_interval:%d, "
+ "port.state: %d\n",
+ login->dev->mtu,
+ rx_frames, rx_usecs,
+ tx_frames, tx_usecs,
+ login->adaptive_rx_coal, 0,
+ login->sample_interval, login->port->attr.state);
+
+ for (i = 0; i < login->tx_rings_num; ++i) {
+ rc = ib_modify_cq(login->tx_res[i].cq, tx_frames, tx_usecs);
+ if (rc && rc != -ENOSYS) {
+ vnic_warn(login->name, "failed modifying tx_res,"
+ " rc %d, tx ring index %d\n", rc, i);
+ return rc;
+ }
+ }
+
+ for (i = 0; i < login->rx_rings_num; ++i) {
+ rc = ib_modify_cq(login->rx_res[i].cq, rx_frames, rx_usecs);
+ if (rc && rc != -ENOSYS) {
+ vnic_warn(login->name, "failed modifying rx_res,"
+ " rc %d, rx ring index %d\n", rc, i);
+ return rc;
+ }
+ }
+
+ return 0;
+}
+
+int vnic_ib_down(struct net_device *dev)
+{
+ return 0;
+}
+
+int vnic_ib_up(struct net_device *dev)
+{
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip_discover.h"
+
+static void vnic_mace_dealloc(struct vnic_mac *mace)
+{
+ ASSERT(mace);
+ kfree(mace);
+}
+
+static struct vnic_mac *vnic_mace_alloc(const u8 *mac, u16 vnic_id)
+{
+ struct vnic_mac *mace;
+
+ mace = kzalloc(sizeof *mace, GFP_ATOMIC);
+ if (!mace)
+ return ERR_PTR(-ENOMEM);
+
+ /* set mac entry fields */
+ memcpy(mace->mac, mac, ETH_ALEN);
+ mace->created = jiffies;
+ mace->last_tx = jiffies;
+ mace->vnic_id = vnic_id;
+
+ return mace;
+}
+
+static void vnic_mace_del(struct vnic_login *login, struct vnic_mac *mace)
+{
+ ASSERT(mace);
+ rb_erase(&mace->rb_node, &login->mac_tree);
+}
+
+static int vnic_mace_add(struct vnic_login *login, struct vnic_mac *mace)
+{
+ struct rb_node **n = &login->mac_tree.rb_node, *pn = NULL;
+ struct vnic_mac *mace_t;
+ int rc;
+
+ while (*n) {
+ pn = *n;
+ mace_t = rb_entry(pn, struct vnic_mac, rb_node);
+ rc = memcmp(mace->mac, mace_t->mac, ETH_ALEN);
+ if (rc < 0)
+ n = &pn->rb_left;
+ else if (rc > 0)
+ n = &pn->rb_right;
+ else {
+ rc = -EEXIST;
+ goto out;
+ }
+ }
+
+ rb_link_node(&mace->rb_node, pn, n);
+ rb_insert_color(&mace->rb_node, &login->mac_tree);
+ rc = 0;
+
+out:
+ return rc;
+}
+
+/* vnic_mace_search --
+ * Return entry pointer if found, or ERR_PTR(-ENODATA) if not found.
+ */
+static struct vnic_mac *vnic_mace_search(struct vnic_login *login, u8 *mac)
+{
+ struct rb_node *n = login->mac_tree.rb_node;
+ struct vnic_mac *mace_t;
+ int rc;
+
+ ASSERT(login);
+ ASSERT(mac);
+
+ while (n) {
+ mace_t = rb_entry(n, struct vnic_mac, rb_node);
+ ASSERT(mace_t);
+ rc = memcmp(mac, mace_t->mac, ETH_ALEN);
+ if (rc < 0)
+ n = n->rb_left;
+ else if (rc > 0)
+ n = n->rb_right;
+ else
+ goto out;
+ }
+
+ mace_t = ERR_PTR(-ENODATA);
+
+out:
+ return mace_t;
+}
+
+/* vnic_mace_update --
+ * Remove: -ENODATA if not found, if removed, update ref_cnt, return 0
+ * Add: -ENOMEM if no mem, -EEXIST if already exists,
+ * if added, update ref_cnt, return 0
+ * NOTE: ref counters must be updated here, as this function is
+ * shared among multiple entry points
+ */
+int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove)
+{
+ struct vnic_mac *mace;
+ int rc;
+
+ mace = vnic_mace_search(login, mac);
+ if (remove) {
+ if (IS_ERR(mace))
+ return -ENODATA;
+ vnic_mace_del(login, mace);
+ vnic_mace_dealloc(mace);
+ /* update ref cnt */
+ ASSERT(atomic_read(&login->vnic_child_cnt));
+ atomic_dec(&login->vnic_child_cnt);
+ } else {
+ if (PTR_ERR(mace) != -ENODATA)
+ return -EEXIST;
+
+ /* test ref cnt */
+ if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) {
+ vnic_warn(login->name, "too many child vNics, max %d\n",
+ vnic_child_max);
+ return -EUSERS; /* too many users */
+ }
+
+ mace = vnic_mace_alloc(mac, vnic_id);
+ if (!mace)
+ return -ENOMEM;
+
+ rc = vnic_mace_add(login, mace);
+ if (rc) {
+ vnic_mace_dealloc(mace);
+ return rc;
+ }
+ /* update ref cnt */
+ atomic_inc(&login->vnic_child_cnt);
+ vnic_dbg_mac(login->name,
+ "updated mac "MAC_6_PRINT_FMT" remove %d\n",
+ MAC_6_PRINT_ARG(mac), remove);
+ }
+
+ return 0;
+}
+
+/* this function can be called from fast data-path
+ * need to make sure that login instance is protected here
+ * likely/unlikely below were added to match the hard_start_xmit fast data flow
+ * + caller must hold login->mac_rwlock (read_lock is enough because we only
+ * queue the job here)
+ * + it queues a job to create a child
+ */
+int vnic_child_update(struct vnic_login *login, u8 *mac, int remove)
+{
+ struct vnic_mac *mace;
+ char *cmd_str;
+ struct fip_hadmin_cmd *cmd_hadmin;
+ int count, rc = -EINVAL;
+ u16 vnic_id = 0;
+
+ vnic_dbg_func(login->name);
+
+ mace = vnic_mace_search(login, mac);
+
+ /* if asked to add, and data already exists, abort */
+ if (likely(!remove && !IS_ERR(mace))) {
+ mace->last_tx = jiffies;
+ return -EEXIST;
+ }
+
+ if (!remove) {
+ /* test if there is too many child vNics same check exist in
+ * vnic_mace_update(), but we have it here as well to let
+ * vnic_set_mac return friendly rc
+ */
+ if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) {
+ vnic_warn(login->name, "too many child vNics, "
+ "max %d\n", vnic_child_max);
+ return -EUSERS; /* too many users */
+ }
+
+ /* update last_tx */
+ ASSERT(mace);
+ /* generate new vnic_id only when new child is being added */
+ vnic_id = atomic_inc_return(&login->port->vnic_child_ids);
+ /* set bit 14 so we avoid conflict with normal host/net admin */
+ vnic_id %= (1 << (VNIC_ID_LEN - 2));
+ vnic_id |= (1 << (VNIC_ID_LEN - 2));
+
+ /* TODO: update hadmin user-script and manual to make hadmin
+ * vnic_id interval >= 16K (1<<14 == 16384) so bit 14 is clear
+ * for parent host admin.
+ * to avoid atomic counter wrap around, move to bitmap array
+ */
+ } else {
+ /* if asked to remove, and data not found, abort */
+ if (IS_ERR(mace))
+ return -ENODATA;
+
+ ASSERT(mace);
+ vnic_id = mace->vnic_id;
+ }
+
+ /* allocate cmd structs, too big to be local vars
+ * use GFP_ATOMIC because this func can be called from data path
+ */
+ cmd_str = kmalloc(sizeof *cmd_str * PAGE_SIZE, GFP_ATOMIC);
+ if (!cmd_str)
+ return -ENOMEM;
+
+ cmd_hadmin = kmalloc(sizeof *cmd_hadmin, GFP_ATOMIC);
+ if (!cmd_hadmin) {
+ kfree(cmd_str);
+ return -ENOMEM;
+ }
+
+ /* inherit command from parent, change:
+ * name, parent, mac, vnic_id and source
+ * Note: cannot use parent login->fip_vnic->cmd here
+ * in order to support net-admin-vnics
+ */
+ vnic_login_cmd_init(cmd_hadmin);
+
+ /* child vNic name scheme:
+ * eth<parent-cnt>.c<child-vnic-id>
+ * Note: avoid sysfs files conflict (that's why parent unique cnt must
+ * be included in the name here)
+ */
+ snprintf(cmd_hadmin->c_name, MAX_INPUT_LEN, "%s%u.c%u",
+ "eth", login->cnt, vnic_id);
+ snprintf(cmd_hadmin->c_mac, MAX_INPUT_LEN, MAC_6_PRINT_FMT,
+ MAC_6_PRINT_ARG(mac));
+ snprintf(cmd_hadmin->c_vnic_id, MAX_INPUT_LEN, "%u",
+ vnic_id);
+ snprintf(cmd_hadmin->c_eport, MAX_INPUT_LEN, "%s",
+ login->fip_vnic->gw_info.gw_port_name);
+ snprintf(cmd_hadmin->c_parent, MAX_INPUT_LEN, "%s",
+ login->dev->name);
+ snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s",
+ login->fip_vnic->gw_info.system_name);
+ snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, VNIC_GUID_FMT,
+ VNIC_GUID_RAW_ARG(login->fip_vnic->gw_info.system_guid));
+
+ /* all hadmin vNics must use same BX format (guid vs. name) */
+ if (login->fip_vnic->hadmined) {
+ snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s",
+ login->fip_vnic->cmd.c_bxname);
+ snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, "%s",
+ login->fip_vnic->cmd.c_bxguid);
+ }
+
+ /* VLAN is optional, set it only when used by parent */
+ if (login->vlan_used)
+ snprintf(cmd_hadmin->c_vid, MAX_INPUT_LEN, "%d",
+ login->fip_vnic->vlan);
+
+ /* ready to set the command */
+ count = vnic_login_cmd_set(cmd_str, cmd_hadmin);
+ if (!count)
+ goto out;
+
+ /* queue job (similar to sysfs write function,
+ * will eventually call fip_discover_hadmin_update_parent() ->
+ * vnic_mace_update()
+ */
+ count = fip_hadmin_sysfs_update(login->port, cmd_str, count, remove);
+ if (count <= 0 && count != -EEXIST)
+ goto out;
+
+ /* at this point, job queued, return success */
+ rc = 0;
+
+out:
+ kfree(cmd_str);
+ kfree(cmd_hadmin);
+ return rc;
+}
+
+void vnic_child_flush(struct vnic_login *login, int all)
+{
+ struct rb_node *n;
+ struct vnic_mac *mace, *mace_t;
+ LIST_HEAD(local_list);
+
+ vnic_dbg_func(login->name);
+
+ n = rb_first(&login->mac_tree);
+ while (n) {
+ mace = rb_entry(n, struct vnic_mac, rb_node);
+ list_add_tail(&mace->list, &local_list);
+ n = rb_next(n);
+ }
+
+ list_for_each_entry_safe(mace, mace_t, &local_list, list) {
+ list_del(&mace->list);
+ /* if not-flush-all, and mac is dev_addr mac, skip this entry */
+ if (!all && !memcmp(login->dev->dev_addr, mace->mac, ETH_ALEN))
+ continue;
+ vnic_child_update(login, mace->mac, 1);
+ vnic_mace_del(login, mace);
+ vnic_mace_dealloc(mace);
+ }
+
+
+}
+
+/* find parent vNic
+ * add the child vnic to its mac_tree
+ * sync child qp_base_num with parent
+ * for child removal, it's ok not to find the parent, or the child mac entry
+ */
+int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id,
+ u8 *mac, u32 *qp_base_num_ptr, char *parent_name,
+ int remove)
+{
+ struct vnic_login *login;
+ int rc = -ENODATA;
+
+ vnic_dbg_func(name);
+
+ mutex_lock(&port->mlock);
+ list_for_each_entry(login, &port->login_list, list) {
+ vnic_dbg_mac(name, "checking parent %s for child %s (expect %s)\n",
+ login->dev->name, name, parent_name);
+ /* check if parent vnic has valid QPN and not being destroyed */
+ if (!strcmp(login->dev->name, parent_name) &&
+ test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->fip_vnic->login_state) &&
+ !login->fip_vnic->flush) {
+ /* sync qp_base_num with parent */
+ if (qp_base_num_ptr)
+ *qp_base_num_ptr = login->qp_base_num;
+
+ /* update mac_tree and mace vnic_id */
+ write_lock_bh(&login->mac_rwlock);
+ rc = vnic_mace_update(login, mac, vnic_id, remove);
+ write_unlock_bh(&login->mac_rwlock);
+
+ break;
+ }
+ }
+
+ mutex_unlock(&port->mlock);
+
+ /* for vNic removal, ignore rc */
+ return remove ? 0 : rc;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+void vnic_login_refresh_mcasts(struct vnic_port *port)
+{
+ struct vnic_login *login;
+
+ vnic_dbg_mark();
+ mutex_lock(&port->mlock);
+ list_for_each_entry(login, &port->login_list, list)
+ vnic_tree_mcast_detach(&login->mcast_tree);
+ list_for_each_entry(login, &port->login_list, list)
+ {
+ if (vnic_sa_query) {
+ /* take the tx lock to make sure no delete function is called at the time */
+ netif_tx_lock_bh(login->dev);
+ vnic_neigh_invalidate(login);
+ netif_tx_unlock_bh(login->dev);
+ }
+
+ vnic_tree_mcast_attach(&login->mcast_tree);
+ }
+ mutex_unlock(&port->mlock);
+}
+
+int vnic_login_pre_create_1(struct vnic_port *port,
+ struct fip_vnic_data *vnic)
+{
+ struct vnic_login *login;
+ struct net_device *dev;
+
+ /* set login to zero first (for parent_used case) */
+ vnic->login = NULL;
+
+ /* if parent_used, skip */
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ return 0;
+ } else {
+ vnic_dbg_func(vnic->name);
+ }
+
+ /* create netdev per login, vlan configuration is done from outside */
+ dev = vnic_alloc_netdev(port);
+ if (IS_ERR(dev)) {
+ vnic_err(port->name, "vnic_alloc_netdev failed\n");
+ goto err;
+ }
+
+ login = vnic_netdev_priv(dev);
+ login->fip_vnic = vnic;
+ vnic->login = login;
+ login->vlan_used = vnic->vlan_used;
+ login->dev->hard_header_len += (vnic->vlan_used && vnic->hadmined)? VLAN_HLEN: 0;
+ vnic_dbg_fip(vnic->name,"creating vnic, hadmin=%d vlan_used=%d hard_header_len += %d\n",
+ vnic->hadmined, vnic->vlan_used, (vnic->vlan_used && vnic->hadmined)? VLAN_HLEN: 0);
+ set_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state);
+
+ return 0;
+
+err:
+ return -ENODEV;
+}
+
+int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag)
+{
+ struct vnic_login *login = vnic->login;
+ int i, j;
+
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ return 0;
+ } else {
+ ASSERT(login);
+ vnic_dbg_func(login->name);
+ }
+
+ login->qps_num = qps_num;
+ login->qkey = VNIC_DATA_QKEY;
+ login->is_lag = is_lag;
+ VNIC_TXQ_SET_ACTIVE(login, min(login->tx_rings_num, login->qps_num));
+
+ /* prepare padding for runt packets */
+ login->pad_va = kzalloc(VNIC_EOIB_ZLEN_MAX, GFP_KERNEL);
+ if (!login->pad_va)
+ return -ENOMEM;
+
+ login->pad_dma = ib_dma_map_single(login->port->dev->ca, login->pad_va,
+ VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(login->port->dev->ca, login->pad_dma))
+ goto err;
+
+ /* create TX resources */
+ for (i = 0; i < login->tx_rings_num; ++i) {
+ if (vnic_create_tx_res(login, i)) {
+ vnic_err(login->name, "vnic_create_tx_res failed,"
+ " index %d\n", i);
+ goto free_tx_res;
+ }
+ }
+
+ /* create RX resources */
+ for (j = 0; j < login->rx_rings_num; ++j) {
+ if (vnic_create_rx_res(login, j)) {
+ vnic_err(login->name, "vnic_create_rx_res failed,"
+ " index %d\n", j);
+ goto free_rx_res;
+ }
+ }
+
+ /* create QPs */
+ if (vnic_create_qp_range(login)) {
+ vnic_err(login->name, "vnic_create_qp_range failed\n");
+ goto free_rx_res;
+ }
+
+ /* first QP is the base QP */
+ login->qp_base_num = login->qp_res[0].qp->qp_num;
+ vnic->qp_base_num = login->qp_base_num;
+
+ /* update state */
+ set_bit(VNIC_STATE_LOGIN_PRECREATE_2, &vnic->login_state);
+
+ login->queue_stopped = 0;
+
+ /* calls vnic_do_get_stats() */
+ queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY);
+
+ return 0;
+
+free_rx_res:
+ for (--j; j >= 0; --j)
+ vnic_destroy_rx_res(login, j);
+
+ i = login->tx_rings_num;
+free_tx_res:
+ for (--i; i >= 0; --i)
+ vnic_destroy_tx_res(login, i);
+/*free_pad:*/
+ ib_dma_unmap_single(login->port->dev->ca, login->pad_dma,
+ VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+err:
+ kfree(login->pad_va);
+ return -ENODEV;
+}
+
+int vnic_login_register_netdev(struct fip_vnic_data *vnic,
+ const char *mac,
+ const char *name)
+{
+ struct vnic_login *login = vnic->login;
+
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ vnic_info("%s created (parent %s mac "MAC_6_PRINT_FMT")\n",
+ name, vnic->parent_name,
+ MAC_6_PRINT_ARG(vnic->mac_cache));
+ return 0;
+ } else {
+ ASSERT(login);
+ vnic_dbg_func(login->name);
+ }
+
+ /* set netdev name and mac */
+ if (name)
+ strncpy(login->dev->name, name, IFNAMSIZ);
+ if (mac) {
+ memcpy(login->dev->dev_addr, mac, ETH_ALEN);
+ /* save original mac */
+ memcpy(login->dev_addr, mac, ETH_ALEN);
+ }
+
+ /* set device features according to all_vlan mode */
+ login->dev->features |= NETIF_F_HIGHDMA;
+
+ //ronni - fixme. add comment here
+ if (!vnic->all_vlan_gw) {
+ login->dev->features |= NETIF_F_VLAN_CHALLENGED;
+ login->dev->features &= ~NETIF_F_HW_VLAN_FILTER;
+ } else
+ login->dev->features |= NETIF_F_HW_VLAN_FILTER;
+
+ /* register netdev */
+ if (register_netdev(login->dev)) {
+ vnic_err(login->name, "register_netdev failed name=%s mac="
+ MAC_6_PRINT_FMT" login->dev=%p\n",
+ name ? name : "net_admin",
+ MAC_6_PRINT_ARG(login->dev->dev_addr), login->dev);
+ goto err;
+ }
+
+ /* encode the port number in dev_id:
+ * This allows us to associate the net device
+ * with the underlying device's port.
+ */
+ login->dev->dev_id = login->port->num - 1;
+
+ if (vnic_create_dentry(login)) {
+ vnic_err(login->name, "vnic_create_dentry failed\n");
+ goto err;
+ }
+
+ /* print info only after register_netdev so dev->name is valid */
+ sprintf(login->name, "%s", login->dev->name);
+ vnic_info("%s created (%s port %d)\n",
+ login->dev->name,
+ login->port->dev->ca->name, login->port->num);
+
+ /* disable tx queues and carrier. They will be started
+ * after create 2 is called the mcast is attached ...
+ */
+ netif_tx_disable(login->dev);
+ netif_carrier_off(login->dev);
+
+ mutex_lock(&login->port->mlock);
+ vnic_dbg_mac(login->name, "added to login_list\n");
+ list_add_tail(&login->list, &login->port->login_list);
+ mutex_unlock(&login->port->mlock);
+
+ set_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state);
+
+ return 0;
+
+err:
+ return -EINVAL;
+}
+
+int vnic_login_complete_ack(struct fip_vnic_data *vnic,
+ struct fip_login_data *login_data,
+ struct fip_shared_vnic_data *shared_vnic)
+{
+ struct vnic_mcast *mcaste, *mcaste_bcast, *mcast_shared = NULL;
+ struct vnic_login *login = vnic->login;
+ int rc;
+ int first_time_vlan = 0;
+
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ return 0;
+ } else {
+ ASSERT(login);
+ vnic_dbg_func(login->name);
+ }
+
+ /*
+ * TODO, check if you need them all, check overlap with gw_neigh
+ * check how pkey is passed from FIP
+ */
+ login->pkey = login_data->pkey;
+ login->pkey_index = login_data->pkey_index;
+ login->n_mac_mcgid = login_data->n_mac_mcgid;
+ login->gw_port_id = login_data->port_id;
+
+ /*GW should send the data SL from the login packet*/
+ login->sl = login_data->sl;
+
+ login->vnic_id = login_data->vnic_id;
+
+ memcpy(login->mgid_prefix, login_data->mgid_prefix, VNIC_MGID_PREFIX_LEN);
+ memcpy(login->vnic_name, login_data->vnic_name, sizeof(login_data->vnic_name));
+ memcpy(login->vendor_id, login_data->vendor_id, sizeof(login_data->vendor_id));
+
+ VNIC_STR_STRIP(login->vnic_name);
+ VNIC_STR_STRIP(login->vendor_id); /* set ZLEN (varies per VLAN support) */
+
+ /* set VLAN */
+ login->zlen = ETH_ZLEN + (vnic_encap_headroom? VNIC_ENCAP_LEN: 0);
+ first_time_vlan = !login->vlan_used; /* always false for hadmin vnics with vlans */
+ login->vlan_used = login_data->vp;
+ login->all_vlan_gw = login_data->all_vlan_gw;
+ if ((VNIC_VLAN_ENABLED(login))) {
+ login->vid = cpu_to_be16(login_data->vlan);
+ if (first_time_vlan) {
+ vnic_dbg_fip(login->dev->name,"Updating hard_header_len %d+%d=%d\n",
+ login->dev->hard_header_len, VLAN_HLEN,
+ login->dev->hard_header_len + VLAN_HLEN);
+ login->dev->hard_header_len += VLAN_HLEN;
+ }
+ login->zlen = ETH_ZLEN + VLAN_HLEN + (vnic_encap_headroom? VNIC_ENCAP_LEN: 0);
+ }
+
+ /* create gw_neigh (no RSS when sending to the GW)
+ * user zero mac to describe GW L2 address
+ */
+ login->gw_neigh =
+ vnic_neighe_alloc(login, NULL, login_data->lid,
+ login_data->qpn, 0);
+ if (IS_ERR(login->gw_neigh)) {
+ vnic_err(login->name, "failed to alloc gw neigh\n");
+ goto err;
+ }
+
+ /* alloc mcast entries here to simplify the error flow */
+ mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+ if (IS_ERR(mcaste))
+ goto err_free_gw_ah;
+ mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL);
+ if (IS_ERR(mcaste_bcast)) {
+ vnic_mcast_dealloc(mcaste);
+ goto err_free_gw_ah;
+ }
+ /* used by shared vnic mcast group */
+ if (shared_vnic && shared_vnic->enabled) {
+ mcast_shared = vnic_mcast_alloc(login->port, NULL, NULL);
+ if (IS_ERR(mcast_shared)) {
+ vnic_mcast_dealloc(mcaste);
+ vnic_mcast_dealloc(mcaste_bcast);
+ goto err_free_gw_ah;
+ }
+ }
+
+ /* attach to default mgid */
+ __vnic_mcaste_fill(login, mcaste, login->gw_port_id, ETH_ZERO_MAC, 0, vnic_mcast_create);
+ mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+ mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+ mcaste->attach_cb = __bcast_attach_cb;
+ mcaste->detach_cb = __bcast_detach_cb;
+ mcaste->attach_cb_ctx = login;
+ mcaste->detach_cb_ctx = login;
+ rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+ ASSERT(!rc);
+
+ /* attach to bcast mgid (use default mlid) */
+ if (login->n_mac_mcgid || vnic_mgid_data_type) {
+ __vnic_mcaste_fill(login, mcaste_bcast, login->gw_port_id, ETH_BCAST_MAC, 0, 0);
+ mcaste_bcast->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+ mcaste_bcast->retry = VNIC_MCAST_ULIMIT_RETRY;
+ /* The port gid is overun by the default gid as part of the mgid over
+ * same mlid hack */
+ memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN);
+ rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast);
+ ASSERT(!rc);
+ } else {
+ vnic_mcast_dealloc(mcaste_bcast);
+ }
+
+ login->shared_vnic = 0;
+ /* attach to bcast mgid (use default mlid) */
+ if (shared_vnic && shared_vnic->enabled) {
+ u8 rss_hash = shared_vnic->ip[0] ^ shared_vnic->ip[1] ^
+ shared_vnic->ip[2] ^ shared_vnic->ip[3];
+
+ login->shared_vnic = 1;
+ __vnic_mcaste_fill(login, mcast_shared, login->gw_port_id, shared_vnic->emac, 0, 0);
+ mcast_shared->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+ mcast_shared->retry = VNIC_MCAST_ULIMIT_RETRY;
+ memcpy(&mcast_shared->port_gid, &mcaste->port_gid, GID_LEN);
+ mcast_shared->gid.raw[12]= rss_hash;
+
+ vnic_dbg_mcast(login->name, "vnic %s attaching shared vnic 1 "
+ "MGID "VNIC_GID_FMT"\n", login->name,
+ VNIC_GID_RAW_ARG(mcast_shared->gid.raw));
+ mcaste = mcast_shared;
+ memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+ rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+ ASSERT(!rc);
+ }
+
+ /* set state */
+ set_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic->login_state);
+
+ /* call vnic_open() if open was called when we were not ready to handle it */
+ if (test_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state))
+#ifndef _BP_NO_NDO_OPS
+ login->dev->netdev_ops->ndo_open(login->dev);
+#else
+ login->dev->open(login->dev);
+#endif
+
+ return 0;
+
+err_free_gw_ah:
+ vnic_neighe_dealloc(login->gw_neigh);
+err:
+ return -EINVAL;
+}
+
+/*
+ * When destroying login, call to stop login wq tasks. do not call from
+ * login_wq context.
+*/
+void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+ struct vnic_login *login = vnic->login;
+
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ return;
+ } else {
+ ASSERT(login);
+ vnic_dbg_func(login->name);
+ }
+
+ if (test_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state)) {
+ /* cancel vnic_auto_moder() */
+ vnic_dbg_mark();
+ mutex_lock(&login->moder_lock);
+ login->queue_stopped = 1;
+ mutex_unlock(&login->moder_lock);
+#ifndef _BP_WORK_SYNC
+ cancel_delayed_work_sync(&login->stats_task);
+ if (cancel_delayed_work_sync(&login->mcast_task))
+ dev_put(login->dev);
+ cancel_delayed_work_sync(&login->restart_task);
+#else
+ cancel_delayed_work(&login->stats_task);
+ if (cancel_delayed_work(&login->mcast_task))
+ dev_put(login->dev);
+ cancel_delayed_work(&login->restart_task);
+ flush_workqueue(login_wq);
+#endif
+ }
+}
+
+/*
+ * When destroy login data struct. Assumes all login wq tasks are stopped.
+ * Can be called from any context, might block for a few secs.
+*/
+void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+ struct vnic_login *login = vnic->login;
+ unsigned long flags;
+ int i;
+
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ vnic_info("%s destroyed (parent %s mac "MAC_6_PRINT_FMT")\n",
+ vnic->interface_name, vnic->parent_name,
+ MAC_6_PRINT_ARG(vnic->mac_cache));
+ /* Note: vNics can be logged out by BXM (bypass sysfs calls)
+ * so we need to cleanup the parent here as well
+ * if we reach this function from sysfs calls,
+ * then vnic_parent_update will have no effect here (ok)
+ */
+ vnic_parent_update(vnic->port, vnic->name, vnic->vnic_id,
+ vnic->mac_cache, NULL, vnic->parent_name, 1);
+ return;
+ } else {
+ ASSERT(login);
+ vnic_dbg_func(login->name);
+ }
+
+ /* the cleanup procedure depends on our state, our vnic type
+ * (host/network admin), and the cleanup level required. In network admined
+ * vnics there is a single create state and only one cleanup level (full).
+ * for host admined there are two create states (init, regular) and two
+ * cleanup level. The flow depends on the reason for the cleanup. */
+ vnic_dbg_data(login->name, "vnic_login_destroy flush=%d\n", flush);
+
+ /* we need to change state to prevent from completion to re-open the TX
+ * queue once we close it. Before calling stop() function, need to make
+ * sure that all on-going hard_start_xmit() calls are done.
+ */
+
+ if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+ set_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+ netif_tx_disable(login->dev);
+ vnic_dbg_mark();
+ }
+
+ if (test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic->login_state)) {
+ if (test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)) {
+ /* calls vnic_stop() */
+#ifndef _BP_NO_NDO_OPS
+ login->dev->netdev_ops->ndo_stop(login->dev);
+#else
+ login->dev->stop(login->dev);
+#endif
+ set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+ vnic_dbg_mark();
+ }
+ vnic_mcast_del_all(&login->mcast_tree);
+ vnic_member_remove_all(login);
+ vnic_neighe_dealloc(login->gw_neigh);
+ vnic_dbg_mark();
+ }
+ if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state))
+ clear_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+
+ if (flush == FIP_FULL_FLUSH &&
+ test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+ mutex_lock(&login->port->mlock);
+ vnic_dbg_mac(login->name, "delete from login_list\n");
+ list_del(&login->list);
+ mutex_unlock(&login->port->mlock);
+
+ /* print info if register_netdev was called before so
+ * dev->name is valid
+ */
+ vnic_info("%s destroyed (%s port %d)\n", login->dev->name,
+ login->port->dev->ca->name, login->port->num);
+
+ /* use irq save so caller function supports any context */
+ write_lock_irqsave(&login->mac_rwlock, flags);
+ vnic_child_flush(login, 1);
+ write_unlock_irqrestore(&login->mac_rwlock, flags);
+
+ vnic_delete_dentry(login);
+ unregister_netdev(login->dev);
+ vnic_dbg_mark();
+ }
+
+ vnic_dbg_mark();
+ /* login_ctx was in pre created state [always true] */
+ spin_lock_bh(&login->stats_lock);
+ if (test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_2, &vnic->login_state)) {
+ spin_unlock_bh(&login->stats_lock);
+ vnic_dbg_mark();
+ /* take port->mlock in case of refresh event is being called vnic_refresh_mcasts */
+ mutex_lock(&login->port->mlock);
+ /* tx queues are already stopped here */
+ vnic_neigh_del_all(login);
+ vnic_mcast_del_all(&login->mcast_tree);
+ for (i = 0; i < login->qps_num; ++i)
+ vnic_destroy_qp(login, i);
+ mutex_unlock(&login->port->mlock);
+
+ for (i = 0; i < login->rx_rings_num; ++i)
+ vnic_destroy_rx_res(login, i);
+ for (i = 0; i < login->tx_rings_num; ++i)
+ vnic_destroy_tx_res(login, i);
+ ib_dma_unmap_single(login->port->dev->ca, login->pad_dma,
+ VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+ kfree(login->pad_va);
+ } else
+ spin_unlock_bh(&login->stats_lock);
+
+ if (flush == FIP_FULL_FLUSH &&
+ test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state)) {
+ vnic_free_netdev(login);
+ }
+}
+
+int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube)
+{
+ struct vnic_neigh *neighe;
+ struct vnic_login *login = vnic->login;
+ int rc;
+
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ return 0;
+ } else {
+ ASSERT(login);
+ vnic_dbg_func(login->name);
+ }
+
+ vnic_dbg_data(login->name, "adding vhube lid 0x%02x qpn 0x%x, mac "
+ MAC_6_PRINT_FMT"\n", vhube->lid, vhube->qpn,
+ MAC_6_PRINT_ARG(vhube->mac));
+
+ neighe = vnic_neighe_alloc(login, vhube->mac, vhube->lid,
+ vhube->qpn, vhube->rss);
+ if (IS_ERR(neighe))
+ return (int)PTR_ERR(neighe);
+
+ vnic_dbg_mark();
+ /* when adding new neighe, make sure that TX queues are not running. */
+ netif_tx_lock_bh(login->dev);
+ rc = vnic_neighe_add(login, neighe);
+ netif_tx_unlock_bh(login->dev);
+ if (rc) {
+ vnic_neighe_dealloc(neighe);
+ return rc;
+ }
+
+ return 0;
+}
+
+void vnic_vhube_flush(struct fip_vnic_data *vnic)
+{
+ struct vnic_login *login = vnic->login;
+
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ return;
+ } else {
+ ASSERT(login);
+ vnic_dbg_func(login->name);
+ }
+
+ /* when adding new neighe, make sure that TX queues are not running. */
+ vnic_dbg_mark();
+ netif_tx_lock_bh(login->dev);
+ vnic_neigh_del_all(login);
+ netif_tx_unlock_bh(login->dev);
+
+ return;
+}
+
+void vnic_vhube_del(struct fip_vnic_data *vnic, u8* mac)
+{
+ struct vnic_neigh *neighe;
+ struct vnic_login *login = vnic->login;
+
+ if (vnic->parent_used) {
+ vnic_dbg_mac(vnic->name, "function skipped\n");
+ return;
+ } else {
+ ASSERT(login);
+ vnic_dbg_func(login->name);
+ }
+
+ vnic_dbg_mark();
+ /* when adding new neighe, make sure that TX queues are not running. */
+ netif_tx_lock_bh(login->dev);
+ neighe = vnic_neighe_search(login, mac);
+ if (IS_ERR(neighe)) {
+ vnic_warn(login->name, "couldn't find "MAC_6_PRINT_FMT"\n",
+ MAC_6_PRINT_ARG(mac));
+ } else {
+ vnic_neighe_del(login, neighe);
+ vnic_neighe_dealloc(neighe);
+ }
+ netif_tx_unlock_bh(login->dev);
+ return;
+}
+
+struct fip_login_data login_data;
+struct fip_vnic_data vnic;
+struct vnic_login *__vnic_login_create(struct vnic_port *port, int index)
+{
+ struct vnic_login *login;
+ int rc, no_bxm_n_rss = 0x4;
+ int qps_num = (port->rx_rings_num > 1) ? (1 << no_bxm_n_rss) : 1;
+
+ /* pre create vnic */
+ rc = vnic_login_pre_create_1(port, &vnic);
+ if (rc) {
+ vnic_err(port->name, "vnic_login_pre_create_1 failed"
+ " for %s port %d index %d\n",
+ port->dev->ca->name, port->num, index);
+ goto err;
+ }
+
+ login = vnic.login;
+
+ rc = vnic_login_pre_create_2(&vnic, qps_num, 0);
+ if (rc) {
+ vnic_err(port->name, "vnic_login_pre_create_2 failed"
+ " for %s port %d index %d\n",
+ port->dev->ca->name, port->num, index);
+ goto create_fail;
+ }
+
+ /* create vnic */
+ memset(&login_data, 0, sizeof(struct fip_login_data));
+ sprintf(login_data.vendor_id, "%s", NOT_AVAILABLE_STRING);
+ sprintf(login_data.vnic_name, "%s", NOT_AVAILABLE_STRING);
+ memcpy(login_data.mgid_prefix, NO_BXM_MGID_PREFIX, VNIC_MGID_PREFIX_LEN);
+ login_data.qpn = 0xa00000;
+ login_data.lid = 1;
+ login_data.pkey = 0xffff;
+ login_data.mtu = 1500;
+
+ /* random_ether_addr(mac); */
+ memcpy(login_data.mac, port->gid.raw + 10, ETH_ALEN);
+ login_data.mac[0] += index * 0x10;
+ /* mcast bit must be zero */
+ login_data.mac[0] &= 0xfe;
+ vnic_dbg_mark();
+ if (vnic_login_register_netdev(&vnic, login_data.mac, NULL)) {
+ vnic_err(login->name, "vnic_login_register_netdev failed\n");
+ goto create_fail;
+ }
+ if (vnic_login_complete_ack(&vnic, &login_data, NULL)) {
+ vnic_err(login->name, "vnic_login_complete_ack failed\n");
+ goto create_fail;
+ }
+
+ return login;
+
+create_fail:
+ vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH);
+err:
+ return ERR_PTR(-ENODEV);
+}
+
+int vnic_port_data_init(struct vnic_port *port)
+{
+ int i, no_bxm_vnic_per_port = 1;
+
+ vnic_dbg_mark();
+ mutex_lock(&port->start_stop_lock);
+ for (i = 0; i < no_bxm_vnic_per_port; ++i) {
+ __vnic_login_create(port, i);
+ }
+ mutex_unlock(&port->start_stop_lock);
+
+ return 0;
+ /*TODO - JPM: handle vnic_login_create failure */
+}
+
+void vnic_port_data_cleanup(struct vnic_port *port)
+{
+ struct vnic_login *login, *login_t;
+
+ vnic_dbg_mark();
+ /* vnic_login_destroy() acquires the port->mlock, cannot hold it here */
+ list_for_each_entry_safe(login, login_t,
+ &port->login_list, list) {
+ vnic_dbg_data(login->name, "login %s\n", login->name);
+ vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH);
+ }
+}
+
+/* ALI TODO: check if need to replace login ptr with vnic */
+void debug_dump_members(struct vnic_login *login, struct vnic_gw_info *member)
+{
+ int i;
+
+ vnic_warn(login->name, "Error members_debug_dump "
+ "member id=%d gw id = %d active_count=%d\n",
+ member->member_id, member->gw_id,
+ login->lag_member_active_count);
+
+ /* go over map and count how many entries are mapped to each member*/
+ for (i=0; i<MAX_LAG_MEMBERS; i++) {
+ vnic_warn(login->name, "%d member %d used %x gw_id %d\n",
+ i, login->lag_gw_neigh[i].member_id,
+ login->lag_gw_neigh[i].info,
+ login->lag_gw_neigh[i].gw_id);
+ }
+}
+
+static void vnic_build_map_histogram(struct vnic_login *login, int member_id, int *hist)
+{
+ int i;
+
+ memset(hist, 0, sizeof(int) * MAX_LAG_MEMBERS);
+
+ /* go over map and count how many entries are mapped to each member*/
+ for (i=0; i<LAG_MAP_TABLE_SIZE; i++) {
+ ASSERT(login->lag_gw_map[i] >= 0 && login->lag_gw_map[i] < MAX_LAG_MEMBERS);
+ hist[login->lag_gw_map[i]]++;
+ }
+}
+
+static void _vnic_remove_member_from_map(struct vnic_login *login, int member_id)
+{
+ int user_count[MAX_LAG_MEMBERS] = {0};
+ int i, j;
+ int continue_flag;
+ int thresh;
+
+ login->lag_member_active_count--;
+ if (login->lag_member_active_count > 0) {
+ /* go over map and count how many entries are mapped to each member*/
+ vnic_build_map_histogram(login, member_id, user_count);
+
+ thresh = 2; //it might be possible to find a better lower boundary
+
+ for (i=0; i<LAG_MAP_TABLE_SIZE; i++) {
+ /* entries that use the removed member must be remapped */
+ if (login->lag_gw_map[i] != member_id)
+ continue;
+
+ continue_flag = 1;
+ while (continue_flag) {
+ for (j = 0; j < MAX_LAG_MEMBERS; j++) {
+ if (j == member_id)
+ continue;
+
+ /* Only use members that are connected, and are short of members */
+ if (login->lag_gw_neigh[j].info & GW_MEMBER_INFO_MAPPED &&
+ user_count[j] < thresh) {
+ login->lag_gw_map[i] = j;
+ user_count[j]++;
+ continue_flag = 0;
+ break;
+ }
+ }
+ if (j == MAX_LAG_MEMBERS)
+ thresh++;
+ }
+ }
+ }
+}
+
+static void _vnic_add_member_to_map(struct vnic_login *login, int member_id)
+{
+ int i;
+ int expected;
+ int user_count[MAX_LAG_MEMBERS] = {0};
+ int continue_flag;
+ int thresh;
+
+ /* this is the first active port use it for all maps */
+ if (!login->lag_member_active_count) {
+ for (i=0; i<LAG_MAP_TABLE_SIZE; i++)
+ login->lag_gw_map[i] = member_id;
+ login->lag_member_active_count++;
+ } else {
+ /* go over map and count how many entries are mapped to each member
+ * we will use count to reasign ports from the most heavily used members */
+ vnic_build_map_histogram(login, member_id, user_count);
+
+ /* when adding new member, make sure that TX queues are not running. */
+ login->lag_member_active_count++;
+ expected = LAG_MAP_TABLE_SIZE / login->lag_member_active_count;
+ thresh = LAG_MAP_TABLE_SIZE % login->lag_member_active_count;
+ continue_flag = 1;
+ while (continue_flag) {
+ for (i = 0; i < LAG_MAP_TABLE_SIZE; i++) {
+ if (user_count[login->lag_gw_map[i]] > expected + thresh) {
+ user_count[login->lag_gw_map[i]]--;
+ login->lag_gw_map[i] = member_id;
+ user_count[login->lag_gw_map[i]]++;
+ if (user_count[member_id] >= expected) {
+ continue_flag = 0;
+ break;
+ }
+ }
+ }
+ thresh--;
+ }
+ }
+}
+
+void __bcast_member_attach_cb(struct vnic_mcast *mcaste, void *gw_ptr)
+{
+ struct vnic_gw_info *member = gw_ptr;
+
+ /* When SA is local, mcast join works even when port is down */
+ if (member->neigh.login->port->attr.state != IB_PORT_ACTIVE)
+ return;
+
+ vnic_dbg_lag(member->neigh.login->name, "__bcast_member_attach_cb for member id %d and "
+ "gw_id=%d\n", member->member_id, member->gw_id);
+
+ netif_tx_lock_bh(member->neigh.login->dev);
+ member->info |= GW_MEMBER_INFO_MCAST;
+
+ if (member->info & GW_MEMBER_INFO_EPORT_UP &&
+ !(member->info & GW_MEMBER_INFO_MAPPED)) {
+ _vnic_add_member_to_map(member->neigh.login, member->member_id);
+ member->info |= GW_MEMBER_INFO_MAPPED;
+ }
+ netif_tx_unlock_bh(member->neigh.login->dev);
+}
+
+void __bcast_member_detach_cb(struct vnic_mcast *mcaste, void *gw_ptr)
+{
+ struct vnic_gw_info *member = gw_ptr;
+
+ vnic_dbg_lag(member->neigh.login->name, "__bcast_member_detach_cb for member id %d and "
+ "gw_id=%d\n", member->member_id, member->gw_id);
+
+ netif_tx_lock_bh(member->neigh.login->dev);
+ if (member->info & GW_MEMBER_INFO_MAPPED)
+ _vnic_remove_member_from_map(member->neigh.login, member->member_id);
+
+ member->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_MCAST);
+ netif_tx_unlock_bh(member->neigh.login->dev);
+}
+
+/*
+ * create MGIDs and join the default MCAST addresses. The mcaste are added to the
+ * list contained within member struct. If more MGIDs are used by the vnic when
+ * a member is added we will join those too using the members GW_ID.
+*/
+static int _vnic_add_member_mgid(struct vnic_login *login, struct vnic_gw_info *member)
+{
+ struct vnic_mcast *mcaste, *mcaste_bcast;
+ int rc;
+#ifndef _BP_NO_MC_LIST
+ struct dev_mc_list *mclist;
+#else
+ struct netdev_hw_addr *ha;
+#endif
+
+ mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+ if (IS_ERR(mcaste))
+ return (-ENOMEM);
+
+ /* attach to default mgid */
+ __vnic_mcaste_fill(login, mcaste, member->gw_id, ETH_ZERO_MAC, 0, vnic_mcast_create);
+ mcaste->attach_cb = __bcast_member_attach_cb;
+ mcaste->detach_cb = __bcast_member_detach_cb;
+ mcaste->attach_cb_ctx = member;
+ mcaste->detach_cb_ctx = member;
+ mcaste->priv_data = member;
+ rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+ if (rc) {
+ debug_dump_members(login, member);
+ ASSERT(!rc);
+ }
+
+ rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+ if (rc) {
+ debug_dump_members(login, member);
+ ASSERT(!rc);
+ }
+
+ if (login->n_mac_mcgid) {
+ mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL);
+ if (IS_ERR(mcaste_bcast))
+ goto free_mcasts;
+
+ __vnic_mcaste_fill(login, mcaste_bcast, member->gw_id, ETH_BCAST_MAC, 0, 0);
+ /* The port gid is overun by the default gid as part of the mgid over
+ * same mlid hack */
+ memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN);
+ mcaste_bcast->priv_data = member;
+ rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast);
+ ASSERT(!rc);
+ }
+
+
+ /* hold the tx lock so set_multicast_list() won't change mc_list */
+ netif_tx_lock_bh(login->dev);
+#ifndef _BP_NO_MC_LIST
+ for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) {
+ u8* mmac = mclist->dmi_addr;
+#else
+ netdev_for_each_mc_addr(ha, login->dev) {
+ u8* mmac = ha->addr;
+#endif
+ /* do not add the default MGIDS because they are always used */
+ if (IS_ZERO_MAC(mmac))
+ continue;
+ if (IS_BCAST_MAC(mmac))
+ continue;
+
+ vnic_dbg_lag(login->name, "_vnic_add_member_mgid for "
+ MAC_6_PRINT_FMT" and member gw_id=%d\n",
+ MAC_6_PRINT_ARG(mcaste->mac), member->gw_id);
+
+ if (_vnic_mcast_attach_mgid(login, mmac, mcaste, member,
+ member->gw_id))
+ goto attach_failed;
+ }
+ netif_tx_unlock_bh(login->dev);
+
+ return 0;
+
+attach_failed:
+ netif_tx_unlock_bh(login->dev);
+free_mcasts:
+ vnic_mcast_del_user(&login->mcast_tree, member);
+ return -ENOMEM;
+}
+
+int vnic_member_add(struct vnic_login *login, int member_id, struct lag_member *member)
+{
+ struct vnic_gw_info *member_e;
+ int ret;
+
+ if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+ return -1;
+
+ vnic_dbg_lag(login->name,"vnic_member_add id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+ member_id, member_e->gw_id, member->lid, member->qpn, member->sl);
+ /* member id is already in use */
+ if (login->lag_gw_neigh[member_id].info & GW_MEMBER_INFO_CREATED)
+ return -1;
+
+ member_e = &login->lag_gw_neigh[member_id];
+
+ /* create new entry */
+ member_e->member_id = member_id;
+ member_e->neigh.lid = member->lid;
+ member_e->neigh.qpn = member->qpn;
+ member_e->gw_id = member->gw_port_id;
+ member_e->neigh.login = login;
+ INIT_DELAYED_WORK(&member_e->neigh.destroy_task, vnic_neighe_dealloc_task);
+ skb_queue_head_init(&member_e->neigh.pkt_queue);
+ init_completion(&member_e->neigh.query_comp);
+ complete(&member_e->neigh.query_comp); /* mark as complete since no query is running */
+ member_e->neigh.valid = 0;
+ member_e->neigh.pquery = ERR_PTR(-ENODATA);
+ member_e->neigh.query_id = -1;
+ member_e->neigh.ah = ERR_PTR(-ENODATA); /* ah query will be done via datapath */
+ if (!vnic_sa_query) {
+ member_e->neigh.ah = vnic_ah_alloc(login, member->lid);
+ if (IS_ERR(member_e->neigh.ah))
+ return -ENOMEM;
+ }
+ /* need to add multicast code */
+ ret = _vnic_add_member_mgid(login, member_e);
+ if (ret)
+ goto free_ah;
+
+ netif_tx_lock_bh(login->dev);
+ member_e->info = GW_MEMBER_INFO_CREATED;
+ if (member->eport_state)
+ member_e->info |= GW_MEMBER_INFO_EPORT_UP;
+ login->lag_member_count++;
+ netif_tx_unlock_bh(login->dev);
+
+ return 0;
+free_ah:
+ if (!IS_ERR(member_e->neigh.ah))
+ ib_destroy_ah(member_e->neigh.ah);
+ return ret;
+}
+
+void vnic_member_remove_all(struct vnic_login *login)
+{
+ int i;
+
+ if (!login->is_lag)
+ return;
+
+ for (i=0; i<MAX_LAG_MEMBERS; i++)
+ vnic_member_remove(login, i);
+}
+
+int vnic_member_remove(struct vnic_login *login, int member_id)
+{
+ struct vnic_gw_info *member_e;
+
+ vnic_dbg_lag(login->name, "vnic_member_remove for id %d\n", member_id);
+
+ if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+ return -1;
+
+ member_e = &login->lag_gw_neigh[member_id];
+
+ vnic_dbg_lag(login->name,"vnic_member_remove id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+ member_id, member_e->gw_id, member_e->neigh.lid, member_e->neigh.qpn, member_e->neigh.sl);
+
+ /* member id is not in use */
+ if (!(member_e->info & GW_MEMBER_INFO_CREATED))
+ return -1;
+
+ if (member_e->neigh.query_id >=0 && member_e->neigh.pquery && !IS_ERR(member_e->neigh.pquery))
+ ib_sa_cancel_query(member_e->neigh.query_id, member_e->neigh.pquery);
+
+ netif_tx_lock_bh(login->dev);
+ if (member_e->info & GW_MEMBER_INFO_MAPPED)
+ _vnic_remove_member_from_map(login, member_e->member_id);
+ member_e->info &= ~(GW_MEMBER_INFO_MAPPED);
+ member_e->neigh.valid = 0;
+ netif_tx_unlock_bh(login->dev);
+
+ /* wait for completion after the entry was removed from login data path */
+ wait_for_completion(&member_e->neigh.query_comp);
+
+ /* modification of map will be done through mcast CB if needed */
+ vnic_mcast_del_user(&login->mcast_tree, member_e);
+
+ if(member_e->neigh.ah && !IS_ERR(member_e->neigh.ah))
+ ib_destroy_ah(member_e->neigh.ah);
+ member_e->neigh.ah = ERR_PTR(-ENODATA);
+ member_e->info = 0;
+ login->lag_member_count--;
+
+ return 0;
+}
+
+void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop)
+{
+ if (login->lag_prop.hash_mask != prop->hash_mask) {
+ netif_tx_lock_bh(login->dev);
+ memcpy(&login->lag_prop, prop,
+ sizeof(login->lag_prop));
+ netif_tx_unlock_bh(login->dev);
+ }
+}
+
+/*
+ * modify a specific LAG eport member parameters. The parameters might not be
+ * "interesting" and might not effect data traffic. They might require creating
+ * a new ah, or might even result in a modification of the transmit hash mapping
+ * function.
+*/
+int vnic_member_modify(struct vnic_login *login, int member_id, struct lag_member *member)
+{
+ struct vnic_gw_info *member_e;
+
+ if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+ return -1;
+
+ member_e = &login->lag_gw_neigh[member_id];
+
+ vnic_dbg_lag(login->name,"vnic_member_modify id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+ member_id, member_e->gw_id, member_e->neigh.lid, member_e->neigh.qpn, member_e->neigh.sl);
+
+ /* member id is not in use */
+ if (! member_e->info & GW_MEMBER_INFO_CREATED)
+ return -1;
+
+ /* change in LID requires new ah */
+ /* TODO Test this */
+ if (member_e->neigh.lid != member->lid) {
+ /* take tx lock to make sure ah is not being used */
+ if (vnic_sa_query) {
+ /* Cancel SA query in case */
+ if (member_e->neigh.query_id >=0 && member_e->neigh.pquery && !IS_ERR(member_e->neigh.pquery))
+ ib_sa_cancel_query(member_e->neigh.query_id, member_e->neigh.pquery);
+ netif_tx_lock_bh(login->dev);
+ member_e->neigh.lid = member->lid;
+ member_e->neigh.valid = 0;
+ if ((member_e->neigh.ah && !IS_ERR(member_e->neigh.ah)))
+ {
+ /* lid is not the same : destroy AH */
+ ib_destroy_ah(member_e->neigh.ah);
+ member_e->neigh.ah = ERR_PTR(-ENODATA);
+ }
+ netif_tx_unlock_bh(login->dev);
+ } else {
+ struct ib_ah *ah, *ah1;
+ ah = member_e->neigh.ah;
+ ah1 = vnic_ah_alloc(login, member->lid);
+ if (IS_ERR(ah1))
+ return -ENOMEM;
+ netif_tx_lock_bh(login->dev);
+ member_e->neigh.lid = member->lid;
+ member_e->neigh.ah = ah1;
+ netif_tx_unlock_bh(login->dev);
+ ib_destroy_ah(ah);
+ }
+ }
+
+ if (member_e->neigh.qpn != member->qpn)
+ member_e->neigh.qpn = member->qpn;
+
+ netif_tx_lock_bh(login->dev);
+ /* link changed from up to down */
+ if (member_e->info & GW_MEMBER_INFO_MAPPED && !member->eport_state) {
+ _vnic_remove_member_from_map(login, member_id);
+ member_e->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP);
+ }
+
+ /* link changed from down to up and mcast are connected */
+ if (!(member_e->info & GW_MEMBER_INFO_MAPPED) &&
+ member->eport_state) {
+ if (member_e->info & GW_MEMBER_INFO_MCAST) {
+ _vnic_add_member_to_map(login, member_id);
+ member_e->info |= (GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP);
+ } else
+ member_e->info |= GW_MEMBER_INFO_EPORT_UP;
+ }
+ netif_tx_unlock_bh(login->dev);
+
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+void vnic_neighe_dealloc_task(struct work_struct *work)
+{
+ struct vnic_neigh *neighe =
+ container_of(work, struct vnic_neigh, destroy_task.work);
+ if (IS_NEIGH_QUERY_RUNNING(neighe))
+ ib_sa_cancel_query(neighe->query_id, neighe->pquery);
+ wait_for_completion(&neighe->query_comp);
+ if (neighe->ah && !IS_ERR(neighe->ah))
+ ib_destroy_ah(neighe->ah);
+ kfree(neighe);
+}
+
+void vnic_neighe_dealloc(struct vnic_neigh *neighe)
+{
+ ASSERT(neighe);
+ /* calls vnic_neighe_dealloc_task */
+ queue_delayed_work(neighe->login->neigh_wq, &neighe->destroy_task, 0);
+}
+
+struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid)
+{
+ struct ib_ah_attr av;
+ struct ib_ah *ah;
+
+ memset(&av, 0, sizeof(av));
+ av.dlid = dlid;
+ av.port_num = login->port->num;
+ av.sl = login->sl; /* PATH Query is need here to allocate the data sl*/
+ ah = ib_create_ah(login->port->pd, &av);
+ if (IS_ERR(ah)) {
+ return ERR_PTR(-ENOMEM);
+ }
+ return(ah);
+}
+
+struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login,
+ const u8 *mac,
+ u16 dlid, u32 dqpn, u8 rss)
+{
+ struct vnic_neigh *neighe;
+ neighe = kzalloc(sizeof *neighe, GFP_ATOMIC);
+ if (!neighe)
+ return ERR_PTR(-ENOMEM);
+ INIT_DELAYED_WORK(&neighe->destroy_task, vnic_neighe_dealloc_task);
+ skb_queue_head_init(&neighe->pkt_queue);
+ if (mac)
+ memcpy(neighe->mac, mac, ETH_ALEN);
+ neighe->rss = rss;
+ neighe->ah = ERR_PTR(-ENODATA);
+ if (!vnic_sa_query) {
+ neighe->ah = vnic_ah_alloc(login, dlid);
+ if (IS_ERR(neighe->ah)) {
+ kfree(neighe);
+ return ERR_PTR(-ENOMEM);
+ }
+ }
+ init_completion(&neighe->query_comp);
+ complete(&neighe->query_comp); /* mark as complete since no query is running */
+ neighe->pquery = ERR_PTR(-ENODATA);
+ neighe->query_id = -1;
+ neighe->qpn = dqpn;
+ neighe->lid = dlid;
+ neighe->login = login;
+
+ return neighe;
+}
+
+void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe)
+{
+ ASSERT(neighe);
+ rb_erase(&neighe->rb_node, &login->neigh_tree);
+}
+
+int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe)
+{
+ struct rb_node **n = &login->neigh_tree.rb_node, *pn = NULL;
+ struct vnic_neigh *neighe_t;
+ int rc;
+
+ while (*n) {
+ pn = *n;
+ neighe_t = rb_entry(pn, struct vnic_neigh, rb_node);
+ rc = memcmp(neighe->mac, neighe_t->mac, ETH_ALEN);
+ if (rc < 0)
+ n = &pn->rb_left;
+ else if (rc > 0)
+ n = &pn->rb_right;
+ else {
+ rc = -EEXIST;
+ goto out;
+ }
+ }
+
+ rb_link_node(&neighe->rb_node, pn, n);
+ rb_insert_color(&neighe->rb_node, &login->neigh_tree);
+ rc = 0;
+
+out:
+ return rc;
+}
+
+struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac)
+{
+ struct rb_node *n = login->neigh_tree.rb_node;
+ struct vnic_neigh *neighe_t;
+ int rc;
+
+ while (n) {
+ neighe_t = rb_entry(n, struct vnic_neigh, rb_node);
+ rc = memcmp(mac, neighe_t->mac, ETH_ALEN);
+ if (rc < 0)
+ n = n->rb_left;
+ else if (rc > 0)
+ n = n->rb_right;
+ else {
+ vnic_dbg_data(login->name,
+ "found: mac "MAC_6_PRINT_FMT" vid %d "
+ "qpn 0x%06x lid 0x%02x\n",
+ MAC_6_PRINT_ARG(neighe_t->mac),
+ be16_to_cpu(login->vid), neighe_t->qpn,
+ neighe_t->lid);
+ goto out;
+ }
+ }
+ neighe_t = ERR_PTR(-ENODATA);
+
+out:
+ return neighe_t;
+}
+
+void vnic_neigh_del_all(struct vnic_login *login)
+{
+ struct rb_node *n;
+ struct vnic_neigh *neighe;
+
+ ASSERT(login);
+ n = rb_first(&login->neigh_tree);
+ while (n) {
+ neighe = rb_entry(n, struct vnic_neigh, rb_node);
+ vnic_neighe_del(login, neighe);
+ n = rb_first(&login->neigh_tree);
+ vnic_neighe_dealloc(neighe);
+ }
+}
+
+void vnic_neigh_invalidate(struct vnic_login *login)
+{
+ struct vnic_neigh *neighe;
+ struct rb_node *n;
+ int i;
+
+ if (login->gw_neigh && !IS_ERR(login->gw_neigh))
+ login->gw_neigh->valid = 0;
+
+ n = rb_first(&login->neigh_tree);
+ while (n) {
+ neighe = rb_entry(n, struct vnic_neigh, rb_node);
+ neighe->valid = 0;
+ n = rb_next(n);
+ }
+
+ if (login->is_lag)
+ for (i=0; i<MAX_LAG_MEMBERS; i++)
+ login->lag_gw_neigh[i].neigh.valid = 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+extern struct net_device_stats *mlx4_vnic_stats_func_container(struct net_device *n);
+
+static int mlx4_vnic_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+ unsigned short vid)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ vnic_dbg_data(login->name, "add VLAN:%d was called\n", vid);
+ return 0;
+}
+
+static int mlx4_vnic_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+ unsigned short vid)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ vnic_dbg_data(login->name, "Kill VID:%d was called\n", vid);
+ return 0;
+}
+
+void vnic_carrier_update(struct vnic_login *login)
+{
+ int attached, eport_up, eport_enforce, carrier_ok;
+
+ ASSERT(login);
+ attached = test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+ eport_up = fip_vnic_get_eport_state(login->fip_vnic);
+ eport_enforce = vnic_eport_state_enforce;
+ carrier_ok = netif_carrier_ok(login->dev);
+
+ /* bring carrier up */
+ if (!carrier_ok && attached && (!eport_enforce || eport_up)) {
+ set_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state);
+ netif_carrier_on(login->dev);
+ vnic_info("%s link is up\n", login->dev->name);
+ return;
+ }
+
+ /* bring carrier down */
+ if (carrier_ok && (!attached || (!eport_up && eport_enforce))) {
+ clear_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state);
+ netif_carrier_off(login->dev);
+ vnic_info("%s link is down\n", login->dev->name);
+ return;
+ }
+
+}
+
+void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr)
+{
+ struct vnic_login *login = login_ptr;
+
+ /* When SA is local, mcast join works even when port is down */
+ if (login->port->attr.state != IB_PORT_ACTIVE)
+ return;
+ set_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+ vnic_carrier_update(login);
+}
+
+void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr)
+{
+ struct vnic_login *login = login_ptr;
+
+ clear_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+ vnic_carrier_update(login);
+}
+
+/* this function cannot sleep, avoid any mutex() in consequent calls */
+static int vnic_set_mac(struct net_device *dev, void *_mac)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ struct sockaddr *saddr = _mac;
+ u8 *mac = (u8 *)(saddr->sa_data);
+ int rc = 0;
+
+ vnic_dbg_func(login->name);
+
+ vnic_dbg_mac(login->name, "mac "MAC_6_PRINT_FMT" => "MAC_6_PRINT_FMT"\n",
+ MAC_6_PRINT_ARG((u8 *)(dev->dev_addr)),
+ MAC_6_PRINT_ARG(mac));
+
+ /* must support child vNics for mac modification */
+ if (!vnic_child_max)
+ return -ENOSYS;
+
+ /* skip if invalid address */
+ if (unlikely(!is_valid_ether_addr(mac)))
+ return -EINVAL;
+
+ /* skip if same mac was already set */
+ if (!(memcmp((u8 *)(dev->dev_addr), mac, ETH_ALEN)))
+ return 0;
+
+ /* already in bh, calls vnic_child_update that queues a job,
+ * so read_lock is enough
+ */
+ read_lock(&login->mac_rwlock);
+
+ /* if mac same as original, delete child, set mac and return */
+ if (!(memcmp(mac, login->dev_addr, ETH_ALEN)))
+ goto out;
+
+ /* else, this is a new child vNic,
+ * add new child vNic
+ * NOTE: pay attention that the GC should not destroy a child vNic that
+ * is being used as mac-change even if it was created by different
+ * source.
+ */
+ rc = vnic_child_update(login, mac, 0);
+ if (rc && rc != -EEXIST)
+ goto err;
+
+out:
+ memcpy(dev->dev_addr, mac, ETH_ALEN);
+ vnic_child_update(login, (u8 *)(dev->dev_addr), 1);
+ vnic_dbg_mac(login->name, "mac changed successfully to "
+ MAC_6_PRINT_FMT"\n", MAC_6_PRINT_ARG(mac));
+
+err:
+ read_unlock(&login->mac_rwlock);
+ return rc;
+}
+
+static void vnic_set_multicast_list(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ vnic_dbg_func(login->name);
+
+ /* test promisc flag changes */
+ if (is_ucast_promisc(login) && !login->promisc) {
+ /* promisc is being set */
+ if (!vnic_child_max) {
+ /* must support child vNics for promisc mode */
+ vnic_info("%s promisc mode cannot be set "
+ "(vnic_child_max %u)\n",
+ dev->name, vnic_child_max);
+ } else if (vnic_src_mac_enforce) {
+ /* cannot support promisc if source mac is enforced
+ * because sender should be able to use any smac
+ */
+ vnic_info("%s promisc mode cannot be set "
+ "(vnic_src_mac_enforce %u)\n",
+ dev->name, vnic_src_mac_enforce);
+ } else {
+ login->promisc = 1;
+ vnic_dbg_mac(dev->name,
+ "entered promiscuous mode: confirmed\n");
+ }
+ } else if (!is_ucast_promisc(login) && login->promisc) {
+ /* promisc is being cleared */
+ login->promisc = 0;
+ write_lock(&login->mac_rwlock);
+ vnic_child_flush(login, 0);
+ write_unlock(&login->mac_rwlock);
+ vnic_dbg_mac(dev->name,
+ "left promiscuous mode: confirmed\n");
+ }
+
+ /* test mcast changes */
+ if (!no_bxm && !login->queue_stopped) {
+ dev_hold(dev);
+ if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100))
+ dev_put(dev);
+ }
+}
+
+static void vnic_auto_moder(struct vnic_login *login)
+{
+ unsigned long period =
+ (unsigned long)(jiffies - login->last_moder_jiffies);
+ unsigned long packets;
+ unsigned long rate;
+ unsigned long avg_pkt_size;
+ unsigned long rx_packets;
+ unsigned long rx_bytes;
+ unsigned long tx_packets;
+ unsigned long tx_pkt_diff;
+ unsigned long rx_pkt_diff;
+ int moder_time;
+
+ period = (unsigned long)(jiffies - login->last_moder_jiffies);
+#if 0
+ vnic_dbg_moder_v(login->name, "adaptive_rx_coal %d, period %d, "
+ "sample_interval %d, state %d\n",
+ login->adaptive_rx_coal, period,
+ login->sample_interval, login->port->attr.state);
+#endif
+
+ if (!login->adaptive_rx_coal || period < login->sample_interval * HZ)
+ return;
+
+ /* TODO: when NAPI is disabled, the RX completion will be called from
+ * IRQ context (and not BH context) and thus spin_lock_bh should be
+ * replaced with spin_lock_irq
+ */
+ spin_lock_bh(&login->stats_lock);
+ rx_packets = login->stats.rx_packets;
+ rx_bytes = login->stats.rx_bytes;
+ tx_packets = login->stats.tx_packets;
+ spin_unlock_bh(&login->stats_lock);
+
+ if (!login->last_moder_jiffies || !period)
+ goto out_set;
+
+ tx_pkt_diff = ((unsigned long)(tx_packets -
+ login->last_moder_tx_packets));
+ rx_pkt_diff = ((unsigned long)(rx_packets - login->last_moder_packets));
+ packets = max(tx_pkt_diff, rx_pkt_diff);
+ rate = packets * HZ / period;
+ avg_pkt_size = packets ? ((unsigned long)(rx_bytes -
+ login->last_moder_bytes)) /
+ packets : 0;
+
+ if (rate > VNIC_RX_RATE_THRESH && avg_pkt_size > VNIC_AVG_PKT_SMALL) {
+ /* If tx and rx packet rates are not balanced, assume that
+ * traffic is mainly BW bound and apply maximum moderation.
+ * Otherwise, moderate according to packet rate */
+ if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+ 2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+ moder_time = login->rx_usecs_high;
+ } else {
+ if (rate < login->pkt_rate_low)
+ moder_time = login->rx_usecs_low;
+ else if (rate > login->pkt_rate_high)
+ moder_time = login->rx_usecs_high;
+ else
+ moder_time = (rate - login->pkt_rate_low) *
+ (login->rx_usecs_high - login->rx_usecs_low) /
+ (login->pkt_rate_high - login->pkt_rate_low) +
+ login->rx_usecs_low;
+ }
+ } else {
+ moder_time = login->rx_usecs_low;
+ }
+
+ if (moder_time != login->last_moder_time) {
+ vnic_dbg_moder(login->name, "tx rate:%lu rx_rate:%lu\n",
+ tx_pkt_diff * HZ / period,
+ rx_pkt_diff * HZ / period);
+ vnic_dbg_moder(login->name,
+ "Rx moder_time changed from:%lu to %d period:%lu"
+ " [jiff] packets:%lu avg_pkt_size:%lu rate:%lu"
+ " [p/s])\n", login->last_moder_time, moder_time,
+ period, packets, avg_pkt_size, rate);
+ login->last_moder_time = moder_time;
+ vnic_ib_set_moder(login,
+ login->last_moder_time, login->rx_frames,
+ login->tx_usecs, login->tx_frames);
+ }
+
+out_set:
+ login->last_moder_packets = rx_packets;
+ login->last_moder_tx_packets = tx_packets;
+ login->last_moder_bytes = rx_bytes;
+ login->last_moder_jiffies = jiffies;
+}
+
+void vnic_dump_stats(struct vnic_login *login)
+{
+ unsigned long *stats, *login_stats = (unsigned long *)(&login->stats);
+ int i, j, len = sizeof(struct net_device_stats) / sizeof(unsigned long);
+ struct net_device_stats stats_tmp;
+
+ spin_lock_bh(&login->stats_lock);
+ /* tx stats are distributed between tx_res entries */
+ stats_tmp = login->stats;
+ memset(&login->stats, 0, sizeof(struct net_device_stats));
+ for (i = 0; i < login->tx_rings_num; ++i) {
+ stats = (unsigned long *)(&login->tx_res[i].stats);
+ for (j = 0; j < len; ++j)
+ login_stats[j] += stats[j];
+ }
+
+ /* rx stats are in login->stats */
+ login->stats.rx_bytes = stats_tmp.rx_bytes;
+ login->stats.rx_packets = stats_tmp.rx_packets;
+ login->stats.rx_errors = stats_tmp.rx_errors;
+ login->stats.rx_dropped = stats_tmp.rx_dropped;
+ spin_unlock_bh(&login->stats_lock);
+}
+
+static void vnic_do_get_stats(struct work_struct *work)
+{
+ struct vnic_login *login =
+ container_of(work, struct vnic_login, stats_task.work);
+
+ mutex_lock(&login->moder_lock);
+ vnic_dump_stats(login);
+
+ if (login->queue_stopped)
+ goto out;
+
+ if (!(test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)))
+ goto resched;
+
+ if (login->port->attr.state == IB_PORT_ACTIVE)
+ vnic_auto_moder(login);
+
+resched:
+ /* calls vnic_do_get_stats() */
+ if (!login->queue_stopped)
+ queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY);
+out:
+ mutex_unlock(&login->moder_lock);
+}
+
+static void vnic_mcast_reattach(struct work_struct *work)
+{
+ struct vnic_mcast *mcaste, *mcaste_t;
+ struct rb_node *n;
+ unsigned long flags;
+ union vhub_mgid mgid;
+ LIST_HEAD(local_list);
+ int i;
+ struct vnic_gw_info *lag_member;
+ struct vnic_login *login;
+ struct net_device *dev;
+#ifndef _BP_NO_MC_LIST
+ struct dev_mc_list *mclist;
+#else
+ struct netdev_hw_addr *ha;
+#endif
+
+ login = container_of(work, struct vnic_login, mcast_task.work);
+ dev = login->dev;
+
+ vnic_dbg_mcast(login->name, "set_multicast_list was notified\n");
+ if (login->queue_stopped) {
+ dev_put(dev);
+ return;
+ }
+
+ /* detach all mcast (except default and bcast mcasts) */
+ spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+ if (!list_empty(&login->mcast_tree.reattach_list)) {
+ /* an event is being processed */
+ spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+ goto retry;
+ }
+
+ for (n = rb_first(&login->mcast_tree.mcast_tree); n; n = rb_next(n)) {
+ mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+ if (IS_ZERO_MAC(mcaste->mac))
+ continue;
+ if (IS_BCAST_MAC(mcaste->mac))
+ continue;
+ list_add_tail(&mcaste->list, &local_list);
+ }
+
+ list_for_each_entry(mcaste, &local_list, list) {
+ vnic_mcast_del(&login->mcast_tree, mcaste);
+ mcaste->attach_task_cnt = 0;
+ }
+
+ spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+ vnic_dbg_mcast(login->name, "local_list is %s empty n_mac_mcgid %u\n",
+ (list_empty(&local_list) ? "" : "not"),
+ login->n_mac_mcgid);
+
+ list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+ list_del(&mcaste->list);
+ vnic_mcast_detach(&login->mcast_tree, mcaste);
+ vnic_mcast_dealloc(mcaste);
+ }
+
+ /* attach all mcasts in mc_list */
+ vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid,
+ CREATE_VHUB_ID(login->vid, login->gw_port_id),
+ VHUB_MGID_DATA, 0, &mgid);
+
+ spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+ mcaste_t = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid);
+ if (IS_ERR(mcaste_t) || !test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state)) {
+ vnic_dbg_data(login->name, "default mgid not ready\n");
+ spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+ dev_put(dev);
+ return;
+ }
+ spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+ /* hold the tx lock so set_multicast_list() won't change mc_list */
+ netif_tx_lock_bh(dev);
+#ifndef _BP_NO_MC_LIST
+ for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) {
+ u8* mmac = mclist->dmi_addr;
+#else
+ netdev_for_each_mc_addr(ha, login->dev) {
+ u8* mmac = ha->addr;
+#endif
+ /* do not add the default MGIDS because they are always used */
+ if (IS_ZERO_MAC(mmac))
+ continue;
+ if (IS_BCAST_MAC(mmac))
+ continue;
+
+ /* attach to the legacy GW / LAG gw id MGID */
+ if (_vnic_mcast_attach_mgid(login, mmac, mcaste_t, login,
+ login->gw_port_id))
+ goto attach_failed;
+
+ if (!login->is_lag)
+ continue;
+
+ for (i=0; i<MAX_LAG_MEMBERS; i++) {
+ lag_member = &login->lag_gw_neigh[i];
+ /* member id is already in use */
+ if (lag_member->info & GW_MEMBER_INFO_CREATED)
+ /* attach to the legacy GW / LAG gw id MGID */
+ if (_vnic_mcast_attach_mgid(login, mmac,
+ mcaste_t,
+ lag_member,
+ lag_member->gw_id))
+ goto attach_failed;
+ }
+ }
+ netif_tx_unlock_bh(dev);
+ dev_put(dev);
+ return;
+
+attach_failed:
+ netif_tx_unlock_bh(dev);
+ vnic_mcast_del_all(&login->mcast_tree);
+
+retry:
+ if (!login->queue_stopped) {
+ if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100))
+ dev_put(dev);
+ } else
+ dev_put(dev);
+}
+
+static int vnic_change_mtu(struct net_device *dev, int new_mtu)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ if (new_mtu > login->max_mtu) {
+ vnic_warn(login->name, "failed: new_mtu %d > %d\n", new_mtu,
+ login->max_mtu);
+ return -EINVAL;
+ }
+
+ vnic_dbg_data(login->name, "mtu %d -> %d\n", dev->mtu, new_mtu);
+ dev->mtu = new_mtu;
+
+ return 0;
+}
+
+static void vnic_set_default_moder(struct vnic_login *login)
+{
+
+ login->rx_frames = VNIC_RX_COAL_TARGET / login->dev->mtu + 1;
+ login->rx_usecs = VNIC_RX_COAL_TIME;
+ login->tx_frames = VNIC_TX_COAL_PKTS;
+ login->tx_usecs = VNIC_TX_COAL_TIME;
+ login->pkt_rate_low = VNIC_RX_RATE_LOW;
+ login->rx_usecs_low = VNIC_RX_COAL_TIME_LOW;
+ login->pkt_rate_high = VNIC_RX_RATE_HIGH;
+ login->rx_usecs_high = VNIC_RX_COAL_TIME_HIGH;
+ login->sample_interval = VNIC_SAMPLE_INTERVAL;
+ login->adaptive_rx_coal = 1;
+ login->last_moder_time = VNIC_AUTO_CONF;
+ login->last_moder_jiffies = 0;
+ login->last_moder_packets = 0;
+ login->last_moder_tx_packets = 0;
+ login->last_moder_bytes = 0;
+
+ vnic_dbg_data(login->name, "default coalescing params for mtu:%d to "
+ "rx_frames:%d rx_usecs:%d "
+ "tx_frames:%d tx_usecs:%d\n",
+ login->dev->mtu,
+ login->rx_frames, login->rx_usecs,
+ login->tx_frames, login->tx_usecs);
+}
+
+#ifndef _BP_NAPI_POLL
+int vnic_napi_alloc(struct vnic_login *login, int rx_res_index)
+{
+
+ struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+ netif_napi_add(login->dev, napi, vnic_poll_cq_rx, vnic_napi_weight);
+
+ return 0;
+}
+
+void vnic_napi_enable(struct vnic_login *login, int rx_res_index)
+{
+
+ struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+ napi_enable(napi);
+}
+
+static void vnic_napi_disable(struct vnic_login *login, int rx_res_index)
+{
+ struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+ if (!napi->poll)
+ return;
+
+ napi_disable(napi);
+}
+
+static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index)
+{
+#ifndef _BP_NAPI_NO_DEL
+ struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+ netif_napi_del(napi);
+#else
+ return;
+#endif
+}
+
+#else
+int vnic_napi_alloc(struct vnic_login *login, int rx_res_index)
+{
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+ char name[IFNAMSIZ];
+
+ snprintf(name, IFNAMSIZ, "%s-N%d", login->name, rx_res_index);
+ rx_res->poll_dev =
+ alloc_netdev(0, name, ether_setup);
+ if (!rx_res->poll_dev)
+ return -ENOMEM;
+
+ rx_res->poll_dev = rx_res->poll_dev;
+ rx_res->poll_dev->priv = rx_res;
+ rx_res->poll_dev->weight = vnic_napi_weight;
+ rx_res->poll_dev->poll = vnic_poll_cq_rx;
+
+ return 0;
+}
+
+void vnic_napi_enable(struct vnic_login *login, int rx_res_index)
+{
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+
+ ASSERT(rx_res->poll_dev);
+ set_bit(__LINK_STATE_START, &rx_res->poll_dev->state);
+}
+
+static void vnic_napi_disable(struct vnic_login *login, int rx_res_index)
+{
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+ struct net_device *poll_dev = rx_res->poll_dev;
+
+ if (!poll_dev)
+ return;
+
+ while (test_bit(__LINK_STATE_RX_SCHED, &poll_dev->state))
+ msleep(VNIC_NAPI_SCHED_TIMEOUT);
+}
+
+static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index)
+{
+ struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+ struct net_device *poll_dev = rx_res->poll_dev;
+
+ if (!poll_dev)
+ return;
+
+ free_netdev(poll_dev);
+ rx_res->poll_dev = NULL;
+}
+#endif
+
+static int _vnic_open(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int i;
+
+ /* Todo add locks here */
+ if (!(test_bit(VNIC_STATE_LOGIN_CREATE_2, &login->fip_vnic->login_state))) {
+ set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+ return 0;
+ }
+
+ if (test_and_set_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))
+ return 0;
+
+ clear_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+
+ /* ARM RX handlers */
+ for (i = 0; i < login->rx_rings_num; ++i) {
+ login->rx_res[i].stopped = 0;
+ if (ib_req_notify_cq(login->rx_res[i].cq, IB_CQ_NEXT_COMP)) {
+ vnic_err(login->name, "ib_req_notify_cq failed\n");
+ goto err;
+ }
+ }
+
+ /* ARM TX handlers */
+ for (i = 0; i < login->tx_rings_num; ++i) {
+ login->tx_res[i].stopped = 0;
+ spin_lock_init(&login->tx_res[i].lock);
+ if (!vnic_tx_polling &&
+ ib_req_notify_cq(login->tx_res[i].cq, IB_CQ_NEXT_COMP)) {
+ vnic_err(login->name, "ib_req_notify_cq failed\n");
+ goto err;
+ }
+ }
+
+ /* enable napi*/
+ for (i = 0; i < login->napi_num; ++i)
+ vnic_napi_enable(login, i);
+
+ /* move QP to RTS, post recv skb */
+ if (vnic_ib_open(dev))
+ goto err_napi;
+
+ /* dummy call */
+ if (vnic_ib_up(dev))
+ goto err_ib_stop;
+
+ /* configure */
+ vnic_set_default_moder(login);
+ if (vnic_ib_set_moder(login, login->last_moder_time, login->rx_frames,
+ login->tx_usecs, login->tx_frames))
+ vnic_warn(login->name, "vnic_ib_set_moder failed!\n");
+
+ /* start interface TX queue */
+ VNIC_TXQ_START_ALL(login);
+
+ /* report and return */
+ vnic_info("%s is opened\n", dev->name);
+
+ return 0;
+
+err_ib_stop:
+ vnic_ib_stop(dev);
+err_napi:
+ /* disable napi*/
+ for (i = 0; i < login->napi_num; ++i)
+ vnic_napi_disable(login, i);
+err:
+ clear_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state);
+ return -EINVAL;
+}
+
+static int vnic_open(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int ret;
+
+ vnic_dbg_func(login->name);
+
+ mutex_lock(&login->state_lock);
+ ret = _vnic_open(dev);
+ mutex_unlock(&login->state_lock);
+ return ret;
+}
+
+static int _vnic_stop(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int i, _watchdog_timeo = dev->watchdog_timeo;
+
+ /* check if already stopped */
+ if (!(test_and_clear_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)))
+ return 0;
+
+ /* Set trans_start to jiffies and watchdog_timeo to max
+ * to avoid spurious transmit timeouts in the interval between
+ * tx queue stopped and carrier down.
+ */
+ dev->trans_start = jiffies;
+ dev->watchdog_timeo = 0x7fffffff;
+
+ VNIC_TXQ_STOP_ALL(login);
+
+ /* disable rx handlers */
+ for (i = 0; i < login->rx_rings_num; ++i)
+ login->rx_res[i].stopped = 1;
+
+ /* disable tx handlers */
+ for (i = 0; i < login->tx_rings_num; ++i)
+ login->tx_res[i].stopped = 1;
+
+ /* disable napi managers */
+ for (i = 0; i < login->napi_num; ++i)
+ vnic_napi_disable(login, i);
+
+ vnic_ib_down(dev);
+ vnic_ib_stop(dev);
+
+ /* restore watchdog_timeo */
+ dev->watchdog_timeo = _watchdog_timeo;
+
+ vnic_info("%s is stopped\n", dev->name);
+
+ return 0;
+}
+
+static int vnic_stop(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int ret;
+
+ vnic_dbg_func(login->name);
+
+ mutex_lock(&login->state_lock);
+ ret = _vnic_stop(dev);
+ mutex_unlock(&login->state_lock);
+
+ return ret;
+}
+
+int vnic_restart(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int rc = 0;
+
+ if (login->queue_stopped || !test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))
+ return rc;
+
+ set_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+ netif_tx_disable(login->dev);
+
+ mutex_lock(&login->state_lock);
+ _vnic_stop(login->dev);
+
+ clear_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+ set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+
+ rc = _vnic_open(login->dev);
+ mutex_unlock(&login->state_lock);
+
+ return rc;
+}
+
+static void vnic_restart_task(struct work_struct *work)
+{
+ struct vnic_login *login =
+ container_of(work, struct vnic_login, restart_task.work);
+
+ vnic_restart(login->dev);
+}
+
+struct net_device_stats *vnic_get_stats(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ if (dev->reg_state != NETREG_REGISTERED)
+ return &dev->stats;
+
+ spin_lock_bh(&login->stats_lock);
+ if (test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->fip_vnic->login_state))
+ memcpy(&dev->stats, &login->stats, sizeof(login->stats));
+ spin_unlock_bh(&login->stats_lock);
+
+ return &dev->stats;
+}
+
+static void vnic_tx_timeout(struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ vnic_warn(login->name, "TX timeout called on port: %d, "
+ "latency: %d msec, stopped: %d, carrier_ok: %d,"
+ "queue_stopped: %d, watchdog_timeo: %d msec\n",
+ login->port->num,
+ jiffies_to_msecs(jiffies - dev->trans_start),
+ netif_queue_stopped(dev), netif_carrier_ok(dev),
+ login->queue_stopped,
+ jiffies_to_msecs(dev->watchdog_timeo));
+
+ if (netif_carrier_ok(dev)) {
+ VNIC_STATS_DO_INC(login->port_stats.tx_timeout);
+ if (!login->queue_stopped) {
+ vnic_warn(login->name, "TX timeout, queueing rings restart\n");
+ queue_delayed_work(login_wq, &login->restart_task, HZ / 100);
+ }
+ }
+}
+
+#ifndef _BP_NETDEV_NO_TMQ
+u16 vnic_select_queue(struct net_device *dev, struct sk_buff *skb,
+ void *accel_priv, select_queue_fallback_t fallback)
+{
+ /* Notes:
+ * - In kernel 2.6.32 the skb->mac_header 0x1a is not set when
+ * select_queue() is called
+ * - In OVM Server 3.0, DomU tx skb network and transport
+ * headers are not set
+ */
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, ETH_HLEN);
+ skb_set_transport_header(skb,
+ ETH_HLEN +
+ (skb->protocol == htons(ETH_P_IPV6) ?
+ sizeof(struct ipv6hdr) : ip_hdrlen(skb)));
+
+ return vnic_hash(dev, skb) % dev->real_num_tx_queues;
+}
+
+#endif
+
+#ifndef _BP_NO_NDO_OPS
+static struct net_device_ops vnic_netdev_ops = {
+ .ndo_open = vnic_open,
+ .ndo_stop = vnic_stop,
+ .ndo_start_xmit = vnic_tx,
+ .ndo_get_stats = vnic_get_stats,
+ .ndo_set_rx_mode = vnic_set_multicast_list,
+ .ndo_change_mtu = vnic_change_mtu,
+ .ndo_tx_timeout = vnic_tx_timeout,
+ .ndo_set_mac_address = vnic_set_mac,
+ .ndo_vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid,
+ .ndo_vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid,
+#ifndef _BP_NETDEV_NO_TMQ
+ .ndo_select_queue = vnic_select_queue,
+#endif
+};
+#endif
+
+static void vnic_setup(struct net_device *dev)
+{
+ ether_setup(dev);
+
+ dev->hard_header_len += VNIC_SKB_GET_ENCAP_OFFSET;
+ dev->watchdog_timeo = VNIC_WATCHDOG_TIMEOUT;
+
+#ifndef _BP_NO_NDO_OPS
+ if (!vnic_change_mac)
+ vnic_netdev_ops.ndo_set_mac_address = NULL;
+
+ dev->netdev_ops = &vnic_netdev_ops;
+#else
+ dev->open = vnic_open;
+ dev->stop = vnic_stop;
+ dev->hard_start_xmit = vnic_tx;
+ dev->get_stats = mlx4_vnic_stats_func_container;
+ dev->set_multicast_list = vnic_set_multicast_list;
+ dev->change_mtu = vnic_change_mtu;
+ dev->tx_timeout = vnic_tx_timeout;
+ dev->set_mac_address = vnic_set_mac;
+ dev->vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid;
+ dev->vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid;
+
+ if (!vnic_change_mac)
+ dev->set_mac_address = NULL;
+
+#ifndef _BP_NETDEV_NO_TMQ
+ dev->select_queue = vnic_select_queue;
+#endif
+#endif // _BP_NO_NDO_OPS
+}
+
+static int vnic_get_frag_header(struct skb_frag_struct *frags, void **mac_hdr,
+ void **ip_hdr, void **tcpudp_hdr,
+ u64 *hdr_flags, void *priv)
+{
+ struct iphdr *iph;
+ *mac_hdr = page_address(frags->page.p) + frags->page_offset;
+ *ip_hdr = iph = (struct iphdr *)(*mac_hdr + ETH_HLEN);
+ *tcpudp_hdr = (struct tcphdr *)(iph + (iph->ihl << 2));
+ *hdr_flags = LRO_IPV4 | LRO_TCP;
+
+ return 0;
+}
+
+static int vnic_get_skb_header(struct sk_buff *skb, void **iphdr,
+ void **tcphdr, u64 *hdr_flags, void *priv)
+{
+ struct iphdr *iph;
+ struct tcphdr *tcph;
+
+ if (unlikely(skb->protocol != htons(ETH_P_IP)))
+ return -1;
+
+ if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
+ return -1;
+
+ iph = (struct iphdr *)(skb->data + ETH_HLEN);
+ if (iph->protocol != IPPROTO_TCP)
+ return -1;
+
+ tcph = (struct tcphdr *)(iph + (iph->ihl << 2));
+
+ if (ntohs(iph->tot_len) < (iph->ihl * 4 + tcph->doff * 4))
+ return -1;
+
+ *hdr_flags = LRO_IPV4 | LRO_TCP;
+ *iphdr = iph;
+ *tcphdr = tcph;
+
+ return 0;
+}
+
+static int vnic_lro_enable(struct vnic_login *login, int rx_res_index)
+{
+ struct net_lro_mgr *lro = &login->rx_res[rx_res_index].lro;
+
+ lro->dev = login->dev;
+ lro->features = login->napi_num ? LRO_F_NAPI : 0;
+ lro->frag_align_pad = NET_IP_ALIGN;
+ lro->ip_summed = CHECKSUM_UNNECESSARY;
+ lro->ip_summed_aggr = CHECKSUM_UNNECESSARY;
+ lro->max_desc = login->lro_num;
+ lro->max_aggr = VNIC_MAX_LRO_AGGR;
+ lro->lro_arr = login->rx_res[rx_res_index].lro_desc;
+
+ if (lro->max_aggr > MAX_SKB_FRAGS)
+ lro->max_aggr = MAX_SKB_FRAGS;
+
+ if (!vnic_rx_linear)
+ lro->get_frag_header = vnic_get_frag_header;
+ else
+ lro->get_skb_header = vnic_get_skb_header;
+
+ return 0;
+}
+
+static void vnic_lro_disable(struct vnic_login *login, int rx_res_index)
+{
+ /* nop */
+ return;
+}
+
+struct net_device *vnic_alloc_netdev(struct vnic_port *port)
+{
+ struct vnic_login_info *info;
+ struct vnic_login *login;
+ struct net_device *dev;
+ static int vnic_cnt = 0;
+ int i;
+
+ dev = VNIC_TXQ_ALLOC_NETDEV(sizeof *info, "eth%d", vnic_setup, port->tx_rings_num);
+ if (!dev) {
+ vnic_err(port->name, "VNIC_TXQ_ALLOC_NETDEV failed "
+ "(size %Zu, tx_rings_num %d)\n",
+ sizeof *info, port->tx_rings_num);
+ goto err;
+ }
+
+ /* this is a *very* large beast... */
+ login = vmalloc(sizeof *login);
+ if (!login) {
+ vnic_err(port->name, "failed to allocate login struct (%Zu)\n",
+ sizeof *login);
+ goto free_netdev;
+ }
+
+ /* init fields */
+ memset(login, 0, sizeof *login);
+ info = netdev_priv(dev);
+ info->login = login;
+ login->dev = dev;
+ login->port = port;
+ login->max_mtu = VNIC_BUF_SIZE(login->port) - IB_GRH_BYTES -
+ VNIC_ENCAP_LEN - ETH_HLEN - VLAN_HLEN;
+ login->cnt = ++vnic_cnt;
+ /* name will be overwritten later */
+ sprintf(login->name, "%s-%d", "vnic", login->cnt);
+ sprintf(login->desc, "%s-P%d",
+ login->port->dev->ca->node_desc, port->num);
+
+ login->neigh_wq = create_singlethread_workqueue(login->name);
+ if (!login->neigh_wq) {
+ vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+ login->name);
+ goto free_login;
+ }
+
+ login->rx_csum = 1;
+ login->rx_rings_num = port->rx_rings_num;
+ login->tx_rings_num = port->tx_rings_num;
+#ifdef _BP_NETDEV_NO_TMQ
+ /* if the kernel doesn't support Multiple TX queues,
+ * then use only one TX queue */
+ login->tx_rings_num = 1;
+#endif
+ vnic_dbg_mark();
+ spin_lock_init(&login->lock);
+ spin_lock_init(&login->stats_lock);
+ rwlock_init(&login->mac_rwlock);
+ atomic_set(&login->vnic_child_cnt, 0);
+ vnic_mcast_root_init(&login->mcast_tree);
+ mutex_init(&login->moder_lock);
+ mutex_init(&login->state_lock);
+ SET_NETDEV_DEV(login->dev, login->port->dev->ca->dma_device);
+ INIT_DELAYED_WORK(&login->stats_task, vnic_do_get_stats);
+ INIT_DELAYED_WORK(&login->mcast_task, vnic_mcast_reattach);
+ INIT_DELAYED_WORK(&login->restart_task, vnic_restart_task);
+
+ vnic_set_ethtool_ops(dev);
+ /* init ethtool */
+ dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+ dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH;
+ dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
+ dev->features |= dev->hw_features;
+
+ /* init NAPI (must be before LRO init) */
+ login->napi_num = login->rx_rings_num;
+ for (i = 0; i < login->napi_num; ++i) {
+ if (vnic_napi_alloc(login, i)) {
+ vnic_err(login->name, "NAPI alloc %d failed\n", i);
+ goto free_napi;
+ }
+ }
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+ login->dev->features |= NETIF_F_GRO;
+#elif defined(NETIF_F_LRO)
+ login->lro_num = vnic_lro_num;
+ login->lro_mng_num = vnic_lro_num ? login->rx_rings_num : 0;
+ login->dev->features |= vnic_lro_num ? NETIF_F_LRO : 0;
+#endif
+ for (i = 0; i < login->lro_mng_num; ++i) {
+ if (vnic_lro_enable(login, i)) {
+ vnic_err(login->name, "vnic_lro_enable %d failed\n", i);
+ goto free_lro;
+ }
+ }
+
+ return dev;
+
+free_lro:
+ for (--i; i >= 0; --i)
+ vnic_lro_disable(login, i);
+
+ i = login->napi_num;
+free_napi:
+ for (--i; i >= 0; --i)
+ vnic_napi_dealloc(login, i);
+free_login:
+ vfree(login);
+free_netdev:
+ free_netdev(dev);
+err:
+ return ERR_PTR(-ENODEV);
+}
+
+void vnic_free_netdev(struct vnic_login *login)
+{
+ int i;
+
+ vnic_dbg_func(login->name);
+
+ for (i = 0; i < login->lro_mng_num; ++i)
+ vnic_lro_disable(login, i);
+ for (i = 0; i < login->napi_num; ++i)
+ vnic_napi_dealloc(login, i);
+ flush_workqueue(login->neigh_wq);
+ destroy_workqueue(login->neigh_wq);
+ free_netdev(login->dev);
+ vfree(login);
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static inline void free_single_frag(struct vnic_rx_ring *ring, int e,int i)
+{
+ ib_dma_unmap_single(ring->port->dev->ca,
+ ring->rx_info[e].dma_addr[i],
+ ring->frag_info[i].frag_size,
+ PCI_DMA_FROMDEVICE);
+ ring->rx_info[e].dma_addr[i] = 0;
+ put_page(ring->rx_info[e].frags[i].page.p);
+}
+
+#ifndef _BP_NETDEV_NO_TMQ
+/* this functions used only in no_bxm mode,
+ * it's not implemented in netdevice.h so we have it here
+ * based on netif_tx_lock()
+ */
+static inline int vnic_netif_tx_trylock(struct net_device *dev)
+{
+ int i, cpu;
+
+ spin_lock(&dev->tx_global_lock);
+ cpu = smp_processor_id();
+ for (i = 0; i < dev->num_tx_queues; ++i) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+ if (__netif_tx_trylock(txq)) {
+ set_bit(__QUEUE_STATE_FROZEN, &txq->state);
+ __netif_tx_unlock(txq);
+ } else {
+ goto unlock;
+ }
+ }
+
+ return 1;
+
+unlock:
+ /* based on netif_tx_unlock() */
+ for (--i; i >= 0; --i) {
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+ clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
+ if (!test_bit(QUEUE_STATE_ANY_XOFF, &txq->state))
+ __netif_schedule(txq->qdisc);
+ }
+ spin_unlock(&dev->tx_global_lock);
+
+ return 0;
+}
+#else
+#define vnic_netif_tx_trylock(dev) netif_tx_trylock(dev)
+#endif
+
+int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc)
+{
+ ASSERT(skb);
+ vnic_dbg_skb("RX", skb, (unsigned long)-1, (unsigned long)0);
+
+ if (no_bxm) {
+ /* In no_bxm mode, we update neigh table based on ARP reqlies
+ * QPN & LID are retrieved from the IB completion
+ * ATTENTION: on RSS mode, make sure that ARPs are
+ * sent on base QPN
+ */
+ struct vnic_neigh *neighe;
+ struct ethhdr *eth_hdr = (struct ethhdr *)skb->data;
+ struct arphdr *arp_hdr = (struct arphdr *)(skb->data + ETH_HLEN);
+ u16 eth_proto = ntohs(eth_hdr->h_proto);
+ u16 arp_proto = ntohs(arp_hdr->ar_op);
+
+ if (eth_proto != ETH_P_ARP)
+ goto out;
+ if (arp_proto == ARPOP_REQUEST)
+ vnic_dbg_data(login->name, "ARP REQUEST\n");
+ else
+ vnic_dbg_data(login->name, "ARP REPLY\n");
+
+ /* don't stop TX queue, only try, this way we avoid blocking
+ * IRQs in TX flow (performance wise).
+ * other vnic_neighe_* functions are not called in parallel
+ * to this flow (in no_bxm mode)
+ */
+ if (!vnic_netif_tx_trylock(login->dev))
+ goto out;
+
+ neighe = vnic_neighe_search(login, eth_hdr->h_source);
+ if (!IS_ERR(neighe)) {
+ /* if IB address didn't change, do nothing */
+ if (neighe->qpn == wc->src_qp &&
+ neighe->lid == wc->slid)
+ goto unlock;
+ /* else, del old neigh entry, and add a new one */
+ vnic_neighe_del(login, neighe);
+ vnic_neighe_dealloc(neighe);
+ }
+
+ /* RSS: assume that your neighbours are like you */
+ neighe = vnic_neighe_alloc(login, eth_hdr->h_source,
+ wc->slid, wc->src_qp,
+ login->rx_rings_num > 1 ? 1 : 0);
+ if (IS_ERR(neighe))
+ goto unlock;
+ if (vnic_neighe_add(login, neighe))
+ vnic_neighe_dealloc(neighe);
+unlock:
+ netif_tx_unlock(login->dev);
+ }
+out:
+
+ /* shared_vnic may receive PACKET_OTHERHOST
+ * we 'fix' the pkt_type here so the kernel
+ * won't drop it
+ */
+ if (skb->pkt_type == PACKET_OTHERHOST && login->shared_vnic)
+ skb->pkt_type = PACKET_HOST;
+
+ netif_receive_skb(skb);
+
+ return 0;
+
+}
+
+struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind,
+ gfp_t gfp_flag)
+{
+ struct ib_device *ca = ring->port->dev->ca;
+ struct sk_buff *skb;
+ u64 mapping;
+ int buf_size = VNIC_BUF_SIZE(ring->port);
+
+ skb = alloc_skb(buf_size, gfp_flag);
+ if (!skb) {
+ vnic_dbg_data(ring->port->name,
+ "alloc_skb for size %d failed\n", buf_size);
+ goto err_alloc;
+ }
+
+ mapping = ib_dma_map_single(ca, skb->data, buf_size, DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(ca, mapping))) {
+ vnic_dbg_data(ring->port->name,
+ "ib_dma_map_single len %d failed\n", buf_size);
+ goto err_map;
+ }
+
+ ring->rx_info[buf_ind].skb = skb;
+ ring->rx_info[buf_ind].dma_addr[0] = mapping;
+
+ return skb;
+
+err_map:
+ dev_kfree_skb_any(skb);
+err_alloc:
+ return NULL;
+}
+
+static int frag_sizes[] = {
+ FRAG_SZ0,
+ FRAG_SZ1,
+ FRAG_SZ2,
+ FRAG_SZ3
+};
+
+/* Calculate the last offset position that accomodates a full fragment
+ * (assuming fagment size = stride-align)
+ */
+static int vnic_last_alloc_offset(struct vnic_rx_ring *ring, u16 stride, u16 align)
+{
+ u16 res = VNIC_ALLOC_SIZE % stride;
+ u16 offset = VNIC_ALLOC_SIZE - stride - res + align;
+
+ vnic_dbg_data(ring->port->name, "calculated last offset for stride:%d align:%d "
+ "res:%d offset:%d\n", stride, align, res, offset);
+ return offset;
+}
+
+static int vnic_init_allocator(struct vnic_rx_ring *ring)
+{
+ struct vnic_rx_alloc *page_alloc;
+ int i;
+
+ if (vnic_rx_linear)
+ return 0;
+
+ for (i = 0; i < ring->num_frags; i++) {
+ page_alloc = &ring->page_alloc[i];
+ page_alloc->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER);
+ if (!page_alloc->page)
+ goto out;
+
+ page_alloc->offset = ring->frag_info[i].frag_align;
+ vnic_dbg_data(ring->port->name, "Initialized allocator:%d with page:%p\n",
+ i, page_alloc->page);
+ }
+ return 0;
+
+out:
+ while (i--) {
+ page_alloc = &ring->page_alloc[i];
+ if (page_alloc->page) {
+ put_page(page_alloc->page);
+ page_alloc->page = NULL;
+ }
+ }
+ return -ENOMEM;
+}
+
+static void vnic_destroy_allocator(struct vnic_rx_ring *ring)
+{
+ struct vnic_rx_alloc *page_alloc;
+ int i;
+
+ if (vnic_rx_linear)
+ return;
+
+ for (i = 0; i < ring->num_frags; i++) {
+ page_alloc = &ring->page_alloc[i];
+ vnic_dbg_data(ring->port->name, "Freeing allocator:%d count:%d\n",
+ i, page_count(page_alloc->page));
+ if (page_alloc->page) {
+ put_page(page_alloc->page);
+ page_alloc->page = NULL;
+ }
+ }
+}
+
+/*
+ * allocate a single fragment on a single ring entry and map it
+ * to HW address.
+ */
+static int vnic_alloc_frag(struct vnic_rx_ring *ring,
+ struct vnic_frag_data *frags_data, int i)
+{
+ struct vnic_frag_info *frag_info = &ring->frag_info[i];
+ struct vnic_rx_alloc *page_alloc = &ring->page_alloc[i];
+ struct skb_frag_struct *skb_frags = &frags_data->frags[i];
+ struct skb_frag_struct skbf = *skb_frags;
+ struct page *page;
+ struct ib_device *ib_device = ring->port->dev->ca;
+ u64 dma;
+ int decision;
+
+ if (vnic_rx_linear)
+ return 0;
+
+ if (page_alloc->offset >= frag_info->last_offset) {
+ decision = 0;
+ /* Allocate new page */
+ page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER);
+ if (!page) {
+ /*frags_data->dma_addr[i] = NULL;
+ ring->rx_info[wr_id].info = VNIC_FRAG_ALLOC_FAIL;
+ ring->need_refill = 1; */
+ return -ENOMEM;
+ }
+ skbf.page.p = page_alloc->page;
+ skbf.page_offset = page_alloc->offset;
+ } else {
+ decision = 1;
+ page = page_alloc->page;
+ get_page(page);
+ skbf.page.p = page;
+ skbf.page_offset = page_alloc->offset;
+ }
+
+ skbf.size = frag_info->frag_size;
+ dma = ib_dma_map_single(ib_device, page_address(skbf.page.p) +
+ skbf.page_offset, frag_info->frag_size,
+ PCI_DMA_FROMDEVICE);
+ if (unlikely(ib_dma_mapping_error(ib_device, dma))) {
+ vnic_dbg_data(ring->port->name,
+ "ib_dma_map_single len %d failed\n",
+ frag_info->frag_size);
+ put_page(page);
+ return -ENOMEM;
+ }
+
+ if (!decision) {
+ page_alloc->page = page;
+ page_alloc->offset = frag_info->frag_align;
+ } else
+ page_alloc->offset += frag_info->frag_stride;
+
+ *skb_frags = skbf;
+ frags_data->dma_addr[i] = dma;
+
+ return 0;
+}
+
+void vnic_calc_rx_buf(struct vnic_rx_ring *ring)
+{
+ int eff_mtu = VNIC_BUF_SIZE(ring->port), buf_size = 0, i = 0;
+
+ if (vnic_rx_linear) {
+ ring->num_frags = 1;
+ return;
+ }
+
+ while (buf_size < eff_mtu) {
+ ring->frag_info[i].frag_size =
+ (eff_mtu > buf_size + frag_sizes[i]) ?
+ frag_sizes[i] : eff_mtu - buf_size;
+ ring->frag_info[i].frag_prefix_size = buf_size;
+ if (!i) {
+ ring->frag_info[i].frag_align = NET_IP_ALIGN;
+ ring->frag_info[i].frag_stride =
+ ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES);
+ } else {
+ ring->frag_info[i].frag_align = 0;
+ ring->frag_info[i].frag_stride =
+ ALIGN(frag_sizes[i], SMP_CACHE_BYTES);
+ }
+ ring->frag_info[i].last_offset =
+ vnic_last_alloc_offset(ring,
+ ring->frag_info[i].frag_stride,
+ ring->frag_info[i].frag_align);
+ buf_size += ring->frag_info[i].frag_size;
+ i++;
+ }
+
+ ring->num_frags = i;
+ ring->rx_skb_size = eff_mtu;
+ ring->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct));
+
+ vnic_dbg(ring->port->name, "Rx buffer scatter-list (ring %d effective-mtu:%d "
+ "num_frags:%d):\n", ring->index ,eff_mtu, ring->num_frags);
+ for (i = 0; i < ring->num_frags; i++) {
+ vnic_dbg(ring->port->name, "frag:%d - size:%d prefix:%d align:%d "
+ "stride:%d last_offset:%d\n", i,
+ ring->frag_info[i].frag_size,
+ ring->frag_info[i].frag_prefix_size,
+ ring->frag_info[i].frag_align,
+ ring->frag_info[i].frag_stride,
+ ring->frag_info[i].last_offset);
+ }
+}
+
+static void vnic_empty_rx_entry(struct vnic_rx_ring *ring, int i)
+{
+ int frag_num, buf_size = VNIC_BUF_SIZE(ring->port);
+ struct ib_device *ca = ring->port->dev->ca;
+ struct sk_buff *skb;
+ u64 mapping;
+
+ if (vnic_rx_linear) {
+ for (frag_num = 0; frag_num < ring->num_frags; frag_num++) {
+ mapping = ring->rx_info[i].dma_addr[0];
+ skb = ring->rx_info[i].skb;
+ if (mapping)
+ ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE);
+ if (skb)
+ dev_kfree_skb_any(skb);
+ }
+
+ return;
+ }
+
+ /* non linear buffers */
+ for (frag_num = 0; frag_num < ring->num_frags; frag_num++)
+ free_single_frag(ring, i, frag_num);
+}
+
+static int vnic_fill_rx_buffer(struct vnic_rx_ring *ring)
+{
+ struct vnic_frag_data *frags_data = &ring->rx_info[0];
+ struct sk_buff *skb;
+ struct ib_device *ca = ring->port->dev->ca;
+ int buf_ind, frag_num, buf_size = VNIC_BUF_SIZE(ring->port);
+ u64 mapping;
+
+ if (vnic_rx_linear) {
+ for (buf_ind = 0; buf_ind < ring->size; buf_ind++) {
+ skb = vnic_alloc_rx_skb(ring, buf_ind, GFP_KERNEL);
+ if (!skb)
+ goto err_linear;
+ }
+
+ return 0;
+ }
+
+ /* non linear buffers */
+ for (buf_ind = 0; buf_ind < ring->size; buf_ind++, frags_data++) {
+ for (frag_num = 0; frag_num < ring->num_frags; frag_num++) {
+ if (vnic_alloc_frag(ring, frags_data, frag_num))
+ goto err_frags;
+ }
+ }
+
+ return 0;
+
+err_linear:
+ for (buf_ind = 0; buf_ind < ring->size; buf_ind++) {
+ mapping = ring->rx_info[buf_ind].dma_addr[0];
+ skb = ring->rx_info[buf_ind].skb;
+ if (mapping)
+ ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE);
+ if (skb)
+ dev_kfree_skb_any(skb);
+ }
+
+ return -ENOMEM;
+
+err_frags:
+ for (--frag_num; frag_num >= 0; frag_num--)
+ free_single_frag(ring, buf_ind, frag_num);
+
+ for (--buf_ind; buf_ind >= 0; buf_ind--)
+ vnic_empty_rx_entry(ring, buf_ind);
+
+ return -ENOMEM;
+}
+
+/*
+ * free entire ring full of fragments.
+*/
+static void vnic_empty_rx_buffer(struct vnic_rx_ring *ring)
+{
+ int buf_ind;
+
+ for (buf_ind = 0; buf_ind < ring->size; buf_ind++)
+ vnic_empty_rx_entry(ring, buf_ind);
+
+ ring->size = 0;
+}
+
+void vnic_destroy_rx_ring(struct vnic_rx_ring *ring)
+{
+ if (!ring)
+ return;
+ vnic_empty_rx_buffer(ring);
+ vnic_destroy_allocator(ring);
+ vfree(ring->rx_info);
+ vnic_ib_free_ring(ring);
+ kfree(ring);
+}
+
+int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev,
+ struct skb_frag_struct *skb_frags_rx,
+ u64 wr_id, int length)
+{
+ struct vnic_frag_info *frag_info;
+ struct vnic_frag_data *rx_info = &ring->rx_info[wr_id];
+
+ int nr;
+ dma_addr_t dma;
+
+ /* Collect used fragments while replacing them in the HW descriptors */
+ for (nr = 0; nr < ring->num_frags; nr++) {
+ frag_info = &ring->frag_info[nr];
+ if (length <= frag_info->frag_prefix_size)
+ break;
+
+ /* Save page reference in skb */
+ skb_frags_rx[nr].page = rx_info->frags[nr].page;
+ skb_frags_rx[nr].size = rx_info->frags[nr].size;
+ skb_frags_rx[nr].page_offset = rx_info->frags[nr].page_offset;
+ dma = rx_info->dma_addr[nr];
+
+ /* Allocate a replacement page */
+ if (vnic_alloc_frag(ring, rx_info, nr))
+ goto fail;
+
+ /* Unmap buffer */
+ ib_dma_unmap_single(dev, dma, skb_frags_rx[nr].size,
+ PCI_DMA_FROMDEVICE);
+ }
+
+ /* Adjust size of last fragment to match actual length */
+ if (nr > 0)
+ skb_frags_rx[nr - 1].size = length -
+ ring->frag_info[nr - 1].frag_prefix_size;
+ return nr;
+
+fail:
+ /* Drop all accumulated fragments (which have already been replaced in
+ * the descriptor) of this packet; remaining fragments are reused... */
+ while (nr > 0) {
+ nr--;
+ put_page(skb_frags_rx[nr].page.p);
+ }
+
+ return 0;
+}
+
+int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring,
+ struct ib_wc *wc, int ip_summed, char *eth_hdr_va)
+{
+ u64 wr_id = (unsigned int)wc->wr_id;
+ struct sk_buff *skb;
+ int used_frags;
+ char *va = eth_hdr_va;
+ int length = wc->byte_len - VNIC_EOIB_HDR_SIZE - VNIC_VLAN_OFFSET(login),
+ linear_length = (length <= SMALL_PACKET_SIZE) ?
+ length : SMALL_PACKET_SIZE, hdr_len = min(length, HEADER_COPY_SIZE),
+ offest = NET_IP_ALIGN + 16;
+ struct ib_device *ib_dev = login->port->dev->ca;
+
+ /* alloc a small linear SKB */
+ skb = alloc_skb(linear_length + offest, GFP_ATOMIC);
+ if (unlikely(!skb))
+ return -ENOMEM;
+
+ skb_record_rx_queue(skb, ring->index);
+ skb_reserve(skb, offest);
+
+ if (vnic_linear_small_pkt && length <= SMALL_PACKET_SIZE) {
+ u64 dma;
+
+ /* We are copying all relevant data to the skb - temporarily
+ * synch buffers for the copy
+ */
+ dma = ring->rx_info[wr_id].dma_addr[0] + VNIC_EOIB_HDR_SIZE +
+ VNIC_VLAN_OFFSET(login);
+ ib_dma_sync_single_for_cpu(ib_dev, dma, length,
+ DMA_FROM_DEVICE);
+ skb_copy_to_linear_data(skb, va, length);
+ ib_dma_sync_single_for_device(ib_dev, dma, length,
+ DMA_FROM_DEVICE);
+ skb->tail += length;
+ } else {
+ /* unmap the needed fragmentand reallocate them. Fragments that
+ * were not used will not be reused as is. */
+ used_frags = vnic_unmap_and_replace_rx(ring, ib_dev,
+ skb_shinfo(skb)->frags,
+ wr_id, wc->byte_len);
+ if (!used_frags)
+ goto free_and_repost;
+
+ skb_shinfo(skb)->nr_frags = used_frags;
+
+ /* Copy headers into the skb linear buffer */
+ memcpy(skb->data, va, hdr_len);
+ skb->tail += hdr_len;
+ /* Skip headers in first fragment */
+ skb_shinfo(skb)->frags[0].page_offset +=
+ (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) +
+ hdr_len);
+
+ /* Adjust size of first fragment */
+ skb_shinfo(skb)->frags[0].size -=
+ (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) +
+ hdr_len);
+ skb->data_len = length - hdr_len;
+ }
+
+ /* update skb fields */
+ skb->len = length;
+ skb->truesize = length + sizeof(struct sk_buff);
+ skb->ip_summed = ip_summed;
+ skb->dev = login->dev;
+ skb->protocol = eth_type_trans(skb, skb->dev);
+
+ return vnic_rx(login, skb, wc);
+
+free_and_repost:
+ dev_kfree_skb(skb);
+ return -ENODEV;
+
+}
+
+static void vnic_set_rx_sge(struct vnic_rx_ring *ring)
+{
+ int i;
+
+ ring->wr.num_sge = ring->num_frags;
+ ring->wr.next = NULL;
+ ring->wr.sg_list = ring->sge;
+ for (i = 0; i < ring->num_frags; ++i) {
+ ring->sge[i].lkey = ring->port->mr->lkey;
+ ring->sge[i].length = ring->frag_info[i].frag_size;
+ }
+}
+
+struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index)
+{
+ int rc, rx_info, size = vnic_rx_rings_len;
+ struct vnic_rx_ring *ring;
+
+ ring = kzalloc(sizeof *ring, GFP_KERNEL);
+ if (!ring)
+ return ERR_PTR(-ENOMEM);
+
+ /* init attributes */
+ ring->port = port;
+ ring->size = size;
+ ring->index = index;
+ spin_lock_init(&ring->lock);
+
+ /* init rx ring IB resources */
+ if (vnic_ib_init_ring(ring)) {
+ vnic_err(port->name, "vnic_ib_init_ring failed\n");
+ goto free_ring;
+ }
+
+ rx_info = size * roundup_pow_of_two(sizeof(struct vnic_frag_data));
+ ring->rx_info = vmalloc(rx_info);
+ if (!ring->rx_info) {
+ vnic_err(port->name, "Failed allocating rx_info ring"
+ " (%d bytes)\n", rx_info);
+ goto free_ib;
+ }
+ memset(ring->rx_info, 0, rx_info);
+
+ /* determine the sizes of the fragments as result of mtu */
+ vnic_calc_rx_buf(ring);
+
+ rc = vnic_init_allocator(ring);
+ if (rc) {
+ vnic_err(port->name, "Failed initializing ring"
+ " allocator %d\n", rc);
+ goto free_rxinfo;
+ }
+
+ rc = vnic_fill_rx_buffer(ring);
+ if (rc) {
+ vnic_err(port->name, "vnic_fill_rx_buffer failed %d\n", rc);
+ goto free_allocator;
+ }
+
+ /* set rx WQEs drafts */
+ vnic_set_rx_sge(ring);
+
+ /* Initailize all descriptors and post to srq */
+ rc = vnic_post_recvs(ring);
+ if (rc) {
+ vnic_err(port->name, "vnic_post_recvs failed %d\n", rc);
+ goto free_rx_buffer;
+ }
+
+ return ring;
+
+free_rx_buffer:
+ /* TODO: we are freeing posted packets need to move SRQ
+ * to error and free them first
+ */
+ vnic_empty_rx_buffer(ring);
+free_allocator:
+ vnic_destroy_allocator(ring);
+free_rxinfo:
+ vfree(ring->rx_info);
+free_ib:
+ vnic_ib_free_ring(ring);
+free_ring:
+ kfree(ring);
+
+ return ERR_PTR(-EINVAL);
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb,
+ struct neighbour *neighbour, int tx_res_index);
+/* Push VLAN & EoIB headers and calculate RSS hash value
+ * We do the RSS hash here because we already check IP|TCP|UDP
+ * in this function for EoIB fields, so we make use of that
+ * and do RSS too.
+ */
+static struct eoibhdr eoib_h_draft = {
+ .encap_data = ((VNIC_EOIB_HDR_VER << 4) | (VNIC_EOIB_HDR_SIG << 6)),
+ .seg_off = 0,
+ .seg_id = 0
+};
+
+void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+
+ vnic_dbg_func(login->name);
+
+ /* skip invalid address */
+ if (unlikely(!is_valid_ether_addr(mac)))
+ return;
+
+ /* skip parent vNic address (original dev_addr) */
+ if (!(memcmp(login->dev_addr, mac, ETH_ALEN)))
+ return;
+
+ vnic_dbg_mac(login->name, "learn mac "MAC_6_PRINT_FMT"\n",
+ MAC_6_PRINT_ARG(mac));
+
+ /* update child vNic list, ignore returned code */
+ read_lock_bh(&login->mac_rwlock);
+ vnic_child_update(login, mac, remove);
+ read_unlock_bh(&login->mac_rwlock);
+}
+
+u32 vnic_hash(struct net_device *dev, struct sk_buff *skb)
+{
+ struct tcphdr *tr_h = tcp_hdr(skb);
+ struct iphdr *ip_h = ip_hdr(skb);
+ struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+ u32 hash = 0, addrlen, i;
+
+ /* All mcast traffic is sent and received on 1st queue
+ * because only the 1st QP is attached to the MGIDs
+ * TODO: consider distributing tx/rx mcast traffic as well
+ */
+ if (is_multicast_ether_addr(skb_mac_header(skb)))
+ goto out;
+
+ switch (ntohs(skb->protocol)) {
+ case ETH_P_IP:
+ /* In IPv4, access TCP/UDP header only when IP packet is not
+ * fragmented: flags == DF == 0x02.
+ */
+ if (ntohs(ip_h->frag_off) >> 13 == 0x2 &&
+ (ip_h->protocol == IPPROTO_TCP ||
+ ip_h->protocol == IPPROTO_UDP)) {
+ hash ^= (u32)ntohl(ip_h->saddr);
+ hash ^= (u32)ntohl(ip_h->daddr);
+ hash ^= (u32)ntohs(tr_h->source);
+ hash ^= (u32)ntohs(tr_h->dest);
+ }
+ break;
+ case ETH_P_IPV6:
+ /* In IPv6, access TCP/UDP header only when IP packet is not
+ * fragmented: main header nexthdr field points to TCP/UDP
+ */
+ if (ip_h6->nexthdr == IPPROTO_TCP ||
+ ip_h6->nexthdr == IPPROTO_UDP) {
+ addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32);
+ for (i = 0; i < addrlen; ++i) {
+ hash ^= (u32)ntohl(ip_h6->saddr.in6_u.u6_addr32[i]);
+ hash ^= (u32)ntohl(ip_h6->daddr.in6_u.u6_addr32[i]);
+ }
+ tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6);
+ hash ^= (u32)ntohs(tr_h->source);
+ hash ^= (u32)ntohs(tr_h->dest);
+ }
+ }
+out:
+ VNIC_SKB_SET_HASH(skb, hash);
+ return hash;
+}
+
+u8 vnic_lag_hash(struct sk_buff *skb, u16 hash_mask, u16 vid)
+{
+ struct tcphdr *tr_h = tcp_hdr(skb);
+ struct iphdr *ip_h = ip_hdr(skb);
+ struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+ u32 hash = 0, addrlen, i;
+ struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb);
+ u32 hash_dmac, hash_smac, hash_prot, hash_vid;
+ u32 hash_sip = 0, hash_dip = 0, hash_sp = 0, hash_dp = 0;
+ u8 res_hash;
+ u8 *tmp;
+
+ hash_dmac = *(u32 *)(ð->h_dest[ETH_ALEN - sizeof hash_smac]);
+ hash_smac = *(u32 *)(ð->h_source[ETH_ALEN - sizeof hash_smac]);
+ hash_prot = (u32)ntohs(skb->protocol);
+ hash_vid = (u32)vid;
+
+ if (hash_mask & GW_LAG_LAYER_2_3) {
+ switch (hash_prot) {
+ case ETH_P_IP:
+ /* In IPv4, access TCP/UDP header only when IP packet is not
+ * fragmented: flags == DF == 0x02.
+ */
+ if (ntohs(ip_h->frag_off) >> 13 == 0x2 &&
+ (ip_h->protocol == IPPROTO_TCP ||
+ ip_h->protocol == IPPROTO_UDP)) {
+ hash_sip = (u32)(ip_h->saddr);
+ hash_dip = (u32)(ip_h->daddr);
+ hash_sp = (u32)(tr_h->source);
+ hash_dp = (u32)(tr_h->dest);
+ }
+ break;
+ case ETH_P_IPV6:
+ /* In IPv6, access TCP/UDP header only when IP packet is not
+ * fragmented: main header nexthdr field points to TCP/UDP
+ */
+ if (ip_h6->nexthdr == IPPROTO_TCP ||
+ ip_h6->nexthdr == IPPROTO_UDP) {
+ addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32);
+ for (i = 0; i < addrlen; ++i) {
+ hash_sip ^= (u32)(ip_h6->saddr.in6_u.u6_addr32[i]);
+ hash_dip ^= (u32)(ip_h6->daddr.in6_u.u6_addr32[i]);
+ }
+ tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6);
+ hash_sp = (u32)(tr_h->source);
+ hash_dp = (u32)(tr_h->dest);
+ }
+ }
+ }
+
+ hash ^= (hash_mask & GW_LAG_HASH_DMAC) ? hash_dmac : 0;
+ hash ^= (hash_mask & GW_LAG_HASH_SMAC) ? hash_smac : 0;
+ hash ^= (hash_mask & GW_LAG_HASH_TPID) ? hash_prot : 0;
+ hash ^= (hash_mask & GW_LAG_HASH_VID) ? hash_vid : 0;
+ hash ^= (hash_mask & GW_LAG_HASH_SIP) ? hash_sip : 0;
+ hash ^= (hash_mask & GW_LAG_HASH_DIP) ? hash_dip : 0;
+ hash ^= (hash_mask & GW_LAG_HASH_SPORT) ? hash_sp : 0;
+ hash ^= (hash_mask & GW_LAG_HASH_DPORT) ? hash_dp : 0;
+
+ tmp = (u8 *)&hash;
+ res_hash = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
+
+ return res_hash;
+}
+
+static inline int vnic_header_encap(struct sk_buff *skb)
+{
+ struct vnic_login *login = vnic_netdev_priv(skb->dev);
+ struct eoibhdr *eoib_h;
+ struct iphdr *ip_h = ip_hdr(skb);
+ struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+
+ /* push VLAN header
+ * TODO: when VID iz zero, push header only when prio exists, i.e.:
+ * if (VNIC_VLAN_ENABLED(login) && (login->vid || login->user_prio))
+ */
+ if (VNIC_VLAN_ENABLED(login) && login->vid) {
+ struct vlan_ethhdr *veth =
+ (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN);
+ ASSERT(veth);
+ vnic_dbg_data_v(login->name, "push vlan tag with ID %u\n",
+ be16_to_cpu(login->vid));
+ memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN);
+ veth->h_vlan_proto = htons(ETH_P_8021Q);
+ veth->h_vlan_TCI = login->vid;
+ }
+
+ /* push EoIB header */
+ if (vnic_encap_headroom)
+ skb_push(skb, VNIC_ENCAP_LEN);
+
+ /* reset MAC header here, it can be changed for the following reasons:
+ * - vnic_encap_headroom is set, thus EoIB header is pushed
+ * - VLAN is enabled, thus VLAN header is pushed
+ * - some kernels (e.g., 2.6.18-194.el5) call dev_hard_start_xmit()
+ * without setting the mac header pointer
+ */
+ skb_set_mac_header(skb, VNIC_SKB_GET_ENCAP_OFFSET);
+
+ /* enforce source mac*/
+ if (vnic_src_mac_enforce)
+ memcpy(skb_mac_header(skb) + ETH_ALEN,
+ login->dev->dev_addr, ETH_ALEN);
+
+ /* set EoIB header VER/SIG, others set to zero */
+ eoib_h = VNIC_SKB_GET_ENCAP(skb);
+ *eoib_h = eoib_h_draft;
+
+ /* set EoIB header IP_CHK */
+ switch (ntohs(skb->protocol)) {
+ case ETH_P_IP:
+ VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h);
+ if (ip_h->protocol == IPPROTO_TCP)
+ VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h);
+ else if (ip_h->protocol == IPPROTO_UDP)
+ VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h);
+ break;
+ case ETH_P_IPV6:
+ VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h);
+ if (ip_h6->nexthdr == IPPROTO_TCP)
+ VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h);
+ else if (ip_h6->nexthdr == IPPROTO_UDP)
+ VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h);
+ }
+
+#ifdef _BP_NETDEV_NO_TMQ
+ /* if TSS is enabled, use the hash value calculated by
+ * vnic_select_queue() otherwise call vnic_hash()
+ */
+ vnic_hash(skb->dev, skb);
+#endif
+
+ return 0;
+}
+
+static void vnic_neigh_path_query_complete(int status,
+ struct ib_sa_path_rec *pathrec,
+ void *context)
+{
+ struct vnic_neigh *neigh = context;
+ struct ib_ah *old_ah, *new_ah;
+ struct net_device *dev = neigh->login->dev;
+ struct sk_buff_head skqueue;
+ struct vnic_login *login = neigh->login;
+
+ if (status) {
+ vnic_dbg_data(neigh->login->name, "neigh %d "MAC_6_PRINT_FMT" path query complete FAILED\n",
+ neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+ goto drop_pkts;
+ } else {
+ struct ib_ah_attr av;
+ struct sk_buff *skb;
+ vnic_dbg_data(login->name, "neigh %d "MAC_6_PRINT_FMT" path query complete sucess SL=%d\n",
+ neigh->lid, MAC_6_PRINT_ARG(neigh->mac), pathrec->sl);
+ if(ib_init_ah_from_path(login->port->dev->ca, login->port->num, pathrec, &av)){
+ vnic_warn(login->name, "ib_init_ah_from_path %d "MAC_6_PRINT_FMT" failed!\n",
+ neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+ goto drop_pkts;
+ }
+
+ old_ah = neigh->ah;
+ new_ah = ib_create_ah(login->port->pd, &av);
+ if (IS_ERR(new_ah) || !new_ah) {
+ vnic_warn(login->name, "ib_create_ah %d "MAC_6_PRINT_FMT" failed!\n",
+ neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+
+ goto drop_pkts;
+ }
+
+ neigh->sl = pathrec->sl;
+ skb_queue_head_init(&skqueue);
+ netif_tx_lock_bh(login->dev);
+ neigh->ah = new_ah;
+ neigh->valid = 1;
+ neigh->query_id = -1;
+ while ((skb = __skb_dequeue(&neigh->pkt_queue)))
+ __skb_queue_tail(&skqueue, skb);
+ netif_tx_unlock_bh(login->dev);
+
+ /* retransmit all pending packets */
+ while ((skb = __skb_dequeue(&skqueue))) {
+ /* reset skb headers */
+ /* TODO ALL VLAN ?? */
+ if (VNIC_VLAN_ENABLED(login) && login->vid)
+ skb_pull(skb, VLAN_HLEN);
+ if (vnic_encap_headroom)
+ skb_pull(skb, VNIC_ENCAP_LEN);
+
+ skb->dev = dev;
+ dev_queue_xmit(skb);
+ }
+
+ if (old_ah && !IS_ERR(old_ah))
+ ib_destroy_ah(old_ah);
+ }
+ complete(&neigh->query_comp);
+ return;
+
+drop_pkts:
+ netif_tx_lock_bh(dev);
+ neigh->query_id = -1; /* this will cause a retry */
+ while (!skb_queue_empty(&neigh->pkt_queue))
+ {
+ struct sk_buff *skb = skb_dequeue(&neigh->pkt_queue);
+ int tx_res_index;
+ struct vnic_tx_res *tx_res;
+ skb->dev = dev;
+ tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num);
+ ASSERT(tx_res_index <= login->tx_rings_num);
+ tx_res = &login->tx_res[tx_res_index];
+ VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+ dev_kfree_skb_any(skb);
+ }
+ netif_tx_unlock_bh(dev);
+ complete(&neigh->query_comp);
+}
+
+int vnic_neighe_path_query(struct vnic_neigh *neighe)
+{
+ ib_sa_comp_mask comp_mask;
+ struct ib_sa_path_rec p_rec;
+ u16 slid = neighe->login->port->attr.lid;
+ vnic_dbg_data(neighe->login->vnic_name,"neighe SL Query slid %d dlid %d dmac:"MAC_6_PRINT_FMT"\n",
+ slid, neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+
+ comp_mask = IB_SA_PATH_REC_SERVICE_ID |
+ IB_SA_PATH_REC_DLID |
+ IB_SA_PATH_REC_SLID |
+ IB_SA_PATH_REC_PKEY;
+
+ if (IS_NEIGH_QUERY_RUNNING(neighe))
+ ib_sa_cancel_query(neighe->query_id, neighe->pquery);
+
+ init_completion(&neighe->query_comp);
+ neighe->query_id = -1;
+ neighe->pquery = NULL;
+
+ p_rec.dlid = cpu_to_be16(neighe->lid);
+ p_rec.slid = cpu_to_be16(slid);
+ p_rec.service_id = cpu_to_be64(EOIB_SERVICE_ID);
+ p_rec.pkey = cpu_to_be16(neighe->login->pkey);
+
+ neighe->query_id = ib_sa_path_rec_get(&vnic_sa_client,
+ neighe->login->port->dev->ca,
+ neighe->login->port->num,
+ &p_rec,
+ comp_mask,
+ 1000/*TOUT*/,
+ GFP_ATOMIC,
+ vnic_neigh_path_query_complete,
+ neighe,
+ &neighe->pquery);
+ if (neighe->query_id < 0) {
+ vnic_dbg_data(neighe->login->vnic_name, "FAILED neigh SL Query slid %d dlid %d dmac:"MAC_6_PRINT_FMT"\n",
+ slid, neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+ complete(&neighe->query_comp);
+ }
+ return neighe->query_id;
+}
+
+static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb,
+ struct neighbour *neighbour, int tx_res_index)
+{
+ struct vnic_neigh *neighe;
+ int hash;
+
+ neighe = vnic_neighe_search(login, skb_mac_header(skb));
+ if (IS_ERR(neighe)) {
+ vnic_dbg_data(login->name, "no dst_neigh and no vnic_neigh - "
+ "gw unicast packet\n");
+
+ /* for egress unicast traffic of a shared vnic,
+ * replace src mac by shared mac
+ */
+ if (login->shared_vnic)
+ memcpy(skb_mac_header(skb) + ETH_ALEN,
+ login->shared_mac, ETH_ALEN);
+
+ if (!login->is_lag)
+ neighe = login->gw_neigh;
+ else {
+ if (unlikely(!login->lag_member_active_count))
+ return -ENOENT;
+
+ /* use hash value precomputed and mapping to find LAG GW to send to */
+ hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid);
+ hash = hash % LAG_MAP_TABLE_SIZE;
+ neighe = &login->lag_gw_neigh[login->lag_gw_map[hash]].neigh;
+ }
+
+ /* update GW statistics */
+ VNIC_STATS_ADD(login->port_stats.gw_tx_bytes, skb->len);
+ VNIC_STATS_INC(login->port_stats.gw_tx_packets);
+ } else {
+ vnic_dbg_data(login->name,
+ "no dst_neigh but vnic_neigh exists - "
+ "local unicast packet\n");
+ }
+
+ /* TODO: in VNIC_NEIGH_GET_DQPN use neigh qps_num instead of login */
+ vnic_dbg_data(login->name, "vnic_send to (base qpn 0x%06x) dqpn 0x%06x"
+ " dlid 0x%08x %s\n", neighe->qpn,
+ VNIC_NEIGH_GET_DQPN(skb, neighe), neighe->lid,
+ neighe == login->gw_neigh ? "[GW]" : "");
+
+ if (unlikely(vnic_sa_query && !neighe->valid)) {
+ /* query neigh ah*/
+ vnic_dbg_data(login->name, "AH is not %s, running path query: LID=%d mac="MAC_6_PRINT_FMT"\n",
+ !IS_ERR(neighe->ah) && neighe->ah ? "valid":"found",
+ neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+
+ if (!IS_NEIGH_QUERY_RUNNING(neighe))
+ vnic_neighe_path_query(neighe);
+
+ if (IS_ERR(neighe->ah) || !neighe->ah)
+ { /* AH is not ready yet, Queue pkt */
+ if (skb_queue_len(&neighe->pkt_queue) > VNIC_SKB_QUEUE_LEN || !IS_NEIGH_QUERY_RUNNING(neighe))
+ return 1; /* Drop in case queue is full or no query is currently runnig*/
+ __skb_queue_tail(&neighe->pkt_queue, skb);
+ return 0;
+ }
+ /* if ah is initialized send anyway */
+ }
+ vnic_send(login, skb, neighe->ah, VNIC_NEIGH_GET_DQPN(skb, neighe), tx_res_index);
+ return 0;
+}
+
+void vnic_mcast_send(struct vnic_login *login, struct sk_buff *skb, int tx_res_index)
+{
+ struct vnic_mcast *mcaste;
+ union vhub_mgid mgid;
+ struct ethhdr *eth;
+ struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+ struct ib_ah_attr *av = &tx_res->mcast_av;
+ struct ib_ah *ah;
+ u16 gw_id;
+ int hash;
+
+ eth = (struct ethhdr *)skb_mac_header(skb);
+
+ /* for LAG GW, perform hashing on mcast address */
+ if (login->is_lag && login->lag_member_active_count) {
+ hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid);
+ hash = hash % LAG_MAP_TABLE_SIZE;
+ gw_id = login->lag_gw_neigh[login->lag_gw_map[hash]].gw_id;
+ }
+ else
+ gw_id = login->gw_port_id;
+
+ /* retrieve the mlid */
+ vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid,
+ CREATE_VHUB_ID(login->vid, gw_id),
+ VHUB_MGID_DATA, 0, &mgid);
+
+ spin_lock(&login->mcast_tree.mcast_rb_lock);
+ mcaste = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid);
+ if (unlikely(IS_ERR(mcaste) || !mcaste->ah)) {
+ vnic_dbg_data(login->name, "couldn't find mcaste for "
+ MAC_6_PRINT_FMT"\n",
+ MAC_6_PRINT_ARG(eth->h_dest));
+ spin_unlock(&login->mcast_tree.mcast_rb_lock);
+ goto drop;
+ }
+
+ spin_lock(&mcaste->lock);
+ vhub_mgid_create(login->mgid_prefix, eth->h_dest, login->n_mac_mcgid,
+ CREATE_VHUB_ID(login->vid, gw_id),
+ vnic_mgid_data_type, 0, &mgid);
+ vnic_dbg_mcast_v(login->name, "sending to ETH "MAC_6_PRINT_FMT"-> "
+ "GID "VNIC_GID_FMT" (mask %d bit)\n",
+ MAC_6_PRINT_ARG(eth->h_dest),
+ VNIC_GID_ARG(mgid.ib_gid),
+ login->n_mac_mcgid);
+
+ av->dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+ av->static_rate = mcaste->port_mcaste->rec.rate;
+ av->sl = mcaste->port_mcaste->rec.sl;
+ memcpy(&av->grh.dgid, mgid.ib_gid.raw, GID_LEN);
+
+ ah = ib_create_ah(login->port->pd, av);
+ spin_unlock(&mcaste->lock);
+ spin_unlock(&login->mcast_tree.mcast_rb_lock);
+
+ if (!ah || IS_ERR(ah))
+ goto drop;
+
+ vnic_send(login, skb, ah, IB_MULTICAST_QPN, tx_res_index);
+ ib_destroy_ah(ah);
+ /* used as a counter for multicast TX packets (not RX) */
+ VNIC_STATS_DO_INC(tx_res->stats.multicast);
+
+ return;
+
+drop:
+ VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+ dev_kfree_skb_any(skb);
+}
+
+int vnic_tx(struct sk_buff *skb, struct net_device *dev)
+{
+ struct vnic_login *login = vnic_netdev_priv(dev);
+ int tx_res_index = 0, headroom = dev->hard_header_len - ETH_HLEN;
+ struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+
+ ASSERT(dev);
+ ASSERT(skb);
+#ifdef VNIC_PROFILLNG
+ login->prof_arr[login->prof_arr_it].cnt++;
+ /* copy only fields for reporting, data buffer is invalid */
+ login->prof_arr[login->prof_arr_it].skb = *skb;
+ login->prof_arr[login->prof_arr_it].skb.data = NULL;
+ login->prof_arr[login->prof_arr_it].tstamp = current_kernel_time();
+ login->prof_arr[login->prof_arr_it].jiffies = jiffies;
+ login->prof_arr[login->prof_arr_it].nr_frags = skb_shinfo(skb)->nr_frags;
+ login->prof_arr_it = (login->prof_arr_it + 1) % VNIC_PROFILLNG_SKB_MAX;
+
+#endif
+
+ /* drop zero length skbs */
+ if (unlikely(!skb->len))
+ goto drop;
+
+ /* sometimes, vnic_tx is called before carrier is up FM #100882 */
+ if (unlikely(!test_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state)))
+ goto drop;
+
+ /* check headroom and reallocate skb if needed:
+ * If VLAN used: need VLAN_HLEN (4) Bytes
+ * If vnic_encap_headroom set: need VNIC_ENCAP_LEN (4) Bytes
+ * when vnic_encap_headroom is clear, we do not encap EoIB header
+ * into the headroom, but rather use additional SG entry to hold it
+ */
+
+ if (unlikely(skb_headroom(skb) < headroom)) {
+ struct sk_buff *skb_new;
+
+ skb_new = skb_realloc_headroom(skb, headroom);
+ if (!skb_new)
+ goto drop;
+
+ dev_kfree_skb(skb);
+ skb = skb_new;
+ VNIC_STATS_INC(login->port_stats.realloc_packets);
+ }
+ /* don't use dev->header_ops, use vnic_header_encap() inline
+ * function instead, because when raw socket is used or BR_CTL mode
+ * then header_ops are not called as expected, and we'll end up sending
+ * the packet without EoIB header
+ */
+ if (unlikely(vnic_header_encap(skb)))
+ goto drop;
+
+ /* in promiscuous mode, learn the source mac */
+ if (is_ucast_promisc(login) && vnic_learn_mac_enabled)
+ vnic_learn_mac(dev, skb_mac_header(skb) + ETH_ALEN, 0);
+
+ /* get TX resource for this SKB, keep it after vnic_header_encap()
+ * so if we don't have kernel multiple queue support we use the
+ * RSS hash result for TSS
+ */
+ tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num);
+ ASSERT(tx_res_index <= login->tx_rings_num);
+ tx_res = &login->tx_res[tx_res_index];
+
+
+ /* send ucast/mcast packet */
+ vnic_dbg_skb("TX", skb, (unsigned long)(vnic_encap_headroom ? 0 : -1),
+ (unsigned long)(vnic_encap_headroom ? VNIC_ENCAP_LEN : 0));
+#if 0 /* neighbour caching disabled */
+ if (likely(skb->dst && skb->dst->neighbour)) {
+ if (is_multicast_ether_addr(skb_mac_header(skb))) {
+ vnic_dbg_data(login->name,
+ "dst_neigh exists but no vnic_neigh - "
+ "multicast packet\n");
+ vnic_mcast_send(login, skb, tx_res_index);
+ } else {
+ vnic_dbg_data(login->name,
+ "dst_neigh exists but no vnic_neigh - "
+ "unicast packet\n");
+ vnic_ucast_send(login, skb, skb->dst->neighbour, tx_res_index);
+ }
+ } else
+#endif
+ {
+ if (is_multicast_ether_addr(skb_mac_header(skb))) {
+ vnic_dbg_data(login->name,
+ "no dst_neigh - multicast packet\n");
+ vnic_mcast_send(login, skb, tx_res_index);
+ } else {
+ vnic_dbg_data(login->name,
+ "no dst_neigh - unicast packet\n");
+ if (unlikely(vnic_ucast_send(login, skb, NULL, tx_res_index)))
+ goto drop;
+ }
+ }
+
+ return NETDEV_TX_OK;
+
+drop:
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_FIP_H
+#define _VNIC_FIP_H
+
+#include "vnic.h"
+
+
+#define FIP_TYPE(FIPT) FIP_TYPE_##FIPT
+#define FIP_TYPE_IDX(FIPT) FIP_TYPE_IDX_##FIPT
+
+#define FIP_CASE(FIPT) case FIP_TYPE(FIPT): return FIP_TYPE_IDX(FIPT)
+
+#define FIP_CASE_STR(FIPT) case FIP_TYPE(FIPT): return # FIPT
+#define FIP_SUBCODE_CASE_STR(SUBCODE) case (SUBCODE): return # SUBCODE
+
+#define FIP_MASK(FIPT) (((u64)1) << FIP_TYPE_IDX(FIPT))
+
+#define ADV_EXT_TYPE(FIPT) ADV_EXT_TYPE_##FIPT
+#define ADV_EXT_IDX(FIPT) ADV_EXT_IDX_##FIPT
+
+#define GUID_FORMAT "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"
+#define MGID_PREFIX_FMT "%02x:%02x:%02x:%02x:%02x"
+#define GUID_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4], (g)[5], (g)[6], (g)[7]
+#define MGID_PRE_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4]
+
+enum {
+ FIP_TYPE(VENDOR_ID) = 13,
+ FIP_TYPE(ADDRESS) = 240,
+ FIP_TYPE(GW_INFORMATION)= 241,
+ FIP_TYPE(LOGIN) = 242,
+ FIP_TYPE(VHUB_UPDATE) = 243,
+ FIP_TYPE(VHUB_TABLE) = 244,
+ FIP_TYPE(VNIC_IDENTITY) = 245,
+ FIP_TYPE(PARTITION) = 246,
+ FIP_TYPE(GW_IDENTIFIER) = 248,
+ FIP_TYPE(KA_PARAMS) = 249,
+ FIP_TYPE(EXT_DESC) = 254,
+};
+
+enum {
+ FIP_TYPE_IDX(VENDOR_ID),
+ FIP_TYPE_IDX(ADDRESS),
+ FIP_TYPE_IDX(GW_INFORMATION),
+ FIP_TYPE_IDX(LOGIN),
+ FIP_TYPE_IDX(VHUB_UPDATE),
+ FIP_TYPE_IDX(VHUB_TABLE),
+ FIP_TYPE_IDX(VNIC_IDENTITY),
+ FIP_TYPE_IDX(PARTITION),
+ FIP_TYPE_IDX(GW_IDENTIFIER),
+ FIP_TYPE_IDX(KA_PARAMS),
+ FIP_TYPE_IDX(EXT_DESC),
+};
+
+enum {
+ ADV_EXT_TYPE(CAP) = 40,
+ ADV_EXT_TYPE(BOOT) = 18,
+ ADV_EXT_TYPE(LAG) = 41,
+ ADV_EXT_TYPE(MEMBER) = 42,
+ ADV_EXT_TYPE(PC_ID) = 43, /* Power Cycle ID */
+ ADV_EXT_TYPE(CTRL_IPORT) = 240,
+};
+
+enum {
+ ADV_EXT_IDX(CAP),
+ ADV_EXT_IDX(BOOT),
+ ADV_EXT_IDX(LAG),
+ ADV_EXT_IDX(PC_ID),
+ ADV_EXT_IDX(CTRL_IPORT),
+};
+
+
+enum {
+ EPORT_STATE_DOWN = 0,
+ EPORT_STATE_UP = 1,
+};
+
+enum fip_packet_type {
+ FIP_DISCOVER_UCAST = 0,
+ FIP_DISCOVER_MCAST = 1
+};
+
+enum {
+ FIP_TABLE_HDR_MIDDLE = 0,
+ FIP_TABLE_HDR_FIRST = 1,
+ FIP_TABLE_HDR_LAST = 2,
+ FIP_TABLE_HDR_ONLY = 3
+};
+
+enum {
+ FIP_EXT_LAG_W_POLICY_HOST = 1,
+ FIP_EXT_LAG_W_POLICY_UCAST = 1 << 2
+};
+
+/* string "mellanox" */
+#define FIP_VENDOR_MELLANOX { 0x6d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78 }
+
+
+#define FIP_TEST_PKT_LENGTH(port, length, type) \
+ if ((length) != sizeof(type) + IB_GRH_BYTES) { \
+ vnic_dbg_fip(port->name, "Dump packet:" \
+ "at %d unexpected size. length %d expected %d\n", \
+ __LINE__, (int)length, \
+ (int)(sizeof(type) + IB_GRH_BYTES)); \
+ return -EINVAL; \
+ }
+
+/*
+ * copy string b to string a and NULL termination.
+ * length a must be >= length b+1.
+ */
+#define TERMINATED_MEMCPY(a,b) \
+ do { \
+ ASSERT(sizeof(a)>=sizeof(b)+1); \
+ memcpy((a), (b), sizeof(b)); \
+ (a)[sizeof(b)] = '\0'; \
+ } while (0);
+
+
+enum {
+ FIP_MAX_ADDR_TLVS = 6,
+ FIP_MAX_TLVS = 32,
+ FIP_MAX_EXT_DESC = 32,
+};
+
+struct fip_fip_type {
+ u8 type;
+ u8 length;
+ u16 reserved;
+};
+
+struct fip_header_simple {
+ __be16 opcode;
+ u8 reserved;
+ u8 subcode;
+ __be16 list_length;
+ __be16 flags;
+};
+
+struct fip_vendor_id_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+};
+
+struct fip_address_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ __be32 gwtype_qpn;
+ __be16 sl_gwportid;
+ __be16 lid;
+ u8 guid[8];
+};
+
+struct fip_gw_information_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ u8 h_nmac_mgid;
+ u8 n_rss_mgid_tss_qpn;
+ __be16 n_rss_qpn_vnics;
+};
+
+struct fip_login_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ __be16 mtu;
+ __be16 vnic_id;
+ __be16 flags_vlan;
+ u8 mac[6];
+ u8 eth_gid_prefix[5];
+ u8 antispoofing;
+ __be16 vfields;
+ __be32 syndrom_ctrl_qpn;
+ u8 vnic_name[16];
+};
+
+struct context_table_entry {
+ u8 v_rss_type;
+ u8 reserved;
+ u8 mac[ETH_ALEN];
+ __be32 qpn;
+ u8 reserved1;
+ u8 sl;
+ __be16 lid;
+};
+
+struct fip_vhub_update_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ __be32 state_vhub_id;
+ __be32 tusn;
+};
+
+struct fip_vhub_table_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ __be32 vp_vhub_id;
+ __be32 tusn;
+ __be16 hdr;
+ __be16 table_size;
+};
+
+struct fip_vnic_identity_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ __be32 flags_vhub_id;
+ __be32 tusn;
+ __be16 vnic_id;
+ u8 mac[6];
+ u8 port_guid[8];
+ u8 vnic_name[16];
+};
+
+struct fip_partition_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ __be16 reserved;
+ __be16 pkey;
+};
+
+struct fip_gw_identifier_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ u8 sys_guid[8];
+ u8 sys_name[32];
+ u8 gw_port_name[8];
+};
+
+struct fip_ka_params_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+ __be32 adv_period;
+ __be32 ka_period;
+ __be32 vnic_ka_period;
+};
+
+struct fip_ext_desc_tlv {
+ struct fip_fip_type ft;
+ u8 vendor_id[8];
+};
+
+struct fip_extended_type {
+ u8 ext_type;
+ u8 len;
+ u8 reserved;
+ u8 mandatory;
+};
+
+struct fip_ext_type_cap {
+ struct fip_extended_type et;
+ u32 reserved[4];
+};
+
+struct fip_ext_type_boot {
+ struct fip_extended_type et;
+ u8 boot_prio;
+ u8 reserved;
+ __be16 discovery_timeout;
+};
+
+struct fip_ext_type_lag_props {
+ struct fip_extended_type et;
+ u8 gw_type;
+ u8 reserved;
+ __be16 lag_hash;
+ u8 weight_policy_flags;
+ u8 ca_threshold;
+ __be16 link_down_pol_thresh;
+ u32 reserved2[2];
+};
+
+struct fip_ext_type_power_cycle_id {
+ struct fip_extended_type et;
+ __be64 power_cycle_id;
+ u32 reserved;
+} __attribute__((packed));
+
+struct fip_ext_type_hostname {
+ struct fip_extended_type et;
+ u8 hostname[32];
+};
+
+struct fip_ext_type_ctrl_iport {
+ struct fip_extended_type et;
+ u8 vendor_id[8];
+ __be32 gwtype_qpn;
+ __be16 sl_gwportid;
+ __be16 lid;
+ u8 guid[8];
+};
+
+struct fip_ext_type_lag_member {
+ __be32 qpn;
+ __be16 sl_gw_portid;
+ __be16 lid;
+ u8 guid[8];
+ u8 eport_state;
+ u8 reserved1;
+ u8 weight;
+ u8 link_utilization;
+ u32 reserved2;
+};
+
+struct fip_ext_type_lag_members {
+ struct fip_extended_type et;
+ struct fip_ext_type_lag_member lagm[0];
+};
+
+struct fip_ext_group {
+ struct fip_ext_desc_tlv *fed[FIP_MAX_EXT_DESC];
+ int num;
+};
+
+struct fip_address_group {
+ struct fip_address_tlv *fa[FIP_MAX_ADDR_TLVS];
+ int num;
+};
+
+struct fip_context_group {
+ struct context_table_entry *cte;
+ int num;
+};
+
+struct fip_content {
+ struct fip_eoib_ver *eoib_ver;
+ struct fip_header_simple *fh;
+ struct fip_vendor_id_tlv *fvend;
+ struct fip_address_group fa;
+ struct fip_gw_information_tlv *fgwi;
+ struct fip_login_tlv *fl;
+ struct fip_vhub_update_tlv *fvu;
+ struct fip_vhub_table_tlv *fvt;
+ struct fip_vnic_identity_tlv *fvi;
+ struct fip_partition_tlv *fp;
+ struct fip_gw_identifier_tlv *fgid;
+ struct fip_ka_params_tlv *fka;
+ struct fip_ext_group fed;
+ struct fip_context_group cte;
+ u64 mask;
+ u16 offsets[FIP_MAX_TLVS];
+ int num;
+};
+
+/**************************************************************************/
+/* packet format structs */
+/**************************************************************************/
+#define VENDOR_ID_LENGTH 8
+
+struct fip_eoib_ver {
+ u8 version;
+ u8 reserved[3];
+};
+
+struct fip_fip_header {
+ __be16 opcode;
+ u8 reserved;
+ u8 subcode;
+ __be16 list_length;
+ __be16 flags;
+ struct fip_fip_type type;
+ u8 vendor_id[VNIC_VENDOR_LEN];
+};
+
+struct fip_discover_base {
+ struct fip_fip_type type;
+ u8 vendor_id[VNIC_VENDOR_LEN];
+ u32 qpn;
+ u16 sl_port_id;
+ u16 lid;
+ u8 guid[GUID_LEN];
+};
+
+struct eoib_adv_gw_info { /* Gabi */
+ struct fip_fip_type type;
+ u8 vendor_id[VNIC_VENDOR_LEN];
+ u8 system_guid[GUID_LEN];
+ u8 system_name[VNIC_SYSTEM_NAME_LEN];
+ u8 gw_port_name[VNIC_GW_PORT_NAME_LEN];
+};
+
+/* keep alive information */
+struct eoib_adv_ka_info { /* Gabi */
+ struct fip_fip_type type;
+ u8 vendor_id[VNIC_VENDOR_LEN];
+ u32 gw_adv_period;
+ u32 gw_period;
+ u32 vnic_ka_period;
+};
+
+struct eoib_advertise {
+ struct fip_eoib_ver version;
+ struct fip_fip_header fip;
+ struct fip_discover_base base;
+ struct fip_fip_type type_1;
+ u8 vendor_id[VNIC_VENDOR_LEN];
+ u8 flags;
+ u8 reserved;
+ u16 num_net_vnics;
+ struct eoib_adv_gw_info gw_info; /* Gabi */
+ struct eoib_adv_ka_info ka_info; /* Gabi */
+};
+
+struct syndrom_dword {
+ u8 syndrom;
+ u8 reserved[3];
+};
+
+union syn_qp_ctrl {
+ struct syndrom_dword syn;
+ u32 ctl_qpn;
+};
+
+struct eoib_login {
+ struct fip_eoib_ver eoib_ver;
+ struct fip_header_simple fh;
+ struct fip_vendor_id_tlv fvend;
+ struct fip_address_tlv fa;
+ struct fip_login_tlv fl;
+};
+
+struct fip_solicit_legacy {
+ struct fip_eoib_ver version;
+ struct fip_header_simple fh;
+ struct fip_vendor_id_tlv fvend;
+ struct fip_address_tlv addr;
+};
+
+struct fip_solicit_new {
+ struct fip_eoib_ver version;
+ struct fip_header_simple fh;
+ struct fip_vendor_id_tlv fvend;
+ struct fip_address_tlv addr;
+ struct fip_ext_desc_tlv ext;
+ struct fip_ext_type_cap ext_cap;
+ struct fip_ext_type_hostname ext_hostname;
+};
+
+union fip_vhub_id {
+ struct {
+ u8 flags;
+ u8 reserved[3];
+ } flags;
+ u32 vhub_id;
+};
+
+struct eoib_context_table {
+ struct fip_eoib_ver version;
+ struct fip_fip_header fip;
+ struct fip_fip_type type_1;
+ u8 vendor_id[VNIC_VENDOR_LEN];
+ union fip_vhub_id vhub_id;
+ u32 tusn;
+ u8 flags;
+ u8 reserved;
+ u16 table_size;
+ /* here come the context entries */
+};
+
+/* this is the number of DWORDS to subtract from type_1->length
+ * to get the size of the entries / 4. (size in dwords from start
+ * of vendor_id field until the first context entry + 1 for checksum
+ */
+#define FIP_TABLE_SUB_LENGTH 6
+
+/*
+ * eoib_host_update will be used for vHub context requests,
+ * keep alives and logouts
+ */
+struct eoib_host_update {
+ struct fip_eoib_ver version;
+ struct fip_fip_header fip;
+ struct fip_fip_type type_1;
+ u8 vendor_id[VNIC_VENDOR_LEN];
+ union fip_vhub_id vhub_id;
+ u32 tusn;
+ u16 vnic_id;
+ u8 mac[ETH_ALEN];
+ u8 port_guid[GUID_LEN];
+ u8 vnic_name[VNIC_NAME_LEN];
+};
+
+enum fip_packet_fields {
+ EOIB_FIP_OPCODE = 0xFFF9,
+ FIP_FIP_HDR_LENGTH = 3,
+ FIP_FIP_HDR_TYPE = 13,
+
+ /* keep all subcodes here */
+ FIP_HOST_SOL_SUB_OPCODE = 0x1,
+ FIP_GW_ADV_SUB_OPCODE = 0x2,
+ FIP_HOST_LOGIN_SUB_OPCODE = 0x3,
+ FIP_GW_LOGIN_SUB_OPCODE = 0x4,
+ FIP_HOST_LOGOUT_SUB_OPCODE = 0x5,
+ FIP_GW_UPDATE_SUB_OPCODE = 0x6,
+ FIP_GW_TABLE_SUB_OPCODE = 0x7,
+ FIP_HOST_ALIVE_SUB_OPCODE = 0x8,
+ FIP_MAX_SUBCODES,
+ /* end subcodes section */
+
+ FIP_FIP_FCF_FLAG = 0x1,
+ FIP_FIP_SOLICITED_FLAG = 0x2,
+ FIP_FIP_ADVRTS_FLAG = 0x4,
+ FIP_FIP_FP_FLAG = 0x80,
+ FIP_FIP_SP_FLAG = 0x40,
+
+ FIP_BASIC_LENGTH = 7,
+ FIP_BASIC_TYPE = 240,
+
+ FIP_ADVERTISE_LENGTH_1 = 4,
+ FIP_ADVERTISE_TYPE_1 = 241,
+ FIP_ADVERTISE_HOST_VLANS = 0x80,
+ FIP_ADVERTISE_NUM_VNICS_MASK = 0x0FFF,
+ FIP_ADVERTISE_N_RSS_SHIFT = 12,
+ FIP_ADVERTISE_HOST_EN_MASK = 0x80,
+ FIP_ADVERTISE_ALL_VLAN_GW_MASK = 0x60,
+ FIP_ADVERTISE_GW_PORT_ID_MASK = 0x0FFF,
+ FIP_ADVERTISE_SL_SHIFT = 12,
+
+ FIP_ADVERTISE_GW_LENGTH = 15,
+ FIP_ADVERTISE_GW_TYPE = 248,
+
+ FIP_ADVERTISE_KA_LENGTH = 6,
+ FIP_ADVERTISE_KA_TYPE = 249,
+
+ FIP_LOGIN_LENGTH_1 = 13,
+ FIP_LOGIN_TYPE_1 = 242,
+ FIP_LOGIN_LENGTH_2 = 4,
+ FIP_LOGIN_TYPE_2 = 246,
+
+ FIP_LOGIN_V_FLAG = 0x8000,
+ FIP_LOGIN_M_FLAG = 0x4000,
+ FIP_LOGIN_VP_FLAG = 0x2000,
+ FIP_LOGIN_H_FLAG = 0x1000,
+ FIP_LOGIN_VLAN_MASK = 0x0FFF,
+ FIP_LOGIN_DMAC_MGID_MASK = 0x3F,
+ FIP_LOGIN_RSS_MGID_MASK = 0x0F,
+ FIP_LOGIN_RSS_MASK = 0x10,
+ FIP_LOGIN_RSS_SHIFT = 4,
+ FIP_LOGIN_CTRL_QPN_MASK = 0xFFFFFF,
+ FIP_LOGIN_VNIC_ID_BITS = 16,
+ FIP_LOGIN_ALL_VLAN_GW_FLAG = 0x0040,
+
+ FIP_LOGOUT_LENGTH_1 = 13,
+ FIP_LOGOUT_TYPE_1 = 245,
+
+ FIP_HOST_UPDATE_LENGTH = 13,
+ FIP_HOST_UPDATE_TYPE = 245,
+ FIP_HOST_VP_FLAG = 0x01,
+ FIP_HOST_U_FLAG = 0x80,
+ FIP_HOST_R_FLAG = 0x40,
+
+ FIP_CONTEXT_UP_LENGTH = 9,
+ FIP_CONTEXT_UP_TYPE = 243,
+ FIP_CONTEXT_UP_EPORT_MASK = 0x30,
+ FIP_CONTEXT_UP_EPORT_SHIFT = 4,
+ FIP_CONTEXT_V_FLAG = 0x80,
+ FIP_CONTEXT_RSS_FLAG = 0x40,
+ FIP_CONTEXT_TYPE_MASK = 0x0F,
+
+ FIP_CONTEXT_TBL_TYPE = 244,
+ FIP_CONTEXT_TBL_SEQ_MASK = 0xC0,
+ FIP_CONTEXT_TBL_SEQ_FIRST = 0x40,
+ FIP_CONTEXT_TBL_SEQ_LAST = 0x80,
+
+ FKA_ADV_PERIOD = 8000, /* in mSecs */
+ FKA_ADV_MISSES = 3
+};
+
+enum fip_login_syndroms {
+ FIP_SYNDROM_SUCCESS = 0,
+ FIP_SYNDROM_HADMIN_REJECT = 1,
+ FIP_SYNDROM_GW_RESRC = 2,
+ FIP_SYNDROM_NO_NADMIN = 3,
+ FIP_SYNDROM_UNRECOGNISED_HOST = 4,
+ FIP_SYNDROM_UNSUPPORTED_PARAM = 5,
+ FIP_SYNDROM_GW_IS_LAG_MEMBER = 6,
+ FIP_SYNDROM_DUPLICATE_ADDRESS = 7,
+};
+
+/*
+ * Send a multicast or unicast solicit packet. The multicast packet is sent
+ * to the discover mcast group. Unicast packets are sent to the dqpn + dlid
+ * supplied. The dlid, dqpn, sl are ignored for multicast packets.
+ * functionreturns 0 on success and error code on failure
+*/
+int fip_solicit_send(struct fip_discover *discover,
+ enum fip_packet_type multicast, u32 dqpn,
+ u16 dlid, u8 sl, int new_prot);
+
+/*
+ * Send a unicast login packet. This function supports both host and
+ * network admined logins. function returns 0 on success and
+ * error code on failure
+*/
+int fip_login_send(struct fip_vnic_data *vnic);
+
+int fip_logout_send(struct fip_vnic_data *vnic);
+
+/*
+ * This function creates and sends a few types of packets (all ucast):
+ * vHub context request - new=1, logout=0
+ * vHub context update / vnic keep alive - new=0, logout=0
+ * vnic logout - new=0, logout=1
+*/
+int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout);
+
+/*
+ * Check if a received packet is a FIP packet, And if so return its subtype.
+ * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE
+ * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned.
+*/
+int fip_pkt_parse(char *buffer, int length, int *fip_type);
+
+/*
+ * Already know that this is a FIP packet, return its subtype.
+*/
+int fip_pkt_get_subtype_bh(char *buffer);
+
+/*
+ * parse a packet that is suspected of being an advertise packet. The packet
+ * returns 0 for a valid advertise packet and an error code other wise. The
+ * packets "interesting" details are returned in data.
+*/
+int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc,
+ struct fip_gw_data *data);
+
+/*
+ * parse a packet that is suspected of being an login ack packet. The packet
+ * returns 0 for a valid login ack packet and an error code other wise. The
+ * packets "interesting" details are returned in data.
+*/
+int fip_login_parse(struct fip_discover *discover, struct fip_content *fc,
+ struct fip_login_data *data);
+
+static inline int _map_generic_pkt(struct vnic_port *port,
+ struct fip_ring_entry *tx_ring_entry,
+ void *mem, int pkt_size)
+{
+ /* alloc packet to be sent */
+ tx_ring_entry->mem = mem;
+
+ /* map packet to bus */
+ tx_ring_entry->bus_addr =
+ ib_dma_map_single(port->dev->ca,
+ tx_ring_entry->mem, pkt_size, DMA_TO_DEVICE);
+
+ if (unlikely(ib_dma_mapping_error(port->dev->ca,
+ tx_ring_entry->bus_addr))) {
+ vnic_warn(port->name,
+ "send_generic_pkt failed to map to pci\n");
+ return -ENOMEM;
+ }
+ tx_ring_entry->length = pkt_size;
+
+ return 0;
+}
+
+static inline int alloc_map_fip_buffer(struct ib_device *ca,
+ struct fip_ring_entry *me,
+ int size, gfp_t mask)
+{
+ me->mem = kmalloc(size, mask);
+ if (!me->mem) {
+ vnic_warn(ca->name, "failed to alloc memory (%d)\n", size);
+ return -ENOMEM;
+ }
+
+ me->bus_addr = ib_dma_map_single(ca, me->mem, size, DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(ca, me->bus_addr))) {
+ kfree(me->mem);
+ vnic_warn(ca->name, "ib_dma_mapping_error failed\n");
+ return -ENOMEM;
+ }
+ me->length = size;
+ me->entry_posted = 0;
+
+ return 0;
+}
+
+#define DELAYED_WORK_CLEANUP_JIFFS 2
+#define FIP_MAX_PKT_PRINT_LENGTH 120
+#define FIP_OP_RECV (1ul << 31)
+
+static const char fip_discover_mgid[GID_LEN] = {
+ 0xFF, 0x12, 0xE0, 0x1B,
+ 0x00, 0x06, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00};
+static const char fip_solicit_mgid[GID_LEN] = {
+ 0xFF, 0x12, 0xE0, 0x1B,
+ 0x00, 0x07, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00};
+
+
+/* TODO - remove this: for initial debug only */
+void fip_dbg_dump_raw_pkt(int level, void *buff,
+ int length, int is_tx, char *name);
+enum {
+ FIP_ETH_HEADER_LEN = 14,
+ FIP_ENCAP_LEN = 4,
+ FIP_PROTOCOL_RX_SIZE = 16, /* must be power of 2 */
+ FIP_PROTOCOL_TX_SIZE = 64, /* must be power of 2 */
+ FIP_LOGIN_RX_SIZE = 64, /* must be power of 2 */
+ FIP_LOGIN_TX_SIZE = 64, /* must be power of 2 */
+
+ /* timeout in seconds between LOGIN and ACK */
+ FIP_LOGIN_TIMEOUT = 8,
+ FIP_RESOLICIT_TIME = 8,
+
+ IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + FIP_ENCAP_LEN,
+};
+
+struct fip_rcv_pkt {
+ struct list_head list;
+ struct fip_content *fc;
+ int length;
+ void *mem;
+};
+
+/*
+ * Alloc the discover CQ, QP. Configure the QP to RTS.
+ * alloc the RX + TX rings and queue work for discover
+ * finite state machine code. If complete it set, it clears
+ * possible previous GW / VNIC data structs on init.
+ */
+int fip_discover_init(struct vnic_port *port, struct fip_discover *discover,
+ u16 pkey, int complete);
+
+/*
+ * free the discover TX and RX rings, QP and CQ. If complete
+ * is set, it clears possible previous GW / VNIC data structs
+ * by using a "complete" flush otherwise vnic data is preserved.
+*/
+int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complete);
+
+/*
+ * send a single multicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_mcast_send(struct vnic_port *port, struct ib_qp *qp,
+ unsigned int wr_id, u64 mapping,
+ int size, u16 pkey_index, struct vnic_mcast *mcast);
+/*
+ * send a single unicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_ucast_send(struct vnic_port *port, struct ib_ah *ah,
+ struct ib_qp *qp,
+ unsigned int wr_id, u64 mapping,
+ int size, u16 pkey_index, u32 dest_qpn, u16 dlid,
+ u32 qkey, u8 sl);
+/*
+ * qonfigure a newly allocated QP and move it
+ * from reset->init->RTR->RTS
+ */
+int fip_init_qp(struct vnic_port *port, struct ib_qp *qp,
+ u16 pkey_index, char *name);
+
+/*
+ * allocs a single rx buffer (of size size), map it to pci bus
+ * and post it to the qp for receive. id parameter is used
+ * to keep track of work request when completion is received.
+ * kernel and bus address are returned in mem_entry.
+ * returns 0 on success else failure.
+ * id used to identify entry in receive queue.
+ */
+int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size,
+ int _id, struct fip_ring_entry *mem_entry, char *name);
+
+/* trigered by a core event */
+void fip_qp_to_reset(struct ib_qp *qp, char *name);
+void fip_flush_rings(struct vnic_port *port,
+ struct ib_cq *cq,
+ struct ib_qp *qp,
+ struct fip_ring *rx_ring,
+ struct fip_ring *tx_ring,
+ char *name);
+void fip_free_rings(struct vnic_port *port,
+ struct fip_ring *rx_ring,
+ struct fip_ring *tx_ring,
+ char *name);
+
+/*
+ * This function allocates the tx buffers and initializes the head and
+ * tail indexes.
+ */
+int fip_init_tx(int size, struct fip_ring *tx_ring, char *name);
+
+/*
+ * Configure the discover QP. This includes configuring rx+tx
+ * moving the discover QP to RTS and creating the tx and rx rings
+ */
+int fip_init_rx(struct vnic_port *port, int ring_size, struct ib_qp *qp,
+ struct fip_ring *rx_ring, char *name);
+
+/*
+ * This is a general purpose CQ completion function that handles
+ * completions on RX and TX rings. It can serve all users that are
+ * using RX and TX rings.
+ * RX completions are destinguished from TX comp by the MSB that is set
+ * for RX and clear for TX. For RX, the memory is unmapped from the PCI,
+ * The head is incremented. For TX the memory is unmapped and then freed.
+ * The function returns the number of packets received.
+*/
+int fip_comp(struct vnic_port *port,
+ struct ib_cq *cq,
+ struct fip_ring *rx_ring,
+ struct fip_ring *tx_ring,
+ char *name);
+
+/*
+ * This function is the driving engine of the vnic logic. It manages the
+ * vnics state machines.
+ * Some of the states in the state machine could have been removed because
+ * they contain "actions" and not states. Still it is easier to maintaine
+ * the code this way and it gives an easy mechanism for exception handling
+ * and retries.
+ * Only call this function from fip_wq context.
+*/
+void fip_vnic_fsm(struct work_struct *work);
+
+/*
+ * Mark the vnic for deletion and trigger a delayed call to the cleanup
+ * function. In the past the vnic was moved to another list but this
+ * might cause vnic duplication if new vnics are added to the GW. Even
+ * if the vnic is being flushed we need to know it is there.
+ *
+ * Note: This deletion method insures that all pending vnic work requests
+ * are cleared without dependency of the calling context.
+*/
+void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush);
+
+/*
+ * Free vnic resources. This includes closing the data vnic (data QPs etc)
+ * and the discovery resources. If the vnic can be totaly destroyed (no
+ * pending work) the vnic will be removed from the GW list and it's memory
+ * freed. If not the vnic will not be freed and the function will return an
+ * error. The caller needs to recall this unction to complete the operation.
+*/
+int fip_vnic_destroy(struct fip_vnic_data *vnic);
+
+struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port,
+ struct fip_gw_data *gw,
+ int hadmin,
+ u16 vnic_id);
+
+/*
+ * Look for a vnic in the GW vnic list. The search key used is either the vnic_id
+ * that is unique, or the mac+vlan pair. A match on either key will result in the
+ * return of the vnic. both keys are nesesary because host assigned delete
+ * flow might not have access to the vnic_id. The search disregards vnics that
+ * are undergoing full flush (they will be removed soon).
+*/
+struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw,
+ u16 vnic_id, u8 *mac,
+ u16 vlan, u8 vlan_used);
+
+/*
+ * process an incoming login ack packet. The packet was already parsed and
+ * its data was placed in *data. The function creates RX and TX rings for the
+ * vnic and starts the multicast join procedure.
+ * This function should not be called for packets other then login ack packets.
+*/
+void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic,
+ struct fip_login_data *data);
+
+/*
+ * This function should be called when the building of a vhub context
+ * table is done and the vnic state should transition to CONNECTED.
+*/
+int fip_vnic_tbl_done(struct fip_vnic_data *vnic);
+int fip_vnic_mcast_recnct(struct fip_vnic_data *vnic);
+
+/*
+ * Init the vnic's vHub table data structures, before using them
+ */
+void vhub_ctx_init(struct fip_vnic_data *vnic);
+void vhub_table_free(struct vhub_elist *elist);
+
+/*
+ * Clear and free the vnic's vHub context table data structures.
+ */
+void vhub_ctx_free(struct fip_vnic_data *vnic);
+
+/*
+ * This function handles a vhub context table packet. The table will
+ * be processed only if we do not have a up to date local coppy of
+ * our own. The table update supports multi-packet tables so care
+ * must be taken in building the complete table.
+*/
+int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc,
+ u32 vhub_id, u32 tusn);
+
+/*
+ * This function handles a vhub context update packets. There are three flows
+ * in handeling update packets. The first is before the main table is up
+ * to date, the second is after the table is up to date but before it was
+ * passed to the ownership of the data vnic (login struct) and the local
+ * lists are freed, and the last is when the table maintanence is done
+ * by the data vnic. This function handles all cases.
+*/
+int vhub_handle_update(struct fip_vnic_data *vnic,
+ u32 vhub_id, u32 tusn,
+ struct vnic_table_entry *data);
+
+/*
+ * This function writes the main vhub table to the data (login) vnic.
+ * You should call it when the data vnic is ready for it and after the
+ * table is up to date (and the update list was applied to the main list)
+ */
+int fip_vnic_write_tbl(struct fip_vnic_data *vnic);
+
+/* sysfs entries for hadmin vNics*/
+int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic);
+void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic);
+void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs,
+ int ext_length,
+ struct lag_members *lagm,
+ char *name);
+int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm);
+int extract_vhub_extended(struct fip_ext_desc_tlv *fed,
+ struct fip_vnic_data *vnic);
+static inline int send_generic_ucast_pkt(struct vnic_port *port,
+ struct ib_ah *ah,
+ struct fip_ring *tx_ring,
+ void *mem, int pkt_size,
+ struct ib_qp *qp,
+ int pkey_index,
+ u32 dst_qpn, u16 dst_lid,
+ u32 qkey, u8 sl)
+{
+ int index, rc;
+ unsigned long flags;
+ unsigned long tail;
+
+ /*
+ * we are only allowed to update the head at task level so no need to
+ * perform any locks here
+ */
+ spin_lock_irqsave(&tx_ring->ring_lock, flags);
+ index = tx_ring->head & (tx_ring->size - 1);
+
+ vnic_dbg_fip(port->name, "send ucast packet\n");
+
+ spin_lock(&tx_ring->head_tail_lock);
+ tail = tx_ring->tail;
+ spin_unlock(&tx_ring->head_tail_lock);
+
+ /* ring full try again */
+ if (tx_ring->head - tail >= tx_ring->size) {
+ vnic_warn(port->name, "send_generic_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n",
+ qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail);
+ rc = -EAGAIN;
+ goto err;
+ }
+
+
+ rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size);
+ if (rc)
+ goto err;
+
+ rc = fip_ucast_send(port, ah, qp, index,
+ tx_ring->ring[index].bus_addr,
+ pkt_size, pkey_index, dst_qpn, dst_lid,
+ qkey, sl);
+
+ if (rc) {
+ vnic_warn(port->name, "fip_ucast_send() failed (%d)\n", rc);
+ rc = -ENODEV;
+ goto error_unmap_dma;
+ }
+
+ tx_ring->head++;
+
+ spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+ return 0;
+
+error_unmap_dma:
+ ib_dma_unmap_single(port->dev->ca,
+ tx_ring->ring[index].bus_addr,
+ pkt_size, DMA_TO_DEVICE);
+err:
+ spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+ return rc;
+}
+
+static inline const char *eport_state_str(int state)
+{
+ switch (state) {
+ case EPORT_STATE_DOWN: return "Down";
+ case EPORT_STATE_UP: return "Up";
+ default:return "Invalid";
+ }
+}
+
+#endif /* _VNIC_FIP_H */
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+#define FIP_MAX_PKT_PRINT_LENGTH 120
+
+static void fip_purge_gws(struct work_struct *work);
+static void fip_discover_gw_fsm(struct work_struct *work);
+static void fip_discover_hadmin_update(struct work_struct *work);
+static void fip_discover_fsm(struct work_struct *work);
+void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush);
+
+/* TODO - remove this: for initial debug only */
+void fip_dbg_dump_raw_pkt(int level, void *buff,
+ int length, int is_tx, char *name)
+{
+ int i;
+ int tmp_len;
+ u32 *data_ptr;
+ unsigned char *tmp_data_ptr;
+
+ if (!(vnic_msglvl & VNIC_DEBUG_PKT_DUMP))
+ return;
+
+ printk(KERN_DEBUG "%s %s: packet length is %d\n",
+ is_tx ? "TX" : "RX", name, length);
+
+ length = (length > FIP_MAX_PKT_PRINT_LENGTH) ?
+ FIP_MAX_PKT_PRINT_LENGTH : length;
+
+ tmp_len = (length >> 2) + 1;
+ data_ptr = (u32 *)buff;
+ for (i = 0; i < tmp_len; i++) {
+ if (!is_tx && i == IB_GRH_BYTES >> 2)
+ printk(KERN_DEBUG "========================\n");
+ tmp_data_ptr = (unsigned char *)&data_ptr[i];
+ printk(KERN_DEBUG "%02x %02x %02x %02x \n",
+ tmp_data_ptr[0], tmp_data_ptr[1],
+ tmp_data_ptr[2], tmp_data_ptr[3]);
+ }
+}
+
+/*
+ * Configure the discover QP. This includes configuring rx+tx
+ * moving the discover QP to RTS and creating the tx and rx rings
+ */
+int fip_discover_start_rings(struct fip_discover *discover,
+ struct fip_ring *rx_ring,
+ struct fip_ring *tx_ring,
+ struct ib_cq *cq,
+ struct ib_qp *qp)
+{
+ int rc;
+
+ rc = fip_init_tx(tx_ring->size, tx_ring, discover->name);
+ if (rc) {
+ vnic_warn(discover->name, "fip_init_tx failed rc %d\n", rc);
+ /* set RX ring size to 0 as indication of the failure
+ so RX rings won't be freed, no need to set tx_ring->size
+ since fip_init_tx error flow will handle it */
+ rx_ring->size = 0;
+ return rc;
+ }
+
+ rc = fip_init_rx(discover->port, rx_ring->size, qp, rx_ring, discover->name);
+ if (rc) {
+ vnic_warn(discover->name, "fip_init_rx returned %d\n", rc);
+ goto release_queues;
+ }
+
+ return 0;
+
+release_queues:
+ fip_flush_rings(discover->port, cq, qp, rx_ring, tx_ring, discover->name);
+ fip_free_rings(discover->port, rx_ring, tx_ring, discover->name);
+
+ return rc;
+}
+
+int fip_discover_init_rings(struct vnic_port *port,
+ struct fip_discover *discover,
+ struct fip_ring *rx_ring,
+ struct fip_ring *tx_ring,
+ struct ib_cq **cq,
+ struct ib_qp **qp,
+ ib_comp_handler comp_handler)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ struct ib_device *ca = port->dev->ca;
+
+
+ *cq = ib_create_cq(ca, comp_handler, NULL, discover,
+ rx_ring->size + tx_ring->size, 0);
+ if (IS_ERR(*cq)) {
+ vnic_warn(discover->name, "failed to create CQ\n");
+ goto out;
+ }
+
+ memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+ qp_init_attr.cap.max_send_wr = tx_ring->size;
+ qp_init_attr.cap.max_recv_wr = rx_ring->size;
+ qp_init_attr.cap.max_send_sge = 1;
+ qp_init_attr.cap.max_recv_sge = 1;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.qp_type = IB_QPT_UD;
+ qp_init_attr.send_cq = *cq;
+ qp_init_attr.recv_cq = *cq;
+
+ *qp = ib_create_qp(port->pd, &qp_init_attr);
+ if (IS_ERR(*qp)) {
+ vnic_warn(discover->name, "failed to create QP\n");
+ goto error_free_cq;
+ }
+
+ /* move QP to RTS */
+ if (fip_init_qp(discover->port, *qp, discover->pkey_index, discover->name)) {
+ vnic_warn(discover->name, "fip_init_qp failed for qp\n");
+ goto error_free_qp;
+ }
+
+ /* init RX + TX rings */
+ if (fip_discover_start_rings(discover, rx_ring, tx_ring, *cq, *qp)) {
+ vnic_warn(discover->name, "failed to start rings\n");
+ goto error_free_qp;
+ }
+
+ /* enable receiving CQ comps, triggers fip_discover_comp() */
+ if (ib_req_notify_cq(*cq, IB_CQ_NEXT_COMP)) {
+ vnic_warn(discover->name, "ib_req_notify_cq failed for cq\n");
+ goto error_release_rings;
+ }
+
+ return 0;
+
+error_release_rings:
+ fip_flush_rings(discover->port, *cq, *qp, rx_ring, tx_ring, discover->name);
+ fip_free_rings(discover->port, rx_ring, tx_ring, discover->name);
+error_free_qp:
+ ib_destroy_qp(*qp);
+error_free_cq:
+ ib_destroy_cq(*cq);
+out:
+ *qp = NULL;
+ *cq = NULL;
+ return -ENODEV;
+}
+
+/*
+ * This function handles completions of both TX and RX
+ * packets. RX packets are unmapped lightly parsed moved to a list
+ * and passed to thread processing. TX packets are unmapped and freed.
+ * Note: this function is called from interrupt context
+ */
+static void fip_discover_comp(struct ib_cq *cq, void *discover_ptr)
+{
+ struct fip_discover *discover = discover_ptr;
+
+ /* handle completions. On RX packets this will call discover_process_rx
+ * from thread context to continue processing */
+ if (fip_comp(discover->port, discover->cq,
+ &discover->rx_ring, &discover->tx_ring,
+ discover->name))
+ fip_discover_process_rx(discover);
+}
+
+/*
+ * Alloc the discover CQ, QP. Configure the QP to RTS.
+ * alloc the RX + TX rings and queue work for discover
+ * finite state machine code.
+ */
+int fip_discover_init(struct vnic_port *port, struct fip_discover *discover,
+ u16 pkey, int complete)
+{
+ int rc;
+
+ discover->port = port;
+ discover->flush = FIP_NO_FLUSH;
+ discover->state = FIP_DISCOVER_INIT;
+ discover->rx_ring.size = FIP_PROTOCOL_RX_SIZE;
+ discover->tx_ring.size = FIP_PROTOCOL_TX_SIZE;
+ discover->new_prot_gws = 0;
+ discover->old_prot_gws = 0;
+
+ /* This is in preparation for pkey discovery */
+
+ init_completion(&discover->flush_complete);
+
+ INIT_DELAYED_WORK(&discover->fsm_task, fip_discover_fsm);
+ INIT_DELAYED_WORK(&discover->cleanup_task, fip_purge_gws);
+ INIT_DELAYED_WORK(&discover->hadmin_update_task, fip_discover_hadmin_update);
+ INIT_WORK(&discover->pkt_rcv_task_bh, fip_discover_process_rx_bh);
+ spin_lock_init(&discover->rcv_list.lock);
+ INIT_LIST_HEAD(&discover->rcv_list.list);
+ spin_lock_init(&discover->lock);
+
+
+ if (complete) {
+ discover->pkey = pkey;
+ INIT_LIST_HEAD(&discover->gw_list);
+ init_rwsem(&discover->l_rwsem);
+ sprintf(discover->name, "%s_P%x", port->name, discover->pkey);
+ }
+ INIT_LIST_HEAD(&discover->hadmin_cache);
+ vnic_mcast_root_init(&discover->mcast_tree);
+
+ if (!ib_find_pkey(port->dev->ca, port->num, discover->pkey, &discover->pkey_index)) {
+ rc = fip_discover_init_rings(port, discover, &discover->rx_ring,
+ &discover->tx_ring, &discover->cq,
+ &discover->qp, fip_discover_comp);
+ if (rc) {
+ vnic_warn(discover->name, "descovered init failed rc=%d\n", rc);
+ return rc;
+ }
+
+ /* start discover FSM code */
+ /* calls fip_discover_fsm() */
+ queue_delayed_work(fip_wq, &discover->fsm_task, 0);
+ } else {
+ vnic_warn(discover->name, "Configured PKEY 0x%X is not supported on port\n", discover->pkey);
+ discover->pkey_index = ILLEGAL_PKEY_INDEX;
+ }
+
+
+ return 0;
+}
+
+void fip_recv_list_flush(struct fip_discover *discover)
+{
+ struct list_head discov_recv_local;
+ struct fip_rcv_pkt *rcv, *rcv1;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&discov_recv_local);
+
+ spin_lock_irqsave(&discover->rcv_list.lock, flags);
+ list_replace_init(&discover->rcv_list.list, &discov_recv_local);
+ spin_unlock_irqrestore(&discover->rcv_list.lock, flags);
+
+ list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+ list_del(&rcv->list);
+ kfree(rcv);
+ }
+ return;
+}
+
+/*
+ * free the discover TX and RX rings, QP and CQ.
+ * May not be called from fip wq context.
+ */
+int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complt)
+{
+ if (discover->state == FIP_DISCOVER_OFF)
+ return -EINVAL;
+
+ /* move FSM to flush state and wait for the FSM
+ * to finish whatever it is doing before we continue
+ */
+ vnic_dbg_mark();
+ init_completion(&discover->flush_complete);
+ discover->flush = complt ? FIP_FULL_FLUSH : FIP_PARTIAL_FLUSH;
+ cancel_delayed_work(&discover->fsm_task);
+#ifndef _BP_WORK_SYNC
+ cancel_delayed_work_sync(&discover->hadmin_update_task);
+#else
+ cancel_delayed_work(&discover->hadmin_update_task);
+ flush_workqueue(fip_wq);
+#endif
+ /* flush any hadmin entries leftovers */
+ {
+ struct fip_hadmin_cache *hadmin, *hadmin_t;
+
+ spin_lock_irq(&discover->lock);
+ list_for_each_entry_safe(hadmin, hadmin_t,
+ &discover->hadmin_cache, next) {
+ list_del(&hadmin->next);
+ kfree(hadmin);
+ }
+ spin_unlock_irq(&discover->lock);
+ }
+
+ /* calls fip_discover_fsm() */
+ queue_delayed_work(fip_wq, &discover->fsm_task, 0);
+ vnic_dbg_mark();
+ /* calls fip_discover_fsm() */
+ wait_for_completion(&discover->flush_complete);
+ vnic_dbg_mark();
+
+ /* make sure that discover FSM is idle */
+#ifndef _BP_WORK_SYNC
+ cancel_delayed_work_sync(&discover->fsm_task);
+#else
+ cancel_delayed_work(&discover->fsm_task);
+ flush_workqueue(fip_wq);
+#endif
+
+ if (discover->pkey_index != ILLEGAL_PKEY_INDEX) {
+ fip_flush_rings(port, discover->cq, discover->qp,
+ &discover->rx_ring, &discover->tx_ring,
+ discover->name);
+ fip_free_rings(port, &discover->rx_ring, &discover->tx_ring,
+ discover->name);
+
+ fip_recv_list_flush(discover);
+ if (discover->qp)
+ ib_destroy_qp(discover->qp);
+ discover->qp = NULL;
+
+ if (discover->cq)
+ ib_destroy_cq(discover->cq);
+ discover->cq = NULL;
+ }
+
+ return 0;
+}
+
+/*
+ * This function runs in interrupt context
+ * It does sanity checking of the packet, moves it to a list and passes
+ * handling to a thread.
+ */
+void fip_discover_process_rx(struct fip_discover *discover)
+{
+ struct vnic_port *port = discover->port;
+ int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+ int rc;
+ int queue_packet, one_or_more_queued = 0;
+ struct fip_rcv_pkt *rcv, *rcv1;
+ struct list_head discov_recv_local;
+ int index;
+ struct fip_content *fc;
+ int err;
+ struct fip_ring_entry *ring;
+
+ INIT_LIST_HEAD(&discov_recv_local);
+
+ if (discover->flush != FIP_NO_FLUSH)
+ return;
+
+ while (discover->rx_ring.head != discover->rx_ring.tail) {
+ fc = NULL;
+ queue_packet = 0;
+ index = discover->rx_ring.tail & (discover->rx_ring.size - 1);
+ ring = &discover->rx_ring.ring[index];
+
+ if (ring->entry_posted == 1 &&
+ discover->state == FIP_DISCOVER_SOLICIT) {
+ fc = kzalloc(sizeof *fc, GFP_ATOMIC);
+ if (likely(fc)) {
+ /* login is the first state we RX packets in */
+ rc = fip_packet_parse(port, ring->mem + IB_GRH_BYTES,
+ ring->length - IB_GRH_BYTES, fc);
+ if (!rc)
+ fip_discover_rx_packet(&queue_packet, fc);
+ } else
+ vnic_warn(discover->name, "allocation failed\n");
+ }
+ if (queue_packet) {
+ int length;
+
+ length = ring->length - IB_GRH_BYTES;
+ rcv = kmalloc(sizeof *rcv, GFP_ATOMIC);
+ if (!rcv) {
+ vnic_dbg_fip(discover->name, "failed kmalloc\n");
+ kfree(fc);
+ } else {
+ struct fip_ring_entry me;
+
+ err = alloc_map_fip_buffer(port->dev->ca, &me,
+ mtu_size, GFP_ATOMIC);
+ if (err) {
+ kfree(fc);
+ kfree(rcv);
+ } else {
+ rcv->length = length;
+ rcv->fc = fc;
+ rcv->mem = ring->mem;
+ list_add_tail(&rcv->list, &discov_recv_local);
+ one_or_more_queued++;
+ ib_dma_unmap_single(port->dev->ca,
+ ring->bus_addr,
+ mtu_size, DMA_FROM_DEVICE);
+ *ring = me;
+ }
+ }
+ } else
+ kfree(fc);
+
+ rc = fip_post_receive(port, discover->qp,
+ FIP_UD_BUF_SIZE(discover->port->max_mtu_enum),
+ index, ring, discover->name);
+ if (rc)
+ vnic_warn(discover->name, "fip_post_receive rc %d\n", rc);
+
+ discover->rx_ring.tail++;
+ }
+
+ if (one_or_more_queued) {
+ spin_lock(&discover->lock);
+ if (likely(discover->flush == FIP_NO_FLUSH)) {
+ spin_lock(&discover->rcv_list.lock);
+ list_splice_init(&discov_recv_local, discover->rcv_list.list.prev);
+ spin_unlock(&discover->rcv_list.lock);
+ /* calls fip_discover_process_rx_bh */
+ queue_work(fip_wq, &discover->pkt_rcv_task_bh);
+ spin_unlock(&discover->lock);
+ } else {
+ spin_unlock(&discover->lock);
+ list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+ list_del(&rcv->list);
+ kfree(rcv->fc);
+ kfree(rcv->mem);
+ kfree(rcv);
+ }
+ }
+ }
+
+ return;
+}
+
+/*
+ * This function is the RX packet handler bottom half. It runs on the fip wq.
+*/
+void fip_discover_process_rx_bh(struct work_struct *work)
+{
+ struct fip_discover *discover =
+ container_of(work, struct fip_discover, pkt_rcv_task_bh);
+ int rc;
+ struct list_head discov_recv_local;
+ struct fip_rcv_pkt *rcv, *rcv1;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&discov_recv_local);
+
+ /* the irqsave is needed because debug kernel above 2.6.27 complains about
+ * hard irq safe to hard irq unsafe on discover.lock */
+ spin_lock_irqsave(&discover->rcv_list.lock, flags);
+ list_replace_init(&discover->rcv_list.list, &discov_recv_local);
+ spin_unlock_irqrestore(&discover->rcv_list.lock, flags);
+
+ if (discover->flush != FIP_NO_FLUSH) {
+ list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+ list_del(&rcv->list);
+ kfree(rcv->fc);
+ kfree(rcv->mem);
+ kfree(rcv);
+ }
+ return;
+ }
+
+ list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+ rc = fip_discover_rx_packet_bh(discover, rcv->fc);
+ if (rc)
+ vnic_warn(discover->name, "discover_rx_packet rc %d\n", rc);
+
+ list_del(&rcv->list);
+ kfree(rcv->fc);
+ kfree(rcv->mem);
+ kfree(rcv);
+ }
+ return;
+}
+
+static inline int fip_close_all_vnics(struct fip_gw_data *gw, enum fip_flush flush)
+{
+ struct fip_vnic_data *vnic;
+ int open_vnics = 0;
+
+ vnic_dbg_func(gw->discover->name);
+
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+ open_vnics++;
+ fip_vnic_close(vnic, flush);
+ }
+ return open_vnics;
+}
+
+static int fip_gw_create_vnics(struct fip_gw_data *gw)
+{
+ struct fip_vnic_data *vnic;
+ unsigned long first_free_vnic;
+ struct fip_vnic_send_info gw_address;
+ int i;
+
+ gw->info.gw_num_vnics = (gw->info.gw_num_vnics > FIP_MAX_VNICS_PER_GW) ?
+ FIP_MAX_VNICS_PER_GW : gw->info.gw_num_vnics;
+
+
+ gw->info.gw_num_vnics = vnic_net_admin ? gw->info.gw_num_vnics : 0;
+ fip_vnic_create_gw_param(&gw_address, gw->info.gw_qpn, VNIC_FIP_QKEY,
+ gw->info.gw_lid, vnic_gw_ctrl_sl(gw));
+ /* for host admined */
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+ if (vnic->hadmined) {
+ if (gw->info.hadmined_en)
+ fip_hadmin_vnic_refresh(vnic, &gw_address);
+ else {
+ vnic_dbg_fip(gw->discover->name,
+ "fip_gw_create_vnics hadmin disabled, "
+ "close open hadmin vnics\n");
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ }
+ }
+ }
+
+ /* for network admined */
+ for (i = gw->vnic_count; i < gw->info.gw_num_vnics; i++) {
+ vnic_dbg_fip(gw->discover->name, "fip_gw_create_vnics available"
+ " vnics %d needed %d\n",
+ gw->vnic_count, gw->info.gw_num_vnics);
+
+ /* start network assigned at half array. leave first half to host admin */
+ first_free_vnic = find_first_zero_bit(gw->n_bitmask,
+ FIP_MAX_VNICS_PER_GW);
+ if (first_free_vnic >= FIP_MAX_VNICS_PER_GW)
+ return -ENOMEM;
+
+ vnic = fip_vnic_alloc(gw->discover->port, gw, 0 /* hadmin */, first_free_vnic);
+ if (!vnic)
+ return -ENOMEM;
+
+ fip_vnic_set_gw_param(vnic, &gw_address);
+ set_bit(first_free_vnic, gw->n_bitmask);
+ list_add_tail(&vnic->gw_vnics, &gw->vnic_list);
+ gw->vnic_count++;
+
+ /* calls fip_vnic_fsm() */
+ cancel_delayed_work(&vnic->vnic_task);
+ fip_vnic_fsm(&vnic->vnic_task.work);
+ }
+
+ return 0;
+}
+
+/*
+ * This function goes over vnics and closes network administrated vNics
+ * that are not open and do not receive neighbor table info (there
+ * is no way for the BXM to tell the vNics to close before the
+ * vnic is listening to the neighbour tables).
+*/
+static int fip_gw_close_nonopen_vnics(struct fip_gw_data *gw)
+{
+ struct fip_vnic_data *vnic;
+ int closed_vnics = 0;
+
+ vnic_dbg_fip(gw->discover->name, "Try to close non open vnics\n");
+
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+ vnic_dbg_fip(gw->discover->name, "check vnic %s, hadmin %d state %d\n",
+ vnic->name, vnic->hadmined, vnic->state);
+ if (!vnic->hadmined && vnic->state < FIP_VNIC_VHUB_DONE) {
+ vnic_dbg_fip(gw->discover->name, "closing vnic %s\n", vnic->name);
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ closed_vnics++;
+ }
+ }
+
+ return closed_vnics;
+}
+
+/* permanently delete all vnics pending delete. The function goes over
+ * the list of vnics awaiting deletion and tries to delete them. If the
+ * vnic destructor returns an error value (currently busy) the function
+ * will requeue it self for another try. The function will also test if
+ * new vnics need to be added as a result of vnic removal.
+ */
+static void fip_purge_vnics(struct work_struct *work)
+{
+ struct fip_gw_data *curr_gw =
+ container_of(work,struct fip_gw_data, vnic_cleanup_task.work);
+ struct fip_vnic_data *vnic, *tmp_vnic;
+ int vnic_id, rc, del_cnt = 0, retry = 0;
+ unsigned long *bitmask;
+
+ vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics\n");
+
+ list_for_each_entry_safe(vnic, tmp_vnic, &curr_gw->vnic_list, gw_vnics) {
+ enum fip_flush f;
+ vnic_id = vnic->vnic_id;
+ bitmask = vnic->hadmined ? NULL : curr_gw->n_bitmask;
+
+ /* If successful vnic is removed from list and destroyed */
+ f = vnic->flush;
+ if (f != FIP_NO_FLUSH) {
+ rc = fip_vnic_destroy(vnic);
+ if (!rc) {
+ del_cnt++;
+ if (f == FIP_FULL_FLUSH && bitmask)
+ clear_bit(vnic_id, bitmask);
+ } else {
+ retry |= rc;
+ }
+ }
+
+ /* limit the number of vnics to purge in each loop to let other
+ * tasks on same wq to run (i.e., avoid starvation).
+ */
+ if (del_cnt > 2) {
+ retry = 1;
+ break;
+ }
+ }
+
+ /* This means we still have vnics that refuse to close retry later */
+ if (retry){
+ vnic_dbg_mark();
+ /* calls fip_purge_vnics() */
+ queue_delayed_work(fip_wq, &curr_gw->vnic_cleanup_task, HZ / 10);
+ } else {
+ vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics, all GW"
+ " vnics closed\n");
+
+ if (curr_gw->hadmin_gw && curr_gw->state == FIP_GW_HOST_ADMIN && list_empty(&curr_gw->vnic_list)) {
+ vnic_warn(curr_gw->discover->name,
+ "Removing Host admin GW %s with no vnics\n",
+ (char*)curr_gw->info.vol_info.gw_port_name);
+ fip_close_gw(curr_gw, FIP_FULL_FLUSH);
+ }
+ /* test and open new vnics if vnics are missing */
+ /* ALITODO: after GW timeout, a vnic is re-created! why is that?
+ if (fip_gw_create_vnics(curr_gw)) {
+ vnic_dbg_mark();
+ queue_delayed_work(fip_wq,
+ &curr_gw->vnic_cleanup_task, HZ);
+ }
+ */
+ }
+}
+
+/*
+ * This function adds or removes a single host admined vnic to a GW.
+ * First the function searches for the vnic. The search function
+ * disregards vnics that are undergoing a complete flush.
+*/
+int fip_gw_update_hadmin_gw(struct fip_gw_data *gw,
+ struct fip_hadmin_cache *hadmin_entry)
+{
+ struct fip_vnic_data *vnic;
+ int vnic_id = hadmin_entry->vnic_id, rc = 0;
+
+ /* set bit 16 for hadmin vNics (by spec) */
+ vnic_id |= (1 << (VNIC_ID_LEN - 1));
+
+ vnic = fip_vnic_find_in_list(gw, vnic_id, hadmin_entry->mac,
+ hadmin_entry->vlan,
+ hadmin_entry->vlan_used);
+
+ /* remove: if vNic found - remove it and exit */
+ if (hadmin_entry->remove) {
+ if (vnic)
+ fip_vnic_close(vnic, FIP_FULL_FLUSH);
+ else
+ vnic_dbg_fip(gw->discover->name, "vNic to remove is"
+ " not found (name:%s mac:"MAC_6_PRINT_FMT
+ " vlan:%d id:%d)\n",
+ hadmin_entry->interface_name,
+ MAC_6_PRINT_ARG(hadmin_entry->mac),
+ hadmin_entry->vlan, vnic_id);
+ goto out;
+ }
+
+ /* add: if vNic found - report error, otherwise add new vNic */
+ if (vnic) {
+ /* skip error reporting between child vNics conflict,
+ * as vnic_learn_mac() may learn same child while it's still
+ * pending. TODO: improve this to avoid such cases.
+ */
+ if (hadmin_entry->parent_used && vnic->parent_used)
+ goto out;
+ vnic_warn(gw->discover->name, "vNic creation failed, duplicate"
+ " vNic detected (name:%s mac:"MAC_6_PRINT_FMT
+ " vlan:%d id:%d & existing name:%s mac:"
+ MAC_6_PRINT_FMT" vlan:%d id:%d)\n",
+ hadmin_entry->interface_name,
+ MAC_6_PRINT_ARG(hadmin_entry->mac),
+ hadmin_entry->vlan, vnic_id, vnic->interface_name,
+ MAC_6_PRINT_ARG(vnic->login_data.mac),
+ vnic->login_data.vlan, vnic->login_data.vnic_id);
+ goto out;
+ }
+
+#if 0
+ /* if the GW is in all_vlan mode,
+ * the host can only create vlans in this mode.
+ * However if it is not in all_vlan mode, the host must not create
+ * vlans in this mode */
+ if ((gw->info.all_vlan_gw && !hadmin_entry->all_vlan_gw
+ && hadmin_entry->vlan_used) ||
+ (!gw->info.all_vlan_gw && hadmin_entry->all_vlan_gw)) {
+ vnic_warn(gw->discover->name, "vnic creation failed, all_vlan"
+ " gateway policy must be enforced between the gateway"
+ " and the host\n");
+ rc = -EINVAL;
+ goto out;
+ }
+#endif
+
+ vnic = fip_vnic_alloc(gw->discover->port, gw, 1 /* hadmin */, vnic_id);
+ if (!vnic) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ /* hand over info from hadmin to vnic struct */
+ memcpy(vnic->login_data.mac, hadmin_entry->mac, sizeof(vnic->login_data.mac));
+ memcpy(vnic->interface_name, hadmin_entry->interface_name,
+ sizeof(vnic->interface_name));
+ vnic->login_data.vlan = hadmin_entry->vlan;
+ vnic->login_data.vp = hadmin_entry->vlan_used;
+ vnic->login_data.all_vlan_gw = hadmin_entry->all_vlan_gw;
+ memcpy(vnic->shared_vnic.ip, hadmin_entry->shared_vnic_ip,
+ sizeof(vnic->shared_vnic.ip));
+ memcpy(vnic->shared_vnic.emac, hadmin_entry->shared_vnic_mac,
+ sizeof(vnic->shared_vnic.emac));
+ vnic->shared_vnic.enabled = is_valid_ipv4(hadmin_entry->shared_vnic_ip);
+ vnic->vnic_id = vnic_id; /* will be overwritten later */
+ vnic->vlan_used = hadmin_entry->vlan_used;
+ vnic->parent_used = hadmin_entry->parent_used;
+ memcpy(vnic->parent_name, hadmin_entry->parent_name,
+ sizeof(vnic->parent_name));
+ vnic->qp_base_num = hadmin_entry->qp_base_num;
+ vnic->vlan = hadmin_entry->vlan;
+ vnic->cmd = hadmin_entry->cmd;
+ vnic->all_vlan_gw = hadmin_entry->all_vlan_gw;
+
+ /* create dentry */
+ rc = vnic_create_hadmin_dentry(vnic);
+ if (rc)
+ goto init_failed;
+
+ rc = fip_vnic_hadmin_init(gw->discover->port, vnic);
+ if (rc)
+ goto init_failed;
+
+ list_add_tail(&vnic->gw_vnics, &gw->vnic_list);
+
+ /* calls fip_vnic_fsm() */
+ fip_vnic_fsm(&vnic->vnic_task.work);
+
+ return 0;
+
+init_failed:
+ vnic_delete_hadmin_dentry(vnic);
+ kfree(vnic);
+out:
+ return rc;
+}
+
+/*
+ * Queue the GW for deletion. And trigger a delayed call to the cleanup
+ * function.
+ * Note: This deletion method insures that all pending GW work requests
+ * are cleared without dependency of the calling context.
+*/
+void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush)
+{
+ enum fip_flush tmp_flush = gw->hadmin_gw ? flush : FIP_FULL_FLUSH;
+
+ if (tmp_flush == FIP_PARTIAL_FLUSH && gw->state < FIP_GW_HOST_ADMIN)
+ return;
+
+ /* close already in process, disregard*/
+ if (gw->flush >= tmp_flush)
+ return;
+
+ gw->flush = tmp_flush;
+ gw->info.gw_num_vnics = 0;
+ cancel_delayed_work(&gw->gw_task);
+
+ /* This is not mandatory but will save us time because there is a
+ * better chance that all vnics would be destroyed before trying to
+ * destroy the GW */
+ fip_close_all_vnics(gw, tmp_flush);
+
+ /* calls fip_purge_gws() */
+ queue_delayed_work(fip_wq, &gw->discover->cleanup_task, DELAYED_WORK_CLEANUP_JIFFS);
+}
+
+/*
+ * Free GW resources. This includes destroying the vnics. If the GW can be
+ * totally destroyed (no pending work for the GW and all the vnics have been
+ * destroyed) the GW will be removed from the GWs list and it's memory
+ * freed. If the GW can not be closed at this time it will not be freed
+ * and the function will return an error.
+ * In this case the caller needs to recall the unction to complete the
+ * operation.
+ * Do not call this function directly use: fip_close_gw
+ */
+static int fip_free_gw(struct fip_discover *discover, struct fip_gw_data *gw)
+{
+ struct fip_vnic_data *vnic;
+ int vnic_close_fail = 0;
+
+ gw->info.gw_num_vnics = 0;
+
+ if (delayed_work_pending(&gw->gw_task))
+ return -EBUSY;
+
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics)
+ vnic_close_fail |= (vnic->flush != FIP_NO_FLUSH);
+
+ /* true if vnics need to be closed */
+ /* if some of the vnics are still open return and retry later */
+ if (vnic_close_fail)
+ return -EBUSY;
+
+ if (delayed_work_pending(&gw->vnic_cleanup_task))
+ return -EBUSY;
+
+ /*
+ * it is possible that during gw removal we added the GW again. Test GW
+ * list to ensure it is not in the list already before adding it again.
+ */
+ if (gw->state > FIP_GW_HOST_ADMIN) {
+ if (gw->info.gw_prot_new)
+ discover->new_prot_gws--;
+ else
+ discover->old_prot_gws--;
+ }
+ if (gw->flush == FIP_PARTIAL_FLUSH) {
+ gw->state = FIP_GW_HOST_ADMIN;
+ gw->flush = FIP_NO_FLUSH;
+ } else {
+ list_del(&gw->list);
+ if (!IS_ERR(gw->pquery) && gw->query_id >= 0)
+ ib_sa_cancel_query(gw->query_id, gw->pquery);
+ wait_for_completion(&gw->query_comp);
+ kfree(gw);
+ }
+ return 0;
+}
+
+/*
+ * permanently delete all GWs pending delete. The function goes over
+ * the list of GWs awaiting deletion and tries to delete them. If the
+ * GW destructor returns an error value (currently busy) the function
+ * will requeue it self for another try.
+ */
+static void fip_purge_gws(struct work_struct *work)
+{
+ struct fip_discover *discover =
+ container_of(work, struct fip_discover, cleanup_task.work);
+ struct fip_gw_data *gw, *tmp_gw;
+ int gw_close_fail = 0;
+
+ down_write(&discover->l_rwsem);
+ list_for_each_entry_safe(gw, tmp_gw, &discover->gw_list, list) {
+ if (gw->flush != FIP_NO_FLUSH) {
+ gw_close_fail |= fip_free_gw(discover, gw);
+ }
+ }
+ up_write(&discover->l_rwsem);
+
+ /* This means we still have vnics that refuse to close, retry later */
+ if (gw_close_fail) {
+ vnic_dbg_fip(discover->name, "still have open GWs\n");
+ /* calls fip_purge_gws() */
+ queue_delayed_work(fip_wq, &discover->cleanup_task,
+ DELAYED_WORK_CLEANUP_JIFFS);
+ } else {
+ vnic_dbg_fip(discover->name, "fip_purge_gws all gws"
+ " closed and freed\n");
+ }
+}
+
+static int fip_free_gw_done(struct fip_discover *discover, enum fip_flush flush)
+{
+ struct fip_gw_data *curr_gw;
+ int rc;
+
+ down_read(&discover->l_rwsem);
+ if (flush == FIP_FULL_FLUSH) {
+ rc = list_empty(&discover->gw_list);
+ up_read(&discover->l_rwsem);
+ return rc;
+ }
+
+ list_for_each_entry(curr_gw, &discover->gw_list, list) {
+ if (curr_gw->flush != FIP_NO_FLUSH) {
+ up_read(&discover->l_rwsem);
+ return 0;
+ }
+ }
+
+ up_read(&discover->l_rwsem);
+ return 1;
+}
+
+/*
+ * Go over the GW list and try to close the GWs. It is possible that some
+ * of the GWs have pending work and therefore can not be closed. We can not
+ * sleep on this because we might be running on the same context as the one
+ * we are waiting for. The user should call this function once and then test
+ * if the free is done by polling (must release wq context) fip_free_gw_done
+ */
+static int fip_free_gw_list(struct fip_discover *discover, enum fip_flush flush)
+{
+ struct fip_gw_data *curr_gw;
+
+ down_read(&discover->l_rwsem);
+ list_for_each_entry(curr_gw, &discover->gw_list, list)
+ fip_close_gw(curr_gw, flush);
+ up_read(&discover->l_rwsem);
+
+ vnic_dbg_fip(discover->name, "fip_free_gw_list not done\n");
+ return 0;
+}
+
+static inline void update_gw_address(struct fip_gw_data *gw,
+ struct fip_gw_data_info *new_gw_data)
+{
+ gw->info.gw_qpn = new_gw_data->gw_qpn;
+ gw->info.gw_lid = new_gw_data->gw_lid;
+ gw->info.gw_port_id = new_gw_data->gw_port_id;
+ gw->info.gw_sl = new_gw_data->gw_sl;
+ memcpy(gw->info.gw_guid, new_gw_data->gw_guid, sizeof gw->info.gw_guid);
+
+ vnic_dbg_fip(gw->discover->name, "GW address was modified. "
+ "QPN: 0x%x, LID: 0x%x, guid: " GUID_FORMAT
+ "port id: %d, SL: %d\n", gw->info.gw_qpn,
+ gw->info.gw_lid, GUID_ARG(gw->info.gw_guid),
+ gw->info.gw_port_id, gw->info.gw_sl);
+ /* restart fsm to path query */
+ if (vnic_sa_query)
+ fip_discover_gw_fsm_move(gw, FIP_GW_CTRL_PATH_QUERY);
+}
+
+int fip_gw_modified(struct fip_gw_data *gw,
+ struct fip_gw_data_info *new_gw_data)
+{
+ char *name = gw->discover->name;
+ ASSERT(new_gw_data);
+
+ vnic_dbg_fip(name, "fip_gw_modified called, gw_num_vnics %d -> %d\n",
+ gw->info.gw_num_vnics, new_gw_data->gw_num_vnics);
+
+ if (memcmp(gw->info.gw_guid, new_gw_data->gw_guid,
+ sizeof(gw->info.gw_guid)) ||
+ gw->info.gw_lid != new_gw_data->gw_lid ||
+ gw->info.gw_port_id != new_gw_data->gw_port_id ||
+ gw->info.gw_qpn != new_gw_data->gw_qpn ||
+ (!vnic_sa_query && gw->info.gw_sl != new_gw_data->gw_sl)) {
+ /* TODO: Make sure that the GW doesn't change the sl sent in solicitation */
+ /* In this case the GW address might be modified even
+ in 'good flow' */
+ if (gw->info.gw_type == GW_TYPE_LAG &&
+ gw->info.ext_lag.ucast)
+ update_gw_address(gw, new_gw_data);
+ else {
+ vnic_dbg_fip(name, "fip_gw_modified changing "
+ "unsupported parameter closing GW\n");
+ fip_close_gw(gw, FIP_PARTIAL_FLUSH);
+ }
+ } else if (gw->info.gw_num_vnics < new_gw_data->gw_num_vnics) {
+ vnic_dbg_fip(name, "fip_gw_modified changing num "
+ "vnics from %d to %d\n", gw->info.gw_num_vnics,
+ new_gw_data->gw_num_vnics);
+ gw->info.gw_num_vnics = new_gw_data->gw_num_vnics;
+ if (fip_gw_create_vnics(gw))
+ vnic_err(name, "fip_gw_create_vnics failed\n");
+
+ } else if (gw->info.gw_num_vnics > new_gw_data->gw_num_vnics) {
+ gw->info.gw_num_vnics = new_gw_data->gw_num_vnics;
+ fip_gw_close_nonopen_vnics(gw);
+ if (gw->vnic_count < gw->info.gw_num_vnics)
+ fip_gw_create_vnics(gw);
+ vnic_dbg_fip(name, "fip_gw_modified changing num "
+ "vnics from %d to %d\n", gw->info.gw_num_vnics,
+ new_gw_data->gw_num_vnics);
+ } else if (gw->info.n_rss_qpn != new_gw_data->n_rss_qpn) {
+ gw->info.n_rss_qpn = new_gw_data->n_rss_qpn;
+ vnic_dbg_fip(name, "fip_gw_modified changing n_rss_qpn "
+ "from %d to %d\n", gw->info.n_rss_qpn,
+ new_gw_data->n_rss_qpn);
+ } else if (gw->info.hadmined_en != new_gw_data->hadmined_en) {
+ if (fip_gw_create_vnics(gw))
+ vnic_err(name, "fip_gw_create_vnics failed\n");
+ }
+
+ return 0;
+}
+
+static inline int is_none_zero_guid(u8 *guid)
+{
+ int i;
+ u8 ored = 0;
+
+ if (!guid)
+ return 0;
+
+ for (i = 0; i < 8; ++i)
+ ored |= guid[i];
+
+ return !!ored;
+}
+
+/*
+ * Look for a GW in the GW list.
+ * The search need one identifier to identify the Box (either GUID or system name)
+ * and one identifier for the external port (port_id or eport_name).
+ * This function uses what ever data is available for the search since
+ * various callers do not have access to a single pair of ids.
+ * use NULL for unknown strings and GW_PORT_ID_UNKNOWN for unknown port_id.
+ * GW that are undergoing complete flush are disregarded by the search.
+ */
+struct fip_gw_data *fip_find_gw_in_list(
+ struct fip_discover *discover,
+ int port_id,
+ u8 *eport_name,
+ u8 *gw_guid,
+ u8 *system_guid,
+ u8 *system_name,
+ int is_login)
+{
+ struct fip_gw_data *curr_gw;
+ int use_guid = is_none_zero_guid(gw_guid);
+ int use_system_name = system_name && strlen(system_name) > 0;
+ int use_system_guid = is_none_zero_guid(system_guid);
+ int use_eport = eport_name && strlen(eport_name) > 0;
+ int use_port_id = port_id >= 0;
+ int port_id_pass;
+ int eport_match;
+
+ if(!((use_eport || use_port_id) &&
+ (use_guid || use_system_name || use_system_guid))) {
+ vnic_dbg_fip_v(discover->name,
+ "fip_find_gw_in_list not enough param for search\n");
+ return NULL;
+ }
+
+ if (use_system_name)
+ vnic_dbg_fip_v(discover->name, "system name %s\n", system_name);
+
+ if (use_guid)
+ vnic_dbg_fip_v(discover->name, "gw guid "VNIC_GUID_FMT"\n",
+ VNIC_GUID_RAW_ARG(gw_guid));
+
+ if (use_system_guid)
+ vnic_dbg_fip_v(discover->name, "system guid "VNIC_GUID_FMT"\n",
+ VNIC_GUID_RAW_ARG(system_guid));
+
+ if (use_eport)
+ vnic_dbg_fip_v(discover->name, "eport %s\n", eport_name);
+
+ if (use_port_id)
+ vnic_dbg_fip_v(discover->name, "port_id 0x%x\n", port_id);
+
+ down_read(&discover->l_rwsem);
+ list_for_each_entry(curr_gw, &discover->gw_list, list) {
+ vnic_dbg_fip_v(discover->name, "check gw on eport %s, gw_guid "VNIC_GUID_FMT" "
+ "system_guid "VNIC_GUID_FMT", flush %d\n",
+ curr_gw->info.vol_info.gw_port_name,
+ VNIC_GUID_RAW_ARG(curr_gw->info.gw_guid),
+ VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid),
+ curr_gw->flush);
+
+ if (curr_gw->flush == FIP_FULL_FLUSH)
+ continue;
+
+ /* for login ack, skip non connected GWs */
+ if (is_login && use_port_id && curr_gw->state == FIP_GW_HOST_ADMIN) /* skip dangling hadmined GWs */
+ continue;
+
+ /* use the eport names only if you don't have port_id indexes
+ * This is in order to enable port_id changes.
+ * in case of host admin GW, ignore gw_port_id since the old GW
+ * will never be flushed and the new GW id can change */
+ port_id_pass = use_port_id && (curr_gw->info.gw_port_id != (u16)-1) && !(curr_gw->hadmin_gw && use_eport);
+ eport_match = (use_eport && !port_id_pass &&
+ !strncmp(curr_gw->info.vol_info.gw_port_name,
+ eport_name,VNIC_GW_PORT_NAME_LEN)) ||
+ (port_id_pass && (port_id == curr_gw->info.gw_port_id));
+ if (!eport_match)
+ continue;
+
+ if (use_guid && !memcmp(curr_gw->info.gw_guid, gw_guid, GUID_LEN))
+ goto found;
+
+ if (use_system_guid &&
+ !memcmp(curr_gw->info.vol_info.system_guid,
+ system_guid, GUID_LEN))
+ goto found;
+
+ if(use_system_name &&
+ !strncmp(curr_gw->info.vol_info.system_name, system_name,
+ VNIC_SYSTEM_NAME_LEN))
+ goto found;
+ }
+
+ up_read(&discover->l_rwsem);
+ vnic_dbg_fip(discover->name, "gw not found!\n");
+ return NULL;
+found:
+ if (curr_gw->hadmin_gw && use_eport && use_port_id &&
+ !strncmp(curr_gw->info.vol_info.gw_port_name,eport_name,VNIC_GW_PORT_NAME_LEN) &&
+ curr_gw->info.gw_port_id != port_id) {
+ vnic_info("%s:["VNIC_GUID_FMT"] %s eport ID changed from %d to %d\n",
+ curr_gw->info.vol_info.system_name,
+ VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid),
+ curr_gw->info.vol_info.gw_port_name,
+ curr_gw->info.gw_port_id, port_id);
+ }
+
+ up_read(&discover->l_rwsem);
+ return curr_gw;
+}
+
+/*
+ * Alloc and init a new GW struct
+ */
+static struct fip_gw_data *fip_discover_create_gw(struct fip_discover *discover)
+{
+ struct fip_gw_data *gw_data;
+
+ gw_data = kzalloc(sizeof(struct fip_gw_data), GFP_KERNEL);
+ if (!gw_data)
+ goto out;
+
+ INIT_DELAYED_WORK(&gw_data->gw_task, fip_discover_gw_fsm);
+ INIT_DELAYED_WORK(&gw_data->vnic_cleanup_task, fip_purge_vnics);
+ INIT_LIST_HEAD(&gw_data->vnic_list);
+ gw_data->discover = discover;
+ gw_data->pquery = ERR_PTR(-ENODATA);
+ gw_data->query_id = -1;
+ init_completion(&gw_data->query_comp);
+ complete(&gw_data->query_comp);
+ mutex_init(&gw_data->mlock);
+
+out:
+ return gw_data;
+}
+
+static void fip_discover_hadmin_update(struct work_struct *work)
+{
+ struct fip_discover *discover =
+ container_of(work, struct fip_discover,
+ hadmin_update_task.work);
+ struct fip_hadmin_cache *hadmin_entry;
+ struct fip_hadmin_cache *hadmin_tmp;
+ struct fip_gw_data *curr_gw;
+ struct list_head hadmin_head;
+ char *name;
+ int flush, used_guid, rc;
+
+ /* move list from hadmin_cache to a temporary list */
+ spin_lock_irq(&discover->lock);
+ list_replace(&discover->hadmin_cache, &hadmin_head);
+ INIT_LIST_HEAD(&discover->hadmin_cache);
+ flush = discover->flush;
+ spin_unlock_irq(&discover->lock);
+
+ if (flush != FIP_NO_FLUSH)
+ goto out;
+
+ /* process hadmin list */
+ list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next) {
+ name = (char *)(hadmin_entry->interface_name);
+ vnic_dbg_mac(name, "parent_used %d, remove %d\n",
+ hadmin_entry->parent_used,
+ hadmin_entry->remove);
+ if (hadmin_entry->parent_used) {
+ rc = vnic_parent_update(discover->port, hadmin_entry->interface_name,
+ hadmin_entry->vnic_id, hadmin_entry->mac,
+ &(hadmin_entry->qp_base_num),
+ hadmin_entry->parent_name,
+ hadmin_entry->remove);
+ if (rc)
+ continue;
+ }
+
+ used_guid = is_valid_guid(hadmin_entry->system_guid);
+ curr_gw = fip_find_gw_in_list(discover, NOT_AVAILABLE_NUM,
+ hadmin_entry->eport_name,
+ NULL,
+ used_guid ? hadmin_entry->system_guid : NULL,
+ used_guid ? NULL : hadmin_entry->system_name, 0/* is_login */);
+ if (!hadmin_entry->remove) {
+ /* in case no GW or GW is being removed create a new one */
+ if (!curr_gw || curr_gw->flush == FIP_FULL_FLUSH) {
+ curr_gw = fip_discover_create_gw(discover);
+ if (!curr_gw) {
+ vnic_warn(discover->name, "failed to create hadmin GW\n");
+ continue;
+ } else {
+ down_write(&discover->l_rwsem);
+ list_add_tail(&curr_gw->list, &discover->gw_list);
+ up_write(&discover->l_rwsem);
+ }
+
+ memcpy(curr_gw->info.vol_info.system_guid,
+ hadmin_entry->system_guid, GUID_LEN);
+ memcpy(curr_gw->info.vol_info.gw_port_name,
+ hadmin_entry->eport_name,
+ VNIC_GW_PORT_NAME_LEN);
+ if (used_guid)
+ strcpy(curr_gw->info.vol_info.system_name,
+ NOT_AVAILABLE_STRING);
+ else
+ memcpy(curr_gw->info.vol_info.system_name,
+ hadmin_entry->system_name,
+ VNIC_SYSTEM_NAME_LEN);
+
+ curr_gw->info.gw_port_id = hadmin_entry->gw_port_id;
+ curr_gw->state = FIP_GW_HOST_ADMIN;
+ }
+
+ curr_gw->hadmin_gw = 1;
+ fip_gw_update_hadmin_gw(curr_gw, hadmin_entry);
+ } else if(curr_gw)
+ fip_gw_update_hadmin_gw(curr_gw, hadmin_entry);
+
+ list_del(&hadmin_entry->next);
+ kfree(hadmin_entry);
+ }
+
+out:
+ /* flush hadmin_tmp list and exit */
+ list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next)
+ kfree(hadmin_entry);
+}
+
+static const char *gw_state_to_str(enum fip_gw_state state)
+{
+ switch (state) {
+ case FIP_GW_CONNECTED:
+ return "FIP_GW_CONNECTED";
+ case FIP_GW_CTRL_PATH_QUERY:
+ return "FIP_GW_CTRL_PATH_QUERY";
+ case FIP_GW_DATA_PATH_QUERY:
+ return "FIP_GW_DATA_PATH_QUERY";
+ case FIP_GW_HOST_ADMIN:
+ return "FIP_GW_HOST_ADMIN";
+ case FIP_GW_SEND_SOLICIT:
+ return "FIP_GW_SEND_SOLICIT";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+int fip_gw_sysfs_show(struct vnic_port *port, char *buf)
+{
+ struct fip_gw_data *gw;
+ char *p = buf;
+ struct fip_discover *discover;
+
+ mutex_lock(&port->start_stop_lock);
+ list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+
+ down_read(&discover->l_rwsem);
+
+ list_for_each_entry(gw, &discover->gw_list, list) {
+ p += _sprintf(p, buf, "IOA_PORT %s:%d\n",
+ gw->discover->port->dev->ca->name,
+ gw->discover->port->num);
+ p += _sprintf(p, buf, "BX_NAME %s\n",
+ gw->info.vol_info.system_name);
+ if (!(*(u64 *)(gw->info.vol_info.system_guid)))
+ p += _sprintf(p, buf, "BX_GUID %s\n", NOT_AVAILABLE_STRING);
+ else
+ p += _sprintf(p, buf, "BX_GUID "VNIC_GUID_FMT"\n",
+ VNIC_GUID_RAW_ARG(gw->info.vol_info.system_guid));
+ p += _sprintf(p, buf, "EPORT_NAME %s\n", gw->info.vol_info.gw_port_name);
+ p += _sprintf(p, buf, "EPORT_ID %u\n", gw->info.gw_port_id);
+ p += _sprintf(p, buf, "STATE %s\n", gw_state_to_str(gw->state));
+ p += _sprintf(p, buf, "GW_TYPE %s\n", gw->info.gw_type == GW_TYPE_LAG ?
+ "AGGREGATED" : "LEGACY");
+ p += _sprintf(p, buf, "PKEY 0x%x\n", discover->pkey);
+ p += _sprintf(p, buf, "ALL_VLAN %s\n",
+ gw->state == FIP_GW_CONNECTED ?
+ (gw->info.all_vlan_gw ? "yes" : "no") : NOT_AVAILABLE_STRING);
+ p += _sprintf(p, buf, "CTRL_SL %d\n", gw->ctrl_prec.sl);
+ p += _sprintf(p, buf, "DATA_SL %d\n", gw->data_prec.sl);
+ p += _sprintf(p, buf, "\n");
+ }
+
+ up_read(&discover->l_rwsem);
+ }
+
+ mutex_unlock(&port->start_stop_lock);
+ return (p - buf);
+}
+
+static int fip_discover_rx_advertise_bh(struct fip_discover *discover,
+ struct fip_gw_data *advertise_data)
+{
+ struct fip_gw_data *gw_data;
+ int update_entry = 0;
+
+ /* see if we received advertise packets from this GW before */
+ gw_data = fip_find_gw_in_list(discover,
+ advertise_data->info.gw_port_id,
+ advertise_data->info.vol_info.gw_port_name,
+ advertise_data->info.gw_guid,
+ advertise_data->info.vol_info.system_guid,
+ advertise_data->info.vol_info.system_name, 0/* is_login */);
+
+ /*
+ * GW not found in GW list. Create a new GW structure
+ * and add it to the GW list.
+ */
+ if (!gw_data) {
+ gw_data = fip_discover_create_gw(discover);
+ if (!gw_data) {
+ vnic_dbg_fip(discover->name, "Could not create gw\n");
+ return -ENOMEM;
+ }
+ gw_data->keep_alive_jiffies = jiffies;
+
+ down_write(&discover->l_rwsem);
+ list_add_tail(&gw_data->list, &discover->gw_list);
+ up_write(&discover->l_rwsem);
+ update_entry = 1;
+ } else {
+ gw_data->keep_alive_jiffies = jiffies;
+ vnic_dbg_fip(discover->name, "gw_data->flush %d\n", gw_data->flush);
+ if (gw_data->flush != FIP_NO_FLUSH)
+ return 0;
+
+ if (gw_data->state <= FIP_GW_SEND_SOLICIT)
+ update_entry = 1;
+ }
+
+ /* If GW is in multicast state (based on received mcast packet),
+ * replace it with the newer up-to-date packet info.
+ */
+ if (update_entry) {
+ if (gw_data->state < FIP_GW_CTRL_PATH_QUERY) {
+ down_write(&discover->l_rwsem);
+ if (advertise_data->info.gw_prot_new)
+ discover->new_prot_gws++;
+ else
+ discover->old_prot_gws++;
+ up_write(&discover->l_rwsem);
+ }
+ memcpy(&gw_data->info, &advertise_data->info,
+ sizeof(struct fip_gw_data_info));
+ if (gw_data->state < FIP_GW_SEND_SOLICIT)
+ gw_data->state = vnic_sa_query? FIP_GW_CTRL_PATH_QUERY : FIP_GW_SEND_SOLICIT;
+ } else {
+ /* If the pc_id in the adv doesn't match the one
+ saved - there was a power cycle, so we want to close
+ the GW */
+ if (advertise_data->info.ext_pc_id.valid &&
+ (advertise_data->info.ext_pc_id.power_cycle_id !=
+ gw_data->info.ext_pc_id.power_cycle_id)) {
+ vnic_dbg_fip_p0(discover->name, "received advertisement with "
+ "pc_id %llu when expecting %llu. closing the GW",
+ advertise_data->info.ext_pc_id.power_cycle_id,
+ gw_data->info.ext_pc_id.power_cycle_id);
+ fip_close_gw(gw_data, FIP_PARTIAL_FLUSH);
+ goto no_repost;
+ }
+
+ /* TBD: enforce discard ?? */
+ if (gw_data->info.gw_type != advertise_data->info.gw_type)
+ vnic_dbg_fip_p0(discover->name, "gateway type must not change\n");
+
+ /* update GW descriptors that do not require additional processing.
+ These will be updated as part of GW_MODIFY flow */
+ mutex_lock(&gw_data->mlock);
+ if (advertise_data->info.ext_pc_id.valid)
+ memcpy(&gw_data->info.ext_pc_id, &advertise_data->info.ext_pc_id,
+ sizeof(gw_data->info.ext_pc_id));
+
+ memcpy(&gw_data->info.vol_info, &advertise_data->info.vol_info,
+ sizeof(gw_data->info.vol_info));
+ if (gw_data->info.ext_lag.valid) {
+ gw_data->info.ext_lag.hash = advertise_data->info.ext_lag.hash;
+ gw_data->info.ext_lag.ca = advertise_data->info.ext_lag.ca;
+ gw_data->info.ext_lag.ca_thresh = advertise_data->info.ext_lag.ca_thresh;
+ gw_data->info.ext_lag.weights_policy = advertise_data->info.ext_lag.weights_policy;
+ }
+ mutex_unlock(&gw_data->mlock);
+ }
+
+ /* if multicast advertisement received */
+ if (advertise_data->info.flags & FIP_RCV_MULTICAST) {
+ vnic_dbg_fip(discover->name, "FIP_RCV_MULTICAST ADVERTISE, state %d\n",
+ gw_data->state);
+ /* we are beyond accepting mcast advertisement */
+ if (gw_data->state > FIP_GW_SEND_SOLICIT)
+ goto out;
+
+ vnic_dbg_fip(discover->name, "received mcast advertise sending"
+ " ucast solicit to GW qpn %d lid %d flags 0x%x\n",
+ gw_data->info.gw_qpn, gw_data->info.gw_lid,
+ gw_data->info.flags);
+ } else { /* unicast advertisement received */
+ int ack_received = advertise_data->info.flags & FIP_GW_AVAILABLE;
+
+ vnic_dbg_fip(discover->name, "received ucast advertise from GW "
+ "qpn %d lid %d flags 0x%x, ack_received %s "
+ "gw_num_vnics %d gw->state=%d, "
+ VNIC_GUID_FMT"\n",
+ gw_data->info.gw_qpn, gw_data->info.gw_lid,
+ gw_data->info.flags, ack_received ? "yes" : "no",
+ gw_data->info.gw_num_vnics, gw_data->state,
+ VNIC_GUID_RAW_ARG(gw_data->info.gw_guid));
+
+ if (ack_received) {
+ /* if this is first ACK received */
+ switch (gw_data->state) {
+ case FIP_GW_CTRL_PATH_QUERY:
+ /*
+ * in case we are in FIP_GW_CTRL_PATH_QUERY we wait until it completes
+ * to move us to FIP_GW_SEND_SOLICIT
+ */
+ break;
+ case FIP_GW_SEND_SOLICIT:
+ /* in case we received an ack in this state we move to DATA_PATH_QUERY */
+ gw_data->state = vnic_sa_query ? FIP_GW_DATA_PATH_QUERY : FIP_GW_CONNECTED;
+ break;
+ case FIP_GW_CONNECTED:
+ /*
+ * received an ACK and we are connected. we need to
+ * check for changes in GW and apply them if needed
+ */
+ if (!fip_gw_modified(gw_data, &advertise_data->info))
+ gw_data->state = FIP_GW_CONNECTED;
+ goto no_repost;
+ default:
+ break;
+ }
+ } else /* !ack_received */ {
+ fip_close_gw(gw_data, FIP_PARTIAL_FLUSH);
+ goto no_repost;
+ }
+ /*
+ * we don't accept ACKs in transient states.
+ * This should not be a problem since crowded multiple ACKs
+ * is not an expected flow, and if the packets are similar
+ * (no updates) it doesn't matter anyway.
+ */
+ }
+
+out:
+ vnic_dbg_fip(discover->name, "out gw->state=%d\n", gw_data->state);
+ /*
+ * we will call the GW FSM to hadle
+ */
+ cancel_delayed_work(&gw_data->gw_task);
+ fip_discover_gw_fsm(&gw_data->gw_task.work);
+no_repost:
+ return 0;
+}
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then validates the packet
+ * according to its type. This functions runs in ka_wq task context.
+ */
+void fip_discover_rx_packet(int *queue, struct fip_content *fc)
+{
+ *queue = 0;
+ switch (fc->fh->subcode) {
+ case FIP_GW_ADV_SUB_OPCODE:
+ case FIP_GW_LOGIN_SUB_OPCODE:
+ *queue = 1;
+ break;
+ default:
+ break;
+ }
+}
+
+/*
+ * Print FIP syndrome number and string
+ */
+static void fip_print_syndrome(struct fip_vnic_data *vnic, int synd) {
+ char *syndstr;
+
+ switch (synd) {
+ case FIP_SYNDROM_HADMIN_REJECT:
+ syndstr = "FIP_SYNDROM_HADMIN_REJECT";
+ break;
+ case FIP_SYNDROM_GW_RESRC:
+ syndstr = "FIP_SYNDROM_GW_RESRC";
+ break;
+ case FIP_SYNDROM_NO_NADMIN:
+ syndstr = "FIP_SYNDROM_NO_NADMIN";
+ break;
+ case FIP_SYNDROM_UNRECOGNISED_HOST:
+ syndstr = "FIP_SYNDROM_UNRECOGNISED_HOST";
+ break;
+ case FIP_SYNDROM_UNSUPPORTED_PARAM:
+ syndstr = "FIP_SYNDROM_UNSUPPORTED_PARAM";
+ break;
+ case FIP_SYNDROM_GW_IS_LAG_MEMBER:
+ syndstr = "FIP_SYNDROM_GW_IS_LAG_MEMBER";
+ break;
+ case FIP_SYNDROM_DUPLICATE_ADDRESS:
+ syndstr = "FIP_SYNDROM_DUPLICATE_ADDRESS";
+ break;
+ default:
+ syndstr = "FIP_OTHER";
+ }
+
+ vnic_warn(vnic->name, "SYNDROME 0x%x: %s\n",
+ synd, syndstr);
+}
+
+static void handle_login_packet(struct fip_discover *discover,
+ struct fip_login_data *login_data)
+{
+ struct fip_gw_data *gw;
+ struct fip_vnic_data *vnic;
+ int mac_vlan_refused = 0;
+ int synd;
+
+ /* find the GW that this login belongs to */
+ gw = fip_find_gw_in_list(discover,
+ login_data->port_id,
+ NULL,
+ login_data->guid,
+ NULL, NULL, 1/* is_login */);
+
+ if (!gw){
+ vnic_warn(discover->name,"dropping login ack with vnic_id:%d mac:"MAC_6_PRINT_FMT
+ " BX port_id:%d GUID: "VNIC_GUID_FMT", GW not found!\n",
+ login_data->vnic_id,
+ MAC_6_PRINT_ARG(login_data->mac),
+ login_data->port_id,
+ VNIC_GUID_RAW_ARG(login_data->guid));
+ return;
+ }
+ vnic = fip_vnic_find_in_list(gw, login_data->vnic_id,
+ login_data->mac,
+ login_data->vlan,
+ login_data->vp);
+ if (!vnic){
+ vnic_warn(discover->name,"dropping login ack with vnic_id:%d mac:"MAC_6_PRINT_FMT
+ " BX port_id:%d GUID: "VNIC_GUID_FMT", vnic not found!\n",
+ login_data->vnic_id,
+ MAC_6_PRINT_ARG(login_data->mac),
+ login_data->port_id,
+ VNIC_GUID_RAW_ARG(login_data->guid));
+ return;
+ }
+
+ /*
+ * For host administered vNICs we must have login and login ack
+ * macs equal and different than all zeros. login and and login
+ * ack must agree on vlan presence. And if vlan is present, vlans
+ * must be indentical. Otherwise, the request is rejected.
+ */
+ if (vnic->hadmined) {
+ if (!IS_ZERO_MAC(vnic->login_data.mac) &&
+ memcmp(vnic->login_data.mac, login_data->mac, ETH_ALEN)) {
+ vnic_dbg_fip(discover->name, "fip_discover_rx_packet"
+ " host admined mac refused\n");
+ mac_vlan_refused = 1;
+ } else if (vnic->login_data.all_vlan_gw != login_data->all_vlan_gw)
+ vnic_dbg_fip(discover->name,
+ "fip_discover_rx_packet host"
+ " host and GW disagree on all_vlan mode\n");
+ /* If the host is not working in all_vlan_gw policy -
+ check the requested vlan against the accepted */
+ else if (!gw->info.all_vlan_gw &&
+ (vnic->login_data.vp != login_data->vp ||
+ (login_data->vp == 1 &&
+ vnic->login_data.vlan != login_data->vlan))) {
+ vnic_dbg_fip(discover->name,
+ "fip_discover_rx_packet host"
+ " admined vlan refused\n");
+ mac_vlan_refused = 1;
+ }
+ }
+
+ /* process a login packet for the specific vnic */
+ synd = (int)login_data->syndrome;
+ if (synd || mac_vlan_refused) {
+ char *vnic_name = vnic->hadmined ?
+ (char *)vnic->interface_name : (char *)vnic->name;
+ /* print syndrome as long as backlog limit is not exceeded */
+ if (vnic->synd_backlog++ >= vnic_synd_backlog)
+ return;
+
+ vnic_warn(discover->name, "%s login failed "
+ "(mac "MAC_6_PRINT_FMT" vlan %d) "
+ "backlog %d/%d\n",
+ vnic_name,
+ MAC_6_PRINT_ARG(vnic->mac_cache),
+ (vnic->vlan_used ? vnic->vlan : -1),
+ vnic->synd_backlog, vnic_synd_backlog);
+
+ if (mac_vlan_refused)
+ vnic_warn(vnic->name, "MAC/VLAN refused\n");
+
+ fip_print_syndrome(vnic, synd);
+
+ if (synd == FIP_SYNDROM_UNRECOGNISED_HOST) {
+ vnic_info("%s %s sending ucast sloicit to Gateway\n",
+ discover->name, vnic_name);
+ if(fip_solicit_send(gw->discover,
+ FIP_DISCOVER_UCAST,
+ gw->info.gw_qpn,
+ gw->info.gw_lid,
+ vnic_gw_ctrl_sl(gw),
+ gw->info.gw_prot_new))
+ vnic_warn(discover->name, "%s Failed to send ucast solicit\n", vnic_name);
+ }
+ } else {
+ vnic->all_vlan_gw = !!((!vnic->hadmined && vnic->gw->info.all_vlan_gw) ||
+ (vnic->hadmined && vnic->login_data.all_vlan_gw));
+ fip_vnic_login_ack_recv(vnic, login_data);
+ }
+}
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then processes the packet
+ * according to its type. This functions runs in task context.
+ */
+int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc)
+{
+ struct fip_gw_data *advertise_data = NULL;
+ struct fip_login_data *login_data = NULL;
+ int rc;
+ int ret = 0;
+
+ switch (fc->fh->subcode) {
+ case FIP_GW_ADV_SUB_OPCODE:
+ advertise_data = kzalloc(sizeof *advertise_data, GFP_KERNEL);
+ if (!advertise_data) {
+ vnic_warn(discover->name,
+ "Failed to allocate %Zu bytes",
+ sizeof *advertise_data);
+ return -ENOMEM;
+ }
+
+ rc = fip_advertise_parse_bh(discover, fc, advertise_data);
+ if (!rc)
+ ret = fip_discover_rx_advertise_bh(discover,
+ advertise_data);
+ kfree(advertise_data);
+ break;
+
+ case FIP_GW_LOGIN_SUB_OPCODE:
+ login_data = kzalloc(sizeof *login_data, GFP_KERNEL);
+ if (!login_data) {
+ vnic_warn(discover->name,
+ "Failed to allocate %Zu bytes",
+ sizeof *login_data);
+ return -ENOMEM;
+ }
+
+ rc = fip_login_parse(discover, fc, login_data);
+ if (!rc)
+ handle_login_packet(discover, login_data);
+
+ kfree(login_data);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This function is a callback called upon successful join to a
+ * multicast group. The function checks if we have joined + attached
+ * to all required mcast groups and if so moves the discovery FSM to solicit.
+ */
+static void fip_discover_mcast_connect_cb(struct vnic_mcast *mcaste, void *ctx)
+{
+ struct fip_discover *discover = mcaste->priv_data;
+
+ if (mcaste->cur_attached && mcaste->req_attach) {
+ vnic_dbg_parse(discover->name, "attached mask = 0x%lx, req mask = 0x%lx\n",
+ *mcaste->cur_attached, *mcaste->req_attach);
+ if ((*mcaste->cur_attached & *mcaste->req_attach) !=
+ *mcaste->req_attach) {
+ return;
+ }
+ }
+
+ discover->discover_mcast_attached_jiffies = jiffies;
+ set_bit(MCAST_ATTACHED, &discover->discover_mcast_state);
+ /* in the case of a reconnect don't change state or send a solicit
+ * packet
+ */
+ if (discover->state < FIP_DISCOVER_SOLICIT) {
+ vnic_dbg_fip(discover->name, "fip_multicast_connected moved"
+ " state to solicit\n");
+ spin_lock_irq(&discover->lock);
+ if (discover->flush == FIP_NO_FLUSH) {
+ /* delay sending solicit packet by 0-100 mSec */
+ int rand_delay = jiffies % 100; /*get_random_int()*/
+ discover->state = FIP_DISCOVER_SOLICIT;
+ cancel_delayed_work(&discover->fsm_task);
+ /* This is really (rand_delay / 1000) * HZ*/
+ /* calls fip_discover_fsm() */
+ queue_delayed_work(fip_wq, &discover->fsm_task,
+ (rand_delay * HZ) / 1000);
+ }
+ spin_unlock_irq(&discover->lock);
+ }
+ vnic_dbg_fip(discover->name, "discover_mcast_connect_cb done\n");
+}
+
+/*
+ * This function is a callback called upon a mcast deattach event.
+ * This event can be triggered due to discovery teardown or due to an async
+ * event. Currently this code does not participate in the discovery's FSM.
+*/
+void fip_discover_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx)
+{
+// struct vnic_mcast *mcast_other = ctx;
+ struct fip_discover *discover = mcast->priv_data;
+
+ discover->discover_mcast_detached_jiffies = jiffies;
+ clear_bit(MCAST_ATTACHED, &discover->discover_mcast_state);
+
+ vnic_dbg_fip(NULL, "fip_discover_mcast_deattach_cb\n");
+}
+
+/*
+ * Try to connect to the relevant mcast groups. If one of the mcast failed
+ * The function should be recalled to try and complete the join process
+ * (for the mcast groups that the join process was not performed).
+ * Note: A successful return of vnic_mcast_join means that the mcast join
+ * started, not that the join completed. completion of the connection process
+ * is asyncronous and uses a supplyed callback.
+ */
+static int fip_discover_mcast_connect(struct fip_discover *discover)
+{
+ struct vnic_mcast *mcaste_disc, *mcaste_sol, *mcaste;
+ int rc;
+
+ mcaste_disc = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached);
+ if (IS_ERR(mcaste_disc))
+ return -EINVAL;
+
+ mcaste_sol = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached);
+ if (IS_ERR(mcaste_sol)) {
+ vnic_mcast_dealloc(mcaste_disc);
+ return -EINVAL;
+ }
+
+ set_bit(FIP_MCAST_DISCOVER, &discover->req_attach);
+ set_bit(FIP_MCAST_SOLICIT, &discover->req_attach);
+
+ mcaste = mcaste_disc;
+ mcaste->priv_data = discover;
+ mcaste->attach_bit_nr = FIP_MCAST_DISCOVER;
+ memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+ memcpy(&mcaste->gid, fip_discover_mgid, GID_LEN);
+ if (discover->pkey != 0xffff)
+ *(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000);
+ memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+ mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+ mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+ mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+ mcaste->attach_cb = fip_discover_mcast_connect_cb;
+ mcaste->detach_cb = fip_discover_mcast_deattach_cb;
+ mcaste->attach_cb_ctx = mcaste_sol;
+ mcaste->detach_cb_ctx = mcaste_sol;
+ mcaste->pkey = discover->pkey;
+ mcaste->qkey = VNIC_FIP_QKEY;
+ mcaste->qp = discover->qp;
+ mcaste->blocking = 0;
+ mcaste->join_state = 1;
+ rc = vnic_mcast_add(&discover->mcast_tree, mcaste);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */
+ ASSERT(!rc);
+
+ mcaste = mcaste_sol;
+ mcaste->priv_data = discover;
+ mcaste->attach_bit_nr = FIP_MCAST_SOLICIT;
+ memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+ memcpy(&mcaste->gid, fip_solicit_mgid, GID_LEN);
+ if (discover->pkey != 0xffff)
+ *(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000);
+ memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+ mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+ mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+ mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+ mcaste->attach_cb = fip_discover_mcast_connect_cb;
+ mcaste->detach_cb = fip_discover_mcast_deattach_cb;
+ mcaste->attach_cb_ctx = mcaste_disc;
+ mcaste->detach_cb_ctx = mcaste_disc;
+ mcaste->pkey = discover->pkey;
+ mcaste->qkey = VNIC_FIP_QKEY;
+ mcaste->qp = discover->qp;
+ mcaste->blocking = 0;
+ mcaste->join_state = 1;
+ mcaste->sender_only = 1;
+ rc = vnic_mcast_add(&discover->mcast_tree, mcaste);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_SEND_ONLY */
+ ASSERT(!rc);
+
+ return 0;
+}
+
+int fip_discover_mcast_reattach(struct fip_discover *discover,
+ struct vnic_port *port)
+{
+ int flush;
+
+ spin_lock_irq(&discover->lock);
+ flush = discover->flush;
+ spin_unlock_irq(&discover->lock);
+
+ if (flush == FIP_NO_FLUSH &&
+ discover->state > FIP_DISCOVER_INIT) {
+ vnic_tree_mcast_detach(&discover->mcast_tree);
+ vnic_tree_mcast_attach(&discover->mcast_tree);
+ }
+ return 0;
+}
+
+static void fip_discover_ctrl_path_query_complete(
+ int status,
+ struct ib_sa_path_rec *pathrec,
+ void *context)
+{
+ struct fip_gw_data *gw = context;
+ vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query complete status=%d\n", status);
+ if (!status) {
+ vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query success srcgid:"VNIC_GUID_FMT" dgid:"VNIC_GUID_FMT"\n",
+ VNIC_GUID_RAW_ARG(pathrec->sgid.raw+8),
+ VNIC_GUID_RAW_ARG(pathrec->dgid.raw+8));
+ gw->ctrl_prec = *pathrec;
+ fip_discover_gw_fsm_move(gw, FIP_GW_SEND_SOLICIT);
+ } else {
+ vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query FAILED ret=%d\n", status);
+ gw->query_id = -1; /* this will cause a retry */
+ }
+ complete(&gw->query_comp);
+}
+
+static void fip_discover_data_path_query_complete(
+ int status,
+ struct ib_sa_path_rec *pathrec,
+ void *context)
+{
+ struct fip_gw_data *gw = context;
+ vnic_dbg_fip_p0(gw->discover->name, "fip data path query complete status=%d\n", status);
+ if (!status) {
+ struct ib_sa_path_rec old_pathrec;
+ struct fip_vnic_data *vnic;
+ vnic_dbg_fip_p0(gw->discover->name, "fip data path query success srcgid:"VNIC_GUID_FMT" dgid:"VNIC_GUID_FMT"\n",
+ VNIC_GUID_RAW_ARG(pathrec->sgid.raw+8),
+ VNIC_GUID_RAW_ARG(pathrec->dgid.raw+8));
+ old_pathrec = gw->data_prec;
+ gw->data_prec = *pathrec;
+ if (old_pathrec.sl != gw->data_prec.sl) {
+ /* in case of SL change close the vnic to relogin with the new SL */
+ vnic_info("[%s] %s %s Data SL changed from %d to %d\n",
+ gw->info.vol_info.system_name,
+ gw->discover->port->name,
+ gw->info.vol_info.gw_port_name,
+ old_pathrec.sl, gw->data_prec.sl);
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+ if (vnic->flush != FIP_FULL_FLUSH && vnic->state >= FIP_VNIC_LOGIN)
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ }
+ }
+ fip_discover_gw_fsm_move(gw, FIP_GW_CONNECTED);
+ } else {
+ vnic_dbg_fip_p0(gw->discover->name, "fip data path query FAILED ret=%d\n", status);
+ gw->query_id = -1; /* this will cause a retry */
+ }
+ complete(&gw->query_comp);
+}
+
+static int fip_discover_path_query(struct fip_gw_data *gw, int is_data_sl)
+{
+ ib_sa_comp_mask comp_mask;
+ struct ib_sa_path_rec p_rec;
+ void(*callback)(int status, struct ib_sa_path_rec *resp, void *context);
+
+ vnic_dbg_fip_p0(gw->discover->name, "fip path query %d of GW lid:%d sl=%d GID:"VNIC_GUID_FMT" SID=%llx data_path=%d!\n",
+ gw->query_path_cnt,
+ gw->info.gw_lid,
+ gw->info.gw_sl,
+ VNIC_GUID_RAW_ARG(gw->info.gw_guid),
+ is_data_sl ? EOIB_SERVICE_ID : EOIB_CTRL_SERVICE_ID,
+ is_data_sl);
+
+ comp_mask = IB_SA_PATH_REC_SERVICE_ID |
+ IB_SA_PATH_REC_DGID |
+ IB_SA_PATH_REC_SGID |
+ IB_SA_PATH_REC_REVERSIBLE |
+ IB_SA_PATH_REC_PKEY;
+
+ callback = is_data_sl ? fip_discover_data_path_query_complete : fip_discover_ctrl_path_query_complete;
+ memset(&p_rec, 0, sizeof(p_rec));
+
+ p_rec.service_id = is_data_sl ? cpu_to_be64(EOIB_SERVICE_ID) : cpu_to_be64(EOIB_CTRL_SERVICE_ID);
+ p_rec.sgid = gw->discover->port->gid;
+ /* copy the subnet prefix from source gid */
+ memcpy(p_rec.dgid.raw, p_rec.sgid.raw, 8);
+ /* copy gw dgid */
+ memcpy(p_rec.dgid.raw+8, gw->info.gw_guid,8);
+ p_rec.pkey = cpu_to_be16(gw->discover->pkey);
+ p_rec.reversible = cpu_to_be32(1);
+
+ if (gw->query_id >= 0 && !IS_ERR(gw->pquery) && gw->pquery) {
+ ib_sa_cancel_query(gw->query_id, gw->pquery);
+ return -1; /* retry later */
+ }
+
+ init_completion(&gw->query_comp);
+ gw->query_path_cnt++;
+ gw->query_id = -1;
+ gw->pquery = ERR_PTR(-ENODATA);
+
+ gw->query_id =
+ ib_sa_path_rec_get(&vnic_sa_client,
+ gw->discover->port->dev->ca,
+ gw->discover->port->num,
+ &p_rec,
+ comp_mask,
+ 2000 /*TOUT*/,
+ GFP_KERNEL,
+ callback,
+ gw,
+ &gw->pquery);
+ if (gw->query_id < 0) {
+ complete(&gw->query_comp);
+ vnic_dbg_fip_p0(gw->discover->name, "ib_sa_path_rec_get failed, error %d\n", gw->query_id);
+ gw->pquery = ERR_PTR(-ENODATA);
+ }
+ return gw->query_id;
+}
+
+void fip_discover_gw_fsm_move(struct fip_gw_data *gw, enum fip_gw_state state)
+{
+ cancel_delayed_work(&gw->gw_task);
+ if (gw->pquery && !IS_ERR(gw->pquery) && gw->query_id >= 0)
+ ib_sa_cancel_query(gw->query_id, gw->pquery);
+
+ gw->state = state;
+ gw->query_id = -1;
+ gw->query_path_cnt = 0;
+ queue_delayed_work(fip_wq, &gw->gw_task, 0);
+}
+
+
+static void fip_discover_gw_fsm(struct work_struct *work)
+{
+ struct fip_gw_data *curr_gw =
+ container_of(work, struct fip_gw_data, gw_task.work);
+ unsigned long next_wakeup = curr_gw->info.gw_adv_period;
+ unsigned long rand = jiffies % 100 + 1;
+ int ret;
+
+ if (curr_gw->flush != FIP_NO_FLUSH)
+ return;
+
+ if (test_bit(MCAST_ATTACHED,
+ &curr_gw->discover->discover_mcast_state)) {
+ if (time_after(jiffies, curr_gw->keep_alive_jiffies + next_wakeup)) {
+ if (time_after(jiffies,
+ curr_gw->discover->discover_mcast_attached_jiffies
+ + next_wakeup)) {
+ fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH);
+ return;
+ }
+ }
+ } else {
+ /* close gw if 1 minute has elapsed since mcast detach */
+ if (time_after(jiffies,
+ curr_gw->discover->discover_mcast_detached_jiffies
+ + 60*HZ)) {
+ fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH);
+ return;
+ }
+ }
+
+ switch (curr_gw->state) {
+ case FIP_GW_HOST_ADMIN:
+ break;
+ case FIP_GW_CTRL_PATH_QUERY:
+ if (curr_gw->query_path_cnt && curr_gw->query_id >= 0) {
+ /* PATH query is running */
+ next_wakeup = msecs_to_jiffies(100);
+ break;
+ }
+ ret = fip_discover_path_query(curr_gw, 0/*ctrl SL*/);
+ if (ret < 0)
+ vnic_dbg_fip_p0(curr_gw->discover->name, "Query ctrl path Failed : retry num %d ...\n", curr_gw->query_path_cnt);
+ next_wakeup = msecs_to_jiffies(100);
+ break;
+
+ case FIP_GW_SEND_SOLICIT:
+ curr_gw->query_path_cnt = 0;
+ curr_gw->query_id = -1;
+ curr_gw->pquery = ERR_PTR(-ENODATA);
+ vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN FIP_GW_SEND_SOLICIT\n");
+ vnic_dbg_parse(curr_gw->discover->name, "new protocol %d\n", curr_gw->info.gw_prot_new);
+ ret = fip_solicit_send(curr_gw->discover, FIP_DISCOVER_UCAST,
+ curr_gw->info.gw_qpn,
+ curr_gw->info.gw_lid,
+ vnic_gw_ctrl_sl(curr_gw),
+ curr_gw->info.gw_prot_new);
+ if (ret)
+ next_wakeup = (100 + rand * HZ) / 200;
+ else
+ next_wakeup = (100 + rand * HZ) / 25;
+ break;
+
+ case FIP_GW_DATA_PATH_QUERY:
+ if (curr_gw->query_path_cnt && curr_gw->query_id >= 0) {
+ /* PATH query is running */
+ next_wakeup = msecs_to_jiffies(100);
+ break;
+ }
+ ret = fip_discover_path_query(curr_gw, 1/*data SL*/);
+ if (ret < 0)
+ vnic_dbg_fip_p0(curr_gw->discover->name, "Query data path Failed : retry num %d ...\n", curr_gw->query_path_cnt);
+ next_wakeup = msecs_to_jiffies(100);
+ break;
+
+ case FIP_GW_CONNECTED:
+ vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN: GW_CONNECTED!!!\n");
+ /* test vnic status */
+ fip_gw_create_vnics(curr_gw);
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ /* go to sleep until time out. We expect that we will be awaken by
+ * RX packets and never get to wake up due to timeout
+ */
+ cancel_delayed_work(&curr_gw->gw_task);
+ queue_delayed_work(fip_wq, &curr_gw->gw_task, next_wakeup);
+}
+
+static int is_new_solicit_prot(struct fip_discover *discover)
+{
+ vnic_dbg_parse(discover->name, "new gw %d, old gw %d\n",
+ discover->new_prot_gws, discover->old_prot_gws);
+
+ if (!discover->old_prot_gws) {
+ if (!discover->new_prot_gws) {
+ /* mcast solicit sent before any
+ * advertise packets arrive. Use old format.
+ */
+ return 0;
+ } else
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * This is the discover finite state machine that runs the
+ * advertise and solicit packet exchange of the discovery
+ * proccess.
+ * It is assumed that this function is only called from work queue
+ * task context (for locking)
+ */
+static void fip_discover_fsm(struct work_struct *work)
+{
+ struct fip_discover *discover =
+ container_of(work, struct fip_discover, fsm_task.work);
+ struct vnic_port *port = discover->port;
+ int recall_time = -1, flush = discover->flush;
+
+ /* we got a flush request and we have not performed it yet */
+ if ((flush != FIP_NO_FLUSH) &&
+ discover->state != FIP_DISCOVER_OFF) {
+ vnic_dbg_fip(discover->name, "discover_fsm switching to OFF\n");
+
+ recall_time = DELAYED_WORK_CLEANUP_JIFFS * 2;
+
+
+ if (discover->state != FIP_DISCOVER_CLEAR) {
+ fip_free_gw_list(discover, flush);
+ discover->state = FIP_DISCOVER_CLEAR;
+ }
+
+ /* if we open GWs we will test again later */
+ if (!fip_free_gw_done(discover, flush)) {
+ vnic_dbg_fip(discover->name, "fip_free_gw_list not done, recalling \n");
+ goto recall_fsm;
+ }
+
+ if (delayed_work_pending(&discover->cleanup_task))
+ goto recall_fsm;
+
+ vnic_dbg_fip(discover->name, "fip_free_gw_list done \n");
+ vnic_dbg_mark();
+ vnic_mcast_del_all(&discover->mcast_tree);
+ vnic_dbg_mark();
+ discover->state = FIP_DISCOVER_OFF;
+
+ /* signal the unload to continue */
+ complete(&discover->flush_complete);
+ return;
+ }
+
+ if (discover->state == FIP_DISCOVER_OFF)
+ return;
+
+ if (!port->attr.lid) {
+ recall_time = 1 * HZ;
+ goto recall_fsm;
+ }
+
+ switch (discover->state) {
+ int new_prot;
+
+ case FIP_DISCOVER_INIT:
+ vnic_dbg_fip(discover->name, "FIP_DISCOVER_INIT\n");
+ /* in init try and join the discover multicast group
+ * This is a preliminary request for all other progress
+ * will eventually call fip_discover_mcast_connect_cb()
+ */
+ if (fip_discover_mcast_connect(discover)) {
+ vnic_warn(discover->name, "fip_discover_mcast_connect() "
+ "failed\n");
+ recall_time = 1 * HZ;
+ }
+ break;
+
+ case FIP_DISCOVER_SOLICIT:
+ new_prot = is_new_solicit_prot(discover);
+ vnic_dbg_fip(discover->name, "DISCOVER_SOLICIT\n");
+
+ /* send multicast solicit of type fip, if send is
+ * successfull move to login state and await advertise
+ * packets. It TX fail then retry
+ */
+ fip_solicit_send(discover, FIP_DISCOVER_MCAST, 0, 0, 0, new_prot);
+ recall_time = FIP_RESOLICIT_TIME * HZ;
+
+ break;
+
+ case FIP_DISCOVER_OFF:
+ default:
+ ASSERT(0);
+ break;
+
+ }
+
+recall_fsm:
+ if (recall_time >= 0)
+ queue_delayed_work(fip_wq, &discover->fsm_task, recall_time);
+
+ return;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FIP_DISCOVER_H
+#define _FIP_DISCOVER_H
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+/* TODO - rethink this */
+#define FIP_UD_MTU(ib_mtu) (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN)
+#define FIP_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES)
+
+#define FIP_MAX_BACKOFF_SECONDS 16
+#define FIP_MAX_VNICS_PER_GW (1 << 9)
+
+#define FIP_TIMEOUT_FACTOR(a) ((a)*5/2)
+
+enum fip_gw_state {
+ FIP_GW_HOST_ADMIN,
+ FIP_GW_CTRL_PATH_QUERY,
+ FIP_GW_SEND_SOLICIT, /* got mcast advertise & ctrl path query. sending solicit */
+ FIP_GW_DATA_PATH_QUERY,
+ FIP_GW_CONNECTED /* we are already connected. do nothing */
+};
+
+
+enum {
+ GW_TYPE_SINGLE_EPORT = 0,
+ GW_TYPE_LAG = 1,
+};
+
+struct gw_ext_boot {
+ int valid;
+ int boot_prio;
+ int timeout;
+};
+
+struct gw_ext_lag {
+ int valid;
+ int hash; /* enum gw_ext_lag_hash_policy */
+ int weights_policy;
+ int member_ka;
+ int ca; /* conjestion aware */
+ int ca_thresh;
+ int ucast; /* gw supports unicat keep alives */
+};
+
+
+struct gw_ext_pc_id {
+ int valid;
+ u64 power_cycle_id;
+};
+
+struct fip_gw_data_info {
+ struct fip_gw_volatile_info vol_info;
+ long gw_adv_period; /* timeout in jiffies */
+ long gw_period; /* timeout in jiffies */
+ long vnic_ka_period; /* in jiffies */
+ int flags;
+ u32 gw_qpn;
+ u16 gw_lid;
+ u16 gw_port_id;
+ u16 gw_num_vnics;
+ u16 n_rss_qpn;
+ u8 gw_sl; /* GW ctrl SL */
+ u8 hadmined_en;
+ u8 all_vlan_gw;
+ u8 gw_vendor_id[VNIC_VENDOR_LEN+1];
+ u8 gw_guid[GUID_LEN];
+ int gw_type;
+ int gw_prot_new;
+ int ext_mask;
+ struct gw_ext_boot ext_boot;
+ struct gw_ext_lag ext_lag;
+ struct gw_ext_pc_id ext_pc_id;
+};
+
+struct fip_gw_data {
+ enum fip_flush flush;
+ int hadmin_gw;
+ struct mutex mlock;
+ struct fip_discover *discover;
+ struct list_head list;
+ unsigned long keep_alive_jiffies;
+ enum fip_gw_state state;
+ int vnic_count;
+ struct list_head vnic_list;
+ struct delayed_work gw_task;
+ struct delayed_work vnic_cleanup_task;
+ struct fip_gw_data_info info;
+ unsigned long n_bitmask[(FIP_MAX_VNICS_PER_GW >> 3) /
+ sizeof(unsigned long)];
+
+ struct ib_sa_path_rec ctrl_prec;
+ struct ib_sa_path_rec data_prec;
+ struct ib_sa_query *pquery;
+ int query_path_cnt;
+ int query_id;
+ struct completion query_comp;
+};
+
+enum fip_gw_data_flags {
+ FIP_IS_FIP = 1 << 0, /* protocol type */
+ FIP_RCV_MULTICAST = 1 << 1, /* received mcast packet */
+ FIP_GW_AVAILABLE = 1 << 2, /* GW available bit set in pkt */
+ FIP_HADMINED_VLAN = 1 << 3, /* H bit set in advertise pkt */
+};
+
+static inline u8 vnic_gw_ctrl_sl(struct fip_gw_data *gw)
+{
+ return vnic_sa_query? gw->ctrl_prec.sl : gw->info.gw_sl;
+}
+
+/*
+ * TODO - we can do a nicer job here. stage 2
+ * allocates memory and post receives
+ */
+int fip_post_discovery_rcv(struct vnic_port *port,
+ int ring_size, struct ib_qp *qp,
+ struct fip_ring *rx_ring);
+
+int fip_discover_mcast_reattach(struct fip_discover *discover,
+ struct vnic_port *port);
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then handles the packets
+ * specifically according to its type. This functions runs in task context.
+*/
+void fip_discover_rx_packet(int *queue, struct fip_content *fc);
+int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc);
+
+/*
+ * This function is the RX packet handler entry point at the thread level
+ * (unlike the completion handler that runs from interrupt context).
+ * the function calls a handler function and then reallocats the ring
+ * entry for the next receive.
+*/
+void fip_discover_process_rx(struct fip_discover *discover);
+void fip_discover_process_rx_bh(struct work_struct *work);
+void fip_discover_gw_fsm_move(struct fip_gw_data *gw, enum fip_gw_state state);
+
+/* This function creates an info string from GW attributes published
+ * by the GW in advertisement pkts */
+int fip_get_short_gw_info(struct fip_gw_data *gw, char *buff);
+
+
+int fip_packet_parse(struct vnic_port *port, void *packet, int size,
+ struct fip_content *fc);
+
+#endif /* _FIP_DISCOVER_H */
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+#define FIP_OP_RECV (1ul << 31)
+/* TODO - rethink this */
+#define FIP_UD_MTU(ib_mtu) (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN)
+#define FIP_UD_BUF_SIZE(ib_mtu) (ib_mtu + IB_GRH_BYTES)
+
+static inline void fip_wr_pepare(struct vnic_port *port,
+ struct ib_send_wr *tx_wr,
+ struct ib_sge *tx_sge,
+ unsigned int wr_id, u64 mapping,
+ int size, u16 pkey_index)
+{
+ /* This is a fixed part */
+ memset(tx_wr, 0, sizeof(struct ib_send_wr));
+ tx_wr->num_sge = 1;
+ tx_wr->sg_list = tx_sge;
+ tx_wr->opcode = IB_WR_SEND;
+ tx_wr->send_flags = IB_SEND_SIGNALED;
+ tx_wr->wr.ud.pkey_index = pkey_index;
+ tx_wr->wr_id = wr_id;
+
+ memset(tx_sge, 0, sizeof(struct ib_sge));
+ tx_sge->lkey = port->mr->lkey;
+ tx_sge->addr = mapping;
+ tx_sge->length = size;
+}
+
+/*
+ * send a single multicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_mcast_send(struct vnic_port *port,
+ struct ib_qp *qp,
+ unsigned int wr_id,
+ u64 mapping,
+ int size,
+ u16 pkey_index,
+ struct vnic_mcast *mcast)
+{
+ struct ib_send_wr *bad_wr;
+ struct ib_sge tx_sge;
+ struct ib_send_wr tx_wr;
+ int ret;
+
+ fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index);
+
+ tx_wr.wr.ud.ah = mcast->ah;
+ tx_wr.wr.ud.remote_qpn = 0xFFFFFFFF; /*dest_qpn; */
+ tx_wr.wr.ud.remote_qkey = mcast->qkey;
+
+ ret = ib_post_send(qp, &tx_wr, &bad_wr);
+
+ return ret;
+}
+
+/*
+ * send a single unicast packet.
+ * return 0 on success, other on failure.
+ */
+int fip_ucast_send(struct vnic_port *port,
+ struct ib_ah *ah,
+ struct ib_qp *qp,
+ unsigned int wr_id,
+ u64 mapping,
+ int size,
+ u16 pkey_index, u32 dest_qpn, u16 dlid,
+ u32 qkey, u8 sl)
+{
+ struct ib_send_wr *bad_wr;
+ struct ib_ah *new_ah = NULL;
+ struct ib_sge tx_sge;
+ struct ib_send_wr tx_wr;
+ int ret;
+
+ fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index);
+
+ if (!ah) {
+ struct ib_ah_attr ah_attr = {
+ .dlid = dlid,
+ .port_num = port->num,
+ .sl = sl & 0xf,
+ };
+
+ new_ah = ib_create_ah(port->pd, &ah_attr);
+ if (IS_ERR(new_ah))
+ return -1;
+
+ tx_wr.wr.ud.ah = new_ah;
+ } else
+ tx_wr.wr.ud.ah = ah;
+
+ tx_wr.wr.ud.remote_qpn = dest_qpn;
+ tx_wr.wr.ud.remote_qkey = qkey;
+
+ ret = ib_post_send(qp, &tx_wr, &bad_wr);
+
+ if (new_ah)
+ ib_destroy_ah(new_ah);
+
+ return ret;
+}
+
+/*
+ * This is a general purpose CQ completion function that handles
+ * completions on RX and TX rings. It can serve all users that are
+ * using RX and TX rings.
+ * RX completions are destinguished from TX comp by the MSB that is set
+ * for RX and clear for TX. For RX, the memory is unmapped from the PCI,
+ * The head is incremented. For TX the memory is unmapped and then freed.
+ * The function returns the number of packets received.
+*/
+int fip_comp(struct vnic_port *port,
+ struct ib_cq *cq,
+ struct fip_ring *rx_ring,
+ struct fip_ring *tx_ring,
+ char *name)
+{
+#define FIP_DISCOVER_WC_COUNT 4
+ struct ib_wc ibwc[FIP_DISCOVER_WC_COUNT];
+ int wrid, n, i;
+ int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+ int rx_count = 0;
+ struct ib_device *dev = port->dev->ca;
+
+ do {
+ /*
+ * poll for up to FIP_DISCOVER_WC_COUNT in one request.
+ * returns the number of WC actually polled
+ */
+ n = ib_poll_cq(cq, FIP_DISCOVER_WC_COUNT, ibwc);
+ for (i = 0; i < n; ++i) {
+ /*
+ * use a mask on the id to decide if this is a receive
+ * or transmit WC
+ */
+ if (ibwc[i].wr_id & FIP_OP_RECV) {
+ wrid = ibwc[i].wr_id & ~FIP_OP_RECV;
+
+ ib_dma_sync_single_for_cpu(dev,
+ rx_ring->ring[wrid].bus_addr,
+ mtu_size,
+ DMA_FROM_DEVICE);
+
+ if (likely(ibwc[i].status == IB_WC_SUCCESS)) {
+ rx_ring->ring[wrid].length =
+ ibwc[i].byte_len;
+ rx_count++;
+ } else
+ rx_ring->ring[wrid].entry_posted = 0;
+
+ rx_ring->head++;
+ } else { /* TX completion */
+ unsigned long flags;
+ wrid = ibwc[i].wr_id;
+
+ /* unmap and free transmitted packet */
+ ib_dma_unmap_single(dev,
+ tx_ring->ring[wrid].
+ bus_addr, tx_ring->ring[wrid].length,
+ DMA_TO_DEVICE);
+
+ kfree(tx_ring->ring[wrid].mem);
+ tx_ring->ring[wrid].mem = NULL;
+ tx_ring->ring[wrid].length = 0;
+ spin_lock_irqsave(&tx_ring->head_tail_lock, flags);
+ tx_ring->tail++;
+ spin_unlock_irqrestore(&tx_ring->head_tail_lock, flags);
+ }
+ }
+ } while (n == FIP_DISCOVER_WC_COUNT);
+
+ ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+ return rx_count;
+}
+
+/* qonfigure a newly allocated QP and move it
+ * from reset->init->RTR->RTS
+ */
+int fip_init_qp(struct vnic_port *port, struct ib_qp *qp, u16 pkey_index, char *name)
+{
+ struct ib_qp_attr qp_attr;
+ int attr_mask;
+
+ qp_attr.qp_state = IB_QPS_INIT;
+ qp_attr.qkey = VNIC_FIP_QKEY;
+ qp_attr.port_num = port->num;
+ qp_attr.pkey_index = pkey_index;
+ attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+ if (ib_modify_qp(qp, &qp_attr, attr_mask))
+ goto out_fail;
+
+ qp_attr.qp_state = IB_QPS_RTR;
+ attr_mask &= ~IB_QP_PORT;
+ if (ib_modify_qp(qp, &qp_attr, attr_mask))
+ goto out_fail;
+
+ qp_attr.qp_state = IB_QPS_RTS;
+ qp_attr.sq_psn = 0;
+ attr_mask |= IB_QP_SQ_PSN;
+ attr_mask &= ~IB_QP_PKEY_INDEX;
+ if (ib_modify_qp(qp, &qp_attr, attr_mask))
+ goto out_fail;
+
+ return 0;
+
+out_fail:
+ qp_attr.qp_state = IB_QPS_RESET;
+ if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE))
+ vnic_warn(name, "failed to modify QP to RESET state\n");
+
+ return -EINVAL;
+}
+
+void fip_qp_to_reset(struct ib_qp *qp, char *name)
+{
+ struct ib_qp_attr qp_attr;
+
+ qp_attr.qp_state = IB_QPS_RESET;
+ if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE))
+ vnic_warn(name, "Failed to modify QP to RESET state\n");
+ return;
+}
+
+/*
+ * alloc a single buffer, map it and post it to the qp.
+ * id used to identify entry in receive queue.
+ */
+int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size,
+ int _id, struct fip_ring_entry *mem_entry, char *name)
+{
+ struct ib_recv_wr rx_wr, *bad_wr;
+ struct ib_sge rx_sge;
+ int rc;
+
+ rx_wr.wr_id = _id | FIP_OP_RECV;
+ rx_wr.next = NULL;
+ rx_wr.sg_list = &rx_sge;
+ rx_wr.num_sge = 1;
+ rx_sge.addr = mem_entry->bus_addr;
+ rx_sge.length = size;
+ rx_sge.lkey = port->mr->lkey;
+
+ ib_dma_sync_single_for_device(port->dev->ca, rx_sge.addr,
+ FIP_UD_BUF_SIZE(port->max_mtu_enum),
+ DMA_FROM_DEVICE);
+
+ rc = ib_post_recv(qp, &rx_wr, &bad_wr);
+ if (unlikely(rc)) {
+ vnic_warn(name, "post receive failed for buf rc %d (id %d)\n", _id, rc);
+ goto post_recv_failed;
+ }
+ mem_entry->entry_posted = 1;
+ return 0;
+
+post_recv_failed:
+ mem_entry->entry_posted = 0;
+ return -EIO;
+}
+
+void fip_flush_rings(struct vnic_port *port,
+ struct ib_cq *cq,
+ struct ib_qp *qp,
+ struct fip_ring *rx_ring,
+ struct fip_ring *tx_ring,
+ char *name)
+{
+ vnic_dbg_fip(name, "fip_qp_to_err called\n");
+ if (qp) {
+ fip_qp_to_reset(qp, name);
+ fip_comp(port, cq, rx_ring, tx_ring, name);
+ }
+}
+
+void fip_free_rings(struct vnic_port *port,
+ struct fip_ring *rx_ring,
+ struct fip_ring *tx_ring,
+ char *name)
+{
+ struct ib_device *dev = port->dev->ca;
+ int i;
+
+ for (i = rx_ring->size - 1; i >= 0; --i) {
+ if (rx_ring->ring[i].mem) {
+ ib_dma_unmap_single(dev,
+ rx_ring->ring[i].bus_addr,
+ FIP_UD_BUF_SIZE(port->max_mtu_enum),
+ DMA_FROM_DEVICE);
+ kfree(rx_ring->ring[i].mem);
+ }
+ }
+ rx_ring->size = 0;
+
+ for (i = tx_ring->size - 1; i >= 0; --i)
+ if (tx_ring->ring[i].length != 0) {
+ ib_dma_unmap_single(dev,
+ tx_ring->ring[i].bus_addr,
+ tx_ring->ring[i].length,
+ DMA_TO_DEVICE);
+ kfree(tx_ring->ring[i].mem);
+ }
+ tx_ring->size = 0;
+
+ vnic_dbg_fip(name, "Done cleaning RX and TX queues\n");
+
+ kfree(rx_ring->ring);
+ rx_ring->ring = NULL;
+ kfree(tx_ring->ring);
+ tx_ring->ring = NULL;
+}
+
+/*
+ * TODO - we can do a nicer job here. stage 2
+ * allocates memory and post receives
+ * TODO2: need to handle the bad flow to free all existing entries in the ring
+ */
+int fip_init_rx(struct vnic_port *port,
+ int ring_size,
+ struct ib_qp *qp,
+ struct fip_ring *rx_ring,
+ char *name)
+{
+ struct ib_device *dev = port->dev->ca;
+ int i, rc = 0, mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+
+ rx_ring->size = ring_size;
+ rx_ring->ring = kzalloc(rx_ring->size *
+ sizeof(struct fip_ring_entry),
+ GFP_KERNEL);
+ if (!rx_ring->ring) {
+ vnic_warn(name, "failed to alloc fip RX ring, size %d\n", rx_ring->size);
+ rx_ring->size = 0;
+ return -ENOMEM;
+ }
+
+ /* allocate the ring entries */
+ for (i = 0; i < rx_ring->size; i++) {
+ rx_ring->ring[i].mem = kmalloc(mtu_size, GFP_KERNEL);
+ if (unlikely(!rx_ring->ring[i].mem)) {
+ rc = -ENOMEM;
+ goto error;
+ }
+
+ rx_ring->ring[i].entry_posted = 0;
+ rx_ring->ring[i].length = mtu_size;
+ rx_ring->ring[i].bus_addr = ib_dma_map_single(dev,
+ rx_ring->ring[i].mem,
+ mtu_size, DMA_FROM_DEVICE);
+ if (unlikely(ib_dma_mapping_error(dev, rx_ring->ring[i].bus_addr))) {
+ rc = -ENODEV;
+ goto dma_error;
+ }
+
+ if (fip_post_receive(port, qp, FIP_UD_BUF_SIZE(port->max_mtu_enum),
+ i, rx_ring->ring + i, name)) {
+ rc = -EIO;
+ goto post_recv_failed;
+ }
+ }
+
+ rx_ring->head = 0;
+ rx_ring->tail = 0;
+ spin_lock_init(&rx_ring->head_tail_lock);
+ spin_lock_init(&rx_ring->ring_lock);
+ return 0;
+
+post_recv_failed:
+ ib_dma_unmap_single(dev, rx_ring->ring[i].bus_addr,
+ mtu_size, DMA_FROM_DEVICE);
+dma_error:
+ kfree(rx_ring->ring[i].mem);
+ rx_ring->ring[i].mem = NULL;
+error:
+ /* previous entries need to be freed after flushing the QP */
+ return rc;
+}
+
+/*
+ * This function allocates the tx buffers and initializes the head and
+ * tail indexes.
+ */
+int fip_init_tx(int size, struct fip_ring *tx_ring, char *name)
+{
+ tx_ring->size = size;
+ tx_ring->ring = kzalloc(tx_ring->size *
+ sizeof(struct fip_ring_entry),
+ GFP_KERNEL);
+
+ if (!tx_ring->ring) {
+ vnic_warn(name, "failed to alloc fip TX ring, size %d\n",
+ tx_ring->size);
+ tx_ring->size = 0;
+ return -ENOMEM;
+ }
+
+ tx_ring->head = 0;
+ tx_ring->tail = 0;
+ spin_lock_init(&tx_ring->head_tail_lock);
+ spin_lock_init(&tx_ring->ring_lock);
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+#ifndef work_pending /* back-port */
+#define work_pending(_work) test_bit(0, &(_work)->pending)
+#endif
+
+enum {
+ VNIC_LOGIN_REG_NETDEV_PENDING,
+ VNIC_LOGIN_REG_NETDEV_DONE,
+ VNIC_LOGIN_DESTROY_PENDING,
+ VNIC_LOGIN_DESTROY_DONE,
+ VNIC_LOGIN_DESTROY_FULL
+};
+
+static int fip_vnic_rings_create(struct vnic_port *port,
+ struct fip_vnic_data *vnic);
+static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic);
+static void fip_vnic_recv(struct fip_vnic_data *vnic);
+
+#ifdef _BP_HR_TIMER
+int fip_vnic_keepalive(struct hrtimer * timer);
+#else
+enum hrtimer_restart fip_vnic_keepalive(struct hrtimer * timer);
+#endif
+int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source);
+
+
+#define QUEUE_VNIC_DWORK(vnic, task, time) \
+do { \
+ unsigned long flags; \
+ spin_lock_irqsave(&vnic->lock, flags); \
+ if (likely(vnic->flush == FIP_NO_FLUSH)) \
+ queue_delayed_work(fip_wq, task, time); \
+ spin_unlock_irqrestore(&vnic->lock, flags); \
+} while(0)
+
+#define REQUEUE_VNIC_DWORK(vnic, task, time) \
+do { \
+ cancel_delayed_work(task); \
+ QUEUE_VNIC_DWORK(vnic, task, time); \
+} while(0);
+
+
+/*
+ * Look for a vnic in the GW vnic list. The search key used is either the vnic_id
+ * that is unique, or the mac+vlan pair. A match on either key will result in the
+ * return of the vnic. both keys are nesesary because host assigned delete
+ * flow might not have access to the vnic_id. The search disregards vnics that
+ * are undergoing full flush (they will be removed soon).
+*/
+struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw, u16 vnic_id,
+ u8 *mac, u16 vlan, u8 vlan_used)
+{
+ struct fip_vnic_data *vnic;
+ int use_mac = mac ? 1 : 0;
+ int vlan_match;
+
+ ASSERT(gw);
+
+ if (list_empty(&gw->vnic_list))
+ return NULL;
+
+ /* do not use MAC 0:..:0 for vnic matches */
+ if (use_mac)
+ use_mac = !IS_ZERO_MAC(mac);
+
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+ if (vnic->flush == FIP_FULL_FLUSH)
+ continue;
+
+ if (vnic->vnic_id == vnic_id)
+ return vnic;
+
+ if (vlan_used != vnic->login_data.vp)
+ continue;
+
+ vlan_match = !vlan_used ||
+ (vlan_used && (vlan == vnic->login_data.vlan));
+
+ if ((use_mac && !memcmp(vnic->login_data.mac, mac, ETH_ALEN)) &&
+ vlan_match)
+ return vnic;
+ }
+ return NULL;
+}
+
+/*
+ * This function handles completions of both TX and RX
+ * packets of vnics. RX packets are unmapped lightly parsed moved to a list
+ * and passed to thread processing. TX packets are unmapped and freed.
+ * Note: this function is called from interrupt context
+ */
+static void fip_vnic_comp(struct ib_cq *cq, void *vnic_ptr)
+{
+ struct fip_vnic_data *vnic = vnic_ptr;
+
+ /* handle completions. On RX packets this will call vnic_recv
+ * from thread context to continue processing */
+ if (fip_comp(vnic->port, vnic->cq, &vnic->rx_ring,
+ &vnic->tx_ring, vnic->name))
+ fip_vnic_recv(vnic);
+
+ fip_vnic_keepalive_send(vnic, 0);
+}
+
+/*
+ * read the state of the gw eport. This can be done from any context and therefore
+ * requires protection.
+*/
+int fip_vnic_get_eport_state(struct fip_vnic_data *vnic)
+{
+ int i;
+
+ if (no_bxm)
+ return 1;
+
+ if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+ for (i = 0; i < MAX_LAG_MEMBERS; i++) {
+ if (!(vnic->lm.used_bitmask & 1 << i))
+ continue;
+
+ if (vnic->lm.memb[i].eport_state)
+ return 1;
+ }
+ return 0;
+ } else {
+ return atomic_read(&vnic->eport_state);
+ }
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff)
+{
+ struct fip_gw_data *gw = vnic->gw;
+ struct fip_gw_volatile_info tmp_info;
+ int rc;
+
+ if (!gw)
+ return -EINVAL;
+
+ mutex_lock(&gw->mlock);
+ memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+ mutex_unlock(&gw->mlock);
+
+ rc = sprintf(buff, "%s", tmp_info.system_name);
+
+ return rc < 0 ? rc : 0;
+}
+
+int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff)
+{
+ struct fip_gw_data *gw = vnic->gw;
+ struct fip_gw_volatile_info tmp_info;
+ void *rc;
+
+ memset(buff, 0, sizeof *buff);
+
+ if (!gw)
+ return -EINVAL;
+
+ mutex_lock(&gw->mlock);
+ memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+ mutex_unlock(&gw->mlock);
+
+ rc = memcpy(buff, tmp_info.system_guid, GUID_LEN);
+
+ return rc ? 0 : -EINVAL;
+}
+
+int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff)
+{
+ struct fip_gw_data *gw = vnic->gw;
+ int rc;
+
+ if (!gw)
+ return -EINVAL;
+
+ rc = sprintf(buff, "%s", gw->info.all_vlan_gw ? "yes" : "no");
+
+ return rc < 0 ? rc : 0;
+}
+
+int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff)
+{
+
+ struct fip_gw_data *gw = vnic->gw;
+ struct fip_gw_volatile_info tmp_info;
+ int rc;
+
+ if (!gw)
+ return -EINVAL;
+
+ mutex_lock(&gw->mlock);
+ memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+ mutex_unlock(&gw->mlock);
+
+ rc = sprintf(buff, "%s", tmp_info.gw_port_name);
+
+ return rc < 0 ? rc : 0;
+}
+
+u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic)
+{
+ return vnic->gw->info.gw_sl;
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_gw_type(struct fip_vnic_data *vnic)
+{
+ struct fip_gw_data *gw = vnic->gw;
+ int lag = 0;
+
+ if (!gw)
+ return -EINVAL;
+
+ lag = gw->info.gw_type == GW_TYPE_LAG;
+
+ return lag;
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf)
+{
+ struct fip_gw_data *gw = vnic->gw;
+ int i;
+ struct lag_member *member;
+ char *p = buf;
+
+ if (!gw)
+ return -EINVAL;
+
+ if (gw->info.gw_type != GW_TYPE_LAG)
+ return -EINVAL;
+
+ p += _sprintf(p, buf, "LAG_MEMBER_INFORMATION:\n");
+ for (i=0; i<MAX_LAG_MEMBERS; i++) {
+ if (!(vnic->lm.used_bitmask & 1 << i))
+ continue;
+
+ member = &vnic->lm.memb[i];
+ p += _sprintf(p, buf, " %.2d ID=%.3X LID=%4X QPN=%8X STATE=%s\n",
+ i, member->gw_port_id, member->lid, member->qpn,
+ member->eport_state ? "UP" : "DOWN");
+ }
+
+ return p - buf;
+}
+
+/*
+ * process an incoming login ack packet. The packet was already parsed and
+ * its data was placed in *data. The function creates RX and TX rings for the
+ * vnic and starts the multicast join procedure.
+ * This function should not be called for packets other then login ack packets.
+ */
+void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic,
+ struct fip_login_data *data)
+{
+ /* we allow login acks only in wait for ack in other states
+ * we ignore them */
+ if (vnic->state != FIP_VNIC_WAIT_4_ACK) {
+ vnic_dbg_fip_v(vnic->name,
+ "vnic_login_ack_recv in state other"
+ " then FIP_VNIC_WAIT_4_ACK state %d\n",
+ vnic->state);
+ return;
+ }
+
+ /* For LAG vnics, process login ack member data */
+ if (vnic->gw->info.gw_type == GW_TYPE_LAG)
+ handle_member_update(vnic, &data->lagm);
+
+ memcpy(&vnic->login_data, data, sizeof(vnic->login_data));
+
+ vnic->state = FIP_VNIC_RINGS_INIT;
+
+ /* calls fip_vnic_fsm() */
+ cancel_delayed_work(&vnic->vnic_task);
+ fip_vnic_fsm(&vnic->vnic_task.work);
+ // REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0);
+ return;
+}
+
+/*
+ * This is a helper function we use in order to move the login create
+ * to another context so we don't block the fip thread for too long.
+ * The call stack triggered by this function calls register_netdev that
+ * might block for some time when netdev are removed in parallel. This
+ * stalls the fip_wq which causes KA not to be sent.
+*/
+void fip_vnic_login_create(struct work_struct *work)
+{
+ struct fip_vnic_data *vnic =
+ container_of(work, struct fip_vnic_data, vnic_login_create_task);
+ char *name = NULL;
+ int rc;
+
+ if (vnic->hadmined)
+ name = vnic->interface_name;
+
+ rc = vnic_login_register_netdev(vnic, vnic->mac_cache, name);
+
+ spin_lock_irq(&vnic->lock);
+ clear_bit(VNIC_LOGIN_REG_NETDEV_PENDING, &vnic->login_status);
+ if (!rc)
+ set_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status);
+ spin_unlock_irq(&vnic->lock);
+}
+
+/*
+ * Test if the create request posted earlier terminated or not.
+ * If yes and successfully returns 0, if still pending returns
+ * -EAGAIN , and if failed returns -EINVAL. if retry is set
+ * it will requeue a create attempt and try again. In this case
+ * the function will return -EAGAIN.
+*/
+static int fip_vnic_test_login(struct fip_vnic_data *vnic, int retry)
+{
+ int ret = 0;
+
+ spin_lock_irq(&vnic->lock);
+
+ if (!test_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status)) {
+ /* queue retry login create request */
+ if (retry) {
+ if (!test_and_set_bit(VNIC_LOGIN_REG_NETDEV_PENDING,
+ &vnic->login_status)) {
+ memcpy(vnic->mac_cache, vnic->login_data.mac, ETH_ALEN);
+ vnic->vlan_used = vnic->login_data.vp;
+ vnic->vlan = vnic->login_data.vlan;
+ vnic->all_vlan_gw = vnic->login_data.all_vlan_gw;
+
+ /* calls fip_vnic_login_create() */
+ if (vnic->flush == FIP_NO_FLUSH)
+ queue_work(login_wq, &vnic->vnic_login_create_task);
+ }
+ ret = -EAGAIN;
+ } else {
+ if (test_bit(VNIC_LOGIN_REG_NETDEV_PENDING,
+ &vnic->login_status))
+ ret = -EAGAIN;
+ else
+ ret = -EINVAL;
+ }
+ }
+ spin_unlock_irq(&vnic->lock);
+
+ return ret;
+}
+
+
+/*
+ * This function should be called when the building of a vhub context
+ * table is done and the vnic state should transition to CONNECTED.
+ */
+int fip_vnic_tbl_done(struct fip_vnic_data *vnic)
+{
+ vnic->vhub_table.state = VHUB_TBL_UP2DATE;
+ vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn;
+
+ if (vnic->state <= FIP_VNIC_VHUB_DONE)
+ vnic->state = FIP_VNIC_VHUB_DONE;
+ else
+ vnic->state = FIP_VNIC_VHUB_WRITE;
+
+ cancel_delayed_work(&vnic->vnic_task);
+ fip_vnic_fsm(&vnic->vnic_task.work);
+ return 0;
+}
+
+/*
+ * This function runs in interrupt context
+ * It does sanity checking of the packet, moves it to a list and passes
+ * handleing to a thread.
+ */
+static void fip_vnic_recv(struct fip_vnic_data *vnic)
+{
+ struct fip_ring *rx_ring = &vnic->rx_ring;
+ int ret, length;
+ u32 vhub_id;
+ void *mem;
+ int queue_packet = 0;
+ int one_or_more_queued = 0;
+ int index;
+ int err;
+
+ while (rx_ring->head != rx_ring->tail) {
+ struct fip_content *fc;
+
+ queue_packet = 0;
+ index = rx_ring->tail & (vnic->rx_ring.size - 1);
+
+ if (rx_ring->ring[index].entry_posted == 0)
+ goto repost;
+
+ mem = rx_ring->ring[index].mem;
+ length = rx_ring->ring[index].length;
+
+
+ fc = kzalloc(sizeof *fc, GFP_ATOMIC);
+ if (!fc) {
+ vnic_warn(vnic->name, "kzalloc failed\n");
+ goto repost;
+ }
+
+ err = fip_packet_parse(vnic->port, mem + IB_GRH_BYTES, length - IB_GRH_BYTES, fc);
+ if (err) {
+ vnic_warn(vnic->name, "packet parse failed\n");
+ kfree(fc);
+ goto repost;
+ }
+
+ switch (fc->fh->subcode) {
+ case FIP_GW_UPDATE_SUB_OPCODE:
+ if (fc->fvu) {
+ vhub_id = be32_to_cpu(fc->fvu->state_vhub_id) & 0xffffff;
+ if (vnic->login_data.vhub_id == vhub_id)
+ queue_packet = 1;
+ }
+
+ break;
+ case FIP_GW_TABLE_SUB_OPCODE:
+ if (vnic->state >= FIP_VNIC_VHUB_INIT &&
+ vnic->vhub_table.state == VHUB_TBL_INIT) {
+ /* handle vhub context table packets */
+ if (fc->fvt) {
+ vhub_id = be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff;
+ if (vnic->login_data.vhub_id == vhub_id)
+ queue_packet = 1;
+ }
+ }
+ break;
+ default:
+ vnic_dbg_fip_v(vnic->name,
+ "received unexpected format packet\n");
+ break;
+ }
+
+ if (queue_packet && (likely(vnic->flush == FIP_NO_FLUSH))) {
+ struct fip_rcv_pkt *rcv;
+ struct fip_ring_entry me;
+
+ /* record packet time for heart beat */
+ vnic->keep_alive_jiffs = jiffies;
+ length -= IB_GRH_BYTES;
+ rcv = kzalloc(sizeof *rcv, GFP_ATOMIC);
+ if (!rcv) {
+ vnic_warn(vnic->name, "failed kmalloc\n");
+ kfree(fc);
+ goto repost;
+ }
+
+ /* replace it with new entry, and queue old one */
+ err = alloc_map_fip_buffer(vnic->port->dev->ca, &me,
+ FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+ GFP_ATOMIC);
+ if (err) {
+ vnic_warn(vnic->name, "alloc_map_fip_buffer failed\n");
+ kfree(fc);
+ kfree(rcv);
+ goto repost;
+ }
+
+ /* unmap old entry */
+ ib_dma_unmap_single(vnic->port->dev->ca,
+ rx_ring->ring[index].bus_addr,
+ FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+ DMA_FROM_DEVICE);
+
+ rx_ring->ring[index] = me;
+ rcv->fc = fc;
+ rcv->length = length;
+ rcv->mem = mem;
+ spin_lock(&vnic->vnic_rcv_list.lock);
+ list_add_tail(&rcv->list, &vnic->vnic_rcv_list.list);
+ spin_unlock(&vnic->vnic_rcv_list.lock);
+ one_or_more_queued++;
+ } else
+ kfree(fc);
+repost:
+ ret = fip_post_receive(vnic->port, vnic->qp,
+ FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+ index, rx_ring->ring + index, vnic->name);
+ if (ret)
+ vnic_warn(vnic->name, "fip_post_receive ret %d\n", ret);
+
+ rx_ring->tail++;
+ }
+
+ if (one_or_more_queued && (likely(vnic->flush == FIP_NO_FLUSH))) {
+ /* calls fip_vnic_recv_bh() */
+ queue_work(fip_wq, &vnic->vnic_pkt_rcv_task_bh);
+ }
+
+ return;
+}
+
+void fip_vnic_recv_list_flush(struct fip_vnic_data *vnic)
+{
+ struct list_head vnic_recv_local;
+ struct fip_rcv_pkt *rcv, *rcv1;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&vnic_recv_local);
+
+ spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags);
+ list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local);
+ spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags);
+
+ list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+ list_del(&rcv->list);
+ kfree(rcv);
+ }
+ return;
+}
+
+void lag_ctx_clear(struct fip_vnic_data *vnic)
+{
+ memset(&vnic->lm, 0, sizeof (vnic->lm));
+}
+
+/*
+ * Handle the GW eport member info for a LAG GW. The function compares the
+ * member information to previous membership information that is stored in the
+ * vnic. The data path info is updated only after the login ack info was
+ * updated to prevent race conditions.
+ * The vnic contains a local cache of the member info. The cache is updated
+ * in all cases other then if the write to the data path failed. If the write
+ * failed we will not update the cache and rely on periodic updates packets
+ * for the retry.
+ * There are 4 possible flows per member entry:
+ * 1. the entry is cached in the vnic but not in the packet - remove from vnic
+ * 2. the entry is not cached in the vnic but is in the packet - add to vnic,
+ * 3. entry is in vnic and in packet but different params - modifiy vnic
+ * 4. entry is in vnic and in packet and with similar params - do nothing
+*/
+int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm)
+{
+ int i, j;
+ char packet_used[MAX_LAG_MEMBERS];
+ char vnic_used[MAX_LAG_MEMBERS];
+ struct lag_member *vnic_mem, *pkt_mem;
+ int last_bit = 0;
+ #define EMPTY_ENTRY (char)0xff
+ /* we only update data path with new info after certain stage */
+ int write_through = !!(vnic->state >= FIP_VNIC_VHUB_WRITE);
+ int skip;
+ struct lag_properties lag_prop;
+ struct vnic_login *login = vnic->login;
+
+ memset(packet_used, EMPTY_ENTRY, sizeof(packet_used));
+ memset(vnic_used, EMPTY_ENTRY, sizeof(vnic_used));
+
+ /* if LAG is not enabled, or it's a child vNic, abort */
+ if (!vnic->gw->info.ext_lag.valid || vnic->parent_used)
+ return -EINVAL;
+
+ mutex_lock(&vnic->gw->mlock);
+ lag_prop.ca = vnic->gw->info.ext_lag.ca;
+ lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh;
+ lag_prop.hash_mask = vnic->gw->info.ext_lag.hash;
+ lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy;
+ mutex_unlock(&vnic->gw->mlock);
+ if (write_through)
+ vnic_member_prop(login, &lag_prop);
+
+ /* go over all known members, for each one search for a match in the
+ * packet member struct */
+ for (i=0; i<MAX_LAG_MEMBERS; i++) {
+ if (!(vnic->lm.used_bitmask & 1 << i))
+ continue;
+
+ vnic_mem = &vnic->lm.memb[i];
+ for (j=0; j<lm->num; j++) {
+
+ pkt_mem = &lm->memb[j];
+ /* find match for member in vnic data structure */
+ if (packet_used[j] == EMPTY_ENTRY &&
+ !memcmp(vnic_mem->guid, pkt_mem->guid, GUID_LEN) &&
+ vnic_mem->gw_port_id == pkt_mem->gw_port_id) {
+ /* found a match, check for change in parameters */
+ if (vnic->login) {
+ /* check for change in member parameters */
+ if (vnic_mem->lid != pkt_mem->lid ||
+ vnic_mem->qpn != pkt_mem->qpn ||
+ vnic_mem->eport_state != pkt_mem->eport_state ||
+ vnic_mem->sl != pkt_mem->sl ||
+ vnic_mem->link_utilization != pkt_mem->link_utilization) {
+
+ vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d modifying lid %d qpn %d state %d\n",
+ i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+ /* update data path if required and store update info localy */
+ if (!write_through ||
+ (write_through && !vnic_member_modify(login, i, &lm->memb[j])))
+ *vnic_mem = lm->memb[j];
+ }
+ }
+ packet_used[j] = i;
+ vnic_used[i] = j;
+ break;
+ }
+ }
+ /* if member was removed in last packet remove it */
+ if (vnic_used[i] == EMPTY_ENTRY) {
+ if (!write_through ||
+ (write_through && !vnic_member_remove(login, i))) {
+ vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d removing lid %d qpn %d state %d\n",
+ i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+ vnic->lm.used_bitmask &= ~(1 << i);
+ }
+ }
+ }
+
+ /* go over packet and look for any new members */
+ for (j=0; j<lm->num; j++) {
+ /* if entry was matched up already */
+ if (packet_used[j]!= EMPTY_ENTRY)
+ continue;
+
+ skip = 0;
+ /* verify that the same GW_ID is not in use by another port */
+ for (i=0; i<MAX_LAG_MEMBERS; i++) {
+ if (!(vnic->lm.used_bitmask & 1 << i))
+ continue;
+ if (vnic->lm.memb[i].gw_port_id == lm->memb[j].gw_port_id)
+ skip = 1;
+ }
+ if (skip)
+ continue;
+
+ /* look for an empty member id and add the member to it */
+ for (i=last_bit; i<MAX_LAG_MEMBERS; i++) {
+ if (vnic->lm.used_bitmask & 1 << i)
+ continue;
+
+ vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d adding lid %d qpn %d state %d\n",
+ i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+ if (!write_through ||
+ (write_through && !vnic_member_add(login, i, &lm->memb[j]))) {
+ vnic->lm.used_bitmask |= (1 << i);
+ vnic->lm.memb[i] = lm->memb[j];
+ }
+
+ break;
+ }
+ last_bit = i;
+ }
+
+ return 0;
+}
+
+/* Write the initial member table to the datapath. If we fail we will
+ * delete the entry from the local cache and rely on periodic updates
+ * packets for the retry*/
+int fip_vnic_write_members(struct fip_vnic_data *vnic)
+{
+ int i;
+ struct lag_properties lag_prop;
+ struct vnic_login *login = vnic->login;
+
+ /* if LAG is not enabled, or it's a child vNic, abort */
+ if (!vnic->gw->info.ext_lag.valid || vnic->parent_used)
+ return -EINVAL;
+
+ lag_prop.ca = vnic->gw->info.ext_lag.ca;
+ lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh;
+ lag_prop.hash_mask = vnic->gw->info.ext_lag.hash;
+ lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy;
+ vnic_member_prop(login, &lag_prop);
+
+ /* go over all members, for each une used write it to the data path */
+ for (i=0; i<MAX_LAG_MEMBERS; i++) {
+ if (!(vnic->lm.used_bitmask & 1 << i))
+ continue;
+
+ /* if update failed, delete local entry we will use the
+ * the update packet flow for retries.
+ */
+ if (vnic_member_add(login, i, &vnic->lm.memb[i]))
+ vnic->lm.used_bitmask &= ~(1 << i);
+ }
+
+ return 0;
+}
+
+/* runs in the context of vnic->vnic_pkt_rcv_task_bh */
+void fip_vnic_recv_bh(struct work_struct *work)
+{
+ struct fip_vnic_data *vnic =
+ container_of(work, struct fip_vnic_data, vnic_pkt_rcv_task_bh);
+ int length;
+ u32 vhub_id, tusn;
+ int eport_state;
+ struct vnic_table_entry *vhub_entries;
+ struct list_head vnic_recv_local;
+ struct fip_rcv_pkt *rcv, *rcv1;
+ unsigned long flags;
+ int i, __eport_state;
+
+ INIT_LIST_HEAD(&vnic_recv_local);
+
+ spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags);
+ list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local);
+ spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags);
+
+ /* We Are not interested in packets prior to FIP_VNIC_VHUB_INIT */
+ if (vnic->state < FIP_VNIC_VHUB_INIT ||
+ vnic->flush != FIP_NO_FLUSH) {
+ list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+ kfree(rcv->fc);
+ kfree(rcv->mem);
+ list_del(&rcv->list);
+ kfree(rcv);
+ }
+ } else {
+ int err;
+
+ list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+ length = rcv->length;
+
+ switch (rcv->fc->fh->subcode) {
+ case FIP_GW_UPDATE_SUB_OPCODE:
+ /* validate vhub id before processing packet */
+ vhub_id = be32_to_cpu(rcv->fc->fvu->state_vhub_id) & 0xffffff;
+ if(unlikely(vnic->login_data.vhub_id != vhub_id))
+ break;
+
+ eport_state = be32_to_cpu(rcv->fc->fvu->state_vhub_id) >> 27 & 3;
+ __eport_state = (eport_state == 0) ? EPORT_STATE_DOWN : EPORT_STATE_UP;
+ atomic_set(&vnic->eport_state, __eport_state);
+
+ /* handle vhub context update packets */
+ if (rcv->fc->fed.num) {
+ err = extract_vhub_extended(rcv->fc->fed.fed[0], vnic);
+ if (err)
+ vnic_warn(vnic->name, "extract_vhub_extended() failed\n");
+ }
+ if (rcv->fc->cte.num) {
+ vhub_entries = kmalloc(rcv->fc->cte.num * sizeof *vhub_entries, GFP_KERNEL);
+ if (!vhub_entries) {
+ vnic_warn(vnic->port->name, "failed to allocate memory for update CTEs\n");
+ goto free_entry;
+ }
+
+ tusn = be32_to_cpu(rcv->fc->fvu->tusn);
+ for (i = 0; i < rcv->fc->cte.num; ++i) {
+ vhub_entries[i].lid = be16_to_cpu(rcv->fc->cte.cte[i].lid);
+ vhub_entries[i].qpn = be32_to_cpu(rcv->fc->cte.cte[i].qpn) & 0xffffff;
+ vhub_entries[i].sl = rcv->fc->cte.cte[i].sl & 0xf;
+ vhub_entries[i].rss = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_RSS_FLAG ? 1 : 0;
+ vhub_entries[i].valid = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_V_FLAG ? 1 : 0;
+ memcpy(vhub_entries[i].mac, rcv->fc->cte.cte[i].mac, sizeof(vhub_entries[i].mac));
+ vhub_handle_update(vnic, vhub_id, tusn - rcv->fc->cte.num + i + 1, &vhub_entries[i]);
+ }
+ kfree(vhub_entries);
+ }
+
+ /* update vnic carrier only when vnic is ready:
+ * not closing (non zero flush), and per-registered
+ */
+ if (!vnic->flush && vnic->login &&
+ test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+ vnic_carrier_update(vnic->login);
+ }
+ break;
+ case FIP_GW_TABLE_SUB_OPCODE:
+ /* handle vhub context table packets */
+ tusn = be32_to_cpu(rcv->fc->fvt->tusn);
+ vhub_id = be32_to_cpu(rcv->fc->fvt->vp_vhub_id) & 0xffffff;
+ vhub_handle_tbl(vnic, rcv->fc, vhub_id, tusn);
+ break;
+
+ default:
+ break;
+ }
+free_entry:
+ list_del(&rcv->list);
+ kfree(rcv->fc);
+ kfree(rcv->mem);
+ kfree(rcv);
+ }
+ }
+ return;
+}
+
+/*
+ * Mark the vnic for deletion and trigger a delayed call to the cleanup
+ * function. In the past the vnic was moved to another list but this
+ * might cause vnic duplication if new vnics are added to the GW. Even
+ * if the vnic is being flushed we need to know it is there.
+ *
+ * Note: This deletion method insures that all pending vnic work requests
+ * are cleared without dependency of the calling context.
+ */
+void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+ int tmp_flush;
+
+ /* net admin -> full flush */
+ tmp_flush = vnic->hadmined ? flush : FIP_FULL_FLUSH;
+
+ /* child vNic -> full flush */
+ tmp_flush = (!vnic->parent_used) ? tmp_flush : FIP_FULL_FLUSH;
+
+ /* no need for partial cleanup in host admin idle */
+ if (tmp_flush == FIP_PARTIAL_FLUSH &&
+ vnic->state < FIP_VNIC_HADMIN_IDLE)
+ return;
+
+ /* close already in process, disregard */
+ spin_lock_irq(&vnic->lock);
+ if (vnic->flush >= tmp_flush){
+ spin_unlock_irq(&vnic->lock);
+ return;
+ }
+ if (vnic->flush == FIP_NO_FLUSH && vnic->state > FIP_VNIC_WAIT_4_ACK)
+ fip_update_send(vnic, 0, 1 /* logout */);
+
+ vnic->flush = tmp_flush;
+ cancel_delayed_work(&vnic->vnic_gw_alive_task);
+ cancel_delayed_work(&vnic->vnic_task);
+ spin_unlock_irq(&vnic->lock);
+ /* after this point we should have no work that is not already pending
+ * for execution, and no new work will be added
+ */
+
+ if (vnic->hadmined && tmp_flush == FIP_FULL_FLUSH)
+ vnic_delete_hadmin_dentry(vnic);
+ else if (!vnic->hadmined)
+ /* vnic_count is relevant for net admin only */
+ vnic->gw->vnic_count--;
+
+ vnic_dbg_mark();
+
+ /* calls fip_purge_vnics() */
+ queue_delayed_work(fip_wq, &vnic->gw->vnic_cleanup_task,
+ DELAYED_WORK_CLEANUP_JIFFS);
+}
+
+/*
+ * This is a helper function we use in order to move the login destroy
+ * to another context so we don't block the fip thread for too long.
+*/
+void fip_vnic_login_destroy(struct work_struct *work)
+{
+ struct fip_vnic_data *vnic =
+ container_of(work, struct fip_vnic_data,
+ vnic_login_destroy_task);
+ int flush = vnic->flush;
+
+ vnic_login_destroy_wq_stopped(vnic, flush);
+
+ /* we don't want to use a lock here so we will verify that the
+ * flush level did not change between the request and now */
+ if (flush == FIP_FULL_FLUSH)
+ set_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status);
+
+ set_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status);
+}
+
+/*
+ * Free vnic resources. This includes closing the data vnic (data QPs etc)
+ * and the discovery resources. If the vnic can be totaly destroyed (no
+ * pending work) the vnic will be removed from the GW and it's memory
+ * freed. If not the vnic will not be freed and the function will return an
+ * error. The caller needs to recall this unction to complete the operation.
+ * Note: Do not call this function to remove a vnic, use fip_vnic_close.
+*/
+int fip_vnic_destroy(struct fip_vnic_data *vnic)
+{
+ int pending;
+
+ vnic_dbg_func(vnic->name);
+ vnic_dbg_fip_p0(vnic->name, "fip_vnic_destroy called flow=%d state=%d mac" MAC_6_PRINT_FMT "\n",
+ vnic->flush, vnic->state, MAC_6_PRINT_ARG(vnic->login_data.mac));
+
+ pending = work_pending(&vnic->vnic_pkt_rcv_task_bh) ||
+ delayed_work_pending(&vnic->vnic_gw_alive_task) ||
+ delayed_work_pending(&vnic->vnic_task);
+
+ /* verify no pending packets before we start tearing down the rings */
+ if (pending || fip_vnic_test_login(vnic, 0) == -EAGAIN)
+ goto retry_later;
+
+ if (!test_and_set_bit(VNIC_LOGIN_DESTROY_PENDING,
+ &vnic->login_status)) {
+ vnic_login_destroy_stop_wq(vnic, vnic->flush);
+ /* calls fip_vnic_login_destroy() */
+ queue_work(login_wq, &vnic->vnic_login_destroy_task);
+ }
+
+ if (!test_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status))
+ goto retry_later;
+
+ clear_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status);
+ clear_bit(VNIC_LOGIN_DESTROY_PENDING, &vnic->login_status);
+
+ /* We need to test if when we queued the destroy request it was
+ * a partial flush but this has changed to a full flush.
+ * if so we need to try again */
+ if (vnic->flush == FIP_FULL_FLUSH &&
+ !test_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status))
+ goto retry_later;
+
+ hrtimer_cancel(&vnic->keepalive_timer);
+
+ if (vnic->state >= FIP_VNIC_VHUB_INIT) {
+ lag_ctx_clear(vnic);
+ vhub_ctx_free(vnic);
+ }
+
+ /* disconnect from mcast groups */
+ if (vnic->state >= FIP_VNIC_MCAST_INIT) {
+ vnic_mcast_del_all(&vnic->mcast_tree);
+ fip_vnic_rings_destroy(vnic);
+ }
+
+ if (vnic->state > FIP_VNIC_LOGIN)
+ ib_destroy_ah(vnic->ah);
+
+ if (vnic->flush == FIP_PARTIAL_FLUSH) {
+ if (vnic->hadmined) /* we close Host admin vnics so they won't do any login from fip_vnic_fsm */
+ vnic->state = FIP_VNIC_CLOSED;
+ else
+ vnic->state = FIP_VNIC_HADMIN_IDLE;
+
+ vnic->flush = FIP_NO_FLUSH;
+ vnic->last_send_jiffs = 0;
+
+ vnic_dbg_fip_v(vnic->name, "fip_vnic_remove partial done vnic->retry_count=%d\n", vnic->retry_count);
+ if (!VNIC_MAX_RETRIES || ++vnic->retry_count < VNIC_MAX_RETRIES)
+ QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, FIP_LOGIN_TIMEOUT * HZ);
+
+ } else {
+ list_del(&vnic->gw_vnics);
+ vnic_dbg_fip_v(vnic->name, "fip_vnic_remove full done\n");
+ kfree(vnic);
+ }
+
+ return 0;
+
+retry_later:
+ return -EBUSY;
+}
+
+int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source_timer)
+{
+ int update;
+ unsigned long flags;
+ int ret = 0;
+
+ if (vnic->flush != FIP_NO_FLUSH)
+ return ret;
+
+ if (vnic->last_send_jiffs > 1 && jiffies - vnic->last_send_jiffs > vnic->gw->info.vnic_ka_period * 3 / 2)
+ vnic_dbg_fip_p0(vnic->name, "Delaying in sending KA should be %ld actual time=%ld source=%d\n",
+ vnic->gw->info.vnic_ka_period, jiffies - vnic->last_send_jiffs, source_timer);
+
+ spin_lock_irqsave(&vnic->ka_lock, flags);
+ if (source_timer ||
+ (vnic->last_send_jiffs && jiffies - vnic->last_send_jiffs >
+ vnic->gw->info.vnic_ka_period * 6 / 5)) {
+
+ /* we need to have mcast attached before we ask for a table */
+ if (vnic->state >= FIP_VNIC_VHUB_INIT &&
+ vnic->vhub_table.state == VHUB_TBL_INIT)
+ update = 1;
+ else
+ update = 0;
+
+ /* send vnic keep alive to GW */
+ ret = fip_update_send(vnic, update, 0 /*not logout */);
+ if (!ret)
+ vnic->last_send_jiffs = jiffies;
+ }
+ spin_unlock_irqrestore(&vnic->ka_lock, flags);
+
+ return ret;
+
+}
+
+//void fip_vnic_keepalive(unsigned long data)
+#ifdef _BP_HR_TIMER
+int fip_vnic_keepalive(struct hrtimer * timer)
+#else
+enum hrtimer_restart fip_vnic_keepalive(struct hrtimer *timer)
+#endif
+{
+// struct fip_vnic_data *vnic = (struct fip_vnic_data *)data;
+ struct fip_vnic_data *vnic = (struct fip_vnic_data *)
+ container_of(timer, struct fip_vnic_data, keepalive_timer);
+ unsigned long flags;
+ ktime_t ktime;
+ enum hrtimer_restart ret = HRTIMER_NORESTART;
+ int flush;
+
+ spin_lock_irqsave(&vnic->lock, flags);
+ flush = vnic->flush;
+ spin_unlock_irqrestore(&vnic->lock, flags);
+
+ if (flush != FIP_NO_FLUSH)
+ return ret;
+
+ fip_vnic_keepalive_send(vnic, 1);
+
+ /*mod_timer(&vnic->keepalive, jiffies + time);*/
+ ret = HRTIMER_RESTART;
+ ktime = ktime_set(0, vnic->gw->info.vnic_ka_period * (1000000000 / HZ));
+ hrtimer_forward(&vnic->keepalive_timer, vnic->keepalive_timer.base->get_time(), ktime);
+
+
+ return ret;
+
+}
+
+void fip_vnic_gw_alive(struct work_struct *work)
+{
+ struct fip_vnic_data *vnic =
+ container_of(work, struct fip_vnic_data,
+ vnic_gw_alive_task.work);
+ long time_to_timeout;
+
+ if (vnic->flush != FIP_NO_FLUSH)
+ return;
+
+ if (!test_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state)) {
+ if (time_after(jiffies, vnic->detached_ka_jiffs + 60*HZ)) {
+ vnic_dbg_fip_p0(vnic->name, "No GW keep alive timeout when mcast un attached "
+ "QPN 0x%06x, LID 0x%04x\n", vnic->qp->qp_num,
+ vnic->port->attr.lid);
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ return;
+ } else {
+ vnic_dbg_fip_p0(vnic->name, "Got ka poll when bcast not "
+ "attached QPN 0x%06x, LID 0x%04x, ka=%u\n",
+ vnic->qp->qp_num, vnic->port->attr.lid,
+ jiffies_to_msecs(jiffies - vnic->detached_ka_jiffs));
+ time_to_timeout = vnic->gw->info.gw_period;
+ }
+ } else {
+ long jiffs_from_last;
+ jiffs_from_last = (jiffies - vnic->keep_alive_jiffs);
+ time_to_timeout = vnic->gw->info.gw_period - jiffs_from_last;
+ }
+
+ /* Todo, change receive of update to rearm work timer so an expiration
+ * indicates a truie time out */
+ if (time_to_timeout <= 0) {
+ vnic_dbg_fip_p0(vnic->name, "GW keep alives timed out for "
+ "QPN 0x%06x, LID 0x%04x timeout=%ld\n", vnic->qp->qp_num,
+ vnic->port->attr.lid, time_to_timeout);
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ } else
+ QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task,
+ time_to_timeout + 1);
+}
+
+struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port,
+ struct fip_gw_data *gw,
+ int hadmin, u16 vnic_id)
+{
+ struct fip_vnic_data *vnic;
+
+ vnic = kzalloc(sizeof(struct fip_vnic_data), GFP_KERNEL);
+ if (!vnic) {
+ vnic_err(port->name, "failed to alloc vnic\n");
+ return NULL;
+ }
+
+ vnic->state = hadmin ? FIP_VNIC_HADMIN_IDLE : FIP_VNIC_LOGIN;
+ vnic->vnic_id = vnic_id;
+ vnic->gw = gw;
+ vnic->gw_info = gw->info.vol_info;
+ vnic->port = port;
+ vnic->hadmined = hadmin;
+ vnic->flush = FIP_NO_FLUSH;
+
+ sprintf(vnic->name, "vnic-%d", vnic_id); /* will be overwritten */
+
+ spin_lock_init(&vnic->lock);
+ spin_lock_init(&vnic->ka_lock);
+ INIT_DELAYED_WORK(&vnic->vnic_task, fip_vnic_fsm);
+ INIT_DELAYED_WORK(&vnic->vnic_gw_alive_task, fip_vnic_gw_alive);
+ INIT_WORK(&vnic->vnic_login_destroy_task, fip_vnic_login_destroy);
+ INIT_WORK(&vnic->vnic_login_create_task, fip_vnic_login_create);
+
+
+#ifdef _BP_HR_TIMER
+ hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+#else
+ hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL );
+#endif
+ vnic->keepalive_timer.function = fip_vnic_keepalive;
+
+ vnic_mcast_root_init(&vnic->mcast_tree);
+ atomic_set(&vnic->eport_state,EPORT_STATE_DOWN);
+
+ return vnic;
+}
+
+int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic)
+{
+ int rc;
+
+ vnic_dbg_func(port->name);
+
+ rc = vnic_login_pre_create_1(port, vnic);
+ if (rc) {
+ vnic_warn(port->name, "vnic_login_pre_create_1 failed, rc %d\n", rc);
+ goto pre_create_failed;
+ }
+
+ strncpy(vnic->login_data.vnic_name, vnic->interface_name,
+ sizeof(vnic->interface_name));
+
+ /* queue login create request */
+ fip_vnic_test_login(vnic, 1);
+
+ return 0;
+
+pre_create_failed:
+ return -ENODEV;
+}
+
+void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn,
+ u32 qkey, u16 gw_lid, u8 gw_sl)
+{
+ gw_address->gw_qpn = gw_qpn;
+ gw_address->qkey = qkey;
+ gw_address->gw_lid = gw_lid;
+ gw_address->gw_sl = gw_sl;
+}
+
+void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address)
+{
+ memcpy(&vnic->gw_address, gw_address, sizeof(vnic->gw_address));
+}
+
+int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address)
+{
+ vnic_dbg_fip(vnic->name, "fip_vnic_to_login host admin flow flush=%d"
+ " state=%d\n", vnic->flush, vnic->state);
+ if (likely(vnic->flush == FIP_NO_FLUSH) &&
+ vnic->state <= FIP_VNIC_HADMIN_IDLE &&
+ (!VNIC_MAX_RETRIES || vnic->retry_count < VNIC_MAX_RETRIES)) {
+ fip_vnic_set_gw_param(vnic, gw_address);
+ cancel_delayed_work(&vnic->vnic_task);
+ vnic->state = FIP_VNIC_LOGIN;
+ fip_vnic_fsm(&vnic->vnic_task.work);
+ }
+ return 0;
+}
+
+/*
+ * Call the data vnic precreate 1 + 2 in order to alloc and init the data vnic.
+ * This function updates qp numbers that the data vnic will use. These qp numbers
+ * are needed for the login.
+ * This function does not cleanup on failures. It assumes that the caller will call
+ * the login destoy.
+*/
+static int fip_vnic_login_init(struct vnic_port *port, struct fip_vnic_data *vnic)
+{
+ int qps_num;
+ int rc;
+
+ struct ib_ah_attr ah_attr = {
+ .dlid = vnic->gw_address.gw_lid,
+ .port_num = port->num,
+ .sl = vnic_gw_ctrl_sl(vnic->gw) & 0xf,
+ };
+
+ vnic_dbg_func(vnic->name);
+
+ /* If the driver wants to enable RSS (vnic_rss == 1) then the
+ * number of QPs is what the GW advertises: 1 << n_rss_qpn
+ */
+ qps_num = (port->rx_rings_num > 1) ? (1 << vnic->gw->info.n_rss_qpn) : 1;
+ qps_num = (qps_num == 0) ? 1 : qps_num;
+
+ /* However, we don't support any qps_num, if the GW asks for more than
+ * VNIC_MAX_NUM_CPUS QPs, then we're not going to enable RSS
+ * -- qps_num == 1 means RSS is disabled, otherwise it's enabled
+ */
+ qps_num = qps_num <= VNIC_MAX_NUM_CPUS ? qps_num : 1;
+
+ /* set in vnic, so it can be reported back to the BXM */
+ vnic->qps_num = qps_num;
+
+ /* in host admin vnic->login should be non NULL */
+ if (!vnic->hadmined) {
+ rc = vnic_login_pre_create_1(port, vnic);
+ if (rc) {
+ vnic_warn(vnic->name,
+ "vnic_login_pre_create_1 failed, "
+ "rc %d\n", rc);
+ goto failed;
+ }
+ }
+
+ /* in host admin vnic->login should be non NULL */
+ rc = vnic_login_pre_create_2(vnic, qps_num,
+ vnic->gw->info.gw_type == GW_TYPE_LAG);
+ if (rc) {
+ vnic_warn(port->name, "vnic_login_pre_create_2 failed\n");
+ goto failed;
+ }
+
+ /* if parent_used, you must already have the base QPN */
+ ASSERT(!vnic->parent_used || vnic->qp_base_num);
+
+ vnic->ah = ib_create_ah(port->pd, &ah_attr);
+ if (IS_ERR(vnic->ah)) {
+ vnic_warn(vnic->name, "fip_vnic_login_init failed to create ah\n");
+ vnic->ah = NULL;
+ goto failed;
+ }
+
+ vhub_ctx_init(vnic);
+
+ return 0;
+
+failed:
+ return -ENODEV;
+}
+
+/*
+ * create a CQ and QP for the new vNic. Create RX and TX rings for this
+ * QP. Move QP to RTS and connect it to the CQ.
+*/
+static int fip_vnic_rings_create(struct vnic_port *port,
+ struct fip_vnic_data *vnic)
+{
+ struct ib_qp_init_attr qp_init_attr;
+ int ret;
+
+ vnic->rx_ring.size = FIP_LOGIN_RX_SIZE;
+ vnic->tx_ring.size = FIP_LOGIN_TX_SIZE;
+
+ INIT_WORK(&vnic->vnic_pkt_rcv_task_bh, fip_vnic_recv_bh);
+ spin_lock_init(&vnic->vnic_rcv_list.lock);
+ INIT_LIST_HEAD(&vnic->vnic_rcv_list.list);
+
+ if (ib_find_pkey(port->dev->ca, port->num, vnic->login_data.pkey,
+ &vnic->login_data.pkey_index)) {
+ vnic_warn(vnic->name,
+ "fip_vnic_rings_create PKey 0x%04x not found."
+ " Check configuration in SM/BX\n", vnic->login_data.pkey);
+ goto out_w_err;
+ }
+
+ vnic->pkey = vnic->login_data.pkey;
+ vnic->pkey_index = vnic->login_data.pkey_index;
+
+ vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create pkey id %d "
+ "for pkey 0x%x\n", (int)vnic->pkey_index,
+ (int)vnic->pkey);
+
+ vnic->cq = ib_create_cq(port->dev->ca, fip_vnic_comp, NULL, vnic,
+ vnic->rx_ring.size + vnic->tx_ring.size, 0);
+ if (IS_ERR(vnic->cq)) {
+ vnic_dbg_fip(vnic->name, "failed to create receive CQ\n");
+ goto out_w_err;
+ }
+
+ memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+ qp_init_attr.cap.max_send_wr = vnic->tx_ring.size;
+ qp_init_attr.cap.max_recv_wr = vnic->rx_ring.size;
+ qp_init_attr.cap.max_send_sge = 1;
+ qp_init_attr.cap.max_recv_sge = 1;
+ qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+ qp_init_attr.qp_type = IB_QPT_UD;
+ qp_init_attr.send_cq = vnic->cq;
+ qp_init_attr.recv_cq = vnic->cq;
+
+ vnic->qp = ib_create_qp(port->pd, &qp_init_attr);
+ if (IS_ERR(vnic->qp)) {
+ vnic_dbg_fip(vnic->name, "failed to create QP\n");
+ goto error_free_cq;
+ }
+
+ vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create QPN %d,"
+ " LID %d\n", (int)vnic->qp->qp_num, (int)port->attr.lid);
+
+ /* move QP from reset to RTS */
+ if (fip_init_qp(vnic->port, vnic->qp, vnic->pkey_index, vnic->name)) {
+ vnic_dbg_fip(vnic->name, "fip_init_qp returned with error\n");
+ goto error_free_qp;
+ }
+
+ ret = fip_init_tx(vnic->tx_ring.size, &vnic->tx_ring, vnic->name);
+ if (ret) {
+ vnic_dbg_fip(vnic->name, "fip_init_tx failed ret %d\n", ret);
+ goto error_free_qp;
+ }
+
+ ret = fip_init_rx(port, vnic->rx_ring.size, vnic->qp,
+ &vnic->rx_ring, vnic->name);
+ if (ret) {
+ vnic_dbg_fip(vnic->name, "fip_init_rx returned %d\n", ret);
+ goto error_release_rings;
+ }
+
+ /* enable recieving CQ completions */
+ if (ib_req_notify_cq(vnic->cq, IB_CQ_NEXT_COMP))
+ goto error_release_rings;
+ vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create done OK\n");
+
+ return 0;
+
+error_release_rings:
+ fip_flush_rings(port, vnic->cq, vnic->qp, &vnic->rx_ring,
+ &vnic->tx_ring, vnic->name);
+ fip_free_rings(port, &vnic->rx_ring, &vnic->tx_ring, vnic->name);
+error_free_qp:
+ ib_destroy_qp(vnic->qp);
+error_free_cq:
+ ib_destroy_cq(vnic->cq);
+out_w_err:
+ vnic->qp = NULL;
+ vnic->cq = NULL;
+ vnic->rx_ring.size = 0;
+ vnic->tx_ring.size = 0;
+ return -ENODEV;
+}
+
+static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic)
+{
+ fip_flush_rings(vnic->port, vnic->cq, vnic->qp, &vnic->rx_ring,
+ &vnic->tx_ring, vnic->name);
+ fip_free_rings(vnic->port, &vnic->rx_ring, &vnic->tx_ring, vnic->name);
+ fip_vnic_recv_list_flush(vnic);
+ ib_destroy_qp(vnic->qp);
+ ib_destroy_cq(vnic->cq);
+ vnic->qp = NULL;
+ vnic->cq = NULL;
+}
+
+/*
+ * This function is a callback called upon successful join to a
+ * multicast group. The function checks if we have joined + attached
+ * to all required mcast groups and if so moves the discovery FSM to solicit.
+*/
+void fip_vnic_mcast_cnct_cb(struct vnic_mcast *mcast, void *ctx)
+{
+ struct fip_vnic_data *vnic = mcast->priv_data;
+
+ vnic_dbg_fip(vnic->name, "fip_vnic_mcast_cnct_cb\n");
+ vnic_dbg_parse(vnic->name, "attached mask = 0x%lx, req mask = 0x%lx\n",
+ *mcast->cur_attached, *mcast->req_attach);
+
+ if ((*mcast->cur_attached & *mcast->req_attach) != *mcast->req_attach)
+ return;
+
+ vnic->keep_alive_jiffs = jiffies;
+ set_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state);
+ /* in case of a new mcast connection switch to VHUB_INIT, for a
+ * reconnection stay in the current state */
+ if (vnic->state < FIP_VNIC_VHUB_INIT) {
+ vnic_dbg_fip(vnic->name,
+ "fip_vnic_mcast_cnct_cb done joining mcasts\n");
+ vnic->state = FIP_VNIC_VHUB_INIT;
+ cancel_delayed_work(&vnic->vnic_task);
+ REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0);
+ }
+}
+
+/*
+ * This function is a callback called upon a mcast deattach event.
+ * This event can be triggered due to vnic request or due to an async
+ * event. Currently this code does not participate in the vnic's FSM.
+*/
+void fip_vnic_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx)
+{
+ struct fip_vnic_data *vnic = mcast->priv_data;
+
+ vnic->detached_ka_jiffs = jiffies;
+ clear_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state);
+
+ vnic_dbg_fip(vnic->name, "fip_vnic_mcast_deattach_cb\n");
+}
+
+/*
+ * Try to connect to the relevant mcast groups. If one of the mcast failed
+ * The function should be recalled to try and complete the join process
+ * (for the mcast groups that the join process was not performed).
+ * Note: A successful return of vnic_mcast_join means that the mcast join
+ * started, not that the join completed. completion of the connection process
+ * is asyncronous and uses a supplyed callback.
+ */
+int fip_vnic_mcast_cnct(struct fip_vnic_data *vnic)
+{
+ struct vnic_port *port = vnic->port;
+ union vhub_mgid mgid;
+ struct vnic_mcast *mcaste, *mcaste_upd, *mcaste_tbl;
+ struct vnic_mcast *uninitialized_var(mcaste_ka);
+ int rc;
+
+ vnic_dbg_fip(port->name, "fip_vnic_mcast_cnct called\n");
+
+ mcaste_upd = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+ if (IS_ERR(mcaste_upd))
+ return -EINVAL;
+
+ mcaste_tbl = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+ if (IS_ERR(mcaste_tbl)) {
+ rc = -EINVAL;
+ goto free_upd;
+ }
+
+ set_bit(FIP_MCAST_VHUB_UPDATE, &vnic->req_attach);
+ set_bit(FIP_MCAST_TABLE, &vnic->req_attach);
+
+ vnic_dbg_fip(port->name, "gw type is %d\n", vnic->gw->info.gw_type);
+ if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+ mcaste_ka = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+ if (IS_ERR(mcaste_ka)) {
+ rc = -EINVAL;
+ goto free_tbl;
+ }
+ set_bit(FIP_MCAST_VHUB_KA, &vnic->req_attach);
+ }
+
+ mcaste = mcaste_upd;
+ mcaste->priv_data = vnic;
+ mcaste->attach_bit_nr = FIP_MCAST_VHUB_UPDATE;
+ memset(mcaste->mac, 0, ETH_ALEN);
+ vhub_mgid_create(vnic->login_data.mgid_prefix,
+ mcaste->mac,
+ vnic->login_data.n_mac_mcgid,
+ vnic->login_data.vhub_id, VHUB_MGID_UPDATE,
+ 0, &mgid);
+ mcaste->gid = mgid.ib_gid;
+ mcaste->port_gid = mcaste->gid;
+ mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+ mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+ mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+ mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+ mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+ mcaste->attach_cb_ctx = NULL;
+ mcaste->detach_cb_ctx = NULL;
+ mcaste->blocking = 0;
+ mcaste->qkey = VNIC_FIP_QKEY;
+ mcaste->pkey = vnic->pkey;
+ mcaste->qp = vnic->qp;
+ mcaste->create = vnic_mcast_create;
+ mcaste->blocking = 0;
+ mcaste->join_state = 1;
+ rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */
+ ASSERT(!rc);
+
+ mcaste = mcaste_tbl;
+ mcaste->priv_data = vnic;
+ mcaste->attach_bit_nr = FIP_MCAST_TABLE;
+ memset(mcaste->mac, 0, ETH_ALEN);
+ vhub_mgid_create(vnic->login_data.mgid_prefix,
+ mcaste->mac,
+ vnic->login_data.n_mac_mcgid,
+ vnic->login_data.vhub_id, VHUB_MGID_TABLE,
+ 0, &mgid);
+ mcaste->gid = mgid.ib_gid;
+ mcaste->port_gid = mcaste->gid;
+ mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+ mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+ mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+ mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+ mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+ mcaste->attach_cb_ctx = NULL;
+ mcaste->detach_cb_ctx = NULL;
+ mcaste->blocking = 0;
+ mcaste->qkey = VNIC_FIP_QKEY;
+ mcaste->pkey = vnic->pkey;
+ mcaste->qp = vnic->qp;
+ mcaste->create = vnic_mcast_create;
+ mcaste->blocking = 0;
+ mcaste->join_state = 1;
+ rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */
+ ASSERT(!rc);
+
+ if (vnic->gw->info.gw_type != GW_TYPE_LAG)
+ return 0;
+
+ mcaste = mcaste_ka;
+ mcaste->priv_data = vnic;
+ mcaste->attach_bit_nr = FIP_MCAST_VHUB_KA;
+ memset(mcaste->mac, 0, ETH_ALEN);
+ vhub_mgid_create(vnic->login_data.mgid_prefix,
+ mcaste->mac,
+ vnic->login_data.n_mac_mcgid,
+ vnic->login_data.vhub_id, VHUB_MGID_KA,
+ 0, &mgid);
+ mcaste->gid = mgid.ib_gid;
+ mcaste->port_gid = mcaste->gid;
+ mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+ mcaste->backoff_factor = 1;
+ mcaste->retry = VNIC_MCAST_MAX_RETRY;
+ mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+ mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+ mcaste->attach_cb_ctx = NULL;
+ mcaste->detach_cb_ctx = NULL;
+ mcaste->blocking = 0;
+ mcaste->qkey = VNIC_FIP_QKEY;
+ mcaste->pkey = vnic->pkey;
+ mcaste->qp = vnic->qp;
+ mcaste->create = vnic_mcast_create;
+ mcaste->blocking = 0;
+ mcaste->join_state = 1;
+ mcaste->sender_only = 1;
+ vnic->ka_mcast_gid = mcaste->gid;
+ rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste);
+ ASSERT(!rc);
+
+ return 0;
+
+free_tbl:
+ vnic_mcast_dealloc(mcaste_tbl);
+
+free_upd:
+ vnic_mcast_dealloc(mcaste_upd);
+
+ return rc;
+}
+
+/*
+ * This function is the driving engine of the vnic logic. It manages the
+ * vnics state machines.
+ * Some of the states in the state machine could have been removed because
+ * they contain "actions" and not states. Still it is easier to maintaine
+ * the code this way and it gives an easy mechanism for exception handling
+ * and retries.
+ * Only call this function from fip_wq context.
+*/
+void fip_vnic_fsm(struct work_struct *work)
+{
+ struct fip_vnic_data *vnic =
+ container_of(work, struct fip_vnic_data, vnic_task.work);
+ struct vnic_port *port = vnic->port;
+ int rc, recall_time = 0;
+ const long int msec_in_sec = 1000;
+ struct fip_vnic_send_info gw_address;
+ ktime_t ktime;
+
+ vnic_dbg_fip(port->name, "fip_vnic_fsm called vnic %d\n",
+ vnic->vnic_id);
+
+ if (vnic->flush != FIP_NO_FLUSH)
+ return;
+
+ switch (vnic->state) {
+ case FIP_VNIC_CLOSED:
+ break;
+ case FIP_VNIC_HADMIN_IDLE:
+ if (vnic->gw->state < FIP_GW_CONNECTED)
+ break;
+ fip_vnic_create_gw_param(&gw_address, vnic->gw->info.gw_qpn, VNIC_FIP_QKEY,
+ vnic->gw->info.gw_lid, vnic_gw_ctrl_sl(vnic->gw));
+ fip_vnic_set_gw_param(vnic, &gw_address);
+ /* fall through */
+
+ case FIP_VNIC_LOGIN:
+ vnic_dbg_fip(port->name, "FIP_VNIC_LOGIN vnic %d\n",
+ vnic->vnic_id);
+ /* get data QP numbers needed for login request packet. If we fail
+ * we will close the vnic entirely */
+ rc = fip_vnic_login_init(vnic->port, vnic);
+ if (rc) {
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ vnic_warn(vnic->name, "fip_vnic_login_init failed, "
+ "closing vnic rc %d\n", rc);
+ break;
+ }
+ vnic->state = FIP_VNIC_WAIT_4_ACK;
+ /* fall through */
+
+ case FIP_VNIC_WAIT_4_ACK:
+ vnic_dbg_fip(port->name, "FIP_VNIC_WAIT_4_ACK vnic %d\n",
+ vnic->vnic_id);
+ /* resend login request every timeout */
+ vnic_dbg_fip(port->name, "fip_login_send vnic %d\n",vnic->vnic_id);
+ rc = fip_login_send(vnic);
+ if (!rc)
+ recall_time = FIP_LOGIN_TIMEOUT * msec_in_sec;
+ else
+ recall_time = 1 * msec_in_sec;
+
+ goto queue_vnic_work;
+
+ case FIP_VNIC_RINGS_INIT:
+ /* create QP and rings */
+ rc = fip_vnic_rings_create(vnic->port, vnic);
+ if (rc) {
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ vnic_warn(vnic->name, "fip_vnic_rings_create failed, "
+ "closing vnic rc=%d\n", rc);
+ break;
+ }
+
+ vnic->last_send_jiffs = 1; /* use a non zero value to start transmition */
+ {
+ /* start vnic UCAST KA packets, This will also cause bxm to send us the
+ * neighbor table */
+ if (vnic->gw->info.gw_type != GW_TYPE_LAG) {
+ ktime = ktime_set(0, 0);
+#ifdef _BP_HR_TIMER
+ hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL );
+#else
+ hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL );
+#endif
+ }
+ }
+
+ vnic->state = FIP_VNIC_MCAST_INIT;
+ /* fall through */
+
+ case FIP_VNIC_MCAST_INIT:
+ rc = fip_vnic_mcast_cnct(vnic);
+ if (rc) {
+ vnic_warn(vnic->name,
+ "fip_vnic_mcast_cnct failed, rc %d\n", rc);
+ /* try again later */
+ recall_time = 1 * msec_in_sec;
+ goto queue_vnic_work;
+ }
+ vnic->state = FIP_VNIC_MCAST_INIT_DONE;
+ /* fall through */
+
+ case FIP_VNIC_MCAST_INIT_DONE:
+ /* wait for mcast attach CB before continueing */
+ break;
+
+ case FIP_VNIC_VHUB_INIT:
+
+ /* previous KA if sent did not request a table because MCASTs were not
+ * available. Send extra KA packet that should trigger table request in
+ * order to hasten things up */
+ fip_vnic_keepalive_send(vnic, 1);
+
+ if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+ /* start vnic MCAST KA packets, This will also cause bxm to send us the
+ * neighbor table */
+ ktime = ktime_set(0, 0);
+#ifdef _BP_HR_TIMER
+ hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL );
+#else
+ hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL );
+#endif
+ }
+
+ /* start tracking GW keep alives, calls fip_vnic_gw_alive() */
+ QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task,
+ vnic->gw->info.gw_period);
+
+ vnic->state = FIP_VNIC_VHUB_INIT_DONE;
+ /* fall through */
+
+ case FIP_VNIC_VHUB_INIT_DONE:
+ /* we are waiting to receive a full vhub table. The KA will handle
+ * retries if we do not get the table we are expecting */
+
+ /* queue login create request */
+ if (fip_vnic_test_login(vnic, 1)) {
+ recall_time = 1 * msec_in_sec;
+ goto queue_vnic_work;
+ }
+
+ break;
+
+ case FIP_VNIC_VHUB_DONE:
+ if (fip_vnic_test_login(vnic, 1)) {
+ recall_time = 1 * msec_in_sec;
+ goto queue_vnic_work;
+ }
+
+ if (vnic_login_complete_ack(vnic, &vnic->login_data, &vnic->shared_vnic)) {
+ vnic_warn(vnic->name,
+ "vnic_login_complete_ack failed\n");
+ recall_time = 1 * msec_in_sec;
+ goto queue_vnic_work;
+ }
+
+ /* for LAG write member info */
+ fip_vnic_write_members(vnic);
+
+ vnic->state = FIP_VNIC_VHUB_WRITE;
+ /* fall through */
+
+ case FIP_VNIC_VHUB_WRITE:
+ /* write the vhub table to login */
+ fip_vnic_write_tbl(vnic);
+ vnic->state = FIP_VNIC_CONNECTED;
+ /* fall through */
+
+ case FIP_VNIC_CONNECTED:
+ vnic->retry_count = 0;
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+
+ vnic_dbg_fip(port->name, "state %d gw_lid %d gw_qpn %d\n",
+ vnic->state, vnic->gw_address.gw_lid, vnic->gw_address.gw_qpn);
+ return;
+
+queue_vnic_work:
+ QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, recall_time * HZ / msec_in_sec);
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+
+struct workqueue_struct *fip_wq;
+
+void fip_refresh_mcasts(struct fip_discover *discover)
+{
+ struct fip_gw_data *gw;
+ struct fip_vnic_data *vnic;
+
+ fip_discover_mcast_reattach(discover, discover->port);
+
+ down_read(&discover->l_rwsem);
+ list_for_each_entry(gw, &discover->gw_list, list)
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+ if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT)
+ vnic_tree_mcast_detach(&vnic->mcast_tree);
+ }
+
+ list_for_each_entry(gw, &discover->gw_list, list) {
+ list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+ if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT)
+ vnic_tree_mcast_attach(&vnic->mcast_tree);
+ }
+ /* restart path query */
+ if (vnic_sa_query && gw->state >= FIP_GW_CTRL_PATH_QUERY && gw->flush == FIP_NO_FLUSH)
+ fip_discover_gw_fsm_move(gw, FIP_GW_CTRL_PATH_QUERY);
+ }
+ up_read(&discover->l_rwsem);
+
+}
+
+void port_fip_discover_restart(struct work_struct *work)
+{
+ struct vnic_port *port =
+ container_of(work, struct vnic_port, discover_restart_task.work);
+ struct fip_discover *discover;
+ struct vnic_login *login;
+
+ vnic_dbg_mark();
+ mutex_lock(&port->start_stop_lock);
+ vnic_dbg_mark();
+ mutex_lock(&port->mlock);
+ if (vnic_port_query(port))
+ vnic_warn(port->name, "vnic_port_query failed\n");
+
+ /* bring vnics links down */
+ list_for_each_entry(login, &port->login_list, list)
+ vnic_mcast_del_all(&login->mcast_tree);
+
+ mutex_unlock(&port->mlock);
+ list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+ if (fip_discover_cleanup(port, discover, 0)) {
+ vnic_dbg(port->name, "fip_discover_cleanup flushed\n");
+ goto out;
+ }
+ }
+
+ list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+ if (fip_discover_init(port, discover, discover->pkey, 0)) {
+ vnic_warn(port->name, "failed to alloc discover resources\n");
+ }
+ }
+out:
+ mutex_unlock(&port->start_stop_lock);
+ return;
+}
+
+void vnic_port_fip_cleanup(struct vnic_port *port, int lock)
+{
+ struct fip_discover *discover, *tmp_discover;
+
+ if (lock)
+ mutex_lock(&port->start_stop_lock);
+
+ list_for_each_entry_safe(discover, tmp_discover, &port->fip.discover_list, discover_list) {
+ vnic_dbg_fip_p0(port->name, "Discovery cleanup of PKEY=0x%x\n", discover->pkey);
+
+ list_del(&discover->discover_list);
+ vnic_info("Removed fip discovery %s port %d pkey 0x%x\n",
+ port->dev->ca->name, port->num, discover->pkey);
+ fip_discover_cleanup(port, discover, 1);
+ kfree(discover);
+ }
+
+ if (lock)
+ mutex_unlock(&port->start_stop_lock);
+}
+
+
+int vnic_port_fip_init(struct vnic_port *port)
+{
+ int rc;
+ struct fip_discover *discover;
+ int i;
+
+ if (no_bxm)
+ return 0;
+
+ vnic_discovery_pkeys_count = vnic_discovery_pkeys_count > MAX_NUM_PKEYS_DISCOVERY ?
+ MAX_NUM_PKEYS_DISCOVERY : vnic_discovery_pkeys_count;
+
+ if (vnic_discovery_pkeys_count == 0 ||
+ (vnic_discovery_pkeys_count == MAX_NUM_PKEYS_DISCOVERY &&
+ vnic_discovery_pkeys[0] == 0)) {
+ vnic_discovery_pkeys[0] = 0xffff;
+ vnic_discovery_pkeys_count = 1;
+ vnic_dbg_fip_p0(port->name, "Creating default PKEY for Discovery\n");
+ }
+
+ mutex_lock(&port->start_stop_lock);
+
+ for (i = 0; i < vnic_discovery_pkeys_count; i++) {
+ vnic_discovery_pkeys[i] &= 0xffff;
+ vnic_discovery_pkeys[i] |= 0x8000;
+
+ vnic_dbg_fip_p0(port->name, "Init Discovery=%d on PKEY=0x%x\n", i, vnic_discovery_pkeys[i]);
+
+ discover = kzalloc(sizeof(struct fip_discover), GFP_KERNEL);
+ if (!discover) {
+ vnic_warn(port->name, "discover alloc failed\n");
+ rc = -ENOMEM;
+ goto fail;
+ }
+
+ INIT_LIST_HEAD(&discover->discover_list);
+
+ vnic_info("Added fip discovery %s port %d PKEY 0x%x\n",
+ port->dev->ca->name, port->num,
+ vnic_discovery_pkeys[i]);
+
+ list_add_tail(&discover->discover_list, &port->fip.discover_list);
+ rc = fip_discover_init(port, discover, vnic_discovery_pkeys[i], 1);
+ if (rc) {
+ vnic_warn(port->name, "fip_discover_init pkey=0x%x "
+ "failed\n", discover->pkey);
+ list_del(&discover->discover_list);
+ kfree(discover);
+ goto fail;
+ }
+ }
+ mutex_unlock(&port->start_stop_lock);
+ return 0;
+
+fail:
+ mutex_unlock(&port->start_stop_lock);
+ vnic_port_fip_cleanup(port, 1);
+ return rc;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/uts.h>
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+const struct eoib_host_update base_update_pkt = {
+ .fip.subcode = FIP_HOST_ALIVE_SUB_OPCODE,
+ .fip.type.type = FIP_FIP_HDR_TYPE,
+ .fip.type.length = FIP_FIP_HDR_LENGTH,
+ .fip.vendor_id = FIP_VENDOR_MELLANOX,
+
+ .type_1.type = FIP_HOST_UPDATE_TYPE,
+ .type_1.length = FIP_HOST_UPDATE_LENGTH,
+ .vendor_id = FIP_VENDOR_MELLANOX,
+};
+
+const struct eoib_host_update base_logout_pkt = {
+ .fip.subcode = FIP_HOST_LOGOUT_SUB_OPCODE,
+ .fip.type.type = FIP_FIP_HDR_TYPE,
+ .fip.type.length = FIP_FIP_HDR_LENGTH,
+ .fip.vendor_id = FIP_VENDOR_MELLANOX,
+
+ .type_1.type = FIP_LOGOUT_TYPE_1,
+ .type_1.length = FIP_LOGOUT_LENGTH_1,
+ .vendor_id = FIP_VENDOR_MELLANOX,
+};
+
+static int extract_adv_extended(struct fip_ext_desc_tlv *fed,
+ struct fip_gw_data_info *info)
+{
+ struct fip_ext_type_cap *extended_cap;
+ struct fip_ext_type_boot *extended_boot;
+ struct fip_ext_type_power_cycle_id *extended_pc_id;
+ struct fip_ext_type_lag_props *extended_lag = NULL;
+ struct fip_extended_type *ext_hdr;
+ int length_to_go, ext_length;
+
+ vnic_dbg_parse("", "extracting extended descriptor\n");
+
+ length_to_go = (((int)fed->ft.length) << 2) - sizeof(*fed);
+ ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+ while (length_to_go > 0) {
+ ext_length = ((int)ext_hdr->len) << 2;
+
+ vnic_dbg_parse(NULL, "Advertise parse, sub-tlv "
+ "type %d length %d address=%p\n",
+ ext_hdr->ext_type, ext_length, ext_hdr);
+
+ if (ext_length < sizeof(*ext_hdr) ||
+ ext_length > length_to_go) {
+ vnic_dbg_parse(NULL, "Extended length error. "
+ "Length=%d\n", ext_length);
+ return -EINVAL;
+ }
+
+ if (ext_hdr->ext_type == ADV_EXT_TYPE(CAP) &&
+ ext_length == sizeof(*extended_cap)) { /* capabilities*/
+ /* do nothing */
+ } else if (ext_hdr->ext_type == ADV_EXT_TYPE(LAG) && /* LAG */
+ ext_length == sizeof(*extended_lag)) {
+ extended_lag = (struct fip_ext_type_lag_props *)ext_hdr;
+ info->gw_type = extended_lag->gw_type;
+ info->ext_lag.hash = be16_to_cpu(extended_lag->lag_hash);
+ info->ext_lag.weights_policy = extended_lag->weight_policy_flags >> 4;
+ info->ext_lag.member_ka = (extended_lag->weight_policy_flags & 0x8) >> 3;
+ info->ext_lag.ca = !!(extended_lag->weight_policy_flags &
+ FIP_EXT_LAG_W_POLICY_HOST);
+ info->ext_lag.ca_thresh = extended_lag->ca_threshold;
+ info->ext_lag.ucast = !!(extended_lag->weight_policy_flags &
+ FIP_EXT_LAG_W_POLICY_UCAST);
+ info->ext_lag.valid = 1;
+ } else if (ext_hdr->ext_type == ADV_EXT_TYPE(BOOT) &&
+ ext_length == sizeof(*extended_boot)) { /* boot */
+ extended_boot = (struct fip_ext_type_boot *)ext_hdr;
+ info->ext_boot.boot_prio = extended_boot->boot_prio;
+ info->ext_boot.timeout = extended_boot->discovery_timeout;
+ info->ext_boot.valid = 1;
+ } else if (ext_hdr->ext_type == ADV_EXT_TYPE(PC_ID) &&
+ ext_length == sizeof(*extended_pc_id)) { /* Power Cycle ID */
+ extended_pc_id = (struct fip_ext_type_power_cycle_id *)ext_hdr;
+ info->ext_pc_id.power_cycle_id =
+ be64_to_cpu(extended_pc_id->power_cycle_id);
+ info->ext_pc_id.valid = 1;
+ } else if (ext_hdr->mandatory & 0x01) {
+ vnic_dbg_parse(NULL, "Advertise parse, unknown"
+ " mandatory extended type %d length %d\n",
+ ext_hdr->ext_type, ext_length);
+ return -EINVAL;
+ } else
+ vnic_dbg_parse(NULL, "Advertise parse, unknown "
+ "non-mandatory extended. Skipping, type"
+ " %d length %d\n",
+ ext_hdr->ext_type, ext_length);
+
+ ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+ length_to_go -= ext_length;
+ }
+
+ return 0;
+}
+
+int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc,
+ struct fip_gw_data *data)
+{
+ long ka_time;
+ int err = 0;
+
+ /* make sure we have at least a single address descriptor */
+ if (fc->fa.num < 1 || !fc->fgwi || !fc->fgid || !fc->fka)
+ return -EINVAL;
+
+ data->info.flags = be16_to_cpu(fc->fh->flags) & FIP_FIP_ADVRTS_FLAG ? FIP_GW_AVAILABLE : 0;
+
+ data->info.flags |=
+ (be16_to_cpu(fc->fh->flags) & FIP_FIP_SOLICITED_FLAG) ? 0 :
+ FIP_RCV_MULTICAST;
+
+ data->info.flags |= FIP_IS_FIP;
+ data->info.flags |= (fc->fh->flags & FIP_ADVERTISE_HOST_VLANS) ?
+ FIP_HADMINED_VLAN : 0;
+
+ data->info.gw_qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff;
+ data->info.gw_lid = be16_to_cpu(fc->fa.fa[0]->lid);
+ data->info.gw_port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) &
+ FIP_ADVERTISE_GW_PORT_ID_MASK;
+ data->info.gw_sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; /*ignore this value.*/
+ memcpy(data->info.gw_guid, fc->fa.fa[0]->guid, sizeof(data->info.gw_guid));
+ data->info.gw_num_vnics = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) &
+ FIP_ADVERTISE_NUM_VNICS_MASK;
+
+ data->info.n_rss_qpn = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) >>
+ FIP_ADVERTISE_N_RSS_SHIFT;
+ data->info.hadmined_en = (fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_HOST_EN_MASK);
+ data->info.all_vlan_gw = !!(fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_ALL_VLAN_GW_MASK);
+
+ TERMINATED_MEMCPY(data->info.gw_vendor_id, fc->fgwi->vendor_id);
+ memcpy(data->info.vol_info.system_guid, fc->fgid->sys_guid,
+ sizeof(data->info.vol_info.system_guid));
+ TERMINATED_MEMCPY(data->info.vol_info.system_name,
+ fc->fgid->sys_name);
+ TERMINATED_MEMCPY(data->info.vol_info.gw_port_name, fc->fgid->gw_port_name);
+
+ ka_time = be32_to_cpu(fc->fka->adv_period);
+ ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+ /* do not let KA go under 2 secs */
+ ka_time = (ka_time < 2000) ? 2000 : ka_time;
+ data->info.gw_adv_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time));
+
+ ka_time = be32_to_cpu(fc->fka->ka_period);
+ ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+ data->info.gw_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time));
+
+ ka_time = be32_to_cpu(fc->fka->vnic_ka_period);
+ ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+ data->info.vnic_ka_period = msecs_to_jiffies(ka_time);
+
+ data->info.gw_type = GW_TYPE_SINGLE_EPORT;
+ if (fc->fed.num > 0) {
+ if (fc->fed.num == 1) {
+ /* new version bxm mode */
+ data->info.gw_prot_new = 1;
+ err = extract_adv_extended(fc->fed.fed[0], &data->info);
+ if (err)
+ vnic_dbg_parse(discover->name, "invalid extended descripotr\n");
+ } else {
+ vnic_dbg_parse(discover->name, "too many extended descripotrs\n");
+ return -EINVAL;
+ }
+ }
+
+ return err;
+}
+
+static int send_generic_mcast_pkt(struct vnic_port *port,
+ struct fip_ring *tx_ring,
+ void *mem, int pkt_size,
+ struct ib_qp *qp,
+ int pkey_index,
+ struct vnic_mcast *mcast)
+{
+ int index, rc;
+ unsigned long flags;
+ unsigned long tail;
+
+ /*
+ * we are only allowed to update the head at task level so no need to
+ * perform any locks here
+ */
+ spin_lock_irqsave(&tx_ring->ring_lock, flags);
+ index = tx_ring->head & (tx_ring->size - 1);
+ vnic_dbg_fip(port->name, "mcast packet\n");
+
+ spin_lock(&tx_ring->head_tail_lock);
+ tail = tx_ring->tail;
+ spin_unlock(&tx_ring->head_tail_lock);
+
+ /* ring full try again */
+ if (tx_ring->head - tail >= tx_ring->size) {
+ vnic_warn(port->name, "send_generic_mcast_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n",
+ qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail);
+ rc = -EAGAIN;
+ goto err;
+ }
+
+ rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size);
+ if (rc)
+ goto err;
+
+ rc = fip_mcast_send(port, qp, index,
+ tx_ring->ring[index].bus_addr,
+ pkt_size, pkey_index, mcast);
+
+ if (rc) {
+ vnic_warn(port->name,
+ "send_generic_mcast_pkt: fip_mcast_send ret %d\n",
+ rc);
+ rc = -ENODEV;
+ goto error_unmap_dma;
+ }
+
+ tx_ring->head++;
+
+ spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+ return 0;
+
+error_unmap_dma:
+ ib_dma_unmap_single(port->dev->ca,
+ tx_ring->ring[index].bus_addr,
+ pkt_size, DMA_TO_DEVICE);
+
+err:
+ spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+ return rc;
+}
+
+static void *alloc_solicit_pkt(int new_prot, char *node_desc)
+{
+ void *ptr;
+ struct fip_solicit_new *nptr;
+ struct fip_solicit_legacy *optr;
+ int size = new_prot ? sizeof *nptr : sizeof *optr;
+
+ ptr = kzalloc(size, GFP_KERNEL);
+ if (!ptr)
+ return ERR_PTR(-ENOMEM);
+ optr = ptr;
+ optr->version.version = 1;
+ optr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+ optr->fh.subcode = FIP_HOST_SOL_SUB_OPCODE;
+ optr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*optr), fvend)) / 4;
+ optr->fvend.ft.type = FIP_TYPE(VENDOR_ID);
+ optr->fvend.ft.length = sizeof optr->fvend / 4;
+ strncpy(optr->fvend.vendor_id, "mellanox", sizeof optr->fvend.vendor_id);
+ optr->addr.ft.type = FIP_TYPE(ADDRESS);
+ optr->addr.ft.length = sizeof optr->addr / 4;
+ strncpy(optr->addr.vendor_id, "mellanox", sizeof optr->addr.vendor_id);
+ if (new_prot) {
+ nptr = ptr;
+ nptr->ext.ft.type = 254;
+ nptr->ext.ft.length = sizeof nptr->ext / 4;
+ strncpy(nptr->ext.vendor_id, "mellanox", sizeof nptr->ext.vendor_id);
+ nptr->ext_cap.et.ext_type = 40;
+ nptr->ext_cap.et.len = sizeof nptr->ext_cap / 4;
+ nptr->ext_cap.et.mandatory = 1;
+ nptr->ext_hostname.et.ext_type = 39;
+ nptr->ext_hostname.et.len = sizeof nptr->ext_hostname / 4;
+ strncpy(nptr->ext_hostname.hostname, node_desc, sizeof nptr->ext_hostname.hostname);
+ }
+
+ return ptr;
+}
+
+int fip_solicit_send(struct fip_discover *discover,
+ enum fip_packet_type multicast,
+ u32 dqpn, u16 dlid, u8 sl, int new_prot)
+{
+ int rc = 0;
+ unsigned long flags, flags1;
+ struct fip_solicit_legacy *optr;
+ int size = new_prot ? sizeof(struct fip_solicit_new) : sizeof *optr;
+
+ ASSERT(discover);
+
+ /* alloc packet to be sent */
+ optr = alloc_solicit_pkt(new_prot, discover->port->dev->ca->node_desc);
+ if (IS_ERR(optr))
+ return PTR_ERR(optr);
+
+ /* we set bit 24 to signify that we're a new host */
+ optr->addr.gwtype_qpn = cpu_to_be32(discover->qp->qp_num | 0x1000000);
+ optr->addr.lid = cpu_to_be16(discover->port->attr.lid);
+ /* send the SL to the GW*/
+ optr->addr.sl_gwportid = cpu_to_be16(sl << FIP_ADVERTISE_SL_SHIFT);
+
+ memcpy(optr->addr.guid, &discover->port->gid.global.interface_id, sizeof(optr->addr.guid));
+ vnic_dbg_fip(discover->name, "fip_solicit_send creating multicast %d"
+ " solicit packet\n", multicast);
+
+ fip_dbg_dump_raw_pkt(0, optr, size, 1, "sending solicit packet");
+
+ if (multicast) {
+ struct vnic_mcast *mcaste;
+ union ib_gid gid;
+
+ memcpy(&gid, fip_solicit_mgid, GID_LEN);
+ spin_lock_irqsave(&discover->mcast_tree.mcast_rb_lock, flags);
+ mcaste = vnic_mcast_search(&discover->mcast_tree, &gid);
+ /* it is possible for the MCAST entry or AH to be missing in
+ * transient states (after events). This is a valid condition
+ * but we can't send packet
+ */
+ if (!IS_ERR(mcaste) && mcaste->ah) {
+ spin_lock_irqsave(&mcaste->lock, flags1);
+ rc = send_generic_mcast_pkt(discover->port, &discover->tx_ring,
+ optr, size, discover->qp,
+ discover->pkey_index,
+ mcaste);
+ spin_unlock_irqrestore(&mcaste->lock, flags1);
+ } else
+ kfree(optr);
+
+ spin_unlock_irqrestore(&discover->mcast_tree.mcast_rb_lock, flags);
+ } else {
+ rc = send_generic_ucast_pkt(discover->port, NULL, &discover->tx_ring,
+ optr, size, discover->qp,
+ discover->pkey_index,
+ dqpn, dlid, VNIC_FIP_QKEY, sl);
+ }
+ if (rc)
+ goto error_free_mem;
+
+ return 0;
+
+error_free_mem:
+ vnic_warn(discover->name, "discover_send error ret %d\n", rc);
+ kfree(optr);
+ return -ENOMEM;
+}
+
+static void *alloc_login_pkt(struct fip_vnic_data *vnic)
+{
+ struct eoib_login *ptr;
+ int size = sizeof *ptr;
+
+ ptr = kzalloc(size, GFP_KERNEL);
+ if (!ptr)
+ return ERR_PTR(-ENOMEM);
+
+ ptr->eoib_ver.version = 1;
+ ptr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+ ptr->fh.subcode = FIP_HOST_LOGIN_SUB_OPCODE;
+ ptr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*ptr), fvend) / 4);
+ ptr->fvend.ft.type = FIP_TYPE(VENDOR_ID);
+ ptr->fvend.ft.length = sizeof ptr->fvend / 4;
+ strncpy(ptr->fvend.vendor_id, "mellanox", sizeof ptr->fvend.vendor_id);
+ ptr->fa.ft.type = FIP_TYPE(ADDRESS);
+ ptr->fa.ft.length = sizeof ptr->fa / 4;
+ strncpy(ptr->fa.vendor_id, "mellanox", sizeof ptr->fa.vendor_id);
+ ptr->fa.gwtype_qpn = cpu_to_be32(vnic->qp_base_num);
+ ptr->fa.sl_gwportid = cpu_to_be16(vnic->gw->info.gw_port_id);
+ /* sl will be taken from the data path record query */
+ ptr->fa.sl_gwportid |= cpu_to_be16(vnic->gw->data_prec.sl << FIP_ADVERTISE_SL_SHIFT);
+ ptr->fa.lid = cpu_to_be16(vnic->port->attr.lid);
+ memcpy(ptr->fa.guid, &vnic->port->gid.global.interface_id, sizeof ptr->fa.guid);
+ ptr->fl.ft.type = FIP_TYPE(LOGIN);
+ ptr->fl.ft.length = sizeof ptr->fl / 4;
+ strncpy(ptr->fl.vendor_id, "mellanox", sizeof ptr->fl.vendor_id);
+ ptr->fl.vnic_id = cpu_to_be16(vnic->vnic_id);
+
+ if (vnic->hadmined) {
+ int mac_valid = !IS_ZERO_MAC(vnic->login_data.mac);
+ u16 flags = (mac_valid ? FIP_LOGIN_M_FLAG : 0) |
+ FIP_LOGIN_H_FLAG |
+ (vnic->login_data.vp ? FIP_LOGIN_VP_FLAG | FIP_LOGIN_V_FLAG : 0);
+ ptr->fl.flags_vlan = cpu_to_be16(vnic->login_data.vlan | flags );
+ memcpy(ptr->fl.mac, vnic->login_data.mac, sizeof ptr->fl.mac);
+ memcpy(ptr->fl.vnic_name, vnic->login_data.vnic_name, sizeof ptr->fl.vnic_name);
+
+ // TODO remove this when BXM handles 0 addresses
+ if (!mac_valid)
+ ptr->fl.mac[ETH_ALEN-1] = 1;
+ }
+
+ /* all_vlan mode must be enforced between the host and GW side.
+ For host admin vnic with VLAN we let the host choose the work mode.
+ If the GW isn't working in that same mode, the login will fail
+ and the host will enter a login-retry loop
+ For net admin vnic or host admin without a vlan, we work in the mode
+ published by the GW */
+ if (vnic->gw->info.all_vlan_gw &&
+ (!vnic->hadmined ||
+ (vnic->hadmined && !vnic->login_data.vp)))
+ ptr->fl.vfields |= cpu_to_be16(FIP_LOGIN_ALL_VLAN_GW_FLAG);
+
+ ptr->fl.syndrom_ctrl_qpn = cpu_to_be32(vnic->gw->discover->qp->qp_num);
+ ptr->fl.vfields |= cpu_to_be16((vnic->qps_num > 1) << 12);
+
+ /* for child vNics, allow implicit logout */
+ if (vnic->parent_used) {
+ ptr->fl.vfields |= cpu_to_be16(1 << 14);
+ ptr->fl.vfields |= cpu_to_be16(1 << 13);
+ }
+
+ return ptr;
+}
+
+/*
+ * Send a unicast login packet. This function supports both host and
+ * network admined logins. function returns 0 on success and
+ * error code on failure
+*/
+int fip_login_send(struct fip_vnic_data *vnic)
+{
+ int ret;
+ struct eoib_login *ptr;
+
+ ASSERT(vnic);
+ ASSERT(vnic->port);
+
+ /* don't send packet because GW does not support this */
+ if (vnic->hadmined && !vnic->gw->hadmin_gw)
+ return 0;
+
+ /* alloc packet to be sent */
+ ptr = alloc_login_pkt(vnic);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+
+ fip_dbg_dump_raw_pkt(0, ptr, sizeof *ptr, 1, "sending login packet");
+
+ ret = send_generic_ucast_pkt(vnic->port, NULL/*ah : create a new ah inside*/,
+ &vnic->gw->discover->tx_ring,
+ ptr, sizeof *ptr, vnic->gw->discover->qp,
+ vnic->gw->discover->pkey_index,
+ vnic->gw_address.gw_qpn,
+ vnic->gw_address.gw_lid,
+ vnic->gw_address.qkey,
+ vnic_gw_ctrl_sl(vnic->gw));
+ if (ret) {
+ vnic_warn(vnic->port->name,
+ "fip_login_send: fip_ucast_send ret %d\n", ret);
+ goto error_free_mem;
+ }
+
+ return 0;
+
+error_free_mem:
+ kfree(ptr);
+ return -ENOMEM;
+}
+
+/*
+ * This function creates and sends a few types of packets (all ucast):
+ * vHub context request - new=1, logout=0
+ * vHub context update / vnic keep alive - new=0, logout=0
+ * vnic logout - new=0, logout=1
+*/
+int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout)
+{
+ struct eoib_host_update *pkt;
+ struct ib_qp *send_qp;
+ struct fip_ring *tx_ring;
+ int pkey_index;
+ int ret = 0;
+
+ ASSERT(vnic);
+ ASSERT(vnic->port);
+
+ /* alloc packet to be sent */
+ pkt = kmalloc(sizeof *pkt, GFP_ATOMIC);
+ if (!pkt) {
+ vnic_warn(vnic->port->name, "fip_update_send malloc failed\n");
+ return -EAGAIN;
+ }
+
+ /* copy keep alive packet template */
+ if (logout)
+ memcpy(pkt, &base_logout_pkt, sizeof(struct eoib_host_update));
+ else
+ memcpy(pkt, &base_update_pkt, sizeof(struct eoib_host_update));
+
+ pkt->fip.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+ pkt->fip.list_length =
+ cpu_to_be16((sizeof(struct eoib_host_update) >> 2) - 3);
+ pkt->vnic_id = cpu_to_be16(vnic->vnic_id);
+ memcpy(pkt->mac, vnic->login_data.mac, sizeof(pkt->mac));
+ memcpy(pkt->vnic_name, vnic->login_data.vnic_name,
+ sizeof(pkt->vnic_name));
+ memcpy(pkt->port_guid, &vnic->port->gid.global.interface_id,
+ sizeof(pkt->port_guid));
+
+ pkt->vhub_id.vhub_id = cpu_to_be32(vnic->login_data.vhub_id);
+
+ if (!logout) {
+ pkt->tusn = cpu_to_be32(vnic->vhub_table.main_list.tusn);
+ send_qp = vnic->qp;
+ tx_ring = &vnic->tx_ring;
+ pkey_index = vnic->pkey_index;
+
+ if (vnic->login_data.vp)
+ pkt->vhub_id.flags.flags |= FIP_HOST_VP_FLAG;
+
+ if (request_new)
+ pkt->vhub_id.flags.flags |= FIP_HOST_R_FLAG;
+ else
+ pkt->vhub_id.flags.flags |= FIP_HOST_U_FLAG;
+ } else {
+ send_qp = vnic->gw->discover->qp;
+ tx_ring = &vnic->gw->discover->tx_ring;
+ pkey_index = vnic->gw->discover->pkey_index;
+ }
+
+ if (vnic->gw->info.gw_type == GW_TYPE_LAG &&
+ !vnic->gw->info.ext_lag.ucast && !logout) {
+ struct vnic_mcast *mcaste;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vnic->mcast_tree.mcast_rb_lock, flags);
+ mcaste = vnic_mcast_search(&vnic->mcast_tree, &vnic->ka_mcast_gid);
+ if (!IS_ERR(mcaste)) {
+ if (mcaste->ah) {
+ ret = send_generic_mcast_pkt(vnic->port, &vnic->tx_ring,
+ pkt, sizeof *pkt, vnic->qp,
+ vnic->pkey_index, mcaste);
+ vnic_dbg_parse(vnic->name, "sent multicast keep alive\n");
+ }
+ else {
+ vnic_dbg_parse(vnic->name, "mcaste %p: ah is null\n", mcaste);
+ kfree(pkt);
+ }
+ } else {
+ vnic_dbg_parse(vnic->name, "ka mcast not found\n");
+ ret = -ENOMEM;
+ }
+ spin_unlock_irqrestore(&vnic->mcast_tree.mcast_rb_lock, flags);
+
+ } else
+ /* For LAG gateway the ah is not up to date and therefore
+ should not be used */
+ ret = send_generic_ucast_pkt(vnic->port, NULL/*ah : create a new ah inside*/,
+ tx_ring, pkt, sizeof *pkt,
+ send_qp,
+ pkey_index,
+ vnic->gw_address.gw_qpn,
+ vnic->gw_address.gw_lid,
+ vnic->gw_address.qkey,
+ vnic_gw_ctrl_sl(vnic->gw));
+ if (ret) {
+ vnic_warn(vnic->port->name,
+ "fip_update_send: ret %d\n", ret);
+ goto error_free_mem;
+ }
+
+ return 0;
+
+error_free_mem:
+ kfree(pkt);
+ return -ENOMEM;
+}
+
+static void dump_lag_member(struct lag_member *m)
+{
+ vnic_dbg_lag("", "QPN 0x%x, SL %d, gw_portid 0x%x, LID 0x%x, guid " GUID_FORMAT
+ ", eport_state %s, weight %d, link_utilization %d\n",
+ m->qpn, m->sl, m->gw_port_id, m->lid, GUID_ARG(m->guid),
+ eport_state_str(m->eport_state), m->weight, m->link_utilization);
+}
+
+static inline int handle_lag_member(struct fip_vnic_data *vnic,
+ struct fip_ext_type_lag_members *ext_lag_membs,
+ int ext_length)
+{
+ struct lag_members lag_members;
+
+ extract_memb_extended(ext_lag_membs, ext_length, &lag_members, vnic->name);
+
+ /* propogate change in member state as needed */
+ return handle_member_update(vnic, &lag_members);
+}
+
+int extract_vhub_extended(struct fip_ext_desc_tlv *fed,
+ struct fip_vnic_data *vnic)
+{
+ struct fip_ext_type_ctrl_iport *ext_ctrl_iport;
+ struct fip_ext_type_lag_members *ext_lag_memb;
+ struct fip_extended_type *ext_hdr;
+ struct fip_vnic_send_info *gw_addr;
+ int length_to_go, ext_length;
+
+ if (fed->ft.type != 254)
+ return -EINVAL;
+
+ length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed);
+ ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+ while (length_to_go > 0) {
+ ext_length = ((int)ext_hdr->len) << 2;
+
+ vnic_dbg_parse(vnic->name, "Table Update parse, sub-tlv "
+ "type %d length %d address=%p\n",
+ ext_hdr->ext_type, ext_length, ext_hdr);
+
+ if (ext_length < sizeof(*ext_hdr) ||
+ ext_length > length_to_go) {
+ vnic_dbg_parse(vnic->name, "Extended length error."
+ " Length=%d\n", ext_length);
+ return -EINVAL;
+ }
+
+ switch (ext_hdr->ext_type) {
+ case ADV_EXT_TYPE(MEMBER):
+ ext_lag_memb = (struct fip_ext_type_lag_members *)ext_hdr;
+
+ if (handle_lag_member(vnic, ext_lag_memb, ext_length))
+ vnic_dbg_parse(vnic->name, "handle_lag_member() failed");
+ break;
+ case ADV_EXT_TYPE(CTRL_IPORT):
+ if (ext_length != sizeof(*ext_ctrl_iport)) {
+ vnic_dbg_parse(vnic->name, "Extended length %d is"
+ " different than expected\n",
+ ext_length);
+ return -EINVAL;
+ }
+
+ gw_addr = &vnic->gw_address;
+ ext_ctrl_iport = (struct fip_ext_type_ctrl_iport *)ext_hdr;
+ gw_addr->gw_qpn = be32_to_cpu(ext_ctrl_iport->gwtype_qpn);
+ gw_addr->gw_lid = be16_to_cpu(ext_ctrl_iport->lid);
+ gw_addr->gw_sl = be16_to_cpu(ext_ctrl_iport->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT;
+ break;
+ default:
+ if (ext_hdr->mandatory & 0x01) {
+ vnic_dbg_parse(vnic->name, "Unknown mandatory extended type %d length %d\n",
+ ext_hdr->ext_type, ext_length);
+ return -EINVAL;
+ } else {
+ vnic_dbg_parse(vnic->name, "Unknown non-mandatory extended. Skipping, type %d length %d\n",
+ ext_hdr->ext_type, ext_length);
+ ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+ length_to_go -= ext_length;
+ continue;
+ }
+ }
+
+ ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+ length_to_go -= ext_length;
+ }
+
+ return 0;
+}
+
+static int extract_login_extended(struct fip_ext_desc_tlv *fed,
+ struct lag_members *lagm,
+ char *name)
+{
+ struct fip_ext_type_lag_members *ext_lag_membs;
+ struct fip_extended_type *ext_hdr;
+ int length_to_go, ext_length;
+
+ if (fed->ft.type != 254)
+ return -EINVAL;
+
+ length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed);
+ ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+ while (length_to_go > 0) {
+ ext_length = ((int)ext_hdr->len) << 2;
+
+ vnic_dbg_parse(name, "Table Update parse, sub-tlv "
+ "type %d length %d address=%p\n",
+ ext_hdr->ext_type, ext_length, ext_hdr);
+
+ if (ext_length < sizeof(*ext_hdr) ||
+ ext_length > length_to_go) {
+ vnic_dbg_parse(name, "Extended length error."
+ " Length=%d\n", ext_length);
+ return -EINVAL;
+ }
+
+ switch (ext_hdr->ext_type) {
+ case ADV_EXT_TYPE(MEMBER):
+ ext_lag_membs = (struct fip_ext_type_lag_members *)ext_hdr;
+
+ extract_memb_extended(ext_lag_membs, ext_length, lagm, name);
+
+ break;
+ default:
+ if (ext_hdr->mandatory & 0x01) {
+ vnic_dbg_parse(name, "Unknown mandatory extended type %d length %d\n",
+ ext_hdr->ext_type, ext_length);
+ return -EINVAL;
+ } else {
+ vnic_dbg_parse(name, "Unknown non-mandatory extended. Skipping, type %d length %d\n",
+ ext_hdr->ext_type, ext_length);
+ ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+ length_to_go -= ext_length;
+ continue;
+ }
+ }
+ ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+ length_to_go -= ext_length;
+ }
+
+ return 0;
+}
+
+void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs,
+ int ext_length,
+ struct lag_members *lagm,
+ char *name)
+{
+ struct lag_member *m;
+ struct fip_ext_type_lag_member *lm;
+ int nmemb = 0;
+ int i;
+
+ nmemb = (ext_length - sizeof ext_lag_membs->et) / sizeof *lm;
+ if (nmemb > MAX_LAG_MEMBERS) {
+ vnic_dbg_parse(name, "recieved %d members but max supported is %d. "
+ "Using only %d\n", nmemb, MAX_LAG_MEMBERS,
+ MAX_LAG_MEMBERS);
+ nmemb = MAX_LAG_MEMBERS;
+ }
+
+ m = lagm->memb;
+ lm = ext_lag_membs->lagm;
+
+ for (i = 0; i < nmemb; ++i, ++lm, ++m) {
+ m->qpn = be32_to_cpu(lm->qpn) & 0xffffff;
+ m->sl = be16_to_cpu(lm->sl_gw_portid) >> 12;
+ m->gw_port_id = be16_to_cpu(lm->sl_gw_portid) & 0xfff;
+ m->lid = be16_to_cpu(lm->lid);
+ memcpy(m->guid, lm->guid, sizeof m->guid);
+ m->eport_state = lm->eport_state >> 6;
+ m->weight = lm->weight;
+ m->link_utilization = lm->link_utilization;
+ dump_lag_member(m);
+ }
+ lagm->num = nmemb;
+
+ vnic_dbg_parse(name, "Table Update extended parse finished OK. Num members=%d\n",
+ lagm->num);
+ return;
+}
+
+/*
+ * parse a packet that is suspected of being an login ack packet. The packet
+ * returns 0 for a valid login ack packet and an error code otherwise. The
+ * packets "interesting" details are returned in data.
+ */
+int fip_login_parse(struct fip_discover *discover, struct fip_content *fc,
+ struct fip_login_data *data)
+{
+ u32 vfields;
+ int err = 0;
+
+ data->syndrome = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) >> 24;
+ data->vnic_id = be16_to_cpu(fc->fl->vnic_id);
+ data->lid = be16_to_cpu(fc->fa.fa[0]->lid);
+ data->port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) & 0xfff;
+ data->sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT;
+ data->qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff;
+ memcpy(data->guid, fc->fa.fa[0]->guid, sizeof(data->guid));
+
+ if (be16_to_cpu(fc->fl->flags_vlan) & FIP_LOGIN_VP_FLAG) {
+ data->vp = 1;
+ data->vlan = be16_to_cpu(fc->fl->flags_vlan) & 0xfff;
+ }
+ data->all_vlan_gw = !!(be16_to_cpu(fc->fl->vfields) & FIP_LOGIN_ALL_VLAN_GW_FLAG);
+
+ data->vhub_id = CREATE_VHUB_ID(cpu_to_be16(data->vlan), data->port_id);
+
+ data->ctl_qpn = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) & FIP_LOGIN_CTRL_QPN_MASK;
+ vfields = be16_to_cpu(fc->fl->vfields);
+ data->n_mac_mcgid = vfields & FIP_LOGIN_DMAC_MGID_MASK;
+ data->n_rss_mgid = vfields >> 8 & 0xf;
+ /* data->rss = pkt->rss & FIP_LOGIN_RSS_MASK; it's redundant in login ack */
+ data->pkey = be16_to_cpu(fc->fp->pkey);
+ data->mtu = be16_to_cpu(fc->fl->mtu);
+
+ memcpy(data->mac, fc->fl->mac, sizeof(data->mac));
+ memcpy(data->mgid_prefix, fc->fl->eth_gid_prefix, sizeof(data->mgid_prefix));
+ memcpy(data->vnic_name, fc->fl->vnic_name, sizeof(data->vnic_name));
+ memcpy(data->vendor_id, fc->fl->vendor_id, sizeof(data->vendor_id));
+
+ if (fc->fed.num)
+ err = extract_login_extended(fc->fed.fed[0], &data->lagm, discover->name);
+
+ return err;
+}
+
+/*
+ * Check if a received packet is a FIP packet, And if so return its subtype.
+ * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE
+ * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned.
+*/
+int fip_pkt_parse(char *buffer, int length, int *fip_type)
+{
+ struct fip_fip_header *fip_header;
+ u16 fip_opcode;
+
+ fip_header = (struct fip_fip_header *)
+ (buffer + IB_GRH_BYTES + sizeof(struct fip_eoib_ver));
+
+ fip_opcode = be16_to_cpu(fip_header->opcode);
+
+ if (fip_opcode != EOIB_FIP_OPCODE) {
+ *fip_type = 0;
+ return -EINVAL;
+ }
+
+ *fip_type = fip_opcode;
+
+ return fip_header->subcode;
+}
+
+/*
+ * Already know that this is a FIP packet, return its subtype.
+*/
+int fip_pkt_get_subtype_bh(char *buffer)
+{
+ struct fip_fip_header *fip_header;
+
+ fip_header = (struct fip_fip_header *)
+ (buffer + sizeof(struct fip_eoib_ver));
+
+ return fip_header->subcode;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FIP_DISCOVER_PKT_H
+#define _FIP_DISCOVER_PKT_H
+
+#include <linux/kref.h>
+
+
+
+#endif /* _FIP_DISCOVER_H */
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+/*
+ * construct an mgid address based on vnic login information and the type
+ * variable (data mcast / vhub update / vhub table). The resulting mgid
+ * is returned in *mgid.
+ */
+void vhub_mgid_create(const char *mgid_prefix,
+ const char *mmac, /* mcast mac for bcast 0xFF.. */
+ u64 n_mac, /* bits to take from mmac */
+ u32 vhub_id,
+ enum vhub_mgid_type type,
+ u8 rss_hash,
+ union vhub_mgid *mgid)
+{
+ u32 vhub_id_be;
+ u64 mac_mask;
+ u64 *mac_ptr;
+ u64 one = 1; /* must do that for shift bitwise operation */
+
+ memcpy(mgid->mgid.mgid_prefix, mgid_prefix,
+ sizeof(mgid->mgid.mgid_prefix));
+ mgid->mgid.type = (u8)type;
+ memcpy(mgid->mgid.dmac, mmac, sizeof(mgid->mgid.dmac));
+ mac_mask = cpu_to_le64(((one << n_mac) - one) | 0xFFFF000000000000ULL);
+ mac_ptr = (u64*)(mgid->mgid.dmac);
+ *mac_ptr &= mac_mask;
+ mgid->mgid.rss_hash = rss_hash;
+ vhub_id_be = cpu_to_be32(vhub_id);
+ memcpy(mgid->mgid.vhub_id, ((u8 *) &vhub_id_be) + 1,
+ sizeof(mgid->mgid.vhub_id));
+};
+
+/*
+ * Init the vnic's vHub table data structures, before using them
+ */
+void vhub_ctx_init(struct fip_vnic_data *vnic)
+{
+ INIT_LIST_HEAD(&vnic->vhub_table.main_list.vnic_list);
+ vnic->vhub_table.main_list.tusn = 0;
+ vnic->vhub_table.main_list.count = 0;
+ vnic->vhub_table.main_list.total_count = 0;
+
+ INIT_LIST_HEAD(&vnic->vhub_table.update_list.vnic_list);
+ vnic->vhub_table.update_list.tusn = 0;
+ vnic->vhub_table.update_list.count = 0;
+ vnic->vhub_table.update_list.total_count = 0;
+
+ vnic->vhub_table.checksum = 0;
+ vnic->vhub_table.tusn = 0;
+ vnic->vhub_table.state = VHUB_TBL_INIT;
+}
+
+/* print vhub context table */
+static void vhub_ctx_prnt(struct fip_vnic_data *vnic,
+ struct vhub_elist *vhub_list, int level)
+{
+ struct vnic_table_entry *vnic_entry;
+
+ if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V))
+ return;
+
+ vnic_dbg_vhub_v(vnic->name, "Dumping context table. Count %d tusn %d\n",
+ vhub_list->count, vhub_list->tusn);
+
+ list_for_each_entry(vnic_entry, &vhub_list->vnic_list, list) {
+ vnic_dbg_vhub_v(vnic->name, "lid 0x%04x qpn 0x%06x, mac "
+ MAC_6_PRINT_FMT"\n", vnic_entry->lid,
+ vnic_entry->qpn,
+ MAC_6_PRINT_ARG(vnic_entry->mac));
+ }
+}
+
+void vhub_table_free(struct vhub_elist *elist)
+{
+ struct vnic_table_entry *del_vnic, *tmp_vnic;
+
+ list_for_each_entry_safe(del_vnic, tmp_vnic, &elist->vnic_list, list) {
+ list_del(&del_vnic->list);
+ kfree(del_vnic);
+ }
+}
+
+/*
+ * Clear and free the vnic's vHub context table data structures.
+ */
+void vhub_ctx_free(struct fip_vnic_data *vnic)
+{
+ vnic_dbg_fip_v(vnic->name, "vhub_ctx_free called\n");
+
+ vhub_table_free(&vnic->vhub_table.main_list);
+ vhub_table_free(&vnic->vhub_table.update_list);
+
+ vhub_ctx_init(vnic);
+}
+
+static struct vnic_table_entry *vhub_find_entry(struct vhub_elist *vnic_list,
+ u16 lid, u32 qpn)
+{
+ struct vnic_table_entry *tmp_vnic;
+
+ list_for_each_entry(tmp_vnic, &vnic_list->vnic_list, list) {
+ if (tmp_vnic->lid == lid && tmp_vnic->qpn == qpn)
+ return tmp_vnic;
+ }
+ return NULL;
+}
+
+/*
+ * Move vHub context entries from the update list to the main list. The update
+ * list is used during the wait for the main table to be updated. Once
+ * the table update is completed the entries need to be moved from the update
+ * table to the main table. This function does this.
+*/
+static int vhub_update_main(struct fip_vnic_data *vnic,
+ struct vhub_elist *main_list,
+ struct vhub_elist *update_list)
+{
+ struct vnic_table_entry *new_entry, *tmp_vnic, *del_vnic;
+ int first_tusn = (u32) update_list->tusn - (update_list->count - 1);
+ int extra_tusn;
+
+ /* update list is usually empty */
+ if (likely(update_list->count == 0))
+ return 0;
+
+ if (first_tusn > main_list->tusn + 1) {
+ vnic_warn(vnic->name, "Info, vhub_to_main_tbl sync main to"
+ " update list failed. update tusn %d update "
+ "first %d main %d\n",
+ update_list->tusn, first_tusn, main_list->tusn);
+ return -1;
+ }
+
+ extra_tusn = main_list->tusn + 1 - first_tusn;
+
+ /* go over update list and move / remove entries in it */
+ list_for_each_entry_safe(new_entry, tmp_vnic,
+ &update_list->vnic_list, list) {
+ if (extra_tusn > 0) {
+ list_del(&new_entry->list);
+ kfree(new_entry);
+ extra_tusn--;
+ } else {
+ /* remove from update list and apply to main list */
+ list_del(&new_entry->list);
+ main_list->tusn++;
+
+ /* Check valid bit, if set add to main list */
+ if (new_entry->valid) {
+ list_add_tail(&new_entry->list,
+ &main_list->vnic_list);
+ main_list->count++;
+ } else { /* remove from main list */
+ del_vnic = vhub_find_entry(main_list,
+ new_entry->lid,
+ new_entry->qpn);
+ if (del_vnic) {
+ list_del(&del_vnic->list);
+ kfree(del_vnic);
+
+ main_list->count--;
+ }
+ vnic_dbg_fip_v(vnic->name,
+ "vhub_to_main_tbl removed "
+ "vnic lid %d qpn 0x%x found %d\n",
+ (int)new_entry->lid,
+ (int)new_entry->qpn,
+ (del_vnic != 0));
+ kfree(new_entry);
+ }
+ }
+ update_list->count--;
+ }
+ return 0;
+}
+
+int fip_vnic_search_mac(struct fip_vnic_data *vnic, struct vhub_elist *elist)
+{
+ struct vnic_table_entry *vlist_entry;
+
+ list_for_each_entry(vlist_entry, &elist->vnic_list, list)
+ /* find matching entry based on mac */
+ if(!memcmp(vnic->login_data.mac, vlist_entry->mac, ETH_ALEN)) {
+ /* verify lid/qpn match */
+ if (vnic->port->attr.lid == vlist_entry->lid &&
+ vnic->qp_base_num == vlist_entry->qpn)
+ return 1;
+ else {
+ vnic_dbg_vhub(vnic->name,
+ "vnic LID=0x%x or QPN=0x%x "
+ "in vhub tbl is different than "
+ "expected LID=0x%x, QPN=0x%x\n",
+ vlist_entry->lid,
+ vlist_entry->qpn,
+ vnic->port->attr.lid,
+ vnic->qp_base_num);
+ break;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * This function handles a vhub context table packet. The table will
+ * be processed only if we do not have an up to date local copy of
+ * our own. The table update supports multi-packet tables so care
+ * must be taken in building the complete table.
+ */
+int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc,
+ u32 vhub_id, u32 tusn)
+{
+ struct context_table_entry *entry;
+ struct vnic_table_entry *new_entry;
+ struct vhub_elist *table;
+ int i, j, count_in_pkt;
+ int reason = 0;
+ int hdr_type;
+
+ /* we already have a table. disregard this one */
+ if (vnic->vhub_table.state != VHUB_TBL_INIT) {
+ vnic_dbg_vhub_v(vnic->name,
+ "vhub_handle_tbl context not in init\n");
+ return 0;
+ }
+
+ /* compute the number of vnic entries in the packet.
+ * don't forget the checksum
+ */
+ count_in_pkt = fc->cte.num;
+ table = &vnic->vhub_table.main_list;
+ hdr_type = be16_to_cpu(fc->fvt->hdr) >> 14;
+
+ /* first or only packet in sequence */
+ if (hdr_type == FIP_TABLE_HDR_FIRST || hdr_type == FIP_TABLE_HDR_ONLY) {
+ table->total_count = be16_to_cpu(fc->fvt->table_size);
+ table->tusn = tusn;
+ }
+ if (table->tusn != tusn) {
+ vnic_warn(vnic->name, "Info, vhub_handle_tbl got unexpected "
+ "tusn. Expect=%d received=%d\n", table->tusn, tusn);
+ if (!table->tusn)
+ goto drop_silently;
+ reason = 1;
+ goto reset_table;
+ }
+
+ if ((table->count + count_in_pkt > table->total_count) ||
+ ((table->count + count_in_pkt < table->total_count) &&
+ (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY))) {
+ vnic_dbg_vhub(vnic->name,
+ "vhub_handle_tbl got unexpected entry count. "
+ "count %d, in packet %d total expected %d\n",
+ table->count, count_in_pkt, table->total_count);
+ reason = 2;
+ goto reset_table;
+ }
+
+ entry = fc->cte.cte;
+ for (i = 0; i < count_in_pkt; ++i, ++entry) {
+ new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL);
+ if (!new_entry)
+ goto reset_table;
+
+ for (j = 0; j < (sizeof *entry) >> 2; ++j)
+ vnic->vhub_table.checksum += ((u32 *) entry)[j];
+
+ new_entry->lid = be16_to_cpu(entry->lid);
+ new_entry->qpn = be32_to_cpu(entry->qpn) & 0xffffff;
+ new_entry->sl = entry->sl & 0xf;
+ new_entry->rss = !!(entry->v_rss_type & FIP_CONTEXT_RSS_FLAG);
+ new_entry->valid = !!(entry->v_rss_type & FIP_CONTEXT_V_FLAG);
+ memcpy(new_entry->mac, entry->mac, sizeof(new_entry->mac));
+
+ list_add_tail(&new_entry->list, &table->vnic_list);
+ table->count++;
+ }
+
+ /* last packet */
+ if (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY) {
+ ASSERT(table->count == table->total_count);
+ if (vnic->vhub_table.checksum != be32_to_cpu(*(u32 *) entry)) {
+ vnic_dbg_fip_v(vnic->name,
+ "vhub_handle_tbl checksum mismatch. "
+ "expected 0x%x, in packet 0x%x\n",
+ vnic->vhub_table.checksum,
+ be32_to_cpu(*(u32 *) entry));
+ /* TODO: request checksum match in final code */
+ /* goto reset_table; */
+ }
+
+ if (vhub_update_main(vnic, &vnic->vhub_table.main_list,
+ &vnic->vhub_table.update_list)) {
+ vnic_dbg_fip_v(vnic->name,
+ "vhub_handle_tbl moving update list to main "
+ "list failed\n");
+ reason = 3;
+ goto reset_table;
+ }
+
+ /* we are done receiving the context table */
+ vnic_dbg_fip_v(vnic->name,
+ "vhub_handle_tbl updated with %d entries\n",
+ vnic->vhub_table.main_list.count);
+ vhub_ctx_prnt(vnic, &vnic->vhub_table.main_list, 0);
+
+ /* we are not in the main vHub list close ourselves */
+ if (!fip_vnic_search_mac(vnic, &vnic->vhub_table.main_list)) {
+ vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves\n");
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ reason = 4;
+ goto reset_table;
+ }
+
+ if (fip_vnic_tbl_done(vnic)) {
+ vnic_warn(vnic->name, "vhub_handle_tbl done failed, reseting table\n");
+ reason = 5;
+ goto reset_table;
+ }
+ }
+
+drop_silently:
+ return 0;
+
+reset_table:
+ vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves reason=%d\n", reason);
+ vhub_ctx_free(vnic);
+ /* TODO renable tx of update request, fip_update_send() */
+ return -EINVAL;
+}
+
+/*
+ * This function writes the main vhub table to the data (login) vnic.
+ * You should call it when the data vnic is ready for it and after the
+ * table is up to date (and the update list was applied to the main list)
+ */
+int fip_vnic_write_tbl(struct fip_vnic_data *vnic)
+{
+ struct vnic_table_entry *vlist_entry;
+ int rc;
+
+ if (vnic->login)
+ sprintf(vnic->name, "%s", vnic->login->name);
+
+ /* update table in neigh tree */
+ list_for_each_entry(vlist_entry,
+ &vnic->vhub_table.main_list.vnic_list, list) {
+ rc = vnic_vhube_add(vnic, vlist_entry);
+ if (rc) {
+ vnic_warn(vnic->name, "vnic_vhube_add failed for mac "
+ MAC_6_PRINT_FMT" (rc %d)\n",
+ MAC_6_PRINT_ARG(vlist_entry->mac), rc);
+ vhub_ctx_free(vnic);
+ vnic_vhube_flush(vnic);
+ return -1;
+ }
+ }
+
+ vnic_dbg_fip(vnic->name, "fip_vnic_tbl_done: creation of vnic done\n");
+
+ vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn;
+ vnic->vhub_table.state = VHUB_TBL_UPDATED;
+
+ /* free table memory */
+ vhub_table_free(&vnic->vhub_table.main_list);
+ return 0;
+}
+
+/*
+ * This function handles a vhub context update packets received AFTER
+ * we have a valid vhub table. For update additions the code adds an
+ * entry to the neighbour tree. For update removals we either remove
+ * the entry from the neighbour list or if the removed entry is "this vnic"
+ * we remove the vnic.
+*/
+static int vhub_update_updated(struct fip_vnic_data *vnic,
+ u32 vhub_id, u32 pkt_tusn,
+ struct vnic_table_entry *data)
+{
+ int curr_tusn;
+
+ curr_tusn = vnic->vhub_table.tusn;
+
+ /* if vnic is being flushed, return */
+ if (vnic->flush)
+ return 0;
+
+ /* we got a GW keep alive packet */
+ if (pkt_tusn == curr_tusn)
+ return 0;
+
+ /* if we got an out of order update clear list and request new table */
+ if (pkt_tusn != curr_tusn + 1) {
+ vnic_warn(vnic->name, "Info, vhub_update_up2date received out"
+ " of order update. Recvd=%d Expect=%d\n",
+ pkt_tusn, curr_tusn);
+ goto error_in_update;
+ }
+
+ /* new entry added */
+ if (data->valid) {
+ if (vnic_vhube_add(vnic, data)) {
+ vnic_dbg_fip(vnic->name, "vnic_vhube_add "
+ "failed to update vnic neigh tree\n");
+ goto error_in_update;
+ }
+ } else { /* remove entry */
+ /* the remove request is for this vnic :-o */
+ if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+ vnic_dbg_fip_p0(vnic->name, "remove this vnic "MAC_6_PRINT_FMT"\n",
+ MAC_6_PRINT_ARG(vnic->login_data.mac));
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ } else {
+ vnic_dbg_fip(vnic->name, "remove neigh vnic\n");
+ vnic_vhube_del(vnic, data->mac);
+ }
+ }
+
+ vnic->vhub_table.tusn = pkt_tusn;
+
+ return 0;
+
+error_in_update:
+ vhub_ctx_free(vnic);
+ vnic_vhube_flush(vnic);
+ fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+ return -1;
+}
+
+/*
+ * This function handles a vhub context update packets received BEFORE
+ * we have a valid vhub table. The function adds the update request
+ * to an update list to be processed after the entire vhub table is received
+ * and processed.
+ */
+static int vhub_update_init(struct fip_vnic_data *vnic,
+ u32 vhub_id, u32 pkt_tusn,
+ struct vnic_table_entry *data)
+{
+ struct vnic_table_entry *new_vnic;
+ struct vhub_elist *vnic_list;
+ int curr_tusn;
+
+ vnic_list = &vnic->vhub_table.update_list;
+ curr_tusn = vnic_list->tusn;
+
+ /* if we got an out of order update clear list and request new table */
+ if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1)
+ && curr_tusn != 0) {
+ vnic_warn(vnic->name, "Info, vhub_update_init received out of"
+ " order update. got %d my %d\n", pkt_tusn, curr_tusn);
+ goto error_in_update;
+ }
+
+ /* we got a GW keep alive packet */
+ if (pkt_tusn == curr_tusn) {
+ vnic_dbg_fip_v(vnic->name, "Received GW keep alive update."
+ " tusn %d\n", curr_tusn);
+ return 0;
+ }
+
+ /* got remove request for this vnic don't wait */
+ if (!(data->valid) &&
+ !memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+ vhub_ctx_free(vnic);
+ vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_init\n");
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ goto err;
+ }
+
+ new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL);
+ if (!new_vnic)
+ goto error_in_update;
+
+ memcpy(new_vnic, data, sizeof *data);
+ list_add_tail(&new_vnic->list, &vnic_list->vnic_list);
+ vnic_list->count++;
+ vnic_list->tusn = pkt_tusn;
+ vhub_ctx_prnt(vnic, vnic_list, 0);
+ return 0;
+
+error_in_update:
+ vhub_ctx_free(vnic);
+ fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+err:
+ return -1;
+}
+
+/*
+ * This function handles a vhub context update packets received after
+ * we have a valid vhub table but before it was passed to the data rbtree.
+ * The function applies the update request to the main vhub table.
+ */
+static int vhub_update_inter(struct fip_vnic_data *vnic,
+ u32 vhub_id, u32 pkt_tusn,
+ struct vnic_table_entry *data)
+{
+ struct vnic_table_entry *new_vnic, *del_vnic;
+ struct vhub_elist *vnic_list;
+ int curr_tusn;
+
+ vnic_list = &vnic->vhub_table.main_list;
+ curr_tusn = vnic_list->tusn;
+
+ /* if we got an out of order update clear list and request new table */
+ if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1)
+ && curr_tusn != 0) {
+ vnic_warn(vnic->name, "Info, vhub_update_init received out"
+ " of order update. got %d my %d\n", pkt_tusn, curr_tusn);
+ goto error_in_update;
+ }
+
+ /* we got a GW keep alive packet */
+ if (pkt_tusn == curr_tusn) {
+ vnic_dbg_fip_v(vnic->name, "Received GW keep alive update."
+ " tusn %d\n", curr_tusn);
+ return 0;
+ }
+
+ /* we got an add request */
+ if (data->valid) {
+ new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL);
+ if (!new_vnic)
+ goto error_in_update;
+
+ memcpy(new_vnic, data, sizeof *data);
+ list_add_tail(&new_vnic->list, &vnic_list->vnic_list);
+ vnic_list->count++;
+ vnic_list->tusn = pkt_tusn;
+ } else { /* we got a remove request */
+ /* remove is for this vnic */
+ if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+ vhub_ctx_free(vnic);
+ vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_inter\n");
+ fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+ goto err;
+ }
+
+ /* search and delete the vnic */
+ del_vnic = vhub_find_entry(vnic_list,
+ data->lid,
+ data->qpn);
+ if (del_vnic) {
+ list_del(&del_vnic->list);
+ kfree(del_vnic);
+ vnic_list->count--;
+ }
+ vnic_dbg_fip_v(vnic->name,
+ "vhub_update_inter removed "
+ "vnic lid %d qpn 0x%x found %d\n",
+ (int)data->lid, (int)data->qpn,
+ (del_vnic != 0));
+ }
+
+ vhub_ctx_prnt(vnic, vnic_list, 0);
+ return 0;
+
+error_in_update:
+ vhub_ctx_free(vnic);
+ fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+err:
+ return -1;
+}
+
+/*
+ * This function handles a vhub context update packets. There are three flows
+ * in handeling update packets. The first is before the main table is up
+ * to date, the second is after the table is up to date but before it was
+ * passed to the ownership of the data vnic (login struct) and the local
+ * lists are freed, and the last is when the table maintanence is done
+ * by the data vnic. This function handles all cases.
+*/
+int vhub_handle_update(struct fip_vnic_data *vnic,
+ u32 vhub_id, u32 tusn,
+ struct vnic_table_entry *data)
+{
+ int ret = 0;
+
+ /*
+ * if we do not have an up to date table to use the update list.
+ * if we have an up to date table apply the updates to the
+ * main table list.
+ */
+ switch (vnic->vhub_table.state) {
+ case VHUB_TBL_INIT: /* No full table yet, keep updates for later */
+ ret = vhub_update_init(vnic, vhub_id, tusn, data);
+ break;
+ case VHUB_TBL_UP2DATE: /* full table available, not writen to data half */
+ ret = vhub_update_inter(vnic, vhub_id, tusn, data);
+ break;
+ case VHUB_TBL_UPDATED: /* full table available and writen to data half */
+ ret = vhub_update_updated(vnic, vhub_id, tusn, data);
+ break;
+ default:
+ break;
+ }
+
+ return ret;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip.h"
+
+MODULE_AUTHOR(DRV_AUTH);
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_LICENSE(DRV_LIC);
+MODULE_VERSION(DRV_VER);
+
+static int __init mlx4_ib_init(void)
+{
+ vnic_dbg_func("module_init");
+
+ if (vnic_param_check())
+ goto err;
+ if (vnic_mcast_init())
+ goto err;
+ if (vnic_ports_init())
+ goto free_mcast;
+
+ return 0;
+
+free_mcast:
+ vnic_mcast_cleanup();
+err:
+ return -EINVAL;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+ int ret;
+
+ vnic_dbg_func("module_exit");
+ vnic_ports_cleanup();
+ vnic_dbg_mark();
+ vnic_mcast_cleanup();
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+
+struct workqueue_struct *mcast_wq;
+struct ib_sa_client vnic_sa_client;
+
+//static void vnic_mcast_detach_task(struct work_struct *work);
+static void vnic_mcast_attach_task(struct work_struct *work);
+static void vnic_port_mcast_leave_task(struct work_struct *work);
+static void vnic_port_mcast_join_task(struct work_struct *work);
+
+static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste);
+static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast
+ *_mcaste);
+
+/*
+ * A helper function to prevent code duplication. Fills vnic_mcast struct with
+ * common values.
+ *
+ * in: mcaste - mcaste to fill
+ * in: gw_id - to be used in creation MGID address
+ * in: mac - to be used in creation MGID address
+ * in: create - value of create field in mcaste
+ */
+void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste,
+ u16 gw_id, const u8 *mac, u8 rss_hash, int create)
+{
+ union vhub_mgid mgid;
+
+ memcpy(mcaste->mac, mac, ETH_ALEN);
+ vhub_mgid_create(login->mgid_prefix, mcaste->mac,
+ login->n_mac_mcgid,
+ CREATE_VHUB_ID(login->vid, gw_id),
+ VHUB_MGID_DATA, rss_hash, &mgid);
+ memcpy(&mcaste->gid, mgid.ib_gid.raw, GID_LEN);
+ memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+ mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+ mcaste->backoff_factor = 1;
+ mcaste->retry = VNIC_MCAST_MAX_RETRY;
+ mcaste->blocking = 0;
+ mcaste->qkey = login->qkey;
+ mcaste->pkey = login->pkey;
+ mcaste->create = create;
+ mcaste->qp = login->qp_res[0].qp; /* mcast/bcast is only on first QP */
+ mcaste->join_state = 1;
+}
+
+/*
+ * A helper function to prevent code duplication. Receives a multicast mac
+ * and a gw_id and attaches it (join + attach). The function also receives
+ * a default_mcaste (used for the MGID over default MLID hack and a user list.
+ * Returns 0 on success and non 0 on failure.
+ *
+ * in: mmac - to be used in creation MGID address
+ * in: default_mcaste - mcaste entry of the default MGID. Can be NULL
+ * in: user_list - A user list to hang the new mcaste on. Can be NULL
+ * in: gw_id - to be used in creation MGID address
+ */
+int _vnic_mcast_attach_mgid(struct vnic_login *login,
+ char *mmac,
+ struct vnic_mcast *default_mcaste,
+ void *private_data,
+ u16 gw_id)
+{
+ struct vnic_mcast *mcaste;
+ int rc = 0;
+ int rss_hash;
+
+ mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+ if (IS_ERR(mcaste)) {
+ vnic_warn(login->name, "vnic_mcast_alloc for "MAC_6_PRINT_FMT" failed\n",
+ MAC_6_PRINT_ARG(mmac));
+ vnic_dbg_mark();
+ return -ENOMEM;
+ }
+ memcpy(mcaste->mac, mmac, ETH_ALEN);
+
+ /* if mcast mac has mcast IP in it:*/
+ rss_hash = 0;
+ if ((mcaste->mac[0] & 0xf0) == 0xe0 &&
+ mcaste->mac[4] == 0x00 &&
+ mcaste->mac[5] == 0x00) {
+ /* calculate mcas rss_hash on IP octets */
+ rss_hash = mcaste->mac[0] ^ mcaste->mac[1] ^
+ mcaste->mac[2] ^ mcaste->mac[3];
+ /* and build the corresponding mcast MAC using the IEEE
+ * multicast OUI 01:00:5e
+ */
+ mcaste->mac[5] = mcaste->mac[3];
+ mcaste->mac[4] = mcaste->mac[2];
+ mcaste->mac[3] = mcaste->mac[1] & 0x7f;
+ mcaste->mac[2] = 0x5e;
+ mcaste->mac[1] = 0x00;
+ mcaste->mac[0] = 0x01;
+ }
+
+ __vnic_mcaste_fill(login, mcaste, gw_id, mcaste->mac, rss_hash, 0);
+ mcaste->priv_data = private_data;
+
+ if (default_mcaste)
+ memcpy(&mcaste->port_gid, &default_mcaste->gid, GID_LEN);
+
+ rc = vnic_mcast_add(&login->mcast_tree, mcaste); /* add holds mcast_rb_lock */
+ if (!rc) {
+ rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+ ASSERT(!rc);
+ } else if (rc == -EEXIST){
+ /* MGID may be already in the tree when n_mac_mcgid > 0 (ok)*/
+ vnic_dbg_mcast(login->name, "vnic_mcast_add for "
+ MAC_6_PRINT_FMT" already exist, rc %d\n",
+ MAC_6_PRINT_ARG(mcaste->mac), rc);
+ vnic_mcast_dealloc(mcaste);
+ rc = 0;
+ } else {
+ vnic_warn(login->name, "vnic_mcast_add for "
+ MAC_6_PRINT_FMT" failed, rc %d\n",
+ MAC_6_PRINT_ARG(mcaste->mac), rc);
+ vnic_mcast_dealloc(mcaste);
+ }
+ return rc;
+}
+
+struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port,
+ unsigned long *req_attach,
+ unsigned long *cur_attached)
+{
+ struct vnic_mcast *mcaste;
+
+ mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC);
+ if (!mcaste)
+ return ERR_PTR(-ENOMEM);
+ /* set mcaste fields */
+ init_completion(&mcaste->attach_complete);
+ INIT_DELAYED_WORK(&mcaste->attach_task, vnic_mcast_attach_task);
+ spin_lock_init(&mcaste->lock);
+ mcaste->port = port;
+ mcaste->req_attach = req_attach;
+ mcaste->cur_attached = cur_attached;
+
+ return mcaste;
+}
+
+void vnic_mcast_dealloc(struct vnic_mcast *mcaste)
+{
+ struct vnic_port *port;
+
+ ASSERT(mcaste);
+ port = mcaste->port;
+ vnic_dbg_mcast_vv(port->name, "dealloc vnic_mcast: MAC "MAC_6_PRINT_FMT
+ " GID "VNIC_GID_FMT"\n",
+ MAC_6_PRINT_ARG(mcaste->mac),
+ VNIC_GID_ARG(mcaste->gid));
+ kfree(mcaste);
+}
+
+/*
+ * This function grabs the mcast_tree->mcast_rb_lock
+*/
+int vnic_mcast_add(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+ struct rb_node **n = &mcast_tree->mcast_tree.rb_node, *pn = NULL;
+ struct vnic_mcast *mcaste_t;
+ unsigned long flags;
+ int rc;
+
+ spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+ while (*n) {
+ pn = *n;
+ mcaste_t = rb_entry(pn, struct vnic_mcast, rb_node);
+ rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN);
+ if (rc < 0)
+ n = &pn->rb_left;
+ else if (rc > 0)
+ n = &pn->rb_right;
+ else {
+ rc = -EEXIST;
+ goto out;
+ }
+ }
+
+ rb_link_node(&mcaste->rb_node, pn, n);
+ rb_insert_color(&mcaste->rb_node, &mcast_tree->mcast_tree);
+
+ rc = 0;
+
+out:
+ vnic_dbg_mcast_v(mcaste->port->name,
+ "added (rc %d) vnic_mcast: MAC "MAC_6_PRINT_FMT
+ " GID "VNIC_GID_FMT"\n", rc,
+ MAC_6_PRINT_ARG(mcaste->mac),
+ VNIC_GID_ARG(mcaste->gid));
+
+ spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+ return rc;
+}
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+ */
+void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+ rb_erase(&mcaste->rb_node, &mcast_tree->mcast_tree);
+}
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+*/
+struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree,
+ union ib_gid *gid)
+{
+ struct rb_node *n = mcast_tree->mcast_tree.rb_node;
+ struct vnic_mcast *mcaste_t;
+ int rc;
+
+ while (n) {
+ mcaste_t = rb_entry(n, struct vnic_mcast, rb_node);
+ rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN);
+ if (rc < 0)
+ n = n->rb_left;
+ else if (rc > 0)
+ n = n->rb_right;
+ else {
+ vnic_dbg_mcast_v(mcaste_t->port->name,
+ "found: MAC "MAC_6_PRINT_FMT" GID "
+ VNIC_GID_FMT"\n",
+ MAC_6_PRINT_ARG(mcaste_t->mac),
+ VNIC_GID_ARG(mcaste_t->gid));
+ goto out;
+ }
+ }
+ mcaste_t = ERR_PTR(-ENODATA);
+
+out:
+ return mcaste_t;
+}
+
+static void vnic_mcast_detach_ll(struct vnic_mcast *mcaste, struct mcast_root *mcast_tree)
+{
+ struct vnic_port *port = mcaste->port;
+ struct ib_ah *tmp_ih;
+ unsigned long flags;
+ int rc;
+
+ vnic_dbg_mcast_v(port->name,
+ "mcaste->attached %d for mac "MAC_6_PRINT_FMT"\n",
+ test_bit(MCAST_ATTACHED, &mcaste->state),
+ MAC_6_PRINT_ARG(mcaste->mac));
+
+ spin_lock_irqsave(&mcaste->lock, flags);
+ if (!test_and_clear_bit(MCAST_ATTACHED, &mcaste->state)) {
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+ return;
+ }
+
+ tmp_ih = mcaste->ah;
+ mcaste->ah = NULL;
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+
+ /* callback */
+ if (mcaste->detach_cb) {
+ vnic_dbg_mcast(port->name, "calling detach_cb\n");
+ mcaste->detach_cb(mcaste, mcaste->detach_cb_ctx);
+ }
+
+ if (!mcaste->sender_only)
+ rc = ib_detach_mcast(mcaste->qp, &mcaste->gid, port->attr.lid);
+ else
+ rc = 0;
+
+ ASSERT(tmp_ih);
+ if (ib_destroy_ah(tmp_ih))
+ vnic_warn(port->name,
+ "ib_destroy_ah failed (rc %d) for mcaste mac "
+ MAC_6_PRINT_FMT"\n", rc,
+ MAC_6_PRINT_ARG(mcaste->mac));
+ vnic_dbg_mcast(port->name, "GID "VNIC_GID_FMT" detached!\n",
+ VNIC_GID_ARG(mcaste->gid));
+}
+
+int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+ struct vnic_port *port = mcaste->port;
+ unsigned long flags;
+
+ /* must be a task, to make sure no attach task is pending */
+ vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) "
+ "vnic_mcast_detach_task\n", mcaste->backoff);
+
+ /* cancel any pending/queued tasks. We can not use sync
+ * under the spinlock because it might hang. we need the
+ * spinlock here to ensure the requeueing is atomic
+ */
+ vnic_dbg_mcast_v(port->name, "cancel attach_task\n");
+ spin_lock_irqsave(&mcaste->lock, flags);
+ clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+#ifndef _BP_WORK_SYNC
+ cancel_delayed_work_sync(&mcaste->attach_task);
+#else
+ cancel_delayed_work(&mcaste->attach_task);
+ flush_workqueue(mcast_wq);
+#endif
+ vnic_mcast_detach_ll(mcaste, mcast_tree);
+
+ if (mcaste->port_mcaste)
+ vnic_port_mcast_release(mcaste->port_mcaste);
+
+ return 0;
+}
+
+static void vnic_mcast_attach_task(struct work_struct *work)
+{
+ struct ib_ah_attr av;
+ struct vnic_mcast *mcaste =
+ container_of(work, struct vnic_mcast, attach_task.work);
+ struct vnic_port *port = mcaste->port;
+ unsigned long flags;
+ int rc;
+ u16 mlid;
+
+ if ((++mcaste->attach_task_cnt > mcaste->retry && mcaste->retry) ||
+ !test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) {
+ vnic_dbg_mcast_v(port->name,
+ "attach_task stopped, tried %ld times\n",
+ mcaste->retry);
+ goto out;
+ }
+
+ /* update backoff time */
+ mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor,
+ msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC));
+
+ if (!test_bit(MCAST_JOINED, &mcaste->port_mcaste->state)) {
+ vnic_dbg_mcast_v(port->name, "joined %d, retry %ld from %ld\n",
+ test_bit(MCAST_JOINED, &mcaste->port_mcaste->state),
+ mcaste->attach_task_cnt, mcaste->retry);
+ goto retry;
+ }
+
+ /* attach QP */
+ ASSERT(mcaste);
+ ASSERT(mcaste->port_mcaste);
+ ASSERT(mcaste->port_mcaste->sa_mcast);
+ mlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+ vnic_dbg_mcast(port->name, "QPN 0x%06x attaching MGID "VNIC_GID_FMT
+ " LID 0x%04x\n", mcaste->qp->qp_num,
+ VNIC_GID_ARG(mcaste->gid), mlid);
+ if (!mcaste->sender_only)
+ rc = ib_attach_mcast(mcaste->qp, &mcaste->gid, mlid);
+ else
+ rc = 0;
+
+ if (rc) {
+ int attach_count = atomic_read(&mcaste->port_mcaste->ref_cnt);
+
+ vnic_err(port->name, "failed to attach (rc %d) to multicast "
+ "group, MGID "VNIC_GID_FMT"\n",
+ rc, VNIC_GID_ARG(mcaste->gid));
+
+ if (port->dev->attr.max_mcast_qp_attach <= attach_count) {
+ vnic_err(port->name, "Attach failed. Too many vnics are on the same"
+ " vhub on this port. vnics count=%d, max=%d\n",
+ attach_count,
+ port->dev->attr.max_mcast_qp_attach);
+ }
+
+ goto retry;
+ } else {
+ /* create mcast ah */
+ memset(&av, 0, sizeof(av));
+ av.dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+ av.port_num = mcaste->port->num;
+ av.ah_flags = IB_AH_GRH;
+ av.static_rate = mcaste->port_mcaste->rec.rate;
+ av.sl = mcaste->port_mcaste->rec.sl;
+ memcpy(&av.grh.dgid, mcaste->gid.raw, GID_LEN);
+ spin_lock_irqsave(&mcaste->lock, flags);
+ mcaste->ah = ib_create_ah(port->pd, &av);
+ if (IS_ERR(mcaste->ah)) {
+ mcaste->ah = NULL;
+ vnic_err(port->name,
+ "vnic_ib_create_ah failed (rc %d)\n",
+ (int)PTR_ERR(mcaste->ah));
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+ /* for such a failure, no need to retry */
+ goto out;
+ }
+ vnic_dbg_mcast(mcaste->port->name, "created mcast ah for %p\n", mcaste);
+
+ /* callback */
+ set_bit(MCAST_ATTACHED, &mcaste->state);
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+
+ if (mcaste->cur_attached)
+ set_bit(mcaste->attach_bit_nr, mcaste->cur_attached);
+ vnic_dbg_mcast(mcaste->port->name,
+ "attached GID "VNIC_GID_FMT"\n",
+ VNIC_GID_ARG(mcaste->gid));
+ if (mcaste->attach_cb) {
+ vnic_dbg_mcast(mcaste->port->name,
+ "calling attach_cb\n");
+ mcaste->attach_cb(mcaste, mcaste->attach_cb_ctx);
+ }
+ }
+
+out:
+ mcaste->attach_task_cnt = 0; /* for next time */
+ mcaste->backoff = mcaste->backoff_init;
+ clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+ complete(&mcaste->attach_complete);
+ return;
+
+retry:
+ spin_lock_irqsave(&mcaste->lock, flags);
+ if (test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) {
+ /* calls vnic_mcast_attach_task() */
+ queue_delayed_work(mcast_wq, &mcaste->attach_task, mcaste->backoff);
+ }
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+}
+
+int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+ struct vnic_port_mcast *pmcaste;
+ struct vnic_port *port = mcaste->port;
+ int rc = 0;
+ ASSERT(mcaste);
+
+ mcaste->backoff_init = mcaste->backoff;
+
+ pmcaste = vnic_port_mcast_update(mcaste);
+ if (IS_ERR(pmcaste)) {
+ vnic_err(port->name, "vnic_port_mcast_update failed GID "
+ VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+ rc = PTR_ERR(pmcaste);
+ goto out;
+ }
+
+ mcaste->port_mcaste = pmcaste;
+
+ set_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+
+ /* must be a task, to sample the joined flag */
+ vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) "
+ "vnic_mcast_join_task\n", mcaste->backoff);
+ init_completion(&mcaste->attach_complete);
+ /* calls vnic_mcast_attach_task() */
+ queue_delayed_work(mcast_wq, &mcaste->attach_task, 0);
+ if (mcaste->blocking) {
+ wait_for_completion(&mcaste->attach_complete);
+ if (test_bit(MCAST_ATTACHED, &mcaste->state))
+ goto out;
+ vnic_mcast_detach(mcast_tree, mcaste);
+ rc = 1;
+ }
+
+out:
+ return rc;
+}
+
+#if 0
+static int vnic_mcast_attach_all(struct mcast_root *mcast_tree)
+{
+ int fails = 0;
+ struct vnic_mcast *mcaste;
+ struct rb_node *n;
+
+ n = rb_first(&mcast_tree->mcast_tree);
+ while (n) {
+ mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+ n = rb_next(n);
+ /* async call */
+ if (vnic_mcast_attach(mcast_tree, mcaste))
+ fails++;
+ }
+
+ return fails;
+}
+#endif
+
+int vnic_mcast_del_all(struct mcast_root *mcast_tree)
+{
+ struct rb_node *n;
+ struct vnic_mcast *mcaste, *mcaste_t;
+ unsigned long flags;
+ int fails = 0;
+ LIST_HEAD(local_list);
+
+ spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+ n = rb_first(&mcast_tree->mcast_tree);
+ while (n) {
+ mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+ vnic_mcast_del(mcast_tree, mcaste);
+ list_add_tail(&mcaste->list, &local_list);
+ n = rb_first(&mcast_tree->mcast_tree);
+ }
+ spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+ list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+ list_del(&mcaste->list);
+ vnic_mcast_detach(mcast_tree, mcaste);
+ vnic_mcast_dealloc(mcaste);
+ }
+
+ return fails;
+}
+
+int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner)
+{
+ struct rb_node *n;
+ struct vnic_mcast *mcaste, *mcaste_t;
+ unsigned long flags;
+ int fails = 0;
+ LIST_HEAD(local_list);
+
+ spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+ n = rb_first(&mcast_tree->mcast_tree);
+ while (n) {
+ mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+ n = rb_next(&mcaste->rb_node);
+ if (mcaste->priv_data == owner) {
+ list_add_tail(&mcaste->list, &local_list);
+ vnic_mcast_del(mcast_tree, mcaste);
+ }
+ }
+ spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+ list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+ list_del(&mcaste->list);
+ vnic_mcast_detach(mcast_tree, mcaste);
+ vnic_mcast_dealloc(mcaste);
+ }
+
+ return fails;
+}
+
+/* PORT MCAST FUNCTIONS */
+static struct vnic_port_mcast *vnic_port_mcast_alloc(struct vnic_port *port,
+ union ib_gid *gid)
+{
+ struct vnic_port_mcast *mcaste;
+
+ mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC);
+ if (!mcaste)
+ return ERR_PTR(-ENOMEM);
+
+ mcaste->gid = *gid;
+ mcaste->port = port;
+ init_completion(&mcaste->leave_complete);
+ atomic_set(&mcaste->ref_cnt, 1);
+ INIT_DELAYED_WORK(&mcaste->join_task, vnic_port_mcast_join_task);
+ INIT_WORK(&mcaste->leave_task, vnic_port_mcast_leave_task);
+ mcaste->sa_mcast = ERR_PTR(-EINVAL);
+ memset(&mcaste->rec,0,sizeof(mcaste->rec));
+ vnic_dbg_mcast_v(mcaste->port->name, "allocated port_mcast GID "
+ VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+ spin_lock_init(&mcaste->lock);
+ set_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+
+ return mcaste;
+}
+
+static void vnic_port_mcast_dealloc(struct vnic_port_mcast *mcaste)
+{
+ ASSERT(mcaste);
+ vnic_dbg_mcast_v(NULL, "dealloc port_mcast GID "
+ VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+ kfree(mcaste);
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+static int vnic_port_mcast_add(struct vnic_port_mcast *mcaste)
+{
+ struct rb_node **n = &mcaste->port->mcast_tree.mcast_tree.rb_node;
+ struct rb_node *pn = NULL;
+ struct vnic_port_mcast *mcaste_t;
+ int rc;
+
+ while (*n) {
+ pn = *n;
+ mcaste_t = rb_entry(pn, struct vnic_port_mcast, rb_node);
+ rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN);
+ if (rc < 0)
+ n = &pn->rb_left;
+ else if (rc > 0)
+ n = &pn->rb_right;
+ else {
+ rc = -EEXIST;
+ goto out;
+ }
+ }
+
+ rb_link_node(&mcaste->rb_node, pn, n);
+ rb_insert_color(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree);
+ rc = 0;
+
+out:
+ vnic_dbg_mcast_v(mcaste->port->name, "added (rc %d) port_mcast GID "
+ VNIC_GID_FMT"\n", rc, VNIC_GID_ARG(mcaste->gid));
+ return rc;
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+static void vnic_port_mcast_del(struct vnic_port_mcast *mcaste)
+{
+ ASSERT(mcaste);
+ vnic_dbg_mcast_v(mcaste->port->name, "del port_mcast GID "
+ VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+ rb_erase(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree);
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+struct vnic_port_mcast *vnic_port_mcast_search(struct vnic_port *port,
+ union ib_gid *gid)
+{
+ struct rb_node *n = port->mcast_tree.mcast_tree.rb_node;
+ struct vnic_port_mcast *mcaste_t;
+ int rc;
+
+ while (n) {
+ mcaste_t = rb_entry(n, struct vnic_port_mcast, rb_node);
+ rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN);
+ if (rc < 0)
+ n = n->rb_left;
+ else if (rc > 0)
+ n = n->rb_right;
+ else {
+ vnic_dbg_mcast_v(mcaste_t->port->name,
+ "found: GID "VNIC_GID_FMT"\n",
+ VNIC_GID_ARG(mcaste_t->gid));
+ goto out;
+ }
+ }
+ mcaste_t = ERR_PTR(-ENODATA);
+
+out:
+ return mcaste_t;
+}
+/*
+static void vnic_port_mcast_leave_task(struct work_struct *work)
+{
+ struct vnic_port_mcast *mcaste =
+ container_of(work, struct vnic_port_mcast, leave_task.work);
+
+ vnic_dbg_mcast_v(mcaste->port->name, "leave GID "VNIC_GID_FMT"\n",
+ VNIC_GID_ARG(mcaste->gid));
+
+ if (!IS_ERR(mcaste->sa_mcast) && test_bit(MCAST_JOINED, &mcaste->port_mcaste->state))
+ vnic_dbg_mcast(mcaste->port->name,
+ "mcast left: GID "VNIC_GID_FMT"\n",
+ VNIC_GID_ARG(mcaste->gid));
+ if (!IS_ERR(mcaste->sa_mcast))
+ ib_sa_free_multicast(mcaste->sa_mcast);
+ mcaste->sa_mcast = ERR_PTR(-EINVAL);
+ clear_bit(MCAST_JOINED, &mcaste->port_mcaste->state);
+}
+*/
+
+static int vnic_port_mcast_leave(struct vnic_port_mcast *mcaste,
+ unsigned long backoff)
+{
+ unsigned long flags;
+
+ ASSERT(mcaste);
+ vnic_dbg_mcast(NULL, "queue delayed task (%lu) "
+ "vnic_mcast_leave_task\n", backoff);
+
+ /* cancel any pending/queued tasks. We can not use sync
+ * under the spinlock because it might hang. we need the
+ * spinlock here to ensure the requeueing is atomic
+ */
+ spin_lock_irqsave(&mcaste->lock, flags);
+ clear_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+#ifndef _BP_WORK_SYNC
+ cancel_delayed_work_sync(&mcaste->join_task);
+#else
+ cancel_delayed_work(&mcaste->join_task);
+ if (delayed_work_pending(&mcaste->join_task)) {
+ return -EBUSY;
+ }
+#endif
+
+ if (test_and_clear_bit(MCAST_JOIN_STARTED, &mcaste->state)
+ && !IS_ERR(mcaste->sa_mcast)) {
+ ib_sa_free_multicast(mcaste->sa_mcast);
+ mcaste->sa_mcast = ERR_PTR(-EINVAL);
+ }
+
+ return 0;
+}
+
+static int vnic_port_mcast_join_comp(int status, struct ib_sa_multicast *sa_mcast)
+{
+ struct vnic_port_mcast *mcaste = sa_mcast->context;
+ unsigned long flags;
+
+ vnic_dbg_mcast(mcaste->port->name, "join completion for GID "
+ VNIC_GID_FMT" (status %d)\n",
+ VNIC_GID_ARG(mcaste->gid), status);
+
+ if (status == -ENETRESET)
+ return 0;
+
+ if (status)
+ goto retry;
+
+ /* same as mcaste->rec = mcaste->sa_mcast->rec; */
+ mcaste->rec = sa_mcast->rec;
+
+ set_bit(MCAST_JOINED, &mcaste->state);
+ vnic_dbg_mcast(mcaste->port->name, "joined GID "VNIC_GID_FMT"\n",
+ VNIC_GID_ARG(mcaste->gid));
+#if 0
+ vnic_dbg_mcast_v(mcaste->port->name, "mcast record dump:\n");
+ vnic_dbg_mcast_v(mcaste->port->name, "mgid "VNIC_GID_FMT"\n",
+ VNIC_GID_ARG(rec->mgid));
+ vnic_dbg_mcast_v(mcaste->port->name, "port_gid "VNIC_GID_FMT"\n",
+ VNIC_GID_ARG(rec->port_gid));
+ vnic_dbg_mcast_v(mcaste->port->name, "pkey 0x%x\n", rec->pkey);
+ vnic_dbg_mcast_v(mcaste->port->name, "qkey 0x%x\n", rec->qkey);
+ vnic_dbg_mcast_v(mcaste->port->name, "mtu_slct 0x%x\n",
+ rec->mtu_selector);
+ vnic_dbg_mcast_v(mcaste->port->name, "mtu 0x%x\n", rec->mtu);
+ vnic_dbg_mcast_v(mcaste->port->name, "rate_slct 0x%x\n",
+ rec->rate_selector);
+ vnic_dbg_mcast_v(mcaste->port->name, "rate 0x%x\n", rec->rate);
+ vnic_dbg_mcast_v(mcaste->port->name, "sl 0x%x\n", rec->sl);
+ vnic_dbg_mcast_v(mcaste->port->name, "flow_label 0x%x\n",
+ rec->flow_label);
+ vnic_dbg_mcast_v(mcaste->port->name, "hop_limit 0x%x\n",
+ rec->hop_limit);
+#endif
+
+ goto out;
+retry:
+ /* calls vnic_port_mcast_join_task() */
+ spin_lock_irqsave(&mcaste->lock, flags);
+ if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+ queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff);
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+
+out:
+ /* rc is always zero so we handle ib_sa_free_multicast ourselves */
+ return 0;
+}
+
+static void vnic_port_mcast_join_task(struct work_struct *work)
+{
+ struct vnic_port_mcast *mcaste =
+ container_of(work, struct vnic_port_mcast, join_task.work);
+ struct ib_sa_mcmember_rec rec = {
+ .join_state = mcaste->join_state
+ };
+ int rc;
+ ib_sa_comp_mask comp_mask;
+ unsigned long flags;
+
+ if (++mcaste->join_task_cnt > mcaste->retry && mcaste->retry) {
+ vnic_dbg_mcast(mcaste->port->name,
+ "join_task stopped, tried %ld times\n",
+ mcaste->retry);
+ goto out;
+ }
+
+ /* update backoff time */
+ mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor,
+ msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC));
+
+ rec.mgid.global = mcaste->gid.global;
+ rec.port_gid.global = mcaste->port->gid.global;
+ rec.pkey = cpu_to_be16(mcaste->pkey);
+
+ comp_mask =
+ IB_SA_MCMEMBER_REC_MGID |
+ IB_SA_MCMEMBER_REC_PORT_GID |
+ /*IB_SA_MCMEMBER_REC_PKEY | */
+ IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+ if (mcaste->create) {
+ comp_mask |=
+ IB_SA_MCMEMBER_REC_QKEY |
+ IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+ IB_SA_MCMEMBER_REC_MTU |
+ IB_SA_MCMEMBER_REC_TRAFFIC_CLASS |
+ IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+ IB_SA_MCMEMBER_REC_RATE |
+ IB_SA_MCMEMBER_REC_SL |
+ IB_SA_MCMEMBER_REC_FLOW_LABEL |
+ IB_SA_MCMEMBER_REC_HOP_LIMIT |
+ IB_SA_MCMEMBER_REC_PKEY;
+
+ rec.qkey = cpu_to_be32(mcaste->qkey);
+ rec.mtu_selector = IB_SA_EQ;
+ rec.rate_selector = IB_SA_EQ;
+ /* when no_bxm is set, use min values to let everybody in */
+ rec.mtu = no_bxm ? IB_MTU_2048 : mcaste->port->attr.max_mtu;
+ rec.rate = no_bxm ? IB_RATE_10_GBPS : mcaste->port->rate_enum;
+ rec.sl = 0;
+ rec.flow_label = 0;
+ rec.hop_limit = 0;
+ }
+
+ vnic_dbg_mcast(mcaste->port->name, "joining MGID "VNIC_GID_FMT
+ " create %d, comp_mask %lu\n",
+ VNIC_GID_ARG(mcaste->gid), mcaste->create, (unsigned long)comp_mask);
+
+ if (!IS_ERR(mcaste->sa_mcast))
+ ib_sa_free_multicast(mcaste->sa_mcast);
+
+ mcaste->sa_mcast =
+ ib_sa_join_multicast(&vnic_sa_client, mcaste->port->dev->ca,
+ mcaste->port->num, &rec, comp_mask,
+ GFP_KERNEL, vnic_port_mcast_join_comp, mcaste);
+ set_bit(MCAST_JOIN_STARTED, &mcaste->state);
+
+ if (IS_ERR(mcaste->sa_mcast)) {
+ rc = PTR_ERR(mcaste->sa_mcast);
+ vnic_warn(mcaste->port->name,
+ "ib_sa_join_multicast failed, status %d\n", rc);
+ /* calls vnic_port_mcast_join_task() */
+ spin_lock_irqsave(&mcaste->lock, flags);
+ if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+ queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff);
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+ }
+
+ return;
+
+out:
+ mcaste->join_task_cnt = 0; /* for next time */
+ mcaste->backoff = mcaste->backoff_init;
+ return;
+}
+
+static int vnic_port_mcast_join(struct vnic_port_mcast *mcaste)
+{
+ unsigned long flags;
+
+ ASSERT(mcaste);
+ vnic_dbg_mcast_v(mcaste->port->name, "queue delayed task (%lu) "
+ "vnic_port_mcast_join_task\n", mcaste->backoff);
+
+ /* calls vnic_port_mcast_join_task() */
+ spin_lock_irqsave(&mcaste->lock, flags);
+ if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+ queue_delayed_work(mcast_wq, &mcaste->join_task, 0);
+ spin_unlock_irqrestore(&mcaste->lock, flags);
+
+ return 0;
+}
+
+#if 0
+static int vnic_port_mcast_join_all(struct vnic_port *port)
+{
+ int fails = 0;
+ struct vnic_port_mcast *mcaste;
+ struct rb_node *n;
+
+ n = rb_first(&port->mcast_tree.mcast_tree);
+ while (n) {
+ mcaste = rb_entry(n, struct vnic_port_mcast, rb_node);
+ n = rb_next(n);
+ if (vnic_port_mcast_join(mcaste))
+ fails++;
+ }
+
+ return fails;
+}
+#endif
+
+static void vnic_port_mcast_leave_task(struct work_struct *work)
+{
+ struct vnic_port_mcast *mcaste =
+ container_of(work, struct vnic_port_mcast, leave_task);
+
+#ifndef _BP_WORK_SYNC
+ vnic_port_mcast_leave(mcaste, 0);
+#else
+ if (vnic_port_mcast_leave(mcaste, 0)) {
+ queue_work(mcast_wq, &mcaste->leave_task);
+ return;
+ }
+#endif
+ vnic_port_mcast_dealloc(mcaste);
+}
+
+static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste)
+{
+ unsigned long flags;
+
+ struct vnic_port *port = mcaste->port;
+
+ vnic_dbg_mcast(port->name, "update mcaste->ref_cnt %d -> %d\n",
+ atomic_read(&mcaste->ref_cnt),
+ atomic_read(&mcaste->ref_cnt) - 1);
+
+ spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags);
+ if (atomic_dec_and_test(&mcaste->ref_cnt)) {
+ vnic_port_mcast_del(mcaste);
+ spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+
+ /* we are not going to wait for the leave to terminate.
+ * We will just go on.
+ * calls vnic_port_mcast_leave_task()
+ */
+ queue_work(mcast_wq, &mcaste->leave_task);
+ } else
+ spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+}
+
+static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast *_mcaste)
+{
+ union ib_gid *gid = &_mcaste->port_gid;
+ u32 qkey = _mcaste->qkey;
+ u16 pkey = _mcaste->pkey;
+ struct vnic_port *port = _mcaste->port;
+ struct vnic_port_mcast *mcaste;
+ unsigned long flags;
+
+ spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags);
+ mcaste = vnic_port_mcast_search(port, gid);
+ /* entry found */
+ if (PTR_ERR(mcaste) != -ENODATA) {
+ ASSERT(!IS_ERR(mcaste));
+ atomic_inc(&mcaste->ref_cnt);
+ spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+ vnic_dbg_mcast(mcaste->port->name,
+ "found, add GID "VNIC_GID_FMT" \n",
+ VNIC_GID_ARG(*gid));
+ vnic_dbg_mcast(mcaste->port->name,
+ "update mcaste->ref_cnt %d -> %d\n",
+ atomic_read(&mcaste->ref_cnt),
+ atomic_read(&mcaste->ref_cnt) + 1);
+ } else { /* not found, add it */
+ mcaste = vnic_port_mcast_alloc(port, gid);
+ if (IS_ERR(mcaste)) {
+ spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+ return mcaste;
+ }
+ vnic_dbg_mcast(mcaste->port->name,
+ "not found, add GID "VNIC_GID_FMT" \n",
+ VNIC_GID_ARG(*gid));
+ vnic_dbg_mcast(mcaste->port->name,
+ "update mcaste->ref_cnt %d -> %d\n",
+ atomic_read(&mcaste->ref_cnt),
+ atomic_read(&mcaste->ref_cnt) + 1);
+ mcaste->qkey = qkey;
+ mcaste->pkey = pkey;
+ mcaste->backoff_init = _mcaste->backoff_init;
+ mcaste->backoff = _mcaste->backoff;
+ mcaste->backoff_factor = _mcaste->backoff_factor;
+ mcaste->retry = _mcaste->retry;
+ mcaste->create = _mcaste->create;
+ mcaste->join_state = _mcaste->join_state;
+ vnic_port_mcast_add(mcaste);
+ spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+
+ vnic_port_mcast_join(mcaste);
+ vnic_dbg_mcast(mcaste->port->name, "added\n");
+ }
+
+ return mcaste;
+}
+
+#if 0
+void vnic_port_mcast_del_all(struct vnic_port *port)
+{
+
+ struct rb_node *n;
+ struct vnic_port_mcast *mcaste, *mcaste_t;
+ LIST_HEAD(local_list);
+
+ ASSERT(port);
+
+ n = rb_first(&port->mcast_tree.mcast_tree);
+ while (n) {
+ mcaste = rb_entry(n, struct vnic_port_mcast, rb_node);
+ list_add_tail(&mcaste->list, &local_list);
+ n = rb_next(&mcaste->rb_node);
+ }
+
+ list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+ list_del(&mcaste->list);
+ vnic_warn(port->name, "shouldn't find gid "VNIC_GID_FMT"\n",
+ VNIC_GID_ARG(mcaste->gid));
+ vnic_port_mcast_release(mcaste);
+ }
+
+ return;
+}
+#endif
+
+void vnic_tree_mcast_detach(struct mcast_root *mcast_tree)
+{
+ struct vnic_mcast *mcaste, *mcaste_t;
+ struct rb_node *n;
+ unsigned long flags;
+ INIT_LIST_HEAD(&mcast_tree->reattach_list);
+
+ spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+ n = rb_first(&mcast_tree->mcast_tree);
+ while (n) {
+ mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+ list_add_tail(&mcaste->list, &mcast_tree->reattach_list);
+ n = rb_next(&mcaste->rb_node);
+ vnic_mcast_del(mcast_tree, mcaste);
+ mcaste->attach_task_cnt = 0;
+ }
+ spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+ list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) {
+ vnic_mcast_detach(mcast_tree, mcaste);
+ }
+
+ return;
+}
+
+void vnic_tree_mcast_attach(struct mcast_root *mcast_tree)
+{
+ struct vnic_mcast *mcaste, *mcaste_t;
+ int rc;
+
+ /* The add function grabs the mcast_rb_lock no need to take it */
+ list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) {
+ rc = vnic_mcast_add(mcast_tree, mcaste);
+ ASSERT(!rc);
+ rc = vnic_mcast_attach(mcast_tree, mcaste);
+ ASSERT(!rc);
+ list_del(&mcaste->list);
+ }
+
+ return;
+}
+
+int vnic_mcast_init()
+{
+ ib_sa_register_client(&vnic_sa_client);
+
+ mcast_wq = create_singlethread_workqueue("mcast_wq");
+ if (!mcast_wq)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void vnic_mcast_cleanup()
+{
+ ASSERT(mcast_wq);
+ vnic_dbg_mark();
+ flush_workqueue(mcast_wq);
+ vnic_dbg_mark();
+ destroy_workqueue(mcast_wq);
+ vnic_dbg_mark();
+ ib_sa_unregister_client(&vnic_sa_client);
+
+ return;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+u32 vnic_lro_num = VNIC_MAX_LRO_DESCS;
+u32 vnic_net_admin = 1;
+u32 vnic_child_max = VNIC_CHILD_MAX;
+u32 vnic_tx_rings_num = 0;
+u32 vnic_rx_rings_num = 0;
+u32 vnic_tx_rings_len = VNIC_TX_QUEUE_LEN;
+u32 vnic_rx_rings_len = VNIC_RX_QUEUE_LEN;
+u32 vnic_mgid_data_type = 0;
+u32 vnic_encap_headroom = 1;
+u32 vnic_tx_polling = 1;
+u32 vnic_rx_linear = 0;
+u32 vnic_change_mac = 0;
+u32 vnic_learn_mac_enabled = 1;
+u32 vnic_synd_backlog = 4;
+u32 vnic_eport_state_enforce = 0;
+u32 vnic_src_mac_enforce = 0;
+u32 vnic_inline_tshold = 0;
+u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY];
+u32 vnic_discovery_pkeys_count = MAX_NUM_PKEYS_DISCOVERY;
+u32 vnic_sa_query = 0;
+
+/* these params are enbaled in debug mode */
+u32 no_bxm = 0;
+u32 vnic_msglvl = 0x80000000;
+u32 vnic_max_tx_outs = VNIC_MAX_TX_OUTS;
+u32 vnic_linear_small_pkt = 1;
+u32 vnic_mcast_create = 0;
+u32 vnic_napi_weight = VNIC_MAX_RX_CQE;
+
+module_param_named(tx_rings_num, vnic_tx_rings_num, int, 0444);
+MODULE_PARM_DESC(tx_rings_num, "Number of TX rings, use 0 for #cpus [default 0, max 32]");
+
+module_param_named(tx_rings_len, vnic_tx_rings_len, int, 0444);
+MODULE_PARM_DESC(tx_rings_len, "Length of TX rings, must be power of two [default 1024, max 8K]");
+
+module_param_named(rx_rings_num, vnic_rx_rings_num, int, 0444);
+MODULE_PARM_DESC(rx_rings_num, "Number of RX rings, use 0 for #cpus [default 0, max 32]");
+
+module_param_named(rx_rings_len, vnic_rx_rings_len, int, 0444);
+MODULE_PARM_DESC(rx_rings_len, "Length of RX rings, must be power of two [default 2048, max 8K]");
+
+module_param_named(eport_state_enforce, vnic_eport_state_enforce, int, 0644);
+MODULE_PARM_DESC(eport_state_enforce, "Bring interface up only when corresponding EPort is up [default 0]");
+
+module_param_named(src_mac_enforce, vnic_src_mac_enforce, int, 0644);
+MODULE_PARM_DESC(src_mac_enforce, "Enforce source MAC address [default 0]");
+
+module_param_named(vnic_net_admin, vnic_net_admin, int, 0644);
+MODULE_PARM_DESC(vnic_net_admin, "Enable Network Administration mode [default 1]");
+
+module_param_named(vnic_child_max, vnic_child_max, int, 0644);
+MODULE_PARM_DESC(vnic_child_max, "Max child vNics (per interface), use 0 to disable [default 128]");
+
+module_param_named(mgid_data_type, vnic_mgid_data_type, int, 0444);
+MODULE_PARM_DESC(mgid_data_type, "Set MGID data type for multicast traffic [default 0, max 1]");
+
+module_param_named(encap_headroom, vnic_encap_headroom, int, 0444);
+MODULE_PARM_DESC(encap_headroom, "Use SKB headroom for protocol encapsulation [default 1]");
+
+module_param_named(inline_tshold, vnic_inline_tshold, int, 0444);
+MODULE_PARM_DESC(inline_tshold, "Packets smaller than this threshold (in bytes) use inline & blue flame [default 0, max 512]");
+
+module_param_named(tx_polling, vnic_tx_polling, int, 0444);
+MODULE_PARM_DESC(tx_polling, "Enable TX polling mode [default 1]");
+
+module_param_named(rx_linear, vnic_rx_linear, int, 0444);
+MODULE_PARM_DESC(rx_linear, "Enable linear RX buffers [default 0]");
+
+module_param_named(change_mac, vnic_change_mac, int, 0444);
+MODULE_PARM_DESC(change_mac, "Enable MAC change using child vNics [default 0]");
+
+module_param_named(learn_tx_mac, vnic_learn_mac_enabled, int, 0644);
+MODULE_PARM_DESC(learn_tx_mac, "Enable TX MAC learning in promisc mode [default 1]");
+
+module_param_named(synd_backlog, vnic_synd_backlog, int, 0644);
+MODULE_PARM_DESC(synd_backlog, "Syndrome error reporting backlog limit [default 4]");
+
+module_param_array_named(discovery_pkeys, vnic_discovery_pkeys, int, &vnic_discovery_pkeys_count, 0444);
+MODULE_PARM_DESC(discovery_pkeys, "Vector of PKeys to be used for discovery [default 0xffff, max vector length 24]");
+
+module_param_named(sa_query, vnic_sa_query, int, 0644);
+MODULE_PARM_DESC(sa_query, "Query SA for each IB address and ignore gateway assigned SLs [default 0]");
+
+
+#if !(defined(NETIF_F_GRO) && !defined(_BP_NO_GRO))
+module_param_named(lro_num, vnic_lro_num, int, 0444);
+MODULE_PARM_DESC(lro_num, "Number of LRO sessions per ring, use 0 to disable [default 32, max 32]");
+#endif
+
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+module_param_named(no_bxm, no_bxm, int, 0444);
+MODULE_PARM_DESC(no_bxm, "Enable NO BXM mode [default 0]");
+
+module_param_named(msglvl, vnic_msglvl, uint, 0644);
+MODULE_PARM_DESC(msglvl, "Debug message level [default 0]");
+
+module_param_named(max_tx_outs, vnic_max_tx_outs, int, 0644);
+MODULE_PARM_DESC(max_tx_outs, "Max outstanding TX packets [default 16]");
+
+module_param_named(linear_small_pkt, vnic_linear_small_pkt, int, 0644);
+MODULE_PARM_DESC(linear_small_pkt, "Use linear buffer for small packets [default 1]");
+
+module_param_named(mcast_create, vnic_mcast_create, int, 0444);
+MODULE_PARM_DESC(mcast_create, "Create multicast group during join request [default 0]");
+
+module_param_named(napi_weight, vnic_napi_weight, int, 0444);
+MODULE_PARM_DESC(napi_weight, "NAPI weight [default 32]");
+#endif /* CONFIG_MLX4_VNIC_DEBUG */
+
+int vnic_param_check(void) {
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+ vnic_info("VNIC_DEBUG flag is set\n");
+#endif
+
+ vnic_mcast_create = vnic_mcast_create ? 1 : 0;
+ vnic_mcast_create = no_bxm ? 1 : vnic_mcast_create;
+ no_bxm = no_bxm ? 1 : 0;
+ vnic_sa_query = vnic_sa_query ? 1 : 0;
+
+ vnic_mgid_data_type = max_t(u32, vnic_mgid_data_type, 0);
+ vnic_mgid_data_type = min_t(u32, vnic_mgid_data_type, 1);
+
+ vnic_rx_rings_num = max_t(u32, vnic_rx_rings_num, 0);
+ vnic_rx_rings_num = min_t(u32, vnic_rx_rings_num, VNIC_MAX_NUM_CPUS);
+
+ vnic_tx_rings_num = max_t(u32, vnic_tx_rings_num, 0);
+ vnic_tx_rings_num = min_t(u32, vnic_tx_rings_num, VNIC_MAX_NUM_CPUS);
+
+ vnic_tx_rings_len = rounddown_pow_of_two(vnic_tx_rings_len);
+ vnic_tx_rings_len = max_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MIN);
+ vnic_tx_rings_len = min_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MAX);
+
+ vnic_rx_rings_len = rounddown_pow_of_two(vnic_rx_rings_len);
+ vnic_rx_rings_len = max_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MIN);
+ vnic_rx_rings_len = min_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MAX);
+
+ vnic_max_tx_outs = min_t(u32, vnic_tx_rings_len, vnic_max_tx_outs);
+
+ vnic_napi_weight = min_t(u32, vnic_napi_weight, VNIC_MAX_NUM_CPUS);
+
+ vnic_lro_num = max_t(u32, vnic_lro_num, 0);
+ vnic_lro_num = min_t(u32, vnic_lro_num, VNIC_MAX_LRO_DESCS);
+
+ vnic_inline_tshold = max_t(u32, vnic_inline_tshold, 0);
+ vnic_inline_tshold = min_t(u32, vnic_inline_tshold, VNIC_MAX_INLINE_TSHOLD);
+
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+/* globals */
+struct workqueue_struct *port_wq;
+struct workqueue_struct *login_wq;
+
+/* functions */
+static void vnic_port_event(struct ib_event_handler *handler,
+ struct ib_event *record)
+{
+ struct vnic_port *port =
+ container_of(handler, struct vnic_port, event_handler);
+
+ if (record->element.port_num != port->num)
+ return;
+
+ vnic_info("Received event 0x%x (device %s port %d)\n",
+ record->event, record->device->name,
+ record->element.port_num);
+
+ switch (record->event) {
+ case IB_EVENT_SM_CHANGE:
+ case IB_EVENT_CLIENT_REREGISTER:
+ /* calls vnic_port_event_task_light() */
+ queue_delayed_work(fip_wq, &port->event_task_light, msecs_to_jiffies(VNIC_SM_HEADSTART));
+ break;
+ case IB_EVENT_PORT_ERR:
+ case IB_EVENT_PORT_ACTIVE:
+ /* calls vnic_port_event_task() */
+ queue_delayed_work(fip_wq, &port->event_task, msecs_to_jiffies(VNIC_SM_HEADSTART));
+ break;
+ case IB_EVENT_PKEY_CHANGE:
+ case IB_EVENT_LID_CHANGE:
+ /* calls port_fip_discover_restart() */
+ if (no_bxm)
+ queue_delayed_work(fip_wq, &port->event_task, 0);
+ else
+ queue_delayed_work(port_wq, &port->discover_restart_task, msecs_to_jiffies(VNIC_SM_HEADSTART));
+ break;
+ case IB_EVENT_SRQ_ERR:
+ case IB_EVENT_SRQ_LIMIT_REACHED:
+ case IB_EVENT_QP_LAST_WQE_REACHED:
+ case IB_EVENT_DEVICE_FATAL:
+ default:
+ vnic_warn(port->name, "event 0x%x unhandled\n", record->event);
+ break;
+ }
+
+}
+
+static inline u8 vnic_mcast_rate_enum(struct vnic_port *port, int rate)
+{
+ u8 ret;
+
+ switch (rate) {
+ case 10:
+ ret = IB_RATE_10_GBPS;
+ break;
+ case 20:
+ ret = IB_RATE_20_GBPS;
+ break;
+ case 40:
+ ret = IB_RATE_40_GBPS;
+ break;
+ case 80:
+ ret = IB_RATE_80_GBPS;
+ break;
+ default:
+ ret = IB_RATE_10_GBPS;
+ }
+ return ret;
+}
+
+int vnic_port_query(struct vnic_port *port)
+{
+ if (ib_query_gid(port->dev->ca, port->num, 0, &port->gid)) {
+ vnic_err(port->name, "ib_query_gid failed\n");
+ return -EINVAL;
+ }
+
+ if (ib_query_port(port->dev->ca, port->num, &port->attr)) {
+ vnic_err(port->name, "ib_query_port failed\n");
+ return -EINVAL;
+ }
+
+ port->max_mtu_enum = ib_mtu_enum_to_int(port->attr.max_mtu);
+ port->rate = ((int)port->attr.active_speed *
+ ib_width_enum_to_int(port->attr.active_width) * 25) / 10;
+ port->rate_enum = vnic_mcast_rate_enum(port, port->rate);
+
+ if (ib_query_pkey(port->dev->ca, port->num, port->pkey_index,
+ &port->pkey)) {
+ vnic_err(port->name, "ib_query_pkey failed for index %d\n",
+ port->pkey_index);
+ return -EINVAL;
+ }
+ port->pkey |= 0x8000;
+
+ return 0;
+}
+
+void vnic_port_event_task(struct work_struct *work)
+{
+ struct vnic_port *port =
+ container_of(work, struct vnic_port, event_task.work);
+ struct fip_discover *discover;
+
+ /* refresh port attr, TODO: check what else need to be refreshed */
+ vnic_dbg_mark();
+ mutex_lock(&port->mlock);
+ if (vnic_port_query(port))
+ vnic_warn(port->name, "vnic_port_query failed\n");
+ mutex_unlock(&port->mlock);
+
+ /* refresh login mcasts */
+ vnic_login_refresh_mcasts(port);
+
+ list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+ /* refresh FIP mcasts */
+ if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF))
+ fip_refresh_mcasts(discover);
+ }
+}
+
+void vnic_port_event_task_light(struct work_struct *work)
+{
+ struct vnic_port *port =
+ container_of(work, struct vnic_port, event_task_light.work);
+ unsigned long flags,mc_flags;
+ struct fip_discover *discover;
+ struct rb_node *node;
+ struct vnic_port_mcast *mcaste;
+ struct mcast_root *mcast_tree = &port->mcast_tree;
+ struct vnic_login *login;
+ vnic_dbg_mark();
+ mutex_lock(&port->mlock);
+
+ if (vnic_port_query(port))
+ vnic_warn(port->name, "vnic_port_query failed\n");
+
+ spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+ for (node = rb_first(&mcast_tree->mcast_tree); node; node = rb_next(node)){
+ mcaste = rb_entry(node, struct vnic_port_mcast , rb_node);
+ clear_bit(MCAST_JOINED, &mcaste->state);
+ set_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+ vnic_dbg_mcast(mcaste->port->name,"Rejoin GID="VNIC_GID_FMT"\n",VNIC_GID_ARG(mcaste->gid));
+ spin_lock_irqsave(&mcaste->lock, mc_flags);
+ queue_delayed_work(mcast_wq, &mcaste->join_task, 0);
+ spin_unlock_irqrestore(&mcaste->lock, mc_flags);
+ }
+
+ spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+ vnic_dbg_mark();
+ if (vnic_sa_query)
+ list_for_each_entry(login, &port->login_list, list)
+ {
+ /* take the tx lock to make sure no delete function is called at the time */
+ netif_tx_lock_bh(login->dev);
+ vnic_neigh_invalidate(login);
+ netif_tx_unlock_bh(login->dev);
+ }
+
+ mutex_unlock(&port->mlock);
+
+ list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+ if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF))
+ fip_refresh_mcasts(discover);
+ }
+}
+
+struct vnic_port *vnic_port_alloc(struct vnic_ib_dev *vnic_dev, u8 num)
+{
+ struct vnic_port *port;
+ int def_rings_num;
+ int max_num_cpus;
+
+ port = kzalloc(sizeof *port, GFP_KERNEL);
+ if (!port)
+ return ERR_PTR(-ENOMEM);
+
+ /* pre-init fields */
+ port->num = num;
+ port->dev = vnic_dev;
+
+ max_num_cpus = min((int)num_online_cpus(), VNIC_MAX_NUM_CPUS);
+ def_rings_num = min(vnic_dev->ca->num_comp_vectors, max_num_cpus);
+ port->rx_rings_num = vnic_rx_rings_num ? vnic_rx_rings_num : def_rings_num;
+ port->tx_rings_num = vnic_tx_rings_num ? vnic_tx_rings_num : def_rings_num;
+
+ sprintf(port->name, "%s:%d", port->dev->ca->name, port->num);
+ INIT_LIST_HEAD(&port->login_list);
+ INIT_LIST_HEAD(&port->fip.discover_list);
+ INIT_DELAYED_WORK(&port->event_task, vnic_port_event_task);
+ INIT_DELAYED_WORK(&port->event_task_light, vnic_port_event_task_light);
+ INIT_DELAYED_WORK(&port->discover_restart_task, port_fip_discover_restart);
+ INIT_IB_EVENT_HANDLER(&port->event_handler, vnic_dev->ca,
+ vnic_port_event);
+ mutex_init(&port->mlock);
+ mutex_init(&port->start_stop_lock);
+ vnic_mcast_root_init(&port->mcast_tree);
+ atomic_set(&port->vnic_child_ids, 0);
+
+ port->pkey_index = 0; /* used by fip qps, TBD */
+
+ if (ib_register_event_handler(&port->event_handler)) {
+ vnic_err(port->name, "ib_register_event_handler failed\n");
+ goto err;
+ }
+
+ vnic_dbg_mark();
+ mutex_lock(&port->mlock);
+ if (vnic_port_query(port)) {
+ vnic_err(port->name, "vnic_port_query failed\n");
+ mutex_unlock(&port->mlock);
+ if (ib_unregister_event_handler(&port->event_handler))
+ vnic_err(port->name, "ib_unregister_event_handler failed!\n");
+ goto err;
+ }
+ mutex_unlock(&port->mlock);
+
+ return port;
+err:
+ kfree(port);
+ return ERR_PTR(-EINVAL);
+}
+
+int vnic_port_init(struct vnic_port *port)
+{
+ return vnic_port_ib_init(port);
+}
+
+void vnic_port_cleanup(struct vnic_port *port)
+{
+ /* should be empty list */
+ vnic_port_ib_cleanup(port);
+ return;
+}
+
+static void vnic_ib_dev_add_one(struct ib_device *device);
+static void vnic_ib_dev_remove_one(struct ib_device *device);
+static struct ib_client vnic_init_client = {
+ .name = DRV_NAME,
+ .add = vnic_ib_dev_add_one,
+ .remove = vnic_ib_dev_remove_one,
+};
+
+static void vnic_ib_dev_add_one(struct ib_device *device)
+{
+ struct vnic_port *ib_port;
+ struct vnic_ib_dev *ib_dev;
+ int s, e, p, rc;
+
+ vnic_dbg(NULL, "ib_dev %s\n", device->name);
+
+ if (memcmp(device->name, "mlx4", 4))
+ return;
+
+ if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+ return;
+
+ s = 1;
+ e = device->phys_port_cnt;
+
+ /* alloc ib device */
+ ib_dev = kzalloc(sizeof *ib_dev, GFP_KERNEL);
+ if (!ib_dev)
+ return;
+
+ /* init ib dev */
+ mutex_init(&ib_dev->mlock);
+ ib_dev->ca = device;
+ mutex_lock(&ib_dev->mlock);
+ /* TODO: remove mdev once all mlx4 caps are standard */
+ ib_dev->mdev = to_mdev(device);
+ ASSERT(ib_dev->ca);
+ sprintf(ib_dev->name, "%s", device->name);
+ if (ib_query_device(device, &ib_dev->attr)) {
+ vnic_err(ib_dev->name, "ib_query_device failed on %s\n",
+ device->name);
+ goto abort;
+ }
+
+ VNIC_FW_STR(ib_dev->attr.fw_ver, ib_dev->fw_ver_str);
+ INIT_LIST_HEAD(&ib_dev->port_list);
+ vnic_dbg_mark();
+ for (p = s; p <= e; ++p) {
+ /* skip non IB link layers */
+ if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+ continue;
+
+ /* alloc IB port */
+ ib_port = vnic_port_alloc(ib_dev, p);
+ if (IS_ERR(ib_port)) {
+ vnic_err(ib_dev->name,
+ "vnic_port_alloc failed %d from %d\n", p, e);
+ continue;
+ }
+ /* init IB port */
+ rc = vnic_port_init(ib_port);
+ if (rc) {
+ vnic_err(ib_port->name,
+ "vnic_port_init failed, rc %d\n", rc);
+ if (ib_unregister_event_handler(&ib_port->event_handler))
+ vnic_err(ib_port->name,
+ "ib_unregister_event_handler failed!\n");
+ kfree(ib_port);
+ continue;
+ }
+ if (no_bxm) {
+ rc = vnic_port_data_init(ib_port);
+ if (rc)
+ vnic_err(ib_port->name,
+ "vnic_port_data_init failed, rc %d\n", rc);
+ } else {
+ rc = vnic_port_fip_init(ib_port);
+ if (rc)
+ vnic_err(ib_port->name,
+ "vnic_port_fip_init failed, rc %d\n", rc);
+ else {
+ rc = port_fs_init(ib_port);
+ if (rc)
+ vnic_warn(ib_port->name, "port_fs_init sysfs:"
+ "entry creation failed, %d\n", rc);
+ }
+ }
+ if (rc) {
+ if (ib_unregister_event_handler(&ib_port->event_handler))
+ vnic_err(ib_port->name,
+ "ib_unregister_event_handler failed!\n");
+ vnic_port_cleanup(ib_port);
+ kfree(ib_port);
+ continue;
+
+ }
+ vnic_dbg_mark();
+ mutex_lock(&ib_port->start_stop_lock);
+ list_add_tail(&ib_port->list, &ib_dev->port_list);
+ mutex_unlock(&ib_port->start_stop_lock);
+ }
+
+ /* set device ctx */
+ ib_set_client_data(device, &vnic_init_client, ib_dev);
+ mutex_unlock(&ib_dev->mlock);
+ return;
+
+abort:
+ mutex_unlock(&ib_dev->mlock);
+ kfree(ib_dev);
+}
+
+static void vnic_ib_dev_remove_one(struct ib_device *device)
+{
+ struct vnic_port *port, *port_t;
+ struct vnic_ib_dev *ib_dev =
+ ib_get_client_data(device, &vnic_init_client);
+
+ vnic_dbg(NULL, "ib_dev %s\n", device->name);
+
+ if (!ib_dev)
+ return;
+
+ vnic_dbg_mark();
+ mutex_lock(&ib_dev->mlock);
+ list_for_each_entry_safe(port, port_t, &ib_dev->port_list, list) {
+ vnic_dbg(port->name, "port %d\n", port->num);
+ if (ib_unregister_event_handler(&port->event_handler))
+ vnic_err(port->name, "ib_unregister_event_handler failed!\n");
+ /* make sure we don't have any more pending events */
+#ifndef _BP_WORK_SYNC
+ cancel_delayed_work_sync(&port->event_task_light);
+ cancel_delayed_work_sync(&port->event_task);
+ cancel_delayed_work_sync(&port->discover_restart_task);
+#else
+ cancel_delayed_work(&port->event_task_light);
+ cancel_delayed_work(&port->event_task);
+ cancel_delayed_work(&port->discover_restart_task);
+ flush_workqueue(port_wq);
+ flush_workqueue(fip_wq);
+#endif
+ /* remove sysfs entries related to FIP
+ * we want to do this outside the lock
+ */
+ port_fs_exit(port);
+
+ /* cleanup any pending vnics */
+ vnic_dbg_mark();
+ mutex_lock(&port->start_stop_lock);
+ list_del(&port->list);
+ if (no_bxm)
+ vnic_port_data_cleanup(port);
+ else {
+ vnic_port_fip_cleanup(port, 0);
+ }
+ mutex_unlock(&port->start_stop_lock);
+ vnic_port_cleanup(port);
+ kfree(port);
+ }
+ mutex_unlock(&ib_dev->mlock);
+
+ kfree(ib_dev);
+}
+
+int vnic_ports_init(void)
+{
+ int rc;
+
+ /* create global wq */
+ port_wq = create_singlethread_workqueue("port_wq");
+ if (!port_wq) {
+ vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+ "port_wq");
+ return -EINVAL;
+ }
+
+ login_wq = create_singlethread_workqueue("login_wq");
+ if (!login_wq) {
+ vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+ "login_wq");
+ goto free_wq0;
+ }
+
+ fip_wq = create_singlethread_workqueue("fip");
+ if (!fip_wq) {
+ vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+ "fip");
+ goto free_wq1;
+ }
+
+ /* calls vnic_ib_dev_add_one() */
+ rc = ib_register_client(&vnic_init_client);
+ if (rc) {
+ vnic_err(NULL, "ib_register_client failed %d\n", rc);
+ goto free_wq2;
+ }
+
+ return 0;
+
+free_wq2:
+ destroy_workqueue(fip_wq);
+free_wq1:
+ destroy_workqueue(login_wq);
+free_wq0:
+ destroy_workqueue(port_wq);
+
+ return -EINVAL;
+}
+
+void vnic_ports_cleanup(void)
+{
+ vnic_dbg(NULL, "calling ib_unregister_client\n");
+ /* calls vnic_ib_dev_remove_one() */
+ ib_unregister_client(&vnic_init_client);
+ vnic_dbg(NULL, "calling destroy_workqueue\n");
+ destroy_workqueue(fip_wq);
+ destroy_workqueue(login_wq);
+ destroy_workqueue(port_wq);
+ vnic_dbg(NULL, "vnic_data_cleanup done\n");
+}
--- /dev/null
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/qp.h>
+#include <linux/io.h>
+
+#include "vnic.h"
+
+/* compare with drivers/infiniband/hw/mlx4/qp.c */
+#define mlx4_ib_dbg(format, arg...) vnic_dbg(NULL, format, ## arg)
+
+enum {
+ MLX4_IB_ACK_REQ_FREQ = 8,
+};
+
+enum {
+ MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83,
+ MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
+ MLX4_IB_LINK_TYPE_IB = 0,
+ MLX4_IB_LINK_TYPE_ETH = 1,
+};
+
+enum {
+ /*
+ * Largest possible UD header: send with GRH and immediate data.
+ * 4 bytes added to accommodate for eth header instead of lrh
+ */
+ MLX4_IB_UD_HEADER_SIZE = 76,
+ MLX4_IB_MAX_RAW_ETY_HDR_SIZE = 12
+};
+
+enum {
+ MLX4_IBOE_ETHERTYPE = 0x8915
+};
+
+struct mlx4_ib_sqp {
+ struct mlx4_ib_qp qp;
+ int pkey_index;
+ u32 qkey;
+ u32 send_psn;
+ struct ib_ud_header ud_header;
+ u8 header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+enum {
+ MLX4_IB_MIN_SQ_STRIDE = 6
+};
+
+static const __be32 mlx4_ib_opcode[] = {
+ [IB_WR_SEND] = cpu_to_be32(MLX4_OPCODE_SEND),
+ [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO),
+ [IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+ [IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+ [IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+ [IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+ [IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+ [IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+ [IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+ [IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+ [IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR),
+ [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+ [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+};
+
+#ifndef wc_wmb
+ #if defined(__i386__)
+ #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+ #elif defined(__x86_64__)
+ #define wc_wmb() asm volatile("sfence" ::: "memory")
+ #elif defined(__ia64__)
+ #define wc_wmb() asm volatile("fwb" ::: "memory")
+ #else
+ #define wc_wmb() wmb()
+ #endif
+#endif
+
+#if 0
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+ return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+#endif
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+ return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+ return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with
+ * 0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+ __be32 *wqe;
+ int i;
+ int s;
+ int ind;
+ void *buf;
+ __be32 stamp;
+ struct mlx4_wqe_ctrl_seg *ctrl;
+
+ if (qp->sq_max_wqes_per_wr > 1) {
+ s = roundup(size, 1U << qp->sq.wqe_shift);
+ for (i = 0; i < s; i += 64) {
+ ind = (i >> qp->sq.wqe_shift) + n;
+ stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+ cpu_to_be32(0xffffffff);
+ buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+ *wqe = stamp;
+ }
+ } else {
+ ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ s = (ctrl->fence_size & 0x3f) << 4;
+ for (i = 64; i < s; i += 64) {
+ wqe = buf + i;
+ *wqe = cpu_to_be32(0xffffffff);
+ }
+ }
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ struct mlx4_wqe_inline_seg *inl;
+ void *wqe;
+ int s;
+
+ ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+ s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+ if (qp->ibqp.qp_type == IB_QPT_UD) {
+ struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+ struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+ memset(dgram, 0, sizeof *dgram);
+ av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+ s += sizeof(struct mlx4_wqe_datagram_seg);
+ }
+
+ /* Pad the remainder of the WQE with an inline data segment. */
+ if (size > s) {
+ inl = wqe + s;
+ inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+ }
+ ctrl->srcrb_flags = 0;
+ ctrl->fence_size = size / 16;
+ /*
+ * Make sure descriptor is fully written before setting ownership bit
+ * (because HW can start executing as soon as we do).
+ */
+ wmb();
+
+ ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+ (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+ stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+ unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+ if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+ post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+ ind += s;
+ }
+ return ind;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+ struct ib_event event;
+ struct mlx4_ib_qp *mqp = to_mibqp(qp);
+ struct ib_qp *ibqp = &mqp->ibqp;
+
+ if (type == MLX4_EVENT_TYPE_PATH_MIG)
+ to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+ if (ibqp->event_handler) {
+ event.device = ibqp->device;
+ switch (type) {
+ case MLX4_EVENT_TYPE_PATH_MIG:
+ event.event = IB_EVENT_PATH_MIG;
+ break;
+ case MLX4_EVENT_TYPE_COMM_EST:
+ event.event = IB_EVENT_COMM_EST;
+ break;
+ case MLX4_EVENT_TYPE_SQ_DRAINED:
+ event.event = IB_EVENT_SQ_DRAINED;
+ break;
+ case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+ event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+ break;
+ case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+ event.event = IB_EVENT_QP_FATAL;
+ break;
+ case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+ event.event = IB_EVENT_PATH_MIG_ERR;
+ break;
+ case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+ event.event = IB_EVENT_QP_REQ_ERR;
+ break;
+ case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+ event.event = IB_EVENT_QP_ACCESS_ERR;
+ break;
+ default:
+ printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+ "on QP %06x\n", type, qp->qpn);
+ return;
+ }
+
+ event.element.qp = ibqp;
+ ibqp->event_handler(&event, ibqp->qp_context);
+ }
+}
+
+static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
+{
+ /*
+ * UD WQEs must have a datagram segment.
+ * RC and UC WQEs might have a remote address segment.
+ * MLX WQEs need two extra inline data segments (for the UD
+ * header and space for the ICRC).
+ */
+ switch (type) {
+ case IB_QPT_UD:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_datagram_seg) +
+ ((flags & MLX4_IB_QP_LSO) ? 128 : 0);
+ case IB_QPT_UC:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg);
+ case IB_QPT_XRC_TGT:
+ case IB_QPT_RC:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ sizeof (struct mlx4_wqe_atomic_seg) +
+ sizeof (struct mlx4_wqe_raddr_seg);
+ case IB_QPT_SMI:
+ case IB_QPT_GSI:
+ return sizeof (struct mlx4_wqe_ctrl_seg) +
+ ALIGN(MLX4_IB_UD_HEADER_SIZE +
+ DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+ MLX4_INLINE_ALIGN) *
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg)) +
+ ALIGN(4 +
+ sizeof (struct mlx4_wqe_inline_seg),
+ sizeof (struct mlx4_wqe_data_seg));
+ case IB_QPT_RAW_ETHERTYPE:
+ return sizeof(struct mlx4_wqe_ctrl_seg) +
+ ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE +
+ sizeof(struct mlx4_wqe_inline_seg),
+ sizeof(struct mlx4_wqe_data_seg));
+
+ default:
+ return sizeof (struct mlx4_wqe_ctrl_seg);
+ }
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ int is_user, int has_rq, struct mlx4_ib_qp *qp)
+{
+ /* Sanity check RQ size before proceeding */
+ if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+ cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
+ return -EINVAL;
+
+ if (!has_rq) {
+ if (cap->max_recv_wr)
+ return -EINVAL;
+
+ qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+ } else {
+ /* HW requires >= 1 RQ entry with >= 1 gather entry */
+ if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
+ return -EINVAL;
+
+ qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+ qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+ qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+ }
+
+ /* leave userspace return values as they were, so as not to break ABI */
+ if (is_user) {
+ cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt;
+ cap->max_recv_sge = qp->rq.max_gs;
+ } else {
+ cap->max_recv_wr = qp->rq.max_post =
+ min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+ cap->max_recv_sge = min(qp->rq.max_gs,
+ min(dev->dev->caps.max_sq_sg,
+ dev->dev->caps.max_rq_sg));
+ }
+
+ return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+ enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+ int s;
+
+ /* Sanity check SQ size before proceeding */
+ if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+ cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+ cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+ sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+ return -EINVAL;
+
+ /*
+ * For MLX transport we need 2 extra S/G entries:
+ * one for the header and one for the checksum at the end
+ */
+ if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+ type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
+ cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+ return -EINVAL;
+
+ s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+ cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+ send_wqe_overhead(type, qp->flags);
+
+ if (s > dev->dev->caps.max_sq_desc_sz)
+ return -EINVAL;
+
+ /*
+ * Hermon supports shrinking WQEs, such that a single work
+ * request can include multiple units of 1 << wqe_shift. This
+ * way, work requests can differ in size, and do not have to
+ * be a power of 2 in size, saving memory and speeding up send
+ * WR posting. Unfortunately, if we do this then the
+ * wqe_index field in CQEs can't be used to look up the WR ID
+ * anymore, so we do this only if selective signaling is off.
+ *
+ * Further, on 32-bit platforms, we can't use vmap() to make
+ * the QP buffer virtually contiguous. Thus we have to use
+ * constant-sized WRs to make sure a WR is always fully within
+ * a single page-sized chunk.
+ *
+ * Finally, we use NOP work requests to pad the end of the
+ * work queue, to avoid wrap-around in the middle of WR. We
+ * set NEC bit to avoid getting completions with error for
+ * these NOP WRs, but since NEC is only supported starting
+ * with firmware 2.2.232, we use constant-sized WRs for older
+ * firmware.
+ *
+ * And, since MLX QPs only support SEND, we use constant-sized
+ * WRs in this case.
+ *
+ * We look for the smallest value of wqe_shift such that the
+ * resulting number of wqes does not exceed device
+ * capabilities.
+ *
+ * We set WQE size to at least 64 bytes, this way stamping
+ * invalidates each WQE.
+ */
+ if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+ qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+ type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
+ !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
+ MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
+ qp->sq.wqe_shift = ilog2(64);
+ else
+ qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+ for (;;) {
+ qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+
+ /*
+ * We need to leave 2 KB + 1 WR of headroom in the SQ to
+ * allow HW to prefetch.
+ */
+ qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+ qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+ qp->sq_max_wqes_per_wr +
+ qp->sq_spare_wqes);
+
+ if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+ break;
+
+ if (qp->sq_max_wqes_per_wr <= 1)
+ return -EINVAL;
+
+ ++qp->sq.wqe_shift;
+ }
+
+ qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+ (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
+ send_wqe_overhead(type, qp->flags)) /
+ sizeof (struct mlx4_wqe_data_seg);
+
+ qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+ if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+ qp->rq.offset = 0;
+ qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+ } else {
+ qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+ qp->sq.offset = 0;
+ }
+
+ cap->max_send_wr = qp->sq.max_post =
+ (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+ cap->max_send_sge = min(qp->sq.max_gs,
+ min(dev->dev->caps.max_sq_sg,
+ dev->dev->caps.max_rq_sg));
+ qp->max_inline_data = cap->max_inline_data;
+
+ return 0;
+}
+
+
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+ switch (state) {
+ case IB_QPS_RESET: return MLX4_QP_STATE_RST;
+ case IB_QPS_INIT: return MLX4_QP_STATE_INIT;
+ case IB_QPS_RTR: return MLX4_QP_STATE_RTR;
+ case IB_QPS_RTS: return MLX4_QP_STATE_RTS;
+ case IB_QPS_SQD: return MLX4_QP_STATE_SQD;
+ case IB_QPS_SQE: return MLX4_QP_STATE_SQER;
+ case IB_QPS_ERR: return MLX4_QP_STATE_ERR;
+ default: return -1;
+ }
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+ struct mlx4_ib_gid_entry *ge, *tmp;
+
+ list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+ list_del(&ge->list);
+ kfree(ge);
+ }
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+ struct ib_qp_init_attr *init_attr)
+{
+ if (qp->state != IB_QPS_RESET)
+ if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+ MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+ printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+ qp->mqp.qpn);
+
+ mlx4_qp_remove(dev->dev, &qp->mqp);
+ mlx4_qp_free(dev->dev, &qp->mqp);
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+ mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+ kfree(qp->sq.wrid);
+ kfree(qp->rq.wrid);
+ mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+ if (qp->max_inline_data)
+ mlx4_bf_free(dev->dev, &qp->bf);
+ if (!init_attr->srq)
+ mlx4_db_free(dev->dev, &qp->db);
+
+ del_gid_entries(qp);
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+ if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+ return 0;
+
+ return !attr->srq;
+}
+
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+ struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+{
+ int qpn;
+ int err;
+ enum mlx4_ib_qp_type qp_type =
+ (enum mlx4_ib_qp_type) init_attr->qp_type;
+ qp->mlx4_ib_qp_type = qp_type;
+ qp->pri.vid = qp->alt.vid = 0xFFFF;
+ mutex_init(&qp->mutex);
+ spin_lock_init(&qp->sq.lock);
+ spin_lock_init(&qp->rq.lock);
+ INIT_LIST_HEAD(&qp->gid_list);
+ INIT_LIST_HEAD(&qp->steering_rules);
+
+ qp->state = IB_QPS_RESET;
+ if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+ qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+ qp_has_rq(init_attr), qp);
+ if (err)
+ goto err;
+
+ if (pd->uobject) {
+ } else {
+ qp->sq_no_prefetch = 0;
+
+ if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+ qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+ if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+ qp->flags |= MLX4_IB_QP_LSO;
+
+ if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP &&
+ dev->dev->caps.steering_mode ==
+ MLX4_STEERING_MODE_DEVICE_MANAGED &&
+ !mlx4_is_mfunc(dev->dev))
+ qp->flags |= MLX4_IB_QP_NETIF;
+
+ err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+ if (err)
+ goto err;
+
+ if (qp_has_rq(init_attr)) {
+ err = mlx4_db_alloc(dev->dev, &qp->db, 0, GFP_KERNEL);
+ if (err)
+ goto err;
+
+ *qp->db.db = 0;
+ }
+
+ if (qp->max_inline_data) {
+ err = mlx4_bf_alloc(dev->dev, &qp->bf, 0);
+ if (err) {
+ mlx4_ib_dbg("failed to allocate blue flame register (%d)", err);
+ qp->bf.uar = &dev->priv_uar;
+ }
+ } else
+ qp->bf.uar = &dev->priv_uar;
+
+ if (mlx4_buf_alloc(dev->dev, qp->buf_size,
+ PAGE_SIZE * 2, &qp->buf, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto err_db;
+ }
+
+ err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+ &qp->mtt);
+ if (err) {
+ mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err);
+ goto err_buf;
+ }
+
+ err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, GFP_KERNEL);
+ if (err) {
+ mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err);
+ goto err_mtt;
+ }
+
+ /* these are big chunks that may fail, added __GFP_NOWARN */
+ qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof (u64),
+ GFP_KERNEL | __GFP_NOWARN);
+ qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof (u64),
+ GFP_KERNEL | __GFP_NOWARN);
+
+ if (!qp->sq.wrid || !qp->rq.wrid) {
+ printk(KERN_WARNING "%s:%d: not enough memory\n",
+ __func__, __LINE__);
+ err = -ENOMEM;
+ goto err_wrid;
+ }
+ }
+
+ qpn = sqpn;
+
+ err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, GFP_KERNEL);
+ if (err)
+ goto err_qpn;
+
+ if (init_attr->qp_type == IB_QPT_XRC_TGT)
+ qp->mqp.qpn |= (1 << 23);
+
+ /*
+ * Hardware wants QPN written in big-endian order (after
+ * shifting) for send doorbell. Precompute this value to save
+ * a little bit when posting sends.
+ */
+ qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+ qp->mqp.event = mlx4_ib_qp_event;
+
+ return 0;
+
+err_qpn:
+err_wrid:
+ if (pd->uobject) {
+ } else {
+ kfree(qp->sq.wrid);
+ kfree(qp->rq.wrid);
+ }
+
+err_mtt:
+ mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+ if (pd->uobject)
+ ib_umem_release(qp->umem);
+ else
+ mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+ if (!pd->uobject && !init_attr->srq
+ && init_attr->qp_type != IB_QPT_XRC_TGT)
+ mlx4_db_free(dev->dev, &qp->db);
+
+ if (qp->max_inline_data)
+ mlx4_bf_free(dev->dev, &qp->bf);
+
+err:
+ return err;
+}
+
+#if 0
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+ void *wqe, unsigned *mlx_seg_len)
+{
+ struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+ struct mlx4_wqe_mlx_seg *mlx = wqe;
+ struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+ struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+ u16 pkey;
+ int send_size;
+ int header_size;
+ int spc;
+ int i;
+ union ib_gid sgid;
+ int is_eth;
+ int is_grh;
+ int is_vlan = 0;
+ int err;
+ u16 vlan;
+
+ send_size = 0;
+ for (i = 0; i < wr->num_sge; ++i)
+ send_size += wr->sg_list[i].length;
+
+ is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+ is_grh = mlx4_ib_ah_grh_present(ah);
+ err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &sgid);
+ if (err)
+ return err;
+
+ if (is_eth) {
+ is_vlan = rdma_get_vlan_id(&sgid) < 0x1000;
+ vlan = rdma_get_vlan_id(&sgid);
+ }
+
+ ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+ if (!is_eth) {
+ sqp->ud_header.lrh.service_level =
+ be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+ sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+ sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+ }
+
+ if (is_grh) {
+ sqp->ud_header.grh.traffic_class =
+ (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+ sqp->ud_header.grh.flow_label =
+ ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+ sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
+ ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+ ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
+ memcpy(sqp->ud_header.grh.destination_gid.raw,
+ ah->av.ib.dgid, 16);
+ }
+
+ mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+ if (!is_eth) {
+ mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+ (sqp->ud_header.lrh.destination_lid ==
+ IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+ (sqp->ud_header.lrh.service_level << 8));
+ mlx->rlid = sqp->ud_header.lrh.destination_lid;
+ }
+
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+ sqp->ud_header.immediate_present = 0;
+ break;
+ case IB_WR_SEND_WITH_IMM:
+ sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ sqp->ud_header.immediate_present = 1;
+ sqp->ud_header.immediate_data = wr->ex.imm_data;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (is_eth) {
+ u8 *smac;
+
+ memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
+ smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */
+ memcpy(sqp->ud_header.eth.smac_h, smac, 6);
+ if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+ mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+ if (!is_vlan)
+ sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+ else {
+ u16 pcp;
+
+ sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+ pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13;
+ sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+ }
+ } else {
+ sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
+ if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+ sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+ }
+ sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+ if (!sqp->qp.ibqp.qp_num)
+ ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+ else
+ ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+ sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+ sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+ sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+ sqp->qkey : wr->wr.ud.remote_qkey);
+ sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+ header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+ if (0) {
+ printk(KERN_ERR "built UD header of size %d:\n", header_size);
+ for (i = 0; i < header_size / 4; ++i) {
+ if (i % 8 == 0)
+ printk(" [%02x] ", i * 4);
+ printk(" %08x",
+ be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+ if ((i + 1) % 8 == 0)
+ printk("\n");
+ }
+ printk("\n");
+ }
+
+ /*
+ * Inline data segments may not cross a 64 byte boundary. If
+ * our UD header is bigger than the space available up to the
+ * next 64 byte boundary in the WQE, use two inline data
+ * segments to hold the UD header.
+ */
+ spc = MLX4_INLINE_ALIGN -
+ ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+ if (header_size <= spc) {
+ inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+ memcpy(inl + 1, sqp->header_buf, header_size);
+ i = 1;
+ } else {
+ inl->byte_count = cpu_to_be32(1 << 31 | spc);
+ memcpy(inl + 1, sqp->header_buf, spc);
+
+ inl = (void *) (inl + 1) + spc;
+ memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+ /*
+ * Need a barrier here to make sure all the data is
+ * visible before the byte_count field is set.
+ * Otherwise the HCA prefetcher could grab the 64-byte
+ * chunk with this inline segment and get a valid (!=
+ * 0xffffffff) byte count but stale data, and end up
+ * generating a packet with bad headers.
+ *
+ * The first inline segment's byte_count field doesn't
+ * need a barrier, because it comes after a
+ * control/MLX segment and therefore is at an offset
+ * of 16 mod 64.
+ */
+ wmb();
+ inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+ i = 2;
+ }
+
+ *mlx_seg_len =
+ ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+ return 0;
+}
+#endif
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+ unsigned cur;
+ struct mlx4_ib_cq *cq;
+
+ cur = wq->head - wq->tail;
+ if (likely(cur + nreq < wq->max_post))
+ return 0;
+
+ cq = to_mcq(ib_cq);
+ spin_lock(&cq->lock);
+ cur = wq->head - wq->tail;
+ spin_unlock(&cq->lock);
+
+ return cur + nreq >= wq->max_post;
+}
+
+#if 0
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+ iseg->flags = 0;
+ iseg->mem_key = cpu_to_be32(rkey);
+ iseg->guest_id = 0;
+ iseg->pa = 0;
+}
+#endif
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+ u64 remote_addr, u32 rkey)
+{
+ rseg->raddr = cpu_to_be64(remote_addr);
+ rseg->rkey = cpu_to_be32(rkey);
+ rseg->reserved = 0;
+}
+
+#if 0
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
+{
+ if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+ aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add);
+ } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+ aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+ } else {
+ aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+ aseg->compare = 0;
+ }
+
+}
+#endif
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+ struct ib_send_wr *wr, __be16 *vlan)
+{
+ memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+ dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+ dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+ dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
+ memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
+ *vlan = dseg->vlan;
+}
+
+#if 0
+static void set_mlx_icrc_seg(void *dseg)
+{
+ u32 *t = dseg;
+ struct mlx4_wqe_inline_seg *iseg = dseg;
+
+ t[1] = 0;
+
+ /*
+ * Need a barrier here before writing the byte_count field to
+ * make sure that all the data is visible before the
+ * byte_count field is set. Otherwise, if the segment begins
+ * a new cacheline, the HCA prefetcher could grab the 64-byte
+ * chunk and get a valid (!= * 0xffffffff) byte count but
+ * stale data, and end up sending the wrong data.
+ */
+ wmb();
+
+ iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+#endif
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+ dseg->lkey = cpu_to_be32(sg->lkey);
+ dseg->addr = cpu_to_be64(sg->addr);
+
+ /*
+ * Need a barrier here before writing the byte_count field to
+ * make sure that all the data is visible before the
+ * byte_count field is set. Otherwise, if the segment begins
+ * a new cacheline, the HCA prefetcher could grab the 64-byte
+ * chunk and get a valid (!= * 0xffffffff) byte count but
+ * stale data, and end up sending the wrong data.
+ */
+ wmb();
+
+ dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+ dseg->byte_count = cpu_to_be32(sg->length);
+ dseg->lkey = cpu_to_be32(sg->lkey);
+ dseg->addr = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+ struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+ __be32 *lso_hdr_sz, int *blh)
+{
+ unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
+
+ *blh = unlikely(halign > 64) ? 1 : 0;
+
+ if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+ wr->num_sge > qp->sq.max_gs - (halign >> 4)))
+ return -EINVAL;
+
+ memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
+
+ *lso_hdr_sz = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+ wr->wr.ud.hlen);
+ *lso_seg_len = halign;
+ return 0;
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+ switch (wr->opcode) {
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ return wr->ex.imm_data;
+
+ case IB_WR_SEND_WITH_INV:
+ return cpu_to_be32(wr->ex.invalidate_rkey);
+
+ default:
+ return 0;
+ }
+}
+
+static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
+ void *wqe, int *sz)
+{
+ struct mlx4_wqe_inline_seg *seg;
+ void *addr;
+ int len, seg_len;
+ int num_seg;
+ int off, to_copy;
+ int i;
+ int inl = 0;
+
+ seg = wqe; // current segment
+ wqe += sizeof *seg; // wqe pointer
+ off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1);
+ num_seg = 0;
+ seg_len = 0;
+
+ for (i = 0; i < wr->num_sge; ++i) {
+ addr = (void *) (unsigned long)(wr->sg_list[i].addr);
+ len = wr->sg_list[i].length;
+ inl += len;
+
+ if (inl > qp->max_inline_data) {
+ inl = 0;
+ return -1;
+ }
+
+ while (len >= MLX4_INLINE_ALIGN - off) {
+ to_copy = MLX4_INLINE_ALIGN - off;
+ memcpy(wqe, addr, to_copy);
+ len -= to_copy;
+ wqe += to_copy;
+ addr += to_copy;
+ seg_len += to_copy;
+ wmb(); /* see comment below */
+ seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+ seg_len = 0;
+ seg = wqe;
+ wqe += sizeof *seg;
+ off = sizeof *seg;
+ ++num_seg;
+ }
+
+ memcpy(wqe, addr, len);
+ wqe += len;
+ seg_len += len;
+ off += len;
+ }
+
+ if (seg_len) {
+ ++num_seg;
+ /*
+ * Need a barrier here to make sure
+ * all the data is visible before the
+ * byte_count field is set. Otherwise
+ * the HCA prefetcher could grab the
+ * 64-byte chunk with this inline
+ * segment and get a valid (!=
+ * 0xffffffff) byte count but stale
+ * data, and end up sending the wrong
+ * data.
+ */
+ wmb();
+ seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+ }
+
+ *sz = (inl + num_seg * sizeof * seg + 15) / 16;
+
+ return 0;
+}
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+{
+ __iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+ struct ib_recv_wr **bad_wr)
+{
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_wqe_data_seg *scat;
+ unsigned long flags;
+ int err = 0;
+ int nreq;
+ int ind;
+ int i;
+
+ spin_lock_irqsave(&qp->rq.lock, flags);
+
+ ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+ for (nreq = 0; wr; ++nreq, wr = wr->next) {
+ if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+ mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+ mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+ ibqp->qp_num, wr->num_sge);
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ scat = get_recv_wqe(qp, ind);
+
+ for (i = 0; i < wr->num_sge; ++i)
+ __set_data_seg(scat + i, wr->sg_list + i);
+
+ if (i < qp->rq.max_gs) {
+ scat[i].byte_count = 0;
+ scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+ scat[i].addr = 0;
+ }
+
+ qp->rq.wrid[ind] = wr->wr_id;
+
+ ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+ }
+
+out:
+ if (likely(nreq)) {
+ qp->rq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+ }
+
+ spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+ return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+ switch (mlx4_state) {
+ case MLX4_QP_STATE_RST: return IB_QPS_RESET;
+ case MLX4_QP_STATE_INIT: return IB_QPS_INIT;
+ case MLX4_QP_STATE_RTR: return IB_QPS_RTR;
+ case MLX4_QP_STATE_RTS: return IB_QPS_RTS;
+ case MLX4_QP_STATE_SQ_DRAINING:
+ case MLX4_QP_STATE_SQD: return IB_QPS_SQD;
+ case MLX4_QP_STATE_SQER: return IB_QPS_SQE;
+ case MLX4_QP_STATE_ERR: return IB_QPS_ERR;
+ default: return -1;
+ }
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+ switch (mlx4_mig_state) {
+ case MLX4_QP_PM_ARMED: return IB_MIG_ARMED;
+ case MLX4_QP_PM_REARM: return IB_MIG_REARM;
+ case MLX4_QP_PM_MIGRATED: return IB_MIG_MIGRATED;
+ default: return -1;
+ }
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+ int ib_flags = 0;
+
+ if (mlx4_flags & MLX4_QP_BIT_RRE)
+ ib_flags |= IB_ACCESS_REMOTE_READ;
+ if (mlx4_flags & MLX4_QP_BIT_RWE)
+ ib_flags |= IB_ACCESS_REMOTE_WRITE;
+ if (mlx4_flags & MLX4_QP_BIT_RAE)
+ ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+ return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr,
+ struct mlx4_qp_path *path)
+{
+ struct mlx4_dev *dev = ib_dev->dev;
+ int is_eth;
+
+ memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+ ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1;
+
+ if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+ return;
+
+ is_eth = rdma_port_get_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) ==
+ IB_LINK_LAYER_ETHERNET;
+ if (is_eth)
+ ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
+ ((path->sched_queue & 4) << 1);
+ else
+ ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
+
+ ib_ah_attr->dlid = be16_to_cpu(path->rlid);
+
+ ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
+ ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0;
+ ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+ if (ib_ah_attr->ah_flags) {
+ ib_ah_attr->grh.sgid_index = path->mgid_index;
+ ib_ah_attr->grh.hop_limit = path->hop_limit;
+ ib_ah_attr->grh.traffic_class =
+ (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+ ib_ah_attr->grh.flow_label =
+ be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+ memcpy(ib_ah_attr->grh.dgid.raw,
+ path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+ }
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ struct mlx4_qp_context context;
+ int mlx4_state;
+ int err = 0;
+
+ mutex_lock(&qp->mutex);
+
+ if (qp->state == IB_QPS_RESET) {
+ qp_attr->qp_state = IB_QPS_RESET;
+ goto done;
+ }
+
+ err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+ if (err) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+ qp->state = to_ib_qp_state(mlx4_state);
+ qp_attr->qp_state = qp->state;
+ qp_attr->path_mtu = context.mtu_msgmax >> 5;
+ qp_attr->path_mig_state =
+ to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+ qp_attr->qkey = be32_to_cpu(context.qkey);
+ qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+ qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff;
+ qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff;
+ qp_attr->qp_access_flags =
+ to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+ if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
+ qp->ibqp.qp_type == IB_QPT_XRC_TGT) {
+ to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+ to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+ qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+ qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num;
+ }
+
+ qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+ if (qp_attr->qp_state == IB_QPS_INIT)
+ qp_attr->port_num = qp->port;
+ else
+ qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+ /* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+ qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+ qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+ qp_attr->max_dest_rd_atomic =
+ 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+ qp_attr->min_rnr_timer =
+ (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+ qp_attr->timeout = context.pri_path.ackto >> 3;
+ qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7;
+ qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7;
+ qp_attr->alt_timeout = context.alt_path.ackto >> 3;
+
+done:
+ qp_attr->cur_qp_state = qp_attr->qp_state;
+ qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt;
+ qp_attr->cap.max_recv_sge = qp->rq.max_gs;
+
+ if (!ibqp->uobject) {
+ qp_attr->cap.max_send_wr = qp->sq.wqe_cnt;
+ qp_attr->cap.max_send_sge = qp->sq.max_gs;
+ } else {
+ qp_attr->cap.max_send_wr = 0;
+ qp_attr->cap.max_send_sge = 0;
+ }
+
+ /*
+ * We don't support inline sends for kernel QPs (yet), and we
+ * don't know what userspace's value should be.
+ */
+ qp_attr->cap.max_inline_data = 0;
+
+ qp_init_attr->cap = qp_attr->cap;
+
+ qp_init_attr->create_flags = 0;
+ if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+ qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+ if (qp->flags & MLX4_IB_QP_LSO)
+ qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+out:
+ mutex_unlock(&qp->mutex);
+ return err;
+}
+
+
+int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
+ u32 *qp_num)
+{
+ return -ENOSYS;
+}
+
+int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+ struct ib_qp_attr *attr, int attr_mask)
+{
+ return -ENOSYS;
+}
+
+int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+ struct ib_qp_attr *qp_attr, int qp_attr_mask,
+ struct ib_qp_init_attr *qp_init_attr)
+{
+ return -ENOSYS;
+}
+
+int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+ return -ENOSYS;
+}
+
+int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+ return -ENOSYS;
+}
+
+/**** VNIC IB VERBS ****/
+int vnic_ib_post_send(struct ib_qp *ibqp,
+ struct ib_send_wr *wr,
+ struct ib_send_wr **bad_wr,
+ u8 ip_off, u8 ip6_off,
+ u8 tcp_off, u8 udp_off)
+{
+ struct mlx4_ib_qp *qp = to_mqp(ibqp);
+ void *wqe;
+ struct mlx4_wqe_ctrl_seg *ctrl;
+ struct mlx4_wqe_data_seg *dseg;
+ __be32 owner_opcode = 0;
+ int nreq;
+ int err = 0;
+ unsigned ind;
+ int uninitialized_var(stamp);
+ int uninitialized_var(size);
+ unsigned uninitialized_var(seglen);
+ __be32 dummy;
+ __be32 *lso_wqe;
+ __be32 uninitialized_var(lso_hdr_sz);
+ int i;
+ int blh = 0;
+ __be16 vlan = 0;
+ int inl = 0;
+
+ ind = qp->sq_next_wqe;
+
+ nreq = 0;
+ lso_wqe = &dummy;
+
+ if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+ mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+ err = -ENOMEM;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+ mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+ ibqp->qp_num, wr->num_sge);
+ err = -EINVAL;
+ *bad_wr = wr;
+ goto out;
+ }
+
+ ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+ *((u32 *) (&ctrl->vlan_tag)) = 0;
+ qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+ ctrl->srcrb_flags =
+ (wr->send_flags & IB_SEND_SIGNALED ?
+ cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+ (wr->send_flags & IB_SEND_SOLICITED ?
+ cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+ qp->sq_signal_bits;
+
+ ctrl->imm = send_ieth(wr);
+
+ wqe += sizeof *ctrl;
+ size = sizeof *ctrl / 16;
+
+ set_datagram_seg(wqe, wr, &vlan);
+ wqe += sizeof (struct mlx4_wqe_datagram_seg);
+ size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+ if (wr->opcode == IB_WR_LSO) {
+ err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
+ if (unlikely(err)) {
+ *bad_wr = wr;
+ goto out;
+ }
+ lso_wqe = (__be32 *) wqe;
+ wqe += seglen;
+ size += seglen / 16;
+ }
+ dseg = wqe;
+ dseg += wr->num_sge - 1;
+
+ if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+ int sz;
+
+ err = lay_inline_data(qp, wr, wqe, &sz);
+ if (!err) {
+ inl = 1;
+ size += sz;
+ }
+ } else {
+ size += wr->num_sge * (sizeof(struct mlx4_wqe_data_seg) / 16);
+ for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+ set_data_seg(dseg, wr->sg_list + i);
+ }
+
+ wmb();
+ *lso_wqe = lso_hdr_sz;
+
+ ctrl->fence_size = size;
+
+ /* set SWP bits based on ip/ip6/tcp/udp offests */
+ if (wr->send_flags & IB_SEND_IP_CSUM) {
+ /* SWP bit */
+ owner_opcode |= cpu_to_be32(1 << 24);
+
+ /* IP offset starts from the begining of IB packet
+ * (and not ETH packet) in 2 bytes.
+ * In control segment, we use c & d:
+ * (a) tcp=0, ip=0 => calc TCP/UDP csum over IPv4
+ * (b) tcp=0, ip=1 => calc IP csum only over IPv4
+ * (c) tcp=1, ip=0 => calc TCP/UDP csum over IPv6
+ * (d) tcp=1, ip=1 => calc TCP/UDP and IP csum over IPv4
+ */
+ if (ip_off) {
+ ip_off += (IB_LRH_BYTES + IB_BTH_BYTES +
+ IB_DETH_BYTES) >> 1;
+ ip_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid
+ & 0x80) ? (IB_GRH_BYTES >> 1) : 0;
+ owner_opcode |= cpu_to_be32((ip_off) << 8);
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
+ } else if (ip6_off) {
+ ip6_off += (IB_LRH_BYTES + IB_BTH_BYTES +
+ IB_DETH_BYTES) >> 1;
+ ip6_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid
+ & 0x80) ? (IB_GRH_BYTES >> 1) : 0;
+ owner_opcode |= cpu_to_be32((ip6_off) << 8);
+ }
+
+ if (udp_off) { /* UDP offset and bit */
+ owner_opcode |= cpu_to_be32(udp_off << 16);
+ owner_opcode |= cpu_to_be32(1 << 25);
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
+ } else if (tcp_off) { /* TCP offset */
+ owner_opcode |= cpu_to_be32(tcp_off << 16);
+ ctrl->srcrb_flags |=
+ cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
+ }
+ }
+
+ /* set opcode, use 0x4e for BIG_LSO */
+ if (!blh)
+ owner_opcode |= mlx4_ib_opcode[wr->opcode];
+ else
+ owner_opcode |= cpu_to_be32(0x4e);
+
+ /* set owenership bit */
+ owner_opcode |= (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+ /* Make sure descriptor is fully written */
+ wmb();
+ ctrl->owner_opcode = owner_opcode;
+
+ stamp = ind + qp->sq_spare_wqes;
+ ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
+
+ /* simulate the for loop */
+ nreq++;
+
+out:
+ if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) {
+ ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8);
+ *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
+ /*
+ * Make sure that descriptor is written to memory
+ * before writing to BlueFlame page.
+ */
+ wmb();
+
+ ++qp->sq.head;
+
+ mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl,
+ ALIGN(size * 16, 64));
+ wc_wmb();
+
+ qp->bf.offset ^= qp->bf.buf_size;
+
+ } else if (nreq) {
+ qp->sq.head += nreq;
+
+ /*
+ * Make sure that descriptors are written before
+ * doorbell record.
+ */
+ wmb();
+
+ writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL);
+
+ /*
+ * Make sure doorbells don't leak out of SQ spinlock
+ * and reach the HCA out of order.
+ */
+ mmiowb();
+
+ }
+
+ stamp_send_wqe(qp, stamp, size * 16);
+
+ ind = pad_wraparound(qp, ind);
+ qp->sq_next_wqe = ind;
+ return err;
+}
+
+int __vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata, int nqps,
+ int align, struct ib_qp *list[])
+{
+ struct mlx4_ib_dev *dev = to_mdev(pd->device);
+ struct mlx4_ib_qp *qp;
+ int err;
+ int base_qpn, qpn;
+ int i;
+
+ for (i = 0; i < nqps; ++i) {
+ if (init_attr[i].create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+ return -EINVAL;
+ if (init_attr[i].create_flags & (IB_QP_CREATE_IPOIB_UD_LSO |
+ IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) &&
+ (pd->uobject || init_attr[i].qp_type != IB_QPT_UD))
+ return -EINVAL;
+
+ /* Userspace is not allowed to create special QPs: */
+ if (pd->uobject && (init_attr[i].qp_type == IB_QPT_SMI ||
+ init_attr[i].qp_type == IB_QPT_GSI))
+ return -EINVAL;
+
+ if (nqps > 1 && (init_attr[i].qp_type == IB_QPT_SMI ||
+ init_attr[i].qp_type == IB_QPT_GSI))
+ return -EINVAL;
+ }
+
+ err = mlx4_qp_reserve_range(dev->dev, nqps, align, &base_qpn, 0);
+ if (err)
+ return err;
+
+ for (i = 0, qpn = base_qpn; i < nqps; ++i, ++qpn) {
+ qp = kzalloc(sizeof *qp, GFP_KERNEL);
+ if (!qp) {
+ err = -ENOMEM;
+ goto exit_fail;
+ }
+
+ err = create_qp_common(dev, pd, init_attr + i, udata, qpn, qp);
+ if (err) {
+ kfree(qp);
+ err = err;
+ goto exit_fail;
+ }
+ qp->xrcdn = 0;
+ qp->ibqp.qp_num = qp->mqp.qpn;
+ list[i] = &qp->ibqp;
+ }
+ return 0;
+
+exit_fail:
+ for (--i; i >= 0; --i) {
+ destroy_qp_common(dev, to_mqp(list[i]), init_attr + i);
+ kfree(to_mqp(list[i]));
+ }
+
+ mlx4_qp_release_range(dev->dev, base_qpn, nqps);
+ return err;
+}
+
+/* compare with ib_create_qp() in infiniband/core/verbs.c */
+int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+ struct ib_udata *udata, int nqps,
+ int align, struct ib_qp *list[])
+{
+ struct ib_qp *qp;
+ struct ib_qp_init_attr *qp_init_attr;
+ int rc, i;
+
+ rc = __vnic_ib_create_qp_range(pd, init_attr, udata ,nqps, align, list);
+
+ if (rc)
+ return rc;
+
+ for (i = 0; i < nqps; ++ i) {
+ qp = list[i];
+ qp_init_attr = &init_attr[i];
+ qp->device = pd->device;
+ qp->real_qp = qp;
+ qp->pd = pd;
+ qp->send_cq = qp_init_attr->send_cq;
+ qp->recv_cq = qp_init_attr->recv_cq;
+ qp->srq = qp_init_attr->srq;
+ qp->uobject = NULL;
+ qp->event_handler = qp_init_attr->event_handler;
+ qp->qp_context = qp_init_attr->qp_context;
+ qp->qp_type = qp_init_attr->qp_type;
+ qp->xrcd = qp->qp_type == IB_QPT_XRC_TGT ?
+ qp_init_attr->xrcd : NULL;
+ atomic_inc(&pd->usecnt);
+ atomic_inc(&qp_init_attr->send_cq->usecnt);
+ atomic_inc(&qp_init_attr->recv_cq->usecnt);
+ if (qp_init_attr->srq)
+ atomic_inc(&qp_init_attr->srq->usecnt);
+ if (qp->qp_type == IB_QPT_XRC_TGT)
+ atomic_inc(&qp->xrcd->usecnt);
+ }
+ return 0;
+}
+
--- /dev/null
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_UTILS_H
+#define _VNIC_UTILS_H
+
+/*#define CONFIG_MLX4_VNIC_DEBUG */ /* comment out in RELEASE and PERFORMANCE modes */
+/* #define VNIC_PROFILLNG */ /* comment out in RELEASE and PERFORMANCE modes */
+#define VNIC_EXTRA_STATS /* comment out in PERFORMANCE mode */
+
+enum {
+ VNIC_DEBUG_GENERAL = 1 << 0, /* 0x1 */
+ VNIC_DEBUG_MCAST = 1 << 1, /* 0x2 */
+ VNIC_DEBUG_MCAST_V = 1 << 2, /* 0x4 */
+ VNIC_DEBUG_DATA = 1 << 3, /* 0x8 */
+ VNIC_DEBUG_DATA_V = 1 << 4, /* 0x10 */
+ VNIC_DEBUG_FIP = 1 << 5, /* 0x20 */
+ VNIC_DEBUG_FIP_V = 1 << 6, /* 0x40 */
+ VNIC_DEBUG_SKB = 1 << 7, /* 0x80 */
+ VNIC_DEBUG_SKB_V = 1 << 8, /* 0x100 */
+ VNIC_DEBUG_VHUB = 1 << 9, /* 0x200 */
+ VNIC_DEBUG_VHUB_V = 1 << 10, /* 0x400 */
+ VNIC_DEBUG_ETHTOOL = 1 << 11, /* 0x800 */
+ VNIC_DEBUG_ETHTOOL_V = 1 << 12, /* 0x1000 */
+ VNIC_DEBUG_FUNC = 1 << 13, /* 0x2000 */
+ VNIC_DEBUG_MARK = 1 << 14, /* 0x4000 */
+ VNIC_DEBUG_MODER = 1 << 15, /* 0x8000 */
+ VNIC_DEBUG_MODER_v = 1 << 16, /* 0x10000 */
+ VNIC_DEBUG_PKT_DUMP = 1 << 17, /* 0x20000 */
+ VNIC_DEBUG_FIP_P0 = 1 << 18, /* 0x40000 */
+ VNIC_DEBUG_SYSFS = 1 << 19, /* 0x80000 */
+ VNIC_DEBUG_MAC = 1 << 20, /* 0x100000 */
+ VNIC_DEBUG_TSTAMP = 1 << 21, /* 0x200000 */
+ VNIC_DEBUG_PARSER = 1 << 19, /* 0x400000 */
+ VNIC_DEBUG_LAG = 1 << 20, /* 0x800000 */
+ VNIC_DEBUG_LAG_V = 1 << 21, /* 0x1000000 */
+ VNIC_DEBUG_MCAST_VV = 1 << 22, /* 0x2000000 */
+ VNIC_DEBUG_DEBUG = 1 << 31, /* 0x80000000 */
+};
+
+/* always defined */
+#define vnic_printk(level, prefix, format, arg...) \
+ do { printk(level "T%.4ld [%s] %s:%s:%d: " format, \
+ jiffies * 1000 / HZ, \
+ DRV_NAME, prefix ? prefix : "", __func__, __LINE__ , \
+ ## arg); \
+} while(0)
+
+#define vnic_info(format, arg...) \
+do { printk(KERN_INFO "[%s] " format, DRV_NAME, ## arg); } \
+while (0)
+
+#define vnic_warn(prefix, format, arg...) \
+do { vnic_printk(KERN_WARNING, prefix, format, ## arg); } \
+while (0)
+
+#define vnic_err(prefix, format, arg...) \
+do { vnic_printk(KERN_ERR, prefix, format, ## arg); } \
+while (0)
+
+#define _sprintf(p, buf, format, arg...) \
+ (PAGE_SIZE - (int)(p - buf)) <= 0 ? 0 : \
+ scnprintf(p, PAGE_SIZE - (int)(p - buf), format, ## arg)
+
+/* debug functions */
+#ifndef CONFIG_MLX4_VNIC_DEBUG
+#define ASSERT(x) do { (void)(x); } while (0)
+#define vnic_dbg_mark(void) do { } while (0)
+#define vnic_dbg_func(prefix) do { } while (0)
+#define vnic_dbg(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast_vv(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_debug(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_ethtool(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_ethtool_v(prefix, format, arg...) \
+ do { (void)(prefix); } while (0)
+#define vnic_dbg_data(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_data_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_fip(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_parse(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_lag(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_lag_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_fip_p0(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_sysfs(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_mac(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_fip_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_vhub(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_vhub_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_moder(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_moder_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_printk_skb(prefix, skb, o1, o2) do { (void)(prefix); } while (0)
+#define vnic_dbg_skb(prefix, skb, o1, o2) do { (void)(prefix); } while (0)
+#else
+#define ASSERT(x) \
+do { if (x) break; \
+ printk(KERN_EMERG "### ASSERTION FAILED %s: %s: %d: %s\n", \
+ __FILE__, __func__, __LINE__, #x); dump_stack(); BUG(); \
+} while (0)
+
+#define vnic_dbg(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_GENERAL)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_mcast(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_mcast_v(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST_V)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_mcast_vv(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_MCAST_VV)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_debug(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_DEBUG)) break; \
+ vnic_printk(KERN_WARNING, prefix, format, ## arg); \
+} while (0)
+
+
+#define vnic_dbg_data(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_DATA)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_data_v(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_DATA_V)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_fip_p0(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_FIP_P0)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_sysfs(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_SYSFS)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_mac(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_MAC)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_fip(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_FIP)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_parse(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_PARSER)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_lag(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_LAG)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_lag_v(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_LAG_V)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_fip_v(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_FIP_V)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_vhub(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_VHUB)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_vhub_v(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_moder(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_MODER)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_moder_v(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_MODER_V)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_ethtool(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_ethtool_v(prefix, format, arg...) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL_V)) break; \
+ vnic_printk(KERN_DEBUG, prefix, format, ## arg); \
+} while (0)
+
+#define vnic_dbg_mark(void) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_MARK)) break; \
+ vnic_printk(KERN_DEBUG, NULL, "###\n"); \
+} while (0)
+
+#define vnic_dbg_func(prefix) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_FUNC)) break; \
+ vnic_printk(KERN_DEBUG, prefix, "function called\n"); \
+} while (0)
+
+#define ethp2str(p, str) \
+do { \
+ switch (ntohs(p)) { \
+ case ETH_P_RARP: sprintf(str, "%s", "ETH_P_RARP"); break; \
+ case ETH_P_ARP: sprintf(str, "%s", "ETH_P_ARP"); break; \
+ case ETH_P_IP: sprintf(str, "%s", "ETH_P_IP"); break; \
+ case ETH_P_IPV6: sprintf(str, "%s", "ETH_P_IPV6"); break; \
+ case ETH_P_8021Q:sprintf(str, "%s", "ETH_P_8021Q");break; \
+ default: sprintf(str, "0x%x", p); break; \
+ } \
+} while (0)
+
+#define skb_printk(prefix, format, arg...) \
+ printk(KERN_DEBUG "[%s] " format, prefix, ## arg)
+
+#define vnic_dbg_skb(_prefix, skb, eoib_off, eth_off) \
+do { if (!(vnic_msglvl & VNIC_DEBUG_SKB)) break; \
+ vnic_printk_skb(_prefix, skb, eoib_off, eth_off); \
+} while (0)
+
+#define VNIC_SYSLOG_LLEN 64
+#define vnic_printk_skb(_prefix, skb, eoib_off, eth_off) \
+do { \
+ char pr[VNIC_SYSLOG_LLEN]; \
+ char h_proto_str[VNIC_SYSLOG_LLEN]; \
+ struct eoibhdr *eoib_hdr = (struct eoibhdr *) \
+ (skb->data + eoib_off); \
+ struct ethhdr *ethh = (struct ethhdr *) \
+ (skb->data + eth_off); \
+ struct net_device *dev = skb->dev; \
+ ASSERT(dev); \
+ snprintf(pr, VNIC_SYSLOG_LLEN, "%s:skb-%s", dev->name, _prefix);\
+ skb_printk(pr, "\n"); \
+ skb_printk(pr, "--- skb dump ---\n"); \
+ skb_printk(pr, "len : %d\n", skb->len); \
+ skb_printk(pr, "data_len : %d\n", skb->data_len); \
+ skb_printk(pr, "frags : %d\n", \
+ skb_shinfo(skb)->nr_frags); \
+ skb_printk(pr, "gso : %d\n", skb_is_gso(skb)); \
+ skb_printk(pr, "head_len : %d\n", (int)skb_headlen(skb)); \
+ skb_printk(pr, "data : %p\n", skb->data); \
+ skb_printk(pr, "head : %p\n", skb->head); \
+ skb_printk(pr, "tail : %lu\n", \
+ (unsigned long)(skb->tail)); \
+ skb_printk(pr, "end : %lu\n", \
+ (unsigned long)(skb->end)); \
+ skb_printk(pr, "eoib_off : %lu\n", eoib_off); \
+ skb_printk(pr, "eth_off : %lu\n", eth_off); \
+ if (eth_off < 0 || !skb_headlen(skb)) \
+ break; \
+ ethp2str(ethh->h_proto, h_proto_str); \
+ skb_printk(pr, "eth_proto : %s\n", h_proto_str); \
+ skb_printk(pr, "eth_dest : "MAC_6_PRINT_FMT"\n", \
+ MAC_6_PRINT_ARG(ethh->h_dest)); \
+ skb_printk(pr, "eth_source : "MAC_6_PRINT_FMT"\n", \
+ MAC_6_PRINT_ARG(ethh->h_source)); \
+ if (eoib_off < 0) \
+ break; \
+ skb_printk(pr, "eoib_seg_id : 0x%04x\n", eoib_hdr->seg_id); \
+ skb_printk(pr, "eoib_seg_off : 0x%02x\n", eoib_hdr->seg_off); \
+ skb_printk(pr, "eoib_ip_chk : 0x%02x\n", \
+ VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr)); \
+ skb_printk(pr, "eoib_tcp_chk : 0x%02x\n", \
+ VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)); \
+ skb_printk(pr, "eoib_ver : 0x%02x\n", \
+ VNIC_EOIB_HDR_GET_VER(eoib_hdr)); \
+ skb_printk(pr, "eoib_sig : 0x%02x\n", \
+ VNIC_EOIB_HDR_GET_SIG(eoib_hdr)); \
+} while (0)
+
+#endif /* CONFIG_MLX4_VNIC_DEBUG */
+#endif /* _VNIC_UTILS_H */