]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mlx4_vnic: add mlx4_vnic
authorSaeed Mahameed <saeedm@mellanox.com>
Wed, 17 Apr 2013 20:21:12 +0000 (23:21 +0300)
committerMukesh Kacker <mukesh.kacker@oracle.com>
Tue, 7 Jul 2015 21:38:11 +0000 (14:38 -0700)
Add mlx4_vnic code

Also squash following porting commmits for compilation
of the integrated commit (without squashing they wont compile)

mlx4_vnic: adapt vnic to ofed2 mlx4 implementation
mlx4_vnic: align with OFED2 upstream 3.7 kernel
mlx4_vnic: Fix reference path to hw/mlx4 header files
mlx4_vnic: remove mlx4_vnic_helper module
mlx4_vnic: use ib_modify_cq() in upstream kernel
        We modify code to use ib_modify_cq() in upstream kernel
        (and not use a modified Mellanox version)
mlx4_vnic: removed reference to mlx4_ib_qp->rules_list in vnic_qp.c
        Remove field introduced with Mellanox OFED 2.4 flow
        steering patches which are not in upstream kernel.
mlx4_vnic: used an older version of mlx4_qp_reserve_range()
        Use mlx4_qp_reserve_range() aligned with version
        in Linux 3.18 (We can use the new API when it is
        available upstream)
mlx4_vnic: port to Linux 3.18*
        mlx4_vnic code is based on the original port
        of mlx4_vnic in UEK3. Make changes to compile
        on UEK4 (based on Linux 3.18). Use upstream APIs
        -not Mellanox specific ones - where they are in
        conflict and other changes to make it compile
        on Linux 3.18

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Ajaykumar Hotchandani <ajaykumar.hotchandani@oracle.com>
Signed-off-by: Qing Huang <qing.huang@oracle.com>
(Ported from UEK3 and Mellanox OFED 2.4)

Signed-off-by: Mukesh Kacker <mukesh.kacker@oracle.com>
33 files changed:
drivers/net/ethernet/mellanox/mlx4_vnic/Makefile [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.h [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c [new file with mode: 0644]
drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h [new file with mode: 0644]

diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/Makefile b/drivers/net/ethernet/mellanox/mlx4_vnic/Makefile
new file mode 100644 (file)
index 0000000..09d022a
--- /dev/null
@@ -0,0 +1,9 @@
+obj-$(CONFIG_MLX4_VNIC)        += mlx4_vnic.o
+
+mlx4_vnic-y :=  vnic_data_main.o    vnic_data_ib.o    vnic_data_netdev.o    vnic_data_neigh.o   \
+               vnic_data_fs.o      vnic_data_tx.o    vnic_data_ethtool.o   vnic_data_rx.o      \
+               vnic_fip_main.o     vnic_fip_ib.o     vnic_fip_discover.o   vnic_fip_pkt.o      \
+               vnic_fip_login.o    vnic_fip_vhub.o   vnic_mcast.o          vnic_port.o         \
+               vnic_param.o        vnic_qp.o         vnic_main.o           fip_parser.o        \
+               vnic_data_mac.o
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/gw_state.dot
new file mode 100644 (file)
index 0000000..44f5956
--- /dev/null
@@ -0,0 +1,5 @@
+digraph {
+       FIP_GW_HOST_ADMIN;
+       FIP_GW_MCAST_RCVD;
+       FIP_GW_CONNECTED;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/regndev.dot
new file mode 100644 (file)
index 0000000..ea10aba
--- /dev/null
@@ -0,0 +1,54 @@
+digraph {
+
+       vnic_login_create_1 -> register_netdev; //
+       __vnic_login_create -> vnic_login_create_1; //
+       vnic_new_intf_store -> __vnic_login_create; //
+       vnic_port_data_init -> __vnic_login_create; //
+       vnic_ib_dev_add_one -> vnic_port_data_init; //
+       fip_vnic_login_create -> vnic_login_create_1; //
+       fip_vnic_test_login -> fip_vnic_login_create [label="login_wq", color=blue]; //
+       fip_vnic_destroy -> fip_vnic_test_login; //
+       fip_purge_vnics -> fip_vnic_destroy; //
+       fip_purge_vnics -> fip_purge_vnics [label="fip_wq", color=blue]; //
+       fip_vnic_close -> fip_purge_vnics [label="fip_wq", color=blue];
+       fip_vnic_hadmin_init -> fip_vnic_test_login; //
+       fip_gw_update_hadmin_gw -> fip_vnic_hadmin_init; //
+       fip_discover_hadmin_update -> fip_gw_update_hadmin_gw; //
+       fip_hadmin_sysfs_update -> fip_discover_hadmin_update [label="fip_wq", color=blue]; //
+       fip_vnic_fsm -> fip_vnic_test_login; //
+       fip_gw_create_vnics -> fip_vnic_fsm; //
+
+
+       fip_gw_update_hadmin_gw -> fip_vnic_fsm;
+       fip_vnic_login_ack_recv -> fip_vnic_fsm; //
+       fip_discover_rx_packet_bh -> fip_vnic_login_ack_recv;
+       fip_vnic_tbl_done -> fip_vnic_fsm; //
+       vhub_handle_tbl -> fip_vnic_tbl_done; //
+       fip_vnic_recv_bh -> vhub_handle_tbl; //
+       fip_vnic_recv -> fip_vnic_recv_bh [label="fip_wq", color=blue]; //
+       fip_vnic_comp -> fip_vnic_recv;
+
+       fip_discover_rx_advertise_bh -> fip_discover_gw_fsm;
+
+       fip_hadmin_vnic_refresh -> fip_vnic_fsm; //
+       fip_gw_create_vnics -> fip_hadmin_vnic_refresh //
+       fip_gw_modified -> fip_gw_create_vnics; //
+       fip_discover_rx_advertise_bh -> fip_gw_modified; //
+       fip_discover_rx_packet_bh -> fip_discover_rx_advertise_bh; //
+       fip_discover_process_rx_bh -> fip_discover_rx_packet_bh; //
+       fip_discover_process_rx -> fip_discover_process_rx_bh [label="fip_wq", color=blue]; //
+       fip_discover_comp -> fip_discover_process_rx;
+
+
+
+       fip_discover_rx_advertise_bh -> fip_gw_create_vnics;
+       fip_discover_gw_fsm -> fip_gw_create_vnics;
+
+       vnic_login_pre_create_1 -> vnic_alloc_netdev; //
+       __vnic_login_create -> vnic_login_pre_create_1;
+       fip_vnic_hadmin_init -> vnic_login_pre_create_1;
+       fip_vnic_login_init -> vnic_login_pre_create_1;
+       fip_vnic_fsm -> fip_vnic_login_init;
+       
+
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_flush.dot
new file mode 100644 (file)
index 0000000..fc2a8fd
--- /dev/null
@@ -0,0 +1,5 @@
+digraph {
+       -> FIP_NO_FLUSH [label="fip_vnic_alloc"];
+       FIP_PARTIAL_FLUSH;
+       FIP_FULL_FLUSH;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot b/drivers/net/ethernet/mellanox/mlx4_vnic/doc/vnic_state.dot
new file mode 100644 (file)
index 0000000..6adcd89
--- /dev/null
@@ -0,0 +1,15 @@
+digraph {
+       FIP_VNIC_CLOSED;
+       fip_vnic_alloc [shape=regular];
+       fip_vnic_alloc -> FIP_VNIC_HADMIN_IDLE [label="hadmin"];
+       fip_vnic_alloc -> FIP_VNIC_LOGIN [label="none hadmin"];
+       FIP_VNIC_WAIT_4_ACK;
+       FIP_VNIC_RINGS_INIT;
+       FIP_VNIC_MCAST_INIT;
+       FIP_VNIC_MCAST_INIT_DONE;
+       FIP_VNIC_VHUB_INIT;
+       FIP_VNIC_VHUB_INIT_DONE;
+       FIP_VNIC_VHUB_DONE;
+       FIP_VNIC_VHUB_WRITE;
+       FIP_VNIC_CONNECTED;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c b/drivers/net/ethernet/mellanox/mlx4_vnic/fip_parser.c
new file mode 100644 (file)
index 0000000..e178299
--- /dev/null
@@ -0,0 +1,510 @@
+/*
+ * Copyright (c) 2010 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_pkt.h"
+
+static const struct subcode_rules {
+       u64     req_mask;
+       u64     opt_mask;
+} subcodes_array[FIP_MAX_SUBCODES] = {
+       [FIP_HOST_SOL_SUB_OPCODE] = {
+               .req_mask = FIP_MASK(VENDOR_ID) |
+                           FIP_MASK(ADDRESS),
+               .opt_mask = FIP_MASK(EXT_DESC),
+       },
+       [FIP_GW_ADV_SUB_OPCODE] = {
+               .req_mask = FIP_MASK(VENDOR_ID) |
+                           FIP_MASK(ADDRESS) |
+                           FIP_MASK(GW_INFORMATION) |
+                           FIP_MASK(GW_IDENTIFIER) |
+                           FIP_MASK(KA_PARAMS),
+               .opt_mask = FIP_MASK(EXT_DESC),
+       },
+       [FIP_HOST_LOGIN_SUB_OPCODE] = {
+               .req_mask = FIP_MASK(VENDOR_ID) |
+                           FIP_MASK(ADDRESS) |
+                           FIP_MASK(LOGIN) |
+                           FIP_MASK(PARTITION),
+               .opt_mask = FIP_MASK(EXT_DESC),
+       },
+       [FIP_GW_LOGIN_SUB_OPCODE] = {
+               .req_mask = FIP_MASK(VENDOR_ID) |
+                           FIP_MASK(ADDRESS) |
+                           FIP_MASK(LOGIN) |
+                           FIP_MASK(PARTITION),
+               .opt_mask = FIP_MASK(EXT_DESC),
+       },
+       [FIP_HOST_LOGOUT_SUB_OPCODE] = {
+               .req_mask = FIP_MASK(VENDOR_ID) |
+                           FIP_MASK(VNIC_IDENTITY),
+       },
+       [FIP_GW_UPDATE_SUB_OPCODE] = {
+               .req_mask = FIP_MASK(VENDOR_ID) |
+                           FIP_MASK(VHUB_UPDATE),
+               .opt_mask = FIP_MASK(EXT_DESC),
+       },
+       [FIP_GW_TABLE_SUB_OPCODE] = {
+               .req_mask = FIP_MASK(VENDOR_ID) |
+                           FIP_MASK(VHUB_TABLE),
+       },
+       [FIP_HOST_ALIVE_SUB_OPCODE] = {
+               .req_mask = FIP_MASK(VENDOR_ID) |
+                           FIP_MASK(VNIC_IDENTITY),
+       },
+};
+
+static int type2idx(struct fip_content *fc, struct fip_fip_type *ft)
+{
+       void *p = ft;
+
+       switch (ft->type) {
+       case FIP_TYPE(VENDOR_ID):
+               fc->fvend = p;
+               return FIP_TYPE_IDX(VENDOR_ID);
+       case FIP_TYPE(ADDRESS):
+               fc->fa.fa[fc->fa.num++] = p;
+               return FIP_TYPE_IDX(ADDRESS);
+       case FIP_TYPE(GW_INFORMATION):
+               fc->fgwi = p;
+               return FIP_TYPE_IDX(GW_INFORMATION);
+       case FIP_TYPE(LOGIN):
+               fc->fl = p;
+               return FIP_TYPE_IDX(LOGIN);
+       case FIP_TYPE(VHUB_UPDATE):
+               fc->fvu = p;
+               return FIP_TYPE_IDX(VHUB_UPDATE);
+       case FIP_TYPE(VHUB_TABLE):
+               fc->fvt = p;
+               return FIP_TYPE_IDX(VHUB_TABLE);
+       case FIP_TYPE(VNIC_IDENTITY):
+               fc->fvi = p;
+               return FIP_TYPE_IDX(VNIC_IDENTITY);
+       case FIP_TYPE(PARTITION):
+               fc->fp = p;
+               return FIP_TYPE_IDX(PARTITION);
+       case FIP_TYPE(GW_IDENTIFIER):
+               fc->fgid = p;
+               return FIP_TYPE_IDX(GW_IDENTIFIER);
+       case FIP_TYPE(KA_PARAMS):
+               fc->fka = p;
+               return FIP_TYPE_IDX(KA_PARAMS);
+       case FIP_TYPE(EXT_DESC):
+               fc->fed.fed[fc->fed.num++] = p;
+               return FIP_TYPE_IDX(EXT_DESC);
+       default:
+               return -1;
+       }
+}
+
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+static const char *fip_type_str(int type)
+{
+       switch (type) {
+       FIP_CASE_STR(VENDOR_ID);
+       FIP_CASE_STR(ADDRESS);
+       FIP_CASE_STR(GW_INFORMATION);
+       FIP_CASE_STR(LOGIN);
+       FIP_CASE_STR(VHUB_UPDATE);
+       FIP_CASE_STR(VHUB_TABLE);
+       FIP_CASE_STR(VNIC_IDENTITY);
+       FIP_CASE_STR(PARTITION);
+       FIP_CASE_STR(GW_IDENTIFIER);
+       FIP_CASE_STR(KA_PARAMS);
+       FIP_CASE_STR(EXT_DESC);
+       default:
+               return "Unknown";
+       }
+}
+
+static const char *fip_subcode_str(int subcode)
+{
+       switch (subcode) {
+       FIP_SUBCODE_CASE_STR(FIP_HOST_SOL_SUB_OPCODE);
+       FIP_SUBCODE_CASE_STR(FIP_GW_ADV_SUB_OPCODE);
+       FIP_SUBCODE_CASE_STR(FIP_HOST_LOGIN_SUB_OPCODE);
+       FIP_SUBCODE_CASE_STR(FIP_GW_LOGIN_SUB_OPCODE);
+       FIP_SUBCODE_CASE_STR(FIP_HOST_LOGOUT_SUB_OPCODE);
+       FIP_SUBCODE_CASE_STR(FIP_GW_UPDATE_SUB_OPCODE);
+       FIP_SUBCODE_CASE_STR(FIP_GW_TABLE_SUB_OPCODE);
+       FIP_SUBCODE_CASE_STR(FIP_HOST_ALIVE_SUB_OPCODE);
+       default:
+               return "Unknown";
+       }
+}
+#endif
+
+static int verify_mlx_sig(void *p)
+{
+       static const char *mlx4_str = "mellanox";
+       __be64 mlx_str_64 = *(__be64 *)mlx4_str;
+       __be64 *sig = p;
+
+       return *sig != mlx_str_64;
+}
+
+static int next_type(struct vnic_port *port, void *tlv, int len,
+                    struct fip_content *fc, int *sz, int *idx)
+{
+        struct fip_fip_type *ft;
+
+       if (sizeof *ft > len) {
+               vnic_dbg_parse(port->name, "message too short\n");
+               return -1;
+       }
+       ft = tlv
+               ;
+        vnic_dbg_parse(port->name, "TLV: type %s(%d)\n", fip_type_str(ft->type),
+                    ft->type);
+
+       if (!ft->length || (ft->length << 2 > len)) {
+               vnic_dbg_parse(port->name, "TLV does not fit in message: %s(%d) "
+                            "tlv->len %d, remaining %d\n", fip_type_str(ft->type),
+                            ft->type, ft->length << 2, len);
+               return -1;
+       }
+
+       *sz = (ft->length << 2);
+
+       *idx = type2idx(fc, ft);
+       if (*idx < 0) {
+               vnic_dbg_parse(port->name, "unkown type %d\n", ft->type);
+               return -1;
+       }
+
+       if (ft->type == FIP_TYPE(VENDOR_ID) && verify_mlx_sig(fc->fvend->vendor_id)) {
+                vnic_dbg_parse(port->name, "mellanox signature check failed\n");
+               return -1;
+       }
+
+        if (ft->type == FIP_TYPE(VHUB_TABLE) || ft->type == FIP_TYPE(VHUB_UPDATE)) {
+               int cte_list_sz;
+               struct context_table_entry *cte_start;
+
+               if (ft->type == FIP_TYPE(VHUB_TABLE)) {
+                       unsigned hdr = be16_to_cpu(fc->fvt->hdr) >> 14;
+
+                       if (hdr > FIP_TABLE_HDR_ONLY) {
+                               vnic_dbg_parse(port->name, "invalid table header %d\n", hdr);
+                               return -1;
+                       }
+                       cte_list_sz = *sz - sizeof(struct fip_vhub_table_tlv);
+                       /* Todo, the next 2 lines are comented because the size of the tbl tlv is
+                          miscomputed in BXM versions 1.3.6-5 and it causes tables to be discarded.
+                          In reality the size should be used with the lines in tact. */
+                       /*if (hdr == FIP_TABLE_HDR_LAST)
+                               cte_list_sz -= 4;
+                       */
+
+                       cte_start = (struct context_table_entry *)(fc->fvt + 1);
+               } else {
+                       cte_list_sz = *sz - sizeof(struct fip_vhub_update_tlv);
+                       cte_start = (struct context_table_entry *)(fc->fvu + 1);
+               }
+
+
+               fc->cte.num = cte_list_sz / sizeof(struct context_table_entry);
+               fc->cte.cte = cte_start;
+       }
+
+
+       return 0;
+}
+
+static inline int check_eoib_ver(struct vnic_port *port,
+                                struct fip_eoib_ver *eoib_ver, int sz, int *len)
+{
+       if (unlikely(sz < sizeof *eoib_ver)) {
+               vnic_dbg_parse(port->name, "message too short\n");
+               *len = sz;
+               return -ENOMEM;
+       }
+       *len = sizeof *eoib_ver;
+       if (unlikely(eoib_ver->version >> 4)) {
+               vnic_dbg_parse(port->name, "eoib version check failed: %d\n", eoib_ver->version >> 4);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+static void dump_raw(struct vnic_port *port, void *buf, int len)
+{
+       int i;
+
+       for (i = 0; i < len / 4; ++i)
+               vnic_dbg_parse(port->name, "0x%08x\n", be32_to_cpu(((__be32 *)(buf))[i]));
+}
+
+static inline int check_fip_hdr(struct vnic_port *port,
+                               struct fip_header_simple *fh, int sz, int *len)
+{
+       if (unlikely(sizeof *fh > sz)) {
+               vnic_dbg_parse(port->name, "message too short\n");
+               return -1;
+       }
+
+       if (unlikely(fh->opcode != cpu_to_be16(EOIB_FIP_OPCODE))) {
+               vnic_dbg_parse(port->name, "not fip opcode\n");
+               return -1;
+       }
+
+       if (unlikely((be16_to_cpu(fh->list_length) << 2) > (sz - sizeof *fh))) {
+               vnic_dbg_parse(port->name, "message too short: header length = %u, "
+                              "left length = %lu\n",
+                              be16_to_cpu(fh->list_length) << 2, sz - sizeof *fh);
+               return -1;
+       }
+
+        *len = sizeof *fh;
+
+       return 0;
+}
+
+static int check_fip_mask(struct vnic_port *port, struct fip_content *fc)
+{
+       u64 req_mask = subcodes_array[fc->fh->subcode].req_mask;
+       u64 opt_mask = subcodes_array[fc->fh->subcode].opt_mask;
+
+       if (((fc->mask & req_mask) != req_mask) ||
+           ((fc->mask & ~opt_mask) & ~req_mask)) {
+               vnic_dbg_parse(port->name, "%s: mask check failed: mask 0x%llx,"
+                            "req_mask 0x%llx, opt_mask 0x%llx\n",
+                            fip_subcode_str(fc->fh->subcode), fc->mask, req_mask, opt_mask);
+               return -1;
+       }
+
+       return 0;
+}
+
+static void dump_cte(struct vnic_port *port, struct context_table_entry *cte)
+{
+        vnic_dbg_parse(port->name, "CTE: V(%d) RSS(%d) type(%d) MAC(%pM) QPN(0x%06x) SL(%d) LID(0x%04x)\n",
+                      (0x1 & (cte->v_rss_type >> 7)),
+                      (0x1 & (cte->v_rss_type >> 6)),
+                      (cte->v_rss_type & 0xf),
+                      cte->mac, be32_to_cpu(cte->qpn) & 0xffffff,
+                      (cte->sl & 0xf), be16_to_cpu(cte->lid));
+}
+
+static void dump_vnic_identity(struct vnic_port *port,
+                              struct fip_vnic_identity_tlv *fvi)
+{
+#define VHUB_ID        be32_to_cpu(fvi->flags_vhub_id)
+
+        vnic_dbg_parse(port->name, "%s: U(%d) R(%d) VP(%d) VHUBID(x%x) TUSN(0x%x) VNIC_ID(0x%x)"
+                      "MAC(%pM) GUID("GUID_FORMAT") VNIC NAME (%s)\n",
+                      fip_type_str(fvi->ft.type), (VHUB_ID >> 31), (0x01 & (VHUB_ID >> 30)),
+                      (0x01 & (VHUB_ID >> 24)), VHUB_ID & 0xffffff, be32_to_cpu(fvi->tusn),
+                      be16_to_cpu(fvi->vnic_id), fvi->mac, GUID_ARG(fvi->port_guid), fvi->vnic_name);
+}
+
+static void dump_vnic_partition(struct vnic_port *port, struct fip_partition_tlv *fp)
+{
+       vnic_dbg_parse(port->name, "%s: PKEY(0x%x)\n", fip_type_str(fp->ft.type),
+                      be16_to_cpu(fp->pkey));
+}
+
+
+static void dump_gw_identifier(struct vnic_port *port, struct fip_gw_identifier_tlv *fgid)
+{
+       vnic_dbg_parse(port->name, "%s: SYS GUID("GUID_FORMAT") SYS NAME(%s) GW PORT NAME(%s)\n",
+                    fip_type_str(fgid->ft.type), GUID_ARG(fgid->sys_guid), fgid->sys_name, fgid->sys_name);
+}
+
+static void dump_ka_params(struct vnic_port *port, struct fip_ka_params_tlv *fka)
+{
+       vnic_dbg_parse(port->name, "%s: GW_ADV_PERIOD(%d) GW_KA_PERIOD(%d) VNIC_KA_PERIOD(%d)\n",
+                      fip_type_str(fka->ft.type), be32_to_cpu(fka->adv_period),
+                      be32_to_cpu(fka->ka_period), be32_to_cpu(fka->vnic_ka_period));
+}
+
+static void dump_vhub_table(struct vnic_port *port, struct fip_content *fc)
+{
+       int i;
+
+       vnic_dbg_parse(port->name, "%s: VP(%d) vhub id(0x%x) TUSN(0x%x) HDR(%d) table size (%d)\n",
+                      fip_type_str(fc->fvt->ft.type), be32_to_cpu(fc->fvt->vp_vhub_id) >> 24 & 1,
+                      be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff, be32_to_cpu(fc->fvt->tusn),
+                      be16_to_cpu(fc->fvt->hdr) >> 14, be16_to_cpu(fc->fvt->table_size));
+       for (i = 0; i < fc->cte.num; ++i)
+               dump_cte(port, &fc->cte.cte[i]);
+}
+
+static void dump_fip_login(struct vnic_port *port, struct fip_login_tlv *p)
+{
+       vnic_dbg_parse(port->name, "%s: mtu(%d) vnic_id(0x%x) v_m_vp_h(0x%x) vlan(0x%x) mac(%pM)"
+                      "mgid_prefix("MGID_PREFIX_FMT") vfields(0x%0x) syndrom(%d) QPN(0x%x)"
+                      " vnic_name(%s)\n", fip_type_str(p->ft.type), be16_to_cpu(p->mtu),
+                      be16_to_cpu(p->vnic_id), be16_to_cpu(p->flags_vlan) >> 12,
+                      be16_to_cpu(p->flags_vlan) & 0xfff, p->mac, MGID_PRE_ARG(p->eth_gid_prefix),
+                      be16_to_cpu(p->vfields), be32_to_cpu(p->syndrom_ctrl_qpn) >> 24,
+                      be32_to_cpu(p->syndrom_ctrl_qpn) & 0xffffff, p->vnic_name);
+}
+
+static void dump_fip_address(struct vnic_port *port, struct fip_address_tlv *fa)
+{
+       vnic_dbg_parse(port->name, "%s: GW_TYPE(%d) QPN(0x%x)  SL(%d), GW_PORT_ID(0x%x),"
+                      " LID(0x%x) GUID(" GUID_FORMAT ")\n", fip_type_str(fa->ft.type),
+                      be32_to_cpu(fa->gwtype_qpn) >> 24, be32_to_cpu(fa->gwtype_qpn) & 0xffffff,
+                      be16_to_cpu(fa->sl_gwportid) >> 12, be16_to_cpu(fa->sl_gwportid) & 0xfff,
+                      be16_to_cpu(fa->lid), GUID_ARG(fa->guid));
+}
+
+static void dump_vhub_update(struct vnic_port *port, struct fip_content *fc)
+{
+#define VHUB_ID_1      be32_to_cpu(fc->fvu->state_vhub_id)
+       int i;
+
+       vnic_dbg_parse((port->name), "%s: eport_state(%s) vp(%d) vhub_id(0x%x) tusn(0x%x)\n",
+                      fip_type_str(fc->fvu->ft.type), eport_state_str(VHUB_ID_1 >> 28 & 3),
+                      VHUB_ID_1 >> 24 & 1, VHUB_ID_1 & 0xffffff, be32_to_cpu(fc->fvu->tusn));
+       for (i = 0; i < fc->cte.num; ++i)
+               dump_cte(port, &fc->cte.cte[i]);
+}
+
+static void dump_gateway_information(struct vnic_port *port,
+                                    struct fip_gw_information_tlv *fgwi)
+{
+       vnic_dbg_parse(port->name, "%s: accept host administered(%s) nmac_mgid(%d) "
+                      "nrss_mgid(%d) ntss_qpn(%d), n_rss(%d), num_net_vnics(%d)\n",
+                      fip_type_str(fgwi->ft.type), (fgwi->h_nmac_mgid >> 7) ? "Yes" : "No",
+                      fgwi->h_nmac_mgid & 0x3f, fgwi->n_rss_mgid_tss_qpn >> 4,
+                      fgwi->n_rss_mgid_tss_qpn & 0xf, be16_to_cpu(fgwi->n_rss_qpn_vnics) >> 12,
+                      be16_to_cpu(fgwi->n_rss_qpn_vnics) & 0xfff);
+}
+
+static void dump_fip_packet(struct vnic_port *port, struct fip_content *fc)
+{
+       int i;
+
+       for (i = 0; i < fc->fa.num; ++i)
+               dump_fip_address(port, fc->fa.fa[i]);
+
+       if (fc->fgwi)
+               dump_gateway_information(port, fc->fgwi);
+
+       if (fc->fvu)
+               dump_vhub_update(port, fc);
+
+       if (fc->fl)
+               dump_fip_login(port, fc->fl);
+
+       if (fc->fvt)
+               dump_vhub_table(port, fc);
+
+       if (fc->fvi)
+               dump_vnic_identity(port, fc->fvi);
+
+       if (fc->fp)
+               dump_vnic_partition(port, fc->fp);
+
+       if (fc->fgid)
+                dump_gw_identifier(port, fc->fgid);
+
+       if (fc->fka)
+                dump_ka_params(port, fc->fka);
+}
+
+int fip_packet_parse(struct vnic_port *port, void *packet, int pkt_size, struct fip_content *fc)
+{
+       void *ptr = packet;
+       int len;
+       int err;
+       int idx;
+       u16 offset = 0;
+       int size = pkt_size;
+
+       vnic_dbg_parse(port->name, "size = %d\n", size);
+       err = check_eoib_ver(port, ptr, size, &len);
+       if (err) {
+               if (err != -EINVAL)
+                       goto out_err;
+               else
+                       vnic_dbg_parse(port->name, "version check failed\n");
+       }
+
+       fc->eoib_ver = ptr;
+       size -= len;
+       ptr += len;
+       offset += len;
+       fc->fh = ptr;
+
+       err = check_fip_hdr(port, ptr, size, &len);
+       if (err)
+               goto out_err;
+
+       ptr += len;
+       offset += len;
+
+       fc->fa.num = 0;
+       fc->num = 0;
+       fc->mask = 0;
+
+       /* workaround a BXM bug not reporting the correct descriptor length */
+       if (fc->fh->subcode != FIP_GW_ADV_SUB_OPCODE)
+               size = be16_to_cpu(fc->fh->list_length) << 2;
+       else
+               size -= len;
+
+       vnic_dbg_parse(port->name, "subcode = %s, size %d\n",
+                    fip_subcode_str(fc->fh->subcode), size);
+       while (size > 0) {
+               err = next_type(port, ptr, size, fc, &len, &idx);
+               if (err)
+                       break;
+
+               fc->offsets[fc->num] = offset;
+               fc->mask |= ((u64)1 << idx);
+               ptr += len;
+               size -= len;
+               offset += len;
+               fc->num++;
+       }
+
+       if (err)
+               goto out_err;
+
+       err = check_fip_mask(port, fc);
+       if (err) {
+               vnic_dbg_parse(port->name, "check mask: failed\n");
+               goto out_err;
+       }
+
+       dump_fip_packet(port, fc);
+
+       return 0;
+
+out_err:
+               dump_raw(port, packet, pkt_size);
+       return err;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic.h
new file mode 100644 (file)
index 0000000..04a5e83
--- /dev/null
@@ -0,0 +1,1437 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef VNIC_H
+#define VNIC_H
+
+#include <linux/netdevice.h>
+#include <linux/ethtool.h>
+#include <linux/if_vlan.h>
+#include <linux/if_arp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/ktime.h>
+#include <linux/hrtimer.h>
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#include <linux/inet_lro.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/rwsem.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+#include <net/dst.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_sa.h>
+
+/* for mlx4_ib dev attr, used also in vnic_qp.c */
+#include "../../../../infiniband/hw/mlx4/mlx4_ib.h"
+#include "../../../../infiniband/hw/mlx4/user.h"
+
+#include "vnic_utils.h"
+
+/* driver info definition */
+#define DRV_NAME  "mlx4_vnic"
+#define DRV_VER   "1.4.0"
+#define DRV_LIC   "Dual BSD/GPL"
+#define DRV_DESC  "Mellanox BridgeX Virtual NIC Driver"
+#define DRV_AUTH  "Ali Ayoub & Gabi Liron"
+
+/* backports */
+
+/* for kernel >= 3.17 */
+#define alloc_netdev_mqs(a, b, c, d, e) alloc_netdev_mqs(a, b, NET_NAME_UNKNOWN, c, d, e)
+
+#ifdef alloc_netdev_mq
+#undef alloc_netdev_mq
+#define alloc_netdev_mq(sizeof_priv, name, setup, count) \
+    alloc_netdev_mqs(sizeof_priv, name, setup, count, count)
+#endif
+
+#ifndef SET_ETHTOOL_OPS
+#define SET_ETHTOOL_OPS(netdev,ops) \
+    ( (netdev)->ethtool_ops = (ops) )
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,35))
+#define _BP_NO_MC_LIST
+
+// Not sure this should be here at least this is ok for 2.6.39
+#define _BP_NO_ATT_OWNER
+#endif
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0))
+#define _BP_NO_GRO
+#endif
+
+#ifndef NETIF_F_HW_VLAN_FILTER
+#define NETIF_F_HW_VLAN_FILTER NETIF_F_HW_VLAN_CTAG_FILTER
+#endif
+
+/* externs */
+extern u32 vnic_msglvl;
+extern u32 vnic_max_tx_outs;
+extern u32 vnic_lro_num;
+extern u32 vnic_mcast_create;
+extern u32 vnic_net_admin;
+extern u32 vnic_child_max;
+extern u32 vnic_napi_weight;
+extern u32 vnic_linear_small_pkt;
+extern u32 vnic_tx_rings_num;
+extern u32 vnic_rx_rings_num;
+extern u32 vnic_tx_rings_len;
+extern u32 vnic_rx_rings_len;
+extern u32 vnic_mgid_data_type;
+extern u32 vnic_encap_headroom;
+extern u32 vnic_tx_polling;
+extern u32 vnic_rx_linear;
+extern u32 vnic_change_mac;
+extern u32 vnic_learn_mac_enabled;
+extern u32 vnic_synd_backlog;
+extern u32 vnic_eport_state_enforce;
+extern u32 vnic_src_mac_enforce;
+extern u32 vnic_inline_tshold;
+
+#define MAX_NUM_PKEYS_DISCOVERY        (24)
+#define ILLEGAL_PKEY_INDEX     (0xFFFF)
+extern u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY];
+extern u32 vnic_discovery_pkeys_count;
+extern u32 vnic_sa_query;
+
+
+extern u32 no_bxm;
+
+extern struct workqueue_struct *port_wq;
+extern struct workqueue_struct *fip_wq;
+extern struct workqueue_struct *mcast_wq;
+extern struct workqueue_struct *login_wq;
+
+extern struct ib_sa_client vnic_sa_client;
+
+/* definitions */
+#define EOIB_SERVICE_ID ((0x10ULL << 56) | (0x0002C9E01B0000ULL))
+#define EOIB_CTRL_SERVICE_ID (EOIB_SERVICE_ID | 0x00FFULL)
+#define VNIC_SKB_QUEUE_LEN     32
+#define VNIC_CNT_MAX           32
+#define VNIC_DESC_LEN          (64 + 4)
+#define VNIC_NAME_LEN          16 /* by spec, use IFNAMSIZ for OS */
+#define VNIC_SYSFS_FLEN                (VNIC_NAME_LEN * 2) /* SYSFS file name len, allow pre/suffix (32)*/
+#define VNIC_SYSFS_LLEN                64
+#define VNIC_VENDOR_LEN                8
+#define GID_LEN                        16
+#define GUID_LEN               8
+#define IPV4_LEN               4
+#define IPV6_LEN               16
+#define VNIC_SYSTEM_NAME_LEN   32
+#define VNIC_GW_PORT_NAME_LEN  8
+#define GID_PREFIX_LEN         5
+#define VNIC_MAX_DENTRIES      16
+#define VNIC_ID_LEN            16
+#define VNIC_CHILD_MAX         128
+#define VNIC_MAX_RETRIES       0 /* zero = unlimited */
+#define VNIC_WATCHDOG_TIMEOUT  (25 * HZ) /* 25 sec */
+#define VNIC_NAPI_SCHED_TIMEOUT (5)
+#define FIP_MAX_VNICS_PER_GW   (1 << 9)
+#define NOT_AVAILABLE_NUM      (-1)
+#define NOT_AVAILABLE_STRING   "N/A"
+#define is_valid_str(str)      (strcmp(str, NOT_AVAILABLE_STRING))
+#define is_valid_num(num)      (num != NOT_AVAILABLE_NUM)
+#define is_valid_guid(arr)     (!!(*((u64 *)(arr))))
+#define is_valid_ipv4(arr)     (!!(*((u32 *)(arr))))
+#define is_mcast_promisc(login)        (!(login->n_mac_mcgid))
+#define is_ucast_promisc(login) (!!(login->dev->flags & IFF_PROMISC))
+#define ARRAY_LEN(_x)          (sizeof(_x)/sizeof(_x[0]))
+
+/* TODO: cleanup VNIC_GID_RAW_ARG and friends */
+#define VNIC_GID_RAW_ARG(gid)  ((u8 *)(gid))[0], \
+                               ((u8 *)(gid))[1], \
+                               ((u8 *)(gid))[2], \
+                               ((u8 *)(gid))[3], \
+                               ((u8 *)(gid))[4], \
+                               ((u8 *)(gid))[5], \
+                               ((u8 *)(gid))[6], \
+                               ((u8 *)(gid))[7], \
+                               ((u8 *)(gid))[8], \
+                               ((u8 *)(gid))[9], \
+                               ((u8 *)(gid))[10],\
+                               ((u8 *)(gid))[11],\
+                               ((u8 *)(gid))[12],\
+                               ((u8 *)(gid))[13],\
+                               ((u8 *)(gid))[14],\
+                               ((u8 *)(gid))[15]
+#define VNIC_GUID_RAW_ARG(gid) ((u8 *)(gid))[0], \
+                               ((u8 *)(gid))[1], \
+                               ((u8 *)(gid))[2], \
+                               ((u8 *)(gid))[3], \
+                               ((u8 *)(gid))[4], \
+                               ((u8 *)(gid))[5], \
+                               ((u8 *)(gid))[6], \
+                               ((u8 *)(gid))[7]
+
+#define VNIC_GID_ARG(gid)      VNIC_GID_RAW_ARG((gid).raw)
+#define VNIC_GID_FMT           "%.2x:%.2x:%.2x:%.2x:" \
+                               "%.2x:%.2x:%.2x:%.2x:" \
+                               "%.2x:%.2x:%.2x:%.2x:" \
+                               "%.2x:%.2x:%.2x:%.2x"
+#define VNIC_GUID_FMT          "%.2x:%.2x:%.2x:%.2x:" \
+                               "%.2x:%.2x:%.2x:%.2x"
+
+#define MAC_6_PRINT_FMT                "%.2x:%.2x:%.2x:%.2x:" \
+                               "%.2x:%.2x"
+#define MAC_6_PRINT_ARG(mac)   (mac)[0], (mac)[1], (mac)[2], \
+                               (mac)[3], (mac)[4], (mac)[5]
+
+#define IP_4_PRINT_FMT         "%d.%d.%d.%d"
+#define IP_4_PRINT_ARG(ip)     (ip)[0], (ip)[1], (ip)[2], (ip)[3]
+
+#define CREATE_VHUB_ID(be_vlan, port_id) \
+       ((be16_to_cpu(be_vlan) & 0xFFF) | (((port_id) & 0xFFF) << 12))
+#define CREATE_VHUB_ID_BE(vlan, port_id) \
+       cpu_to_be32(CREATE_VHUB_ID(vlan, port_id))
+#define ROUNDUP_LOG2(x)                ilog2(roundup_pow_of_two(x))
+
+#define VNIC_RX_COAL_TARGET    0x20000
+#define VNIC_RX_COAL_TIME      0x10
+#define VNIC_TX_COAL_PKTS      64
+#define VNIC_TX_COAL_TIME      0x80
+#define VNIC_RX_RATE_LOW       400000
+#define VNIC_RX_COAL_TIME_LOW  0
+#define VNIC_RX_RATE_HIGH      450000
+#define VNIC_RX_COAL_TIME_HIGH 128
+#define VNIC_RX_SIZE_THRESH    1024
+#define VNIC_RX_RATE_THRESH    (1000000 / VNIC_RX_COAL_TIME_HIGH)
+#define VNIC_SAMPLE_INTERVAL   0
+#define VNIC_AVG_PKT_SMALL     256
+#define VNIC_AUTO_CONF         0xffff
+#define VNIC_MCAST_MAX_RETRY   60
+#define VNIC_MCAST_ULIMIT_RETRY        0
+#define VNIC_MCAST_BACKOF_FAC  2
+#define MLX4_DEV_CAP_FLAG_UD_SWP (1 << 28)
+#define VNIC_ETHTOOL_LINE_MAX  32
+#define VNIC_ENCAP_LEN         4
+#define VNIC_MAX_TX_SIZE       2048
+#define VNIC_MAX_RX_SIZE       4096
+#define ETH_LLC_SNAP_SIZE      8
+
+#define VNIC_SM_HEADSTART                      250 /* msecs to actually start handling SM events */
+#define VNIC_MCAST_BACKOFF_MSEC                1000
+#define VNIC_MCAST_BACKOFF_MAX_MSEC    16000
+
+#define SYSFS_VLAN_ID_NO_VLAN          (-1)
+
+#define VNIC_MAX_PAYLOAD_SIZE          4096
+#define VNIC_BUF_SIZE(_port)           (min(_port->max_mtu_enum + \
+                                       IB_GRH_BYTES, VNIC_MAX_PAYLOAD_SIZE))
+
+#define VNIC_TX_QUEUE_LEN              1024 /* default, tuneable */
+#define VNIC_TX_QUEUE_LEN_MIN          64
+#define VNIC_TX_QUEUE_LEN_MAX          (8 * 1024)
+
+#define VNIC_RX_QUEUE_LEN              2048 /* default, tuneable */
+#define VNIC_RX_QUEUE_LEN_MIN          64
+#define VNIC_RX_QUEUE_LEN_MAX          (8 * 1024)
+
+
+#define VNIC_MODER_DELAY               (HZ / 4)
+#define VNIC_STATS_DELAY               VNIC_MODER_DELAY
+
+#define VNIC_AH_SL_DEFAULT             0x0
+
+#define VNIC_DATA_QKEY                 0x80020003
+#define VNIC_FIP_QKEY                  0x80020002
+#define VNIC_VLAN_OFFSET(login)                (login->vlan_used ? VLAN_HLEN : 0)
+#define VNIC_VLAN_ENABLED(login)       (login->vlan_used ? 1 : 0)
+#define VNIC_MAX_TX_CQE                        32      /* default, tuneable */
+#define VNIC_MAX_RX_CQE                        64      /* default, tuneable */
+#define VNIC_MAX_NUM_CPUS              32
+#define VNIC_MAX_INLINE_TSHOLD         512
+
+#define VNIC_EOIB_HDR_VER              0x0
+#define VNIC_EOIB_HDR_SIG              0x3
+#define VNIC_EOIB_HDR_UDP_CHK_OK       0x2
+#define VNIC_EOIB_HDR_TCP_CHK_OK       0x1
+#define VNIC_EOIB_HDR_IP_CHK_OK                0x1
+
+#define VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr)     (eoib_hdr->encap_data & 0x3)
+#define VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)        ((eoib_hdr->encap_data >> 2) & 0x3)
+#define VNIC_EOIB_HDR_GET_VER(eoib_hdr)                ((eoib_hdr->encap_data >> 4) & 0x3)
+#define VNIC_EOIB_HDR_GET_SIG(eoib_hdr)        ((eoib_hdr->encap_data >> 6) & 0x3)
+
+#define VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_hdr)  (eoib_hdr->encap_data = \
+                                               (eoib_hdr->encap_data & 0xFC) | VNIC_EOIB_HDR_IP_CHK_OK)
+#define VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \
+                                               (eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_TCP_CHK_OK << 2))
+#define VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_hdr) (eoib_hdr->encap_data = \
+                                               (eoib_hdr->encap_data & 0xF3) | (VNIC_EOIB_HDR_UDP_CHK_OK << 2))
+
+#define VNIC_IP_CSUM_OK(eoib_hdr)      ((VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr))  == VNIC_EOIB_HDR_IP_CHK_OK)
+#define VNIC_TCP_CSUM_OK(eoib_hdr)     ((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_TCP_CHK_OK)
+#define VNIC_UDP_CSUM_OK(eoib_hdr)     ((VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr)) == VNIC_EOIB_HDR_UDP_CHK_OK)
+#define VNIC_CSUM_OK(eoib_hdr)         (VNIC_IP_CSUM_OK(eoib_hdr)  && \
+                                       (VNIC_TCP_CSUM_OK(eoib_hdr) || \
+                                        VNIC_UDP_CSUM_OK(eoib_hdr)))
+#define VNIC_EOIB_ZLEN_MAX             (ETH_ZLEN + VNIC_ENCAP_LEN + VLAN_HLEN)
+
+#define VNIC_SKB_GET_HASH(_skb, _max)  ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) % _max)
+#define VNIC_SKB_SET_HASH(_skb, _hash)  ((*(u32 *)(_skb->cb + sizeof _skb->cb - 4)) = _hash)
+#define VNIC_SKB_GET_ENCAP_CB(_skb)    ((struct eoibhdr *)(_skb->cb + sizeof _skb->cb - 12))
+#define VNIC_SKB_GET_ENCAP(_skb)       (vnic_encap_headroom ? (struct eoibhdr *)(_skb->data) : VNIC_SKB_GET_ENCAP_CB(_skb))
+#define VNIC_SKB_GET_ENCAP_OFFSET      (vnic_encap_headroom ? VNIC_ENCAP_LEN :0)
+
+#define VNIC_NEIGH_GET_DQPN(_skb, _neighe) ((_neighe->rss) ? (_neighe->qpn + \
+       VNIC_SKB_GET_HASH(_skb, _neighe->login->qps_num)) : (_neighe->qpn))
+
+#define vnic_netdev_priv(netdev)       (((struct vnic_login_info *)netdev_priv(netdev))->login)
+#ifndef _BP_NETDEV_NO_TMQ /* >= 2.6.27 */
+#define VNIC_TXQ_GET_HASH(_skb, _max)  (skb_get_queue_mapping(_skb))
+#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev_mq(sz, nm, sp, qm)
+#define VNIC_TXQ_SET_ACTIVE(login, num)        (login->dev->real_num_tx_queues = \
+                                       login->real_tx_rings_num = \
+                                       login->ndo_tx_rings_num = num)
+#define VNIC_TXQ_GET_ACTIVE(login)     (login->real_tx_rings_num)
+#define VNIC_TXQ_GET(tx_res)           netdev_get_tx_queue(tx_res->login->dev, tx_res->index)
+#define VNIC_TXQ_STOP(tx_res)          netif_tx_stop_queue(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_STOP_ALL(login)       netif_tx_stop_all_queues(login->dev)
+#define VNIC_TXQ_START(tx_res)         netif_tx_start_queue(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_START_ALL(login)      netif_tx_start_all_queues(login->dev)
+#define VNIC_TXQ_STOPPED(tx_res)       netif_tx_queue_stopped(VNIC_TXQ_GET(tx_res))
+#define VNIC_TXQ_WAKE(tx_res)          netif_tx_wake_queue(VNIC_TXQ_GET(tx_res))
+#else
+#define VNIC_TXQ_GET_HASH(skb, _max)   VNIC_SKB_GET_HASH(skb, _max)
+#define VNIC_TXQ_ALLOC_NETDEV(sz, nm, sp, qm) alloc_netdev(sz, nm, sp)
+#define VNIC_TXQ_SET_ACTIVE(login, num)        do { login->real_tx_rings_num = num; \
+                                            login->ndo_tx_rings_num = 1;    \
+                                       } while (0)
+#define VNIC_TXQ_GET_ACTIVE(login)     (login->real_tx_rings_num)
+#define VNIC_TXQ_STOP(tx_res)          netif_stop_queue(tx_res->login->dev)
+#define VNIC_TXQ_STOP_ALL(login)       netif_stop_queue(login->dev)
+#define VNIC_TXQ_START(tx_res)         netif_start_queue(tx_res->login->dev)
+#define VNIC_TXQ_START_ALL(login)      netif_start_queue(login->dev)
+#define VNIC_TXQ_STOPPED(tx_res)       netif_queue_stopped(tx_res->login->dev)
+#define VNIC_TXQ_WAKE(tx_res)          netif_wake_queue(tx_res->login->dev)
+#endif
+
+#define VNIC_ALLOC_ORDER               2
+#define VNIC_ALLOC_SIZE                        (PAGE_SIZE << VNIC_ALLOC_ORDER)
+#define VNIC_MAX_LRO_AGGR              64
+#define VNIC_MAX_RX_FRAGS              4
+#define VNIC_MAX_TX_FRAGS              (MAX_SKB_FRAGS + 2)
+#define VNIC_MGID_PREFIX_LEN           5
+
+/* TODO, when set VNIC_MAX_TX_OUTS to 16,
+ * noticed that the last CQE overwrites the first one
+ */
+#define VNIC_MAX_TX_OUTS               8  /* default, tuneable */
+#define VNIC_MAX_LRO_DESCS             32 /* default, tuneable */
+#define VNIC_EOIB_HDR_SIZE             (IB_GRH_BYTES + VNIC_ENCAP_LEN)
+#define SMALL_PACKET_SIZE              (256 - NET_IP_ALIGN)
+#define HEADER_COPY_SIZE               (128 - NET_IP_ALIGN)
+#define MAX_HEADER_SIZE                        64
+
+#define LAG_MAP_TABLE_SIZE             32
+#define        MAX_LAG_MEMBERS                 16
+
+#define VNIC_FW_STR_MAX                        VNIC_ETHTOOL_LINE_MAX
+#define VNIC_FW_STR(u64_fw_ver, str)                                   \
+do {                                                                   \
+       snprintf(str, VNIC_FW_STR_MAX, "%d.%d.%d",                      \
+       (int)(u64_fw_ver >> 32),                                        \
+       (int)(u64_fw_ver >> 16) & 0xffff,                               \
+       (int)(u64_fw_ver & 0xffff));                                    \
+} while (0);
+#define VNIC_STR_STRIP(str)                                            \
+do {                                                                   \
+       int i;                                                          \
+       for (i = 0; i < strlen(str); ++i)                               \
+               str[i] = str[i] == '\n' ? ' ' : str[i];                 \
+} while (0);
+
+/* well known addresses */
+static const u8 ETH_BCAST_MAC[] = {
+       0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+static const u8 ETH_ZERO_MAC[] = {
+       0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+};
+
+/* this used in no_bxm mode only */
+static const u8 NO_BXM_MGID_PREFIX[] = {
+       0xff, 0x13, 0xe0, 0x1b, 0x00
+};
+
+#define IS_ZERO_MAC(mac) (!memcmp((mac), ETH_ZERO_MAC, ETH_ALEN))
+#define IS_BCAST_MAC(mac) (!memcmp((mac), ETH_BCAST_MAC, ETH_ALEN))
+#define IS_MCAST_MAC(mac) (((unsigned char *)(mac))[0] & 0x01)
+#define IS_UCAST_MAC(mac) (!(IS_MCAST_MAC(mac)))
+#define IS_NEIGH_QUERY_RUNNING(neigh) \
+       (neigh->query_id >= 0 && !IS_ERR(neigh->pquery) && neigh->pquery)
+
+struct mcast_root {
+       struct rb_root  mcast_tree;
+       spinlock_t      mcast_rb_lock;
+       struct list_head reattach_list;
+};
+
+/* structs */
+struct vnic_port_stats {
+       unsigned long gro_held;
+       unsigned long gro_merged;
+       unsigned long gro_normal;
+       unsigned long gro_drop;
+       unsigned long lro_aggregated;
+       unsigned long lro_flushed;
+       unsigned long lro_no_desc;
+       unsigned long tso_packets;
+       unsigned long queue_stopped;
+       unsigned long wake_queue;
+       unsigned long tx_timeout;
+       unsigned long rx_chksum_good;
+       unsigned long rx_chksum_none;
+       unsigned long tx_chksum_offload;
+       unsigned long sig_ver_err;
+       unsigned long vlan_err;
+       unsigned long shared_packets;
+       unsigned long runt_packets;
+       unsigned long realloc_packets;
+       unsigned long gw_tx_packets;
+       unsigned long gw_tx_bytes;
+};
+
+#define VNIC_STATS_DO_ADD(var, val) ((var) += (unsigned long)(val))
+#define VNIC_STATS_DO_INC(var)      (++(var))
+#ifdef VNIC_EXTRA_STATS /* for performance */
+#define VNIC_STATS_ADD(var, val)    ((var) += (unsigned long)(val))
+#define VNIC_STATS_INC(var)         (++(var))
+#else
+#define VNIC_STATS_ADD(var, val)    do { } while (0)
+#define VNIC_STATS_INC(var)         do { } while (0)
+#endif
+
+enum {
+       MCAST_ATTACHED,
+       MCAST_JOINED,
+       MCAST_JOIN_STARTED,
+       MCAST_JOIN_RUNNING,
+       MCAST_ATTACH_RUNNING,
+};
+
+struct vnic_port_mcast {
+       struct rb_node rb_node;
+       struct list_head list;
+       union ib_gid gid;
+       struct vnic_port *port;
+       struct completion leave_complete;
+       struct completion join_event_complete;
+       struct ib_sa_multicast *sa_mcast;
+       struct ib_sa_mcmember_rec rec;
+
+       atomic_t ref_cnt;
+       struct delayed_work join_task;
+       struct work_struct leave_task;
+       unsigned long join_task_cnt;
+       long int state;
+       spinlock_t lock;
+       u8 join_state;
+       /* IN */
+       unsigned long backoff;
+       unsigned long backoff_init;
+       unsigned long backoff_factor;
+       unsigned long retry;
+       u16 pkey;
+       u32 qkey;
+       u8 create;
+};
+
+struct vnic_mcast {
+       struct vnic_port_mcast *port_mcaste;
+       u32 qkey;
+       u16 pkey;
+       struct ib_qp *qp;
+       struct vnic_port *port;
+       struct ib_ah *ah;
+       struct completion attach_complete;
+       struct delayed_work attach_task;
+       struct delayed_work detach_task;
+       unsigned long attach_task_cnt;
+       struct rb_node rb_node;
+       struct list_head list; /* used when delete all */
+       /* IN */
+       u8 mac[ETH_ALEN];
+       union ib_gid gid;
+       union ib_gid port_gid;
+       unsigned long backoff;
+       unsigned long backoff_init;
+       unsigned backoff_factor;
+       unsigned long retry;
+       unsigned long state;
+       u8 blocking;
+       void *attach_cb_ctx;
+       void *detach_cb_ctx;
+       void (*attach_cb) (struct vnic_mcast *mcaste, void *ctx);
+       void (*detach_cb) (struct vnic_mcast *mcaste, void *ctx);
+       u8 create;
+       u8 join_state;
+       void *priv_data;
+       spinlock_t lock;
+       int attach_bit_nr;
+       unsigned long *req_attach;
+       unsigned long *cur_attached;
+       int sender_only;
+};
+
+struct vnic_mac {
+       struct rb_node rb_node; /* list or RB tree */
+       struct list_head list;
+       u16 vnic_id;            /* needed for vnic child removal */
+       u8 mac[ETH_ALEN];       /* key */
+       unsigned long created;
+       unsigned long last_tx; // use jiffies_to_timeval
+};
+
+struct lag_properties {
+       u16     hash_mask;
+       u8      weights_policy;
+       u8      ca;             /* conjestion aware */
+       u8      ca_thresh;
+};
+
+struct vnic_neigh {
+       struct neighbour *neighbour;
+       struct ib_ah *ah;
+       struct vnic_login *login;
+       struct rb_node rb_node;
+       struct ib_sa_query *pquery;
+       struct completion query_comp;
+       int query_id;
+       struct sk_buff_head pkt_queue;
+       struct delayed_work destroy_task;
+       u8 valid;
+       u32 qpn;
+       u16 lid;
+       u8 sl; /* only for debug */
+       u8 mac[ETH_ALEN];
+       u8 rss;
+       u16 info;
+};
+
+enum lag_gw_state {
+       GW_MEMBER_INFO_CREATED  = 1 << 0,
+       GW_MEMBER_INFO_EPORT_UP = 1 << 1,
+       GW_MEMBER_INFO_MCAST    = 1 << 2,
+       GW_MEMBER_INFO_MAPPED   = 1 << 3,
+};
+
+struct vnic_gw_info {
+       enum lag_gw_state info;
+       int member_id;
+       u16 gw_id;
+       struct vnic_neigh neigh;
+};
+
+struct vnic_sysfs_attr {
+       void *ctx;
+       struct kobject *kobj;
+       unsigned long data;
+       char name[VNIC_SYSFS_FLEN];
+       struct module_attribute dentry;
+       struct device *dev;
+};
+
+enum gw_ext_lag_hash_policy {
+       GW_LAG_HASH_DMAC        = 1 << 0,
+       GW_LAG_HASH_SMAC        = 1 << 1,
+       GW_LAG_HASH_TPID        = 1 << 2,       /* ethertype */
+       GW_LAG_HASH_VID         = 1 << 3,
+       GW_LAG_HASH_SIP         = 1 << 4,
+       GW_LAG_HASH_DIP         = 1 << 5,
+       GW_LAG_HASH_IP_NEXT     = 1 << 6,
+       GW_LAG_HASH_SPORT       = 1 << 7,
+       GW_LAG_HASH_DPORT       = 1 << 8,
+       GW_LAG_LAYER_2_3        = 0x1f0
+};
+
+struct vnic_tx_buf {
+       struct sk_buff *skb;
+       u64 mapping[VNIC_MAX_TX_FRAGS];
+       u8 ip_off;
+       u8 ip6_off;
+       u8 tcp_off;
+       u8 udp_off;
+       void *phead;
+       int hlen;
+};
+
+enum {
+#if 1
+       FRAG_SZ0 = 536 - NET_IP_ALIGN, /* so 1500 mtu fits in first 2 frags */
+       FRAG_SZ1 = 1024,
+       FRAG_SZ2 = 2048,
+       FRAG_SZ3 = 4096 - FRAG_SZ2 - FRAG_SZ1 - FRAG_SZ0
+#else
+       FRAG_SZ0 = 512 - NET_IP_ALIGN,
+       FRAG_SZ1 = 1024,
+       FRAG_SZ2 = 2048,
+       FRAG_SZ3 = 4096 << VNIC_ALLOC_ORDER
+#endif
+};
+
+struct vnic_frag_info {
+       u16 frag_size;
+       u16 frag_prefix_size;
+       u16 frag_stride;
+       u16 frag_align;
+       u16 last_offset;
+};
+
+struct vnic_rx_alloc {
+       struct page *page;
+       u16 offset;
+};
+
+struct vnic_frag_data {
+       struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS];
+       u64 dma_addr[VNIC_MAX_RX_FRAGS];
+       struct sk_buff *skb; /* used only for linear buffers mode */
+};
+
+struct vnic_rx_ring {
+       struct vnic_port *port;
+       int index;
+       struct vnic_rx_alloc page_alloc[VNIC_MAX_RX_FRAGS];
+
+       u32 size; /* number of RX descs */
+       spinlock_t lock;
+       struct vnic_frag_data *rx_info;
+
+       struct vnic_frag_info frag_info[VNIC_MAX_RX_FRAGS];
+       u32 rx_skb_size;
+       u16 log_rx_info;
+       u16 num_frags;
+
+       struct ib_recv_wr wr;
+       struct ib_sge sge[VNIC_MAX_RX_FRAGS];
+
+       struct ib_srq *srq;
+       struct net_device_stats stats;
+};
+
+/* vnic states
+   these vlaues can be used only in struct fip_vnic_data.login_state */
+enum {
+       VNIC_STATE_LOGIN_OFF = 0,
+       VNIC_STATE_LOGIN_PRECREATE_1,
+       VNIC_STATE_LOGIN_PRECREATE_2,
+       VNIC_STATE_LOGIN_CREATE_1,
+       VNIC_STATE_LOGIN_CREATE_2,
+       VNIC_STATE_LOGIN_BCAST_ATTACH = 31
+};
+
+/* netdevice open state, depeneds on calls to open/stop
+   these vlaues can be used only in struct vnic_login.netdev_state */
+enum {
+       VNIC_STATE_NETDEV_OFF = 0,
+       VNIC_STATE_NETDEV_OPEN_REQ,
+       VNIC_STATE_NETDEV_OPEN,
+       VNIC_STATE_NETDEV_CARRIER_ON,
+       VNIC_STATE_NETDEV_NO_TX_ENABLE = 31
+};
+
+struct vnic_rx_res {
+       struct vnic_login *login;
+       struct ib_cq *cq;
+       struct net_lro_mgr lro;
+        struct net_lro_desc lro_desc[VNIC_MAX_LRO_DESCS];
+       struct ib_wc recv_wc[VNIC_MAX_RX_CQE];
+       int index;
+       int stopped;
+#ifndef _BP_NAPI_POLL
+       struct napi_struct napi;
+#else
+       struct net_device *poll_dev;
+#endif
+};
+
+struct vnic_tx_res {
+       struct vnic_tx_buf *tx_ring;
+       struct ib_sge tx_sge[VNIC_MAX_TX_FRAGS];
+       struct ib_wc send_wc[VNIC_MAX_TX_CQE];
+       struct ib_send_wr tx_wr;
+       struct vnic_login *login;
+       struct ib_cq *cq;
+       unsigned tx_head;
+       unsigned tx_tail;
+       unsigned tx_outstanding;
+       unsigned tx_stopped_cnt;
+       struct net_device_stats stats;
+       struct ib_ah_attr mcast_av;
+       u8 lso_hdr[VNIC_MAX_PAYLOAD_SIZE];
+       int index;
+       int stopped;
+       spinlock_t lock;
+};
+
+#ifdef VNIC_PROFILLNG
+#define VNIC_PROFILLNG_SKB_MAX 100
+struct vnic_prof_skb_entry {
+       struct sk_buff skb;
+       struct timespec tstamp;
+       unsigned long jiffies;
+       int cnt;
+       u8 nr_frags;
+};
+#endif
+
+struct vnic_qp_res {
+       struct vnic_login *login;
+       struct ib_qp *qp;
+       struct completion last_wqe_complete;
+       int tx_index;
+       int rx_index;
+};
+
+/*
+ * Wrapper struct for vnic_login, used as netdev private data.
+ * some kernels (such as 2.6.18-194.26.1) doesn't allow private
+ * data struct longer than 64KB (NETDEV_PRIV_LEN_MAX).
+ * we allocate the private data separately to work-around this limit.
+ */
+struct vnic_login_info {
+       struct vnic_login *login;
+};
+
+struct vnic_login {
+       spinlock_t lock;
+       spinlock_t stats_lock;
+       struct net_device *dev;
+       struct ethtool_drvinfo drvinfo;
+       struct vnic_port *port;
+       char desc[VNIC_DESC_LEN];
+       struct fip_vnic_data *fip_vnic; /* for ethtool/sysfs*/
+       int queue_stopped;
+       unsigned long netdev_state;
+       char name[VNIC_NAME_LEN];
+       char vnic_name[VNIC_NAME_LEN];
+       char vendor_id[VNIC_VENDOR_LEN];
+       struct vnic_neigh *gw_neigh;
+       struct vnic_gw_info lag_gw_neigh[MAX_LAG_MEMBERS];
+       struct  lag_properties lag_prop;
+       int is_lag;
+       int lag_gw_map[LAG_MAP_TABLE_SIZE];
+       int lag_member_count;
+       int lag_member_active_count;
+       union ib_gid gw_mgid;
+       int promisc;
+       union ib_gid gid;
+       __be16 vid;
+       u8 vlan_used;
+       u32 qkey;
+       u16 pkey;
+       u16 pkey_index;
+       u64 gw_guid;
+       u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+       u8 n_mac_mcgid;
+       u8 sl;
+       u16 gw_port_id;
+       u16 vnic_id;
+       unsigned int max_mtu;
+       int zlen;
+       int cnt;
+       unsigned qps_num;
+       u32 qp_base_num;
+       u8 dev_addr[ETH_ALEN];
+       u8 all_vlan_gw;
+
+       /* statistics */
+       struct net_device_stats stats;
+       struct vnic_port_stats port_stats;
+
+       /* tasks */
+       struct work_struct mcast_restart;
+       struct delayed_work stats_task;
+       struct delayed_work mcast_task;
+       struct delayed_work restart_task;
+       struct mutex moder_lock;
+       struct mutex state_lock;
+
+       /* data structures */
+       struct workqueue_struct *neigh_wq;
+       struct rb_root neigh_tree;
+       struct rb_root mac_tree;
+       atomic_t vnic_child_cnt;
+       rwlock_t mac_rwlock;
+       struct mcast_root mcast_tree;
+       struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES];
+       struct list_head list;
+
+       /* QP resources */
+       struct vnic_qp_res qp_res[VNIC_MAX_NUM_CPUS];
+
+       /* RX resouces */
+       struct vnic_rx_res rx_res[VNIC_MAX_NUM_CPUS];
+       struct ib_recv_wr rx_wr;
+       u32 lro_num;
+       unsigned lro_mng_num;
+       int rx_csum;
+       unsigned napi_num;
+       unsigned rx_rings_num;
+
+       /* TX resources */
+       struct vnic_tx_res tx_res[VNIC_MAX_NUM_CPUS];
+       unsigned tx_rings_num;
+       unsigned real_tx_rings_num;
+       unsigned ndo_tx_rings_num;
+       u8 *pad_va;
+       u64 pad_dma;
+
+       /* for profiling */
+#ifdef VNIC_PROFILLNG
+       struct vnic_prof_skb_entry prof_arr[VNIC_PROFILLNG_SKB_MAX];
+       int prof_arr_it;
+#endif
+       /* interrupt coalecence */
+       u16 rx_usecs;
+       u16 rx_frames;
+       u32 pkt_rate_low;
+       u16 rx_usecs_low;
+       u32 pkt_rate_high;
+       u16 rx_usecs_high;
+       u16 sample_interval;
+       u16 adaptive_rx_coal;
+       unsigned long last_moder_packets;
+       unsigned long last_moder_tx_packets;
+       unsigned long last_moder_bytes;
+       unsigned long last_moder_jiffies;
+       unsigned long last_moder_time;
+       u16 tx_usecs;
+       u16 tx_frames;
+       u8 shared_vnic;
+       u8 shared_mac[ETH_ALEN];
+};
+
+struct eoibhdr {
+       __u8 encap_data;
+       __u8 seg_off;
+       __be16 seg_id;
+};
+
+struct vnic_ib_dev {
+       char name[VNIC_DESC_LEN];
+       struct mutex mlock;
+       struct list_head list;
+       struct list_head port_list;
+       struct ib_device *ca;
+       struct mlx4_ib_dev *mdev;
+       struct ib_device_attr attr;
+       char fw_ver_str[VNIC_FW_STR_MAX];
+};
+
+struct fip_ring_entry {
+       void *mem;
+       u64 bus_addr;
+       int length;
+       int entry_posted;
+};
+
+struct fip_ring {
+       int size;
+       struct fip_ring_entry *ring;
+       unsigned long head;
+       unsigned long tail;
+       spinlock_t ring_lock;
+       spinlock_t head_tail_lock;
+};
+
+enum fip_discover_state {
+       FIP_DISCOVER_OFF,
+       FIP_DISCOVER_INIT,
+       FIP_DISCOVER_SOLICIT,
+       FIP_DISCOVER_CLEAR
+};
+
+#define MAX_INPUT_LEN 64
+#define MAX_INPUT_ARG 12
+struct fip_hadmin_cmd {
+       u8 c_name    [MAX_INPUT_LEN];
+       u8 c_mac     [MAX_INPUT_LEN];
+       u8 c_vnic_id [MAX_INPUT_LEN];
+       u8 c_vid     [MAX_INPUT_LEN];
+       u8 c_bxname  [MAX_INPUT_LEN];
+       u8 c_bxguid  [MAX_INPUT_LEN];
+       u8 c_eport   [MAX_INPUT_LEN];
+       u8 c_ipv4    [MAX_INPUT_LEN];
+       u8 c_ipv6    [MAX_INPUT_LEN];
+       u8 c_emac    [MAX_INPUT_LEN];
+       u8 c_pkey    [MAX_INPUT_LEN];
+       u8 c_parent  [MAX_INPUT_LEN];
+};
+
+struct fip_hadmin_cache {
+       struct fip_hadmin_cmd cmd;
+       u8 system_guid[GUID_LEN];
+       u8 system_name[VNIC_SYSTEM_NAME_LEN];
+       u8 eport_name[VNIC_GW_PORT_NAME_LEN];
+       u8 mac[ETH_ALEN];
+       u16 vnic_id;
+       u16 gw_port_id;
+       u16 vlan;
+       u8 vlan_used;
+       u8 all_vlan_gw;
+       u8 interface_name[VNIC_NAME_LEN];
+       u8 parent_name[VNIC_NAME_LEN];
+       int parent_used;
+       int remove;
+       struct list_head next;
+       u32 qp_base_num;
+       u8 shared_vnic_ip[IPV4_LEN];
+       u8 shared_vnic_mac[ETH_ALEN];
+};
+
+struct pkt_rcv_list {
+       struct list_head list;
+       spinlock_t lock;
+};
+
+struct fip_discover {
+       char name[VNIC_NAME_LEN];
+       struct vnic_port *port;
+       struct list_head discover_list;
+       spinlock_t lock;
+       struct list_head gw_list;
+       struct rw_semaphore l_rwsem;    /* gw list rw semaphore **/
+       int hadmin_update;
+       struct list_head hadmin_cache;
+       enum fip_discover_state state;
+       int flush;
+       struct completion flush_complete;
+       struct ib_cq *cq;
+       struct ib_qp *qp;
+       struct fip_ring rx_ring;
+       struct fip_ring tx_ring;
+       struct mcast_root mcast_tree;
+       struct delayed_work fsm_task;
+       struct delayed_work cleanup_task;
+       struct delayed_work hadmin_update_task;
+       struct work_struct pkt_rcv_task_bh;
+       struct pkt_rcv_list rcv_list;
+
+       int mcast_dest_mask;
+       unsigned long discover_mcast_attached_jiffies;
+       unsigned long discover_mcast_detached_jiffies;
+       unsigned long discover_mcast_state;
+       u16 pkey;
+       u16 pkey_index;
+       unsigned long   req_attach;
+       unsigned long   cur_attached;
+       unsigned new_prot_gws;
+       unsigned old_prot_gws;
+};
+
+struct fip_root {
+       struct list_head discover_list;
+};
+
+struct port_fs_dentry {
+       struct module_attribute fs_entry;
+       struct vnic_port *port;
+};
+
+struct vnic_port {
+       char name[VNIC_DESC_LEN];
+       u8 num;
+       int rx_rings_num;
+       int tx_rings_num;
+       struct vnic_ib_dev *dev;
+       struct mcast_root mcast_tree;
+       struct list_head list;
+       struct list_head login_list;
+       struct delayed_work event_task;
+       struct delayed_work event_task_light;
+       struct delayed_work discover_restart_task;
+       struct ib_event_handler event_handler;
+       struct ib_port_attr attr;
+       union ib_gid gid;
+       int rate;
+       u8 rate_enum;
+       atomic_t vnic_child_ids;
+
+       /* IB resources per port */
+       struct vnic_rx_ring *rx_ring[VNIC_MAX_NUM_CPUS];
+       struct ib_pd *pd;
+       struct ib_mr *mr;
+
+       /* for FIP */
+       struct mutex mlock;
+       struct mutex start_stop_lock;
+       u16 pkey_index;
+       u16 pkey;
+       int max_mtu_enum;
+       struct fip_root fip;
+       struct vnic_sysfs_attr dentries[VNIC_MAX_DENTRIES];
+};
+
+enum fip_vnic_state {
+       FIP_VNIC_CLOSED         = 0,
+       FIP_VNIC_HADMIN_IDLE    = 1<<0,
+       FIP_VNIC_LOGIN          = 1<<1,
+       FIP_VNIC_WAIT_4_ACK     = 1<<2,
+       FIP_VNIC_RINGS_INIT     = 1<<3, /* temporary, create rings */
+       FIP_VNIC_MCAST_INIT     = 1<<4, /* temporary, start mcast attach */
+       FIP_VNIC_MCAST_INIT_DONE= 1<<5, /* wait for mcast cb */
+       FIP_VNIC_VHUB_INIT      = 1<<6,
+       FIP_VNIC_VHUB_INIT_DONE = 1<<7, /* wait for vhub table */
+       FIP_VNIC_VHUB_DONE      = 1<<8,
+       FIP_VNIC_VHUB_WRITE     = 1<<9,
+       FIP_VNIC_CONNECTED      = 1<<10
+};
+
+enum vhub_table_state {
+       VHUB_TBL_INIT,
+       VHUB_TBL_UP2DATE,
+       VHUB_TBL_UPDATED
+};
+
+struct vhub_elist {
+       u32 tusn;
+       int count;
+       int total_count;
+       struct list_head vnic_list;     /* chain vnics */
+};
+
+struct vnic_table_entry {
+       u32 qpn;
+       u16 lid;
+       u8 mac[ETH_ALEN];
+       u8 sl;
+
+       struct list_head list;
+       u8 rss;
+       u8 valid;
+};
+
+struct vhub_table {
+       enum vhub_table_state state;
+       u32 checksum;
+       u32 tusn;
+       struct vhub_elist main_list;
+       struct vhub_elist update_list;
+};
+
+struct fip_shared_vnic_data {
+       u8 ip[IPV4_LEN];
+       u8 emac[ETH_ALEN];
+       u8 enabled;
+       u8 arp_proxy;
+};
+
+struct lag_member {
+       u32     qpn;
+       u8      sl;
+       u16     gw_port_id;
+       u16     lid;
+       u8      guid[GUID_LEN];
+       u8      eport_state;
+       u8      weight;
+       u8      link_utilization;
+};
+
+struct lag_members {
+       int     num;
+       long    used_bitmask;
+       struct  lag_properties prop;
+       struct  lag_member memb[MAX_LAG_MEMBERS];
+};
+
+struct fip_login_data {
+       u32 qpn;
+       u32 ctl_qpn;
+       u16 port_id;            /* must always be uptodate */
+       u16 lid;                /* must always be uptodate */
+       u16 vlan;
+       u16 pkey;
+       u16 pkey_index;
+       u16 vnic_id;            /* must always be uptodate */
+       u32 vhub_id;
+       u16 mtu;
+
+       u8 sl;                  /* service level -- 4 bits */
+       u8 guid[GUID_LEN];
+       u8 mac[ETH_ALEN];
+       u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+       u8 vnic_name[VNIC_NAME_LEN];
+       u8 vendor_id[VNIC_VENDOR_LEN];
+       u8 n_mac_mcgid;
+       u8 n_rss_mgid;
+       u8 syndrome;            /* must always be uptodate */
+
+       u8 vp;                  /* 1 bit: do we use vlan */
+       u8 all_vlan_gw;         /* 1 bit.
+                                  is promisc vlan supported on this vnic */
+       struct lag_members lagm;
+};
+
+enum fip_flush {
+       FIP_NO_FLUSH,
+       FIP_PARTIAL_FLUSH,      /* use this for events caused by vnic/gw logic will */
+       FIP_FULL_FLUSH          /* use this for events caused by unload, host admin destroy */
+};
+
+struct fip_vnic_send_info {
+       u32 gw_qpn;
+       u32 qkey;
+       u16 gw_lid;
+       u8 gw_sl;
+};
+
+/*
+ * This struct holds informative info about the GW that can change without
+ * implecations on GW or vnic logic (only reported to user)
+ */
+struct fip_gw_volatile_info {
+       u8 system_guid[GUID_LEN];
+       u8 system_name[VNIC_SYSTEM_NAME_LEN+1];
+       u8 gw_port_name[VNIC_GW_PORT_NAME_LEN+1];
+};
+
+struct fip_vnic_data {
+       char name[VNIC_NAME_LEN];
+       enum fip_vnic_state state;
+       enum fip_flush flush;
+       spinlock_t lock;
+       spinlock_t ka_lock;
+       struct vnic_sysfs_attr dentry;
+       unsigned long login_state;
+
+       /* data structures maintenance */
+       struct fip_gw_data *gw;
+       struct vnic_port *port;
+       struct list_head gw_vnics;
+       struct vhub_table vhub_table;
+
+       /* execution maintenance */
+       unsigned long update_jiffs;
+       unsigned long keep_alive_jiffs;
+       unsigned long detached_ka_jiffs;
+       unsigned long vnic_mcaste_state;
+       struct delayed_work vnic_task;
+       struct hrtimer keepalive_timer;
+       struct list_head timer;
+       struct delayed_work vnic_gw_alive_task;
+       struct work_struct vnic_pkt_rcv_task_bh;
+       struct work_struct vnic_login_destroy_task;
+       struct work_struct vnic_login_create_task;
+       struct pkt_rcv_list vnic_rcv_list;
+       struct fip_vnic_send_info gw_address;
+
+       /* vnic driver API */
+       struct vnic_login *login;
+       unsigned long login_status;
+       int qps_num;
+       u32 qp_base_num;
+       int parent_used;
+       u8 parent_name[VNIC_NAME_LEN];
+
+       /* rx + tx data structures */
+       struct ib_cq *cq;
+       struct ib_qp *qp;
+       struct fip_ring rx_ring;
+       struct fip_ring tx_ring;
+       struct ib_ah *ah;
+
+       /* data domain */
+       union ib_gid mgid;
+
+       /* vHub context update mcast groups */
+       struct mcast_root mcast_tree;
+       struct fip_login_data login_data;
+       struct fip_shared_vnic_data shared_vnic;
+       u16 mlid;
+       /* u16 pkey_index; not used for now */
+
+       u16 vnic_id; /* unique id for GW */
+       u16 vlan;
+       u8 vlan_used;
+       u8 all_vlan_gw;
+       u16 pkey;
+       u16 pkey_index;
+       u8 hadmined; /* todo, use the state for this */
+       u8 interface_name[VNIC_NAME_LEN];
+       u8 mac_cache[ETH_ALEN];
+       atomic_t eport_state;
+       unsigned long last_send_jiffs;
+       int retry_count;
+       int synd_backlog;
+       struct fip_hadmin_cmd cmd;
+       struct fip_gw_volatile_info gw_info;
+       struct lag_members lm;
+       unsigned long   req_attach;
+       unsigned long   cur_attached;
+       union ib_gid    ka_mcast_gid;
+};
+
+enum vhub_mgid_type {
+       VHUB_MGID_DATA = 0,
+       VHUB_MGID_UPDATE = 2,
+       VHUB_MGID_TABLE = 3,
+       VHUB_MGID_KA = 5,
+};
+
+enum fip_all_mgids {
+       FIP_MCAST_DISCOVER,
+       FIP_MCAST_SOLICIT,
+       FIP_MCAST_VHUB_DATA,
+       FIP_MCAST_VHUB_UPDATE,
+       FIP_MCAST_TABLE,
+       FIP_MCAST_VHUB_KA,
+};
+
+union vhub_mgid {
+       struct mgid {
+               u8 mgid_prefix[VNIC_MGID_PREFIX_LEN];
+               u8 type;
+               u8 dmac[ETH_ALEN];
+               u8 rss_hash;
+               u8 vhub_id[3];
+       } mgid;
+       union ib_gid ib_gid;
+};
+
+void vnic_carrier_update(struct vnic_login *login);
+int vnic_param_check(void);
+
+/* mac table funcs */
+void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove);
+void vnic_child_flush(struct vnic_login *login, int all);
+int vnic_child_update(struct vnic_login *login, u8 *mac, int remove);
+int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove);
+int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id,
+                      u8 *mac, u32 *qp_base_num_ptr, char *parent_name,
+                      int remove);
+
+/* mcast funcs */
+int vnic_mcast_init(void);
+void vnic_mcast_cleanup(void);
+
+/*
+ * A helper function to prevent code duplication. Receives a multicast mac
+ * and a gw_id and attaches it (join + attach). The function also receives
+ * a default_mcaste (used for the MGID over default MLID hack and a user list.
+ * Returns 0 on success and non 0 on failure.
+ *
+ * in: mmac - to be used in creation MGID address
+ * in: default_mcaste - mcaste entry of the default MGID. Can be NULL
+ * in: private_data - A user pointer that can be used to identify owner
+ * in: gw_id - to be used in creation MGID address
+ */
+int _vnic_mcast_attach_mgid(struct vnic_login *login,
+                          char *mmac,
+                          struct vnic_mcast *default_mcaste,
+                          void *private_data,
+                          u16 gw_id);
+
+struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port,
+                                   unsigned long *req_attach,
+                                   unsigned long *cur_attach);
+/*
+ * A helper function to prevent code duplication. Fills vnic_mcast struct with
+ * common values.
+ *
+ * in: mcaste - mcaste to fill
+ * in: gw_id - to be used in creation MGID address
+ * in: mac - to be used in creation MGID address
+ * in: rss_hash - to be used in creation MGID address (ususally 0)
+ * in: create - value of create field in mcaste
+ */
+void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste,
+                       u16 gw_id, const u8 *mac, u8 rss_hash, int create);
+
+void vnic_mcast_dealloc(struct vnic_mcast *mcaste);
+
+int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+
+/*
+ * This function grabs the mcast_tree->mcast_rb_lock
+*/
+int vnic_mcast_add(struct mcast_root *mcast_tree,
+                  struct vnic_mcast *mcaste);
+int vnic_mcast_del_all(struct mcast_root *mcast_tree);
+int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner);
+
+void vnic_tree_mcast_detach(struct mcast_root *mcast_tree);
+void vnic_tree_mcast_attach(struct mcast_root *mcast_tree);
+
+/*void vnic_port_mcast_del_all(struct mcast_root *port); */
+static inline void vnic_mcast_root_init(struct mcast_root *mcast_tree)
+{
+       spin_lock_init(&mcast_tree->mcast_rb_lock);
+       INIT_LIST_HEAD(&mcast_tree->reattach_list);
+}
+
+/* port funcs */
+int vnic_ports_init(void);
+void vnic_ports_cleanup(void);
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+*/
+void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste);
+struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree,
+                                    union ib_gid *gid);
+void port_fip_discover_restart(struct work_struct *work);
+int vnic_port_fip_init(struct vnic_port *port);
+void vnic_port_fip_cleanup(struct vnic_port *port, int lock);
+
+/* others */
+void fip_refresh_mcasts(struct fip_discover *discover);
+void vnic_login_refresh_mcasts(struct vnic_port *port);
+
+/* There are 2 different create flows, for host admin and net admin.
+ * In net admin we always create the vnic after connected with GW but we do not
+ * yet know the vnic details (mac, vlan etc). We know the ring paramets and 
+ * will need to create the RX/TX rings (before login).
+ * To accomplish this we call vnic_login_pre_create_1, vnic_login_pre_create_2
+ * and after login ACK we will call vnic_login_register_netdev and vnic_login_complete_ack.
+ * In Host admin, we know the vnic info but not the GW info when we create the
+ * vnic. So we call vnic_login_pre_create_1 and vnic_login_register_netdev, after
+ * getting the login ACK we will call vnic_login_pre_create_2, vnic_login_complete_ack.
+ */
+int vnic_login_register_netdev(struct fip_vnic_data *vnic,
+                              const char *mac,
+                              const char *name);
+int vnic_login_complete_ack(struct fip_vnic_data *vnic,
+                           struct fip_login_data *login_data,
+                           struct fip_shared_vnic_data *shared_vnic);
+int vnic_login_pre_create_1(struct vnic_port *port,
+                           struct fip_vnic_data *vnic);
+int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag);
+
+/*
+ * When destroying login, call to stop login wq tasks. do not call from
+ * login_wq context.
+*/
+void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush);
+/*
+ * When destroy login data struct. Assumes all login wq tasks are stopped.
+ * Can be called from any context, might block for a few secs.
+*/
+void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush);
+
+/*
+ * Destroy a login datastructure.
+ * This function can not be called from login_wq context. If you need to run
+ * from login_wq use the split function vnic_login_destroy_stop_wq/wq_stopped
+ * instead.
+ */
+static inline
+void vnic_login_destroy(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+       vnic_login_destroy_stop_wq(vnic, flush);
+       vnic_login_destroy_wq_stopped(vnic, flush);
+}
+
+/* add / remove members eports from LAG GW */
+void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop);
+int vnic_member_add(struct vnic_login *login, int member_id,
+                   struct lag_member *emember);
+int vnic_member_remove(struct vnic_login *login, int member_id);
+int vnic_member_modify(struct vnic_login *login, int member_id,
+                      struct lag_member *emember);
+void vnic_member_remove_all(struct vnic_login *login);
+
+int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube);
+void vnic_vhube_flush(struct fip_vnic_data *vnic);
+void vnic_vhube_del(struct fip_vnic_data *vnic, u8 *mac);
+int vnic_neighe_path_query(struct vnic_neigh *neighe);
+
+void vhub_mgid_create(const char *mgid_prefix,
+                     const char *mmac, /* mcast mac for bcast 0xFF.. */
+                     u64 n_mac,        /* bits to take from mmac */
+                     u32 vhub_id,
+                     enum vhub_mgid_type type,
+                     u8 rss_hash,
+                     union vhub_mgid *mgid);
+/*
+ * read the state of the gw eport. Can be called from any context.
+*/
+int fip_vnic_get_eport_state(struct fip_vnic_data *vnic);
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff);
+int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff);
+int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff);
+u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic);
+int fip_vnic_get_gw_type(struct fip_vnic_data *vnic);
+int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf);
+int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff);
+
+
+/*
+ * return short format string of GW info. can be called from any context.
+*/
+int fip_vnic_get_short_gw_info(struct fip_vnic_data *vnic, char *buff);
+
+void vnic_data_cleanup(void);
+
+/*
+ * This function is called from the sysfs update callback function. 
+ * it parses the request and adds the request to a list. It then queues a
+ * work request to process the list from the fip_wq context.  
+*/
+int fip_hadmin_sysfs_update(struct vnic_port *port,
+                           const char *buffer, int count, int remove);
+int fip_gw_sysfs_show(struct vnic_port *port, char *buffer);
+int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd);
+void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd);
+
+int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address);
+void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address);
+void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn,
+                             u32 qkey, u16 gw_lid, u8 gw_sl);
+
+int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic);
+
+int port_fs_init(struct vnic_port *port);
+void port_fs_exit(struct vnic_port *port);
+
+int vnic_port_query(struct vnic_port *port);
+
+#endif /* VNIC_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data.h
new file mode 100644 (file)
index 0000000..d21517f
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_DATA_H
+#define _VNIC_DATA_H
+
+#include "vnic.h"
+
+enum {
+       VNIC_SEND_INLINE_FLAG_POS = 63,
+};
+
+#define        VNIC_SEND_INLINE_FLAG ((u64)1 << VNIC_SEND_INLINE_FLAG_POS)
+
+/* main funcs */
+int vnic_port_data_init(struct vnic_port *port);
+void vnic_port_data_cleanup(struct vnic_port *port);
+
+/* ib funcs */
+struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind,
+                                 gfp_t gfp_flag);
+int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id);
+int vnic_post_recvs(struct vnic_rx_ring *ring);
+int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+                           struct ib_udata *udata, int nqps,
+                           int align, struct ib_qp *list[]);
+int vnic_ib_destroy_qp(struct ib_qp *qp);
+int vnic_ib_post_send(struct ib_qp *ibqp,
+                     struct ib_send_wr *wr,
+                     struct ib_send_wr **bad_wr,
+                     u8 ip_off, u8 ip6_off,
+                     u8 tcp_off, u8 udp_off);
+struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index);
+void vnic_destroy_rx_ring(struct vnic_rx_ring *ring);
+int vnic_init_qp(struct vnic_login *login, int qp_index);
+int vnic_create_qp(struct vnic_login *login, int qp_index);
+int vnic_create_qp_range(struct vnic_login *login);
+void vnic_destroy_qp(struct vnic_login *login, int qp_index);
+int vnic_create_tx_res(struct vnic_login *login, int tx_res_index);
+int vnic_create_rx_res(struct vnic_login *login, int rx_res_index);
+void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index);
+void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index);
+
+int vnic_ib_up(struct net_device *dev);
+int vnic_ib_down(struct net_device *dev);
+int vnic_ib_open(struct net_device *dev);
+int vnic_ib_stop(struct net_device *dev);
+
+int vnic_ib_set_moder(struct vnic_login *login,
+                     u16 rx_usecs, u16 rx_frames, u16 tx_usecs, u16 tx_frames);
+int vnic_port_ib_init(struct vnic_port *port);
+void vnic_port_ib_cleanup(struct vnic_port *port);
+void vnic_ib_dispatch_event(struct ib_event *event);
+#ifndef _BP_NAPI_POLL
+int vnic_poll_cq_rx(struct napi_struct *napi, int budget);
+#else
+int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget);
+#endif
+void vnic_send(struct vnic_login *login, struct sk_buff *skb,
+              struct ib_ah *ah, u32 dqpn, int tx_res_index);
+void vnic_ib_free_ring(struct vnic_rx_ring *ring);
+int vnic_ib_init_ring(struct vnic_rx_ring *ring);
+
+/* netdev funcs */
+struct net_device *vnic_alloc_netdev(struct vnic_port *port);
+void vnic_free_netdev(struct vnic_login *login);
+int vnic_restart(struct net_device *dev);
+void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr);
+void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr);
+
+/* rx funcs */
+int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc);
+int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev,
+                             struct skb_frag_struct *skb_frags_rx,
+                             u64 wr_id, int length);
+int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring,
+               struct ib_wc *wc, int ip_summed, char *eth_hdr_va);
+
+/* tx funcs */
+int vnic_tx(struct sk_buff *skb, struct net_device *dev);
+
+/* sysfs funcs */
+int vnic_create_dentry(struct vnic_login *login);
+void vnic_delete_dentry(struct vnic_login *login);
+
+/* ethtool funcs */
+void vnic_set_ethtool_ops(struct net_device *dev);
+
+/* neigh funcs */
+void vnic_neigh_del_all(struct vnic_login *login);
+struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac);
+void vnic_neighe_dealloc_task(struct work_struct *work);
+void vnic_neighe_dealloc(struct vnic_neigh *neighe);
+struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login,
+                                    const u8 *mac, u16 dlid, u32 dqpn, u8 rss);
+void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe);
+int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe);
+struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid);
+void vnic_neigh_invalidate(struct vnic_login *login);
+
+
+
+struct vnic_login *__vnic_login_create(struct vnic_port *port, int index);
+u32 vnic_hash(struct net_device *dev, struct sk_buff *skb);
+#endif /* _VNIC_DATA_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ethtool.c
new file mode 100644 (file)
index 0000000..16ff551
--- /dev/null
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static struct ethtool_ops vnic_ethtool_ops;
+
+static const char vnic_strings[][ETH_GSTRING_LEN] = {
+       /* public statistics */
+       "rx_packets", "tx_packets", "rx_bytes",
+       "tx_bytes", "rx_errors", "tx_errors",
+       "rx_dropped", "tx_dropped", "multicast",
+       "collisions", "rx_length_errors", "rx_over_errors",
+       "rx_crc_errors", "rx_frame_errors", "rx_fifo_errors",
+       "rx_missed_errors", "tx_aborted_errors", "tx_carrier_errors",
+       "tx_fifo_errors", "tx_heartbeat_errors", "tx_window_errors",
+#define VNIC_PUB_STATS_LEN     21
+
+       /* private statistics */
+       "gro_held", "gro_merged", "gro_normal", "gro_drop",
+       "lro_aggregated", "lro_flushed", "lro_no_desc",
+       "tso_packets", "queue_stopped", "wake_queue",
+       "tx_timeout", "rx_chksum_good", "rx_chksum_none",
+       "tx_chksum_offload", "sig_ver_err", "vlan_err",
+       "shared_packets", "runt_packets", "realloc_packets",
+       "gw_tx_packets", "gw_tx_bytes",
+#define VNIC_PORT_STATS_LEN    21
+
+       /* packet statistics rx_prio_X (TODO) */
+#define VNIC_PKT_STATS_LEN     0
+};
+
+#define VNIC_STATS_LEN (sizeof(vnic_strings) / ETH_GSTRING_LEN)
+
+static void vnic_get_drvinfo(struct net_device *dev,
+                            struct ethtool_drvinfo *drvinfo)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       *drvinfo = login->drvinfo;
+}
+
+static u32 vnic_get_msglevel(struct net_device *dev)
+{
+       return vnic_msglvl;
+}
+
+static void vnic_set_msglevel(struct net_device *dev, u32 mlevel)
+{
+       vnic_msglvl = mlevel;
+}
+
+static int vnic_get_coalesce(struct net_device *dev,
+                            struct ethtool_coalesce *coal)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       vnic_dbg_ethtool(login->name, "get coalescing params for mtu:%d "
+                        "rx_frames:%d rx_usecs:%d, "
+                        "tx_frames:%d tx_usecs:%d, "
+                        "adaptive_rx_coal:%d, "
+                        "adaptive_tx_coal:%d\n",
+                        login->dev->mtu,
+                        login->rx_frames, login->rx_usecs,
+                        login->tx_frames, login->tx_usecs,
+                        login->adaptive_rx_coal, 0);
+
+       coal->tx_coalesce_usecs = login->tx_usecs;
+       coal->tx_max_coalesced_frames = login->tx_frames;
+       coal->rx_coalesce_usecs = login->rx_usecs;
+       coal->rx_max_coalesced_frames = login->rx_frames;
+
+       coal->pkt_rate_low = login->pkt_rate_low;
+       coal->rx_coalesce_usecs_low = login->rx_usecs_low;
+       coal->pkt_rate_high = login->pkt_rate_high;
+       coal->rx_coalesce_usecs_high = login->rx_usecs_high;
+       coal->rate_sample_interval = login->sample_interval;
+       coal->use_adaptive_rx_coalesce = login->adaptive_rx_coal;
+
+       return 0;
+}
+
+static int vnic_set_coalesce(struct net_device *dev,
+                            struct ethtool_coalesce *coal)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       login->rx_frames = (coal->rx_max_coalesced_frames ==
+                           VNIC_AUTO_CONF) ?
+           VNIC_RX_COAL_TARGET /
+           login->dev->mtu + 1 : coal->rx_max_coalesced_frames;
+       login->rx_usecs = (coal->rx_coalesce_usecs ==
+                          VNIC_AUTO_CONF) ?
+           VNIC_RX_COAL_TIME : coal->rx_coalesce_usecs;
+       login->tx_frames = coal->tx_max_coalesced_frames;
+       login->tx_usecs = coal->tx_coalesce_usecs;
+
+       /* Set adaptive coalescing params */
+       login->pkt_rate_low = coal->pkt_rate_low;
+       login->rx_usecs_low = coal->rx_coalesce_usecs_low;
+       login->pkt_rate_high = coal->pkt_rate_high;
+       login->rx_usecs_high = coal->rx_coalesce_usecs_high;
+       login->sample_interval = coal->rate_sample_interval;
+       login->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+       login->last_moder_time = VNIC_AUTO_CONF;
+
+       if (login->adaptive_rx_coal)
+               return 0;
+
+       vnic_ib_set_moder(login,
+                         login->rx_usecs, login->rx_frames,
+                         login->tx_usecs, login->tx_frames);
+
+       return 0;
+}
+
+static int vnic_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+       cmd->autoneg = AUTONEG_DISABLE;
+       cmd->supported = SUPPORTED_10000baseT_Full;
+       cmd->advertising = SUPPORTED_10000baseT_Full;
+       if (netif_carrier_ok(dev)) {
+               cmd->speed = SPEED_10000;
+               cmd->duplex = DUPLEX_FULL;
+       } else {
+               cmd->speed = -1;
+               cmd->duplex = -1;
+       }
+       return 0;
+}
+
+static int vnic_set_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+       if ((cmd->autoneg == AUTONEG_ENABLE) ||
+           (cmd->speed != SPEED_10000) || (cmd->duplex != DUPLEX_FULL))
+               return -EINVAL;
+
+       /* Nothing to change */
+       return 0;
+}
+
+static void vnic_get_strings(struct net_device *dev,
+                            uint32_t stringset, uint8_t *data)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int index = 0, stats_off = 0, i;
+
+       if (stringset != ETH_SS_STATS)
+               return;
+
+       /* Add main counters */
+       for (i = 0; i < VNIC_PUB_STATS_LEN; i++)
+               strcpy(data + (index++) * ETH_GSTRING_LEN,
+                      vnic_strings[i + stats_off]);
+       stats_off += VNIC_PUB_STATS_LEN;
+
+       for (i = 0; i < VNIC_PORT_STATS_LEN; i++)
+               strcpy(data + (index++) * ETH_GSTRING_LEN,
+                      vnic_strings[i + stats_off]);
+       stats_off += VNIC_PORT_STATS_LEN;
+
+       for (i = 0; i < VNIC_PKT_STATS_LEN; i++)
+               strcpy(data + (index++) * ETH_GSTRING_LEN,
+                      vnic_strings[i + stats_off]);
+       stats_off += VNIC_PKT_STATS_LEN;
+
+       for (i = 0; i < login->tx_rings_num; i++) {
+               sprintf(data + (index++) * ETH_GSTRING_LEN,
+                       "tx%d_packets", i);
+               sprintf(data + (index++) * ETH_GSTRING_LEN,
+                       "tx%d_bytes", i);
+       }
+       for (i = 0; i < login->rx_rings_num; i++) {
+               sprintf(data + (index++) * ETH_GSTRING_LEN,
+                       "rx%d_packets", i);
+               sprintf(data + (index++) * ETH_GSTRING_LEN,
+                       "rx%d_bytes", i);
+       }
+}
+
+static void vnic_get_ethtool_stats(struct net_device *dev,
+                                  struct ethtool_stats *stats, uint64_t *data)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int index = 0, i;
+
+       spin_lock_bh(&login->stats_lock);
+
+       for (i = 0; i < VNIC_PUB_STATS_LEN; i++)
+               data[index++] = ((unsigned long *) &login->stats)[i];
+       for (i = 0; i < VNIC_PORT_STATS_LEN; i++)
+               data[index++] = ((unsigned long *) &login->port_stats)[i];
+       for (i = 0; i < VNIC_PKT_STATS_LEN; i++)
+               data[index++] = 0;
+       for (i = 0; i < login->tx_rings_num; i++) {
+               data[index++] = login->tx_res[i].stats.tx_packets;
+               data[index++] = login->tx_res[i].stats.tx_bytes;
+       }
+       for (i = 0; i < login->rx_rings_num; i++) {
+               data[index++] = login->port->rx_ring[i]->stats.rx_packets;
+               data[index++] = login->port->rx_ring[i]->stats.rx_bytes;
+       }
+       spin_unlock_bh(&login->stats_lock);
+}
+
+#ifndef _BP_ETHTOOL_NO_SSETC
+static int vnic_get_sset_count(struct net_device *dev, int sset)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       switch (sset) {
+       case ETH_SS_STATS:
+               return VNIC_STATS_LEN + /* static stats + stats per ring */
+                      (login->tx_rings_num + login->rx_rings_num) * 2;
+       default:
+               return -EOPNOTSUPP;
+       }
+}
+
+#else
+static int vnic_get_stats_count(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       return VNIC_STATS_LEN +
+              (login->tx_rings_num + login->rx_rings_num) * 2;
+}
+#endif
+
+static void vnic_get_wol(struct net_device *dev, struct ethtool_wolinfo *wol)
+{
+       wol->supported = wol->wolopts = 0;
+
+       return;
+}
+
+void vnic_get_ringparam(struct net_device *dev, struct ethtool_ringparam *param)
+{
+       memset(param, 0, sizeof *param);
+       param->rx_max_pending = VNIC_MAX_RX_SIZE;
+       param->tx_max_pending = VNIC_MAX_TX_SIZE;
+       param->rx_pending = vnic_rx_rings_len;
+       param->tx_pending = vnic_tx_rings_len;
+}
+
+void vnic_set_ethtool_ops(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       struct mlx4_ib_dev *mlx4_ibdev = login->port->dev->mdev;
+
+       ASSERT(login);
+       ASSERT(login->port->dev->ca);
+       ASSERT(login->port->dev->ca->dma_device);
+
+       SET_ETHTOOL_OPS(dev, &vnic_ethtool_ops);
+       strncpy(login->drvinfo.driver, DRV_NAME, VNIC_ETHTOOL_LINE_MAX);
+       strncpy(login->drvinfo.version, DRV_VER, VNIC_ETHTOOL_LINE_MAX);
+       login->drvinfo.n_stats = 0;
+       login->drvinfo.regdump_len = 0;
+       login->drvinfo.eedump_len = 0;
+
+       sprintf(login->drvinfo.bus_info, "%s [%s:%d]",
+               pci_name(to_pci_dev(login->port->dev->ca->dma_device)),
+               login->port->dev->ca->name, login->port->num);
+       sprintf(login->drvinfo.fw_version, "%s [%.*s]",
+               login->port->dev->fw_ver_str, MLX4_BOARD_ID_LEN,
+               mlx4_ibdev->dev->board_id);
+       vnic_dbg_ethtool(login->name, "bus %s, port %d, fw_ver %s\n",
+                        login->drvinfo.bus_info, login->port->num,
+                        login->drvinfo.fw_version);
+
+       return;
+}
+
+static struct ethtool_ops vnic_ethtool_ops = {
+       .get_link = ethtool_op_get_link,
+       .get_drvinfo = vnic_get_drvinfo,
+       .get_msglevel = vnic_get_msglevel,
+       .set_msglevel = vnic_set_msglevel,
+       .get_coalesce = vnic_get_coalesce,
+       .set_coalesce = vnic_set_coalesce,
+       .get_strings = vnic_get_strings,
+       .get_ethtool_stats = vnic_get_ethtool_stats,
+#ifndef _BP_ETHTOOL_NO_SSETC
+       .get_sset_count = vnic_get_sset_count,
+#else
+       .get_stats_count = vnic_get_stats_count,
+#endif
+       .get_settings = vnic_get_settings,
+       .set_settings = vnic_set_settings,
+       .get_wol = vnic_get_wol,
+       .get_ringparam = vnic_get_ringparam,
+       .set_ringparam = NULL,
+};
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_fs.c
new file mode 100644 (file)
index 0000000..95d7ef7
--- /dev/null
@@ -0,0 +1,993 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/version.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip_discover.h"
+
+#define ALL_VLAN_GW_VID "all"
+
+#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0))
+#define __MODULE_KOBJ_TYPE struct module_kobject
+#else
+#define __MODULE_KOBJ_TYPE struct module
+#endif
+
+char *login_dentry_name(char *buf, struct vnic_login *login, char *str)
+{
+       snprintf(buf, VNIC_SYSFS_FLEN, "%s%d-%s", "vnic",
+                login->cnt, str);
+       return buf;
+}
+
+char *port_dentry_name(char *buf, struct vnic_port *port, char *str)
+{
+       snprintf(buf, VNIC_SYSFS_FLEN, "%s_%s_%d",
+                str, port->dev->name, port->num);
+       return buf;
+}
+
+char *vnic_dentry_name(char *buf, struct fip_vnic_data *vnic, char *str)
+{
+       snprintf(buf, VNIC_SYSFS_FLEN, "%s-%s-%s", "vnic",
+                vnic->interface_name, str);
+       return buf;
+}
+
+#ifndef _BP_NO_ATT_OWNER
+#define DENTRY_OWNER(_vdentry)                                         \
+       (_vdentry)->dentry.attr.owner = THIS_MODULE;                    \
+       (_vdentry)->kobj = &vdentry->dentry.attr.owner->mkobj.kobj;
+#else
+#define DENTRY_OWNER(_vdentry)                                         \
+       (_vdentry)->kobj = &(THIS_MODULE)->mkobj.kobj;
+#endif
+
+#define DENTRY_REMOVE(_dentry)                                         \
+do {                                                                   \
+       vnic_dbg_sysfs((_dentry)->name, "deleted\n");                   \
+       sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr);    \
+       (_dentry)->ctx = NULL;                                          \
+} while (0);
+
+#define DENTRY_CREATE(_ctx, _dentry, _name, _show, _store)             \
+do {                                                                   \
+       struct vnic_sysfs_attr *vdentry = _dentry;                      \
+       vdentry->ctx = _ctx;                                            \
+       vdentry->dentry.show = _show;                                   \
+       vdentry->dentry.store = _store;                                 \
+       vdentry->dentry.attr.name = vdentry->name;                      \
+       vdentry->dentry.attr.mode = 0;                                  \
+       DENTRY_OWNER(vdentry);                                          \
+       snprintf(vdentry->name, VNIC_SYSFS_FLEN, "%s", _name);          \
+       if (vdentry->dentry.store)                                      \
+               vdentry->dentry.attr.mode |= S_IWUSR;                   \
+       if (vdentry->dentry.show)                                       \
+               vdentry->dentry.attr.mode |= S_IRUGO;                   \
+       vnic_dbg_sysfs(_ctx->name, "creating %s\n",                     \
+               vdentry->name);                                         \
+       if (strlen(_name) > VNIC_SYSFS_FLEN) {                          \
+               vnic_err(_ctx->name, "name too long %d > %d\n",         \
+                        (int)strlen(_name), VNIC_SYSFS_FLEN);          \
+               vdentry->ctx = NULL;                                    \
+               break;                                                  \
+       }                                                               \
+       if (sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr)) {  \
+               vnic_err(_ctx->name, "failed to create %s\n",           \
+                        vdentry->dentry.attr.name);                    \
+               vdentry->ctx = NULL;                                    \
+               break;                                                  \
+       }                                                               \
+       vnic_dbg_sysfs(_ctx->name, "created %s\n", vdentry->name);      \
+} while (0);
+
+/* helper functions */
+static const char *port_phys_state_str(enum ib_port_state pstate)
+{
+       switch (pstate) {
+       case 0:
+               return "no_state_change";
+       case 1:
+               return "sleep";
+       case 2:
+               return "polling";
+       case 3:
+               return "disabled";
+       case 4:
+               return "port_configuration_training";
+       case 5:
+               return "up";
+       case 6:
+               return "error_recovery";
+       case 7:
+               return "phy_test";
+       default:
+               return "invalid_state";
+       }
+}
+static const char *port_state_str(enum ib_port_state pstate)
+{
+       switch (pstate) {
+       case IB_PORT_DOWN:
+               return "down";
+       case IB_PORT_INIT:
+               return "initializing";
+       case IB_PORT_ARMED:
+               return "armed";
+       case IB_PORT_ACTIVE:
+               return "active";
+       case IB_PORT_NOP:
+               return "nop";
+       case IB_PORT_ACTIVE_DEFER:
+               return "defer";
+       default:
+               return "invalid_state";
+       }
+}
+
+/* store/show functions */
+static ssize_t vnic_neigh_show(struct module_attribute *attr,
+                              __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       char *p = buf;
+       struct vnic_sysfs_attr *vnic_dentry =
+               container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_login *login = vnic_dentry->ctx;
+       struct vnic_neigh *neighe;
+       struct vnic_mcast *mcaste;
+       struct rb_node *n;
+       unsigned long flags;
+
+       /* check if GW entry is ready */
+       if (!login->gw_neigh)
+               goto out;
+       ASSERT(login->gw_neigh);
+
+       /* print GW entry */
+       neighe = login->gw_neigh;
+       p += _sprintf(p, buf, "G:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+                    "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d] VALID[%d]\n",
+                    MAC_6_PRINT_ARG(neighe->mac),
+                    be16_to_cpu(login->vid), login->vlan_used, neighe->qpn,
+                    neighe->lid, neighe->rss, neighe->sl, neighe->valid);
+
+       /* print neigh tree entries */
+       n = rb_first(&login->neigh_tree);
+       while (n) {
+               neighe = rb_entry(n, struct vnic_neigh, rb_node);
+               p += _sprintf(p, buf, "U:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+                            "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d] VALID[%d]\n",
+                            MAC_6_PRINT_ARG(neighe->mac),
+                            be16_to_cpu(login->vid), login->vlan_used,
+                            neighe->qpn, neighe->lid, neighe->rss, neighe->sl, neighe->valid);
+               n = rb_next(n);
+       }
+
+       /* print mcast tree entries */
+       spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+       n = rb_first(&login->mcast_tree.mcast_tree);
+       while (n) {
+               u16 lid = 0xFFFF;
+               mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+               n = rb_next(n);
+               if (test_bit(MCAST_ATTACHED, &mcaste->state))
+                       lid = mcaste->port_mcaste->rec.mlid;
+               p += _sprintf(p, buf, "M:MAC["MAC_6_PRINT_FMT"] VID[0x%04x] "
+                            "VID_USED[%d] QPN[0x%06x] LID[0x%04x] RSS[%d] SL[%d]\n",
+                            MAC_6_PRINT_ARG(mcaste->mac),
+                            0, login->vlan_used, IB_MULTICAST_QPN, lid, 0, mcaste->port_mcaste->sa_mcast->rec.sl);
+       }
+       spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+out:
+       return (ssize_t)(p - buf);
+}
+
+/* store/show functions */
+static ssize_t vnic_member_show(struct module_attribute *attr,
+                              __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       char *p = buf;
+       struct vnic_sysfs_attr *vnic_dentry =
+               container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_login *login = vnic_dentry->ctx;
+       int i;
+
+       if (!login->is_lag)
+               goto out;
+
+       netif_tx_lock_bh(login->dev);
+       p += _sprintf(p, buf, "GW member count=%d active count=%d hash bitmask=0x%X\n",
+                    login->lag_member_count, login->lag_member_active_count, login->lag_prop.hash_mask);
+
+       p += _sprintf(p, buf, "GW hash mapping table:\n");
+
+       for (i=0; i<LAG_MAP_TABLE_SIZE; i+=8) {
+               p += _sprintf(p, buf, "%3d %3d %3d %3d %3d %3d %3d %3d\n",
+                      login->lag_gw_map[i], login->lag_gw_map[i+1], login->lag_gw_map[i+2], login->lag_gw_map[i+3],
+                      login->lag_gw_map[i+4], login->lag_gw_map[i+5], login->lag_gw_map[i+6], login->lag_gw_map[i+7]);
+       }
+
+       p += _sprintf(p, buf, "\nGW member state info:   (0x1-created, 0x2-eport up, 0x4-mcast join complete, 0x8-member in use)\n");
+
+       for (i=0; i<MAX_LAG_MEMBERS; i++) {
+               p += _sprintf(p, buf, "%.2d GW id=%.3d State=0x%.3x LID=%.3d QPN=0x%.6x SL[%d] VALID[%d]\n", i,
+                             login->lag_gw_neigh[i].gw_id,
+                             login->lag_gw_neigh[i].info,
+                             login->lag_gw_neigh[i].neigh.lid,
+                             login->lag_gw_neigh[i].neigh.qpn,
+                             login->lag_gw_neigh[i].neigh.sl,
+                             login->lag_gw_neigh[i].neigh.valid);
+       }
+       netif_tx_unlock_bh(login->dev);
+
+out:
+       return (ssize_t)(p - buf);
+}
+
+static ssize_t vnic_login_show(struct module_attribute *attr,
+                            __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       char *p = buf, tmp_line[VNIC_SYSFS_LLEN];
+       struct vnic_sysfs_attr *vnic_dentry =
+           container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_login *login = vnic_dentry->ctx;
+       struct fip_vnic_data *vnic_fip = login->fip_vnic;
+       int rc, eport_connected = test_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic_fip->login_state);
+       u16 pkey_used = 0;
+       int lag_gw;
+       int ret;
+
+       ASSERT(login->dev);
+       ASSERT(login->port->dev->ca);
+
+       /* NETDEV attributes */
+       p += _sprintf(p, buf, "NETDEV_NAME   %s\n", login->dev->name);
+       p += _sprintf(p, buf, "NETDEV_LINK   %s\n",
+                    netif_carrier_ok(login->dev) ? "up" : "down");
+       p += _sprintf(p, buf, "NETDEV_OPEN   %s\n",
+                    (login->dev->flags & IFF_UP) ? "yes" : "no");
+       p += _sprintf(p, buf, "NETDEV_QSTOP  %s\n",
+                    netif_queue_stopped(login->dev) ? "yes" : "no");
+       p += _sprintf(p, buf, "NETDEV_MTU    %d/%d\n",
+                     (int)login->dev->mtu,
+                     (int)login->max_mtu);
+
+       /* IOA attributes */
+       p += _sprintf(p, buf, "IOA_PORT      %s:%d\n",
+                     login->port->dev->ca->name,
+                     login->port->num);
+       p += _sprintf(p, buf, "IOA_NAME      %s\n",
+                     login->desc);
+       p += _sprintf(p, buf, "IOA_LID       0x%04x\n", login->port->attr.lid);
+       p += _sprintf(p, buf, "IOA_GUID      "VNIC_GUID_FMT"\n",
+                    VNIC_GUID_RAW_ARG(login->port->gid.raw + 8));
+       p += _sprintf(p, buf, "IOA_LOG_LINK  %s\n",
+                    port_phys_state_str(login->port->attr.phys_state));
+       p += _sprintf(p, buf, "IOA_PHY_LINK  %s\n",
+                    port_state_str(login->port->attr.state));
+       p += _sprintf(p, buf, "IOA_MTU       %d\n", login->port->max_mtu_enum);
+
+
+       /* EPORT and BX attributes */
+       if (no_bxm) {
+               p += _sprintf(p, buf, "EPORT_STATE   %s\n", "bridgeless");
+       } else if (vnic_fip) {
+               p += _sprintf(p, buf, "EPORT_STATE   %s\n",
+                             !eport_connected ? "disconnected" :
+                             (fip_vnic_get_eport_state(vnic_fip) ?
+                              "up" : "down"));
+               p += _sprintf(p, buf, "EPORT_NAME    %s\n",
+                             fip_vnic_get_eport_name(vnic_fip, tmp_line) ?
+                             NOT_AVAILABLE_STRING : tmp_line);
+               p += _sprintf(p, buf, "EPORT_QPN     0x%06x\n",
+                             login->gw_neigh ? login->gw_neigh->qpn : 0);
+               p += _sprintf(p, buf, "EPORT_LID     0x%04x\n",
+                             login->gw_neigh ? login->gw_neigh->lid : 0);
+               p += _sprintf(p, buf, "EPORT_ID      %u\n", login->gw_port_id);
+
+               p += _sprintf(p, buf, "BX_NAME       %s\n",
+                             fip_vnic_get_bx_name(vnic_fip, tmp_line) ?
+                             NOT_AVAILABLE_STRING : tmp_line);
+               fip_vnic_get_bx_guid(vnic_fip, tmp_line);
+               if (*((u64 *)tmp_line) == 0)
+                       p += _sprintf(p, buf, "BX_GUID       %s\n", NOT_AVAILABLE_STRING);
+               else
+                       p += _sprintf(p, buf, "BX_GUID       "VNIC_GUID_FMT"\n",
+                                     VNIC_GUID_RAW_ARG(tmp_line));
+
+               lag_gw = fip_vnic_get_gw_type(vnic_fip);
+               if (lag_gw) {
+                       p += _sprintf(p, buf, "GW_TYPE       LAG\n");
+                       ret = fip_vnic_get_lag_eports(vnic_fip, p);
+                       p += (ret > 0) ? ret : 0;
+               } else
+                       p += _sprintf(p, buf, "GW_TYPE       LEGACY\n");
+
+               rc = fip_vnic_get_all_vlan_mode(vnic_fip, tmp_line);
+               p += _sprintf(p, buf, "ALL_VLAN      %s\n",
+                             rc < 0 ? NOT_AVAILABLE_STRING : tmp_line);
+
+       } else {
+               p += _sprintf(p, buf, "EPORT_STATE %s\n", "error");
+       }
+
+       /* misc attributes*/
+       p += _sprintf(p, buf, "SW_RSS        %s\n",
+                     !eport_connected ? NOT_AVAILABLE_STRING :
+                     ((login->qps_num > 1) ? "yes" : "no"));
+       p += _sprintf(p, buf, "SW_RSS_SIZE   %u\n", login->qps_num);
+       p += _sprintf(p, buf, "RX_RINGS_NUM  %d\n", login->rx_rings_num);
+       p += _sprintf(p, buf, "RX_RINGS_LIN  %s\n",
+                     login->port->rx_ring[0]->log_rx_info ? "no" : "yes");
+       p += _sprintf(p, buf, "TX_RINGS_NUM  %d\n", login->tx_rings_num);
+       p += _sprintf(p, buf, "TX_RINGS_ACT  %d\n",
+                     VNIC_TXQ_GET_ACTIVE(login));
+       p += _sprintf(p, buf, "NDO_TSS       %s\n",
+                     (login->ndo_tx_rings_num > 1) ? "yes" : "no");
+       p += _sprintf(p, buf, "NDO_TSS_SIZE  %u\n", login->ndo_tx_rings_num);
+       p += _sprintf(p, buf, "MCAST_PROMISC %s\n",
+                     !eport_connected ? NOT_AVAILABLE_STRING :
+                     (is_mcast_promisc(login) ? "yes" : "no"));
+       p += _sprintf(p, buf, "UCAST_PROMISC %s\n",
+                     (is_ucast_promisc(login) ? "yes" : "no"));
+       p += _sprintf(p, buf, "MCAST_MASK    %d\n", login->n_mac_mcgid);
+       p += _sprintf(p, buf, "CHILD_VNICS   %d/%d\n",
+                     atomic_read(&login->vnic_child_cnt),
+                     vnic_child_max);
+       p += _sprintf(p, buf, "PKEY          0x%04x\n", login->pkey);
+       p += _sprintf(p, buf, "PKEY_INDEX    0x%04x\n", login->pkey_index);
+       rc = ib_query_pkey(login->port->dev->ca, login->port->num,
+                          login->pkey_index, &pkey_used);
+       p += _sprintf(p, buf, "PKEY_MEMBER   %s\n",
+                     (rc || !eport_connected) ? NOT_AVAILABLE_STRING :
+                     ((pkey_used & 0x8000) ? "full" : "partial"));
+       p += _sprintf(p, buf, "SL_DATA       %u\n", login->sl);
+       p += _sprintf(p, buf, "SL_CONTROL    %u\n",
+                     vnic_fip ? fip_vnic_get_bx_sl(vnic_fip) : 0);
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+       p += _sprintf(p, buf, "GRO           %s\n",
+                     login->dev->features & NETIF_F_GRO ? "yes" : "no");
+#elif defined(NETIF_F_LRO)
+       p += _sprintf(p, buf, "LRO           %s\n",
+                     login->dev->features & NETIF_F_LRO ? "yes" : "no");
+       p += _sprintf(p, buf, "LRO_NUM       %d\n", login->lro_num);
+#endif
+       p += _sprintf(p, buf, "NAPI          %s\n",
+                     login->napi_num ? "yes" : "no");
+       p += _sprintf(p, buf, "NAPI_WEIGHT   %u\n",
+                     login->napi_num ? vnic_napi_weight : 0);
+       p += _sprintf(p, buf, "QPN           0x%x\n",
+                     login->qp_base_num);
+       p += _sprintf(p, buf, "MAC           "MAC_6_PRINT_FMT"\n",
+                    MAC_6_PRINT_ARG(login->dev_addr));
+       p += _sprintf(p, buf, "VNIC_ID       %d\n",
+                     vnic_fip ? vnic_fip->vnic_id : 0);
+       p += _sprintf(p, buf, "ADMIN_MODE    %s\n",
+                     !vnic_fip ? NOT_AVAILABLE_STRING :
+                     (vnic_fip->hadmined ? "host" : "network"));
+
+       if (vnic_fip && vnic_fip->vlan_used)
+               p += _sprintf(p, buf, "VLAN          0x%03x\n", vnic_fip->vlan);
+       else
+               p += _sprintf(p, buf, "VLAN          %s\n", NOT_AVAILABLE_STRING);
+
+       if (vnic_fip && vnic_fip->shared_vnic.enabled) {
+               p += _sprintf(p, buf, "SHARED_MAC    "MAC_6_PRINT_FMT"\n",
+                             MAC_6_PRINT_ARG(vnic_fip->shared_vnic.emac));
+               p += _sprintf(p, buf, "SHARED_IP     "IP_4_PRINT_FMT"\n",
+                             IP_4_PRINT_ARG(vnic_fip->shared_vnic.ip));
+       } else {
+               p += _sprintf(p, buf, "SHARED_MAC    %s\n", NOT_AVAILABLE_STRING);
+               p += _sprintf(p, buf, "SHARED_IP     %s\n", NOT_AVAILABLE_STRING);
+       }
+
+       return (ssize_t)(p - buf);
+}
+
+static ssize_t vnic_qps_show(struct module_attribute *attr,
+                            __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       char *p = buf;
+       struct vnic_sysfs_attr *vnic_dentry =
+           container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_login *login = vnic_dentry->ctx;
+       struct ib_qp *qp;
+       struct ib_qp_attr query_attr;
+       struct ib_qp_init_attr query_init_attr;
+       int i, mask = -1;
+
+       for (i = 0; i < login->qps_num; ++i) {
+               qp = login->qp_res[i].qp;
+               if (ib_query_qp(qp, &query_attr, mask, &query_init_attr))
+                       continue;
+               p += _sprintf(p, buf, "QP_INDEX         %d\n", i);
+               p += _sprintf(p, buf, "QP_NUM           0x%06x\n", qp->qp_num);
+               p += _sprintf(p, buf, "QP_QKEY          0x%08x\n", query_attr.qkey);
+               p += _sprintf(p, buf, "QP_STATE         0x%02x\n", query_attr.qp_state);
+               p += _sprintf(p, buf, "QP_RX_RING       %d\n", i % login->rx_rings_num);
+               p += _sprintf(p, buf, "QP_PTR           %p\n", qp);
+               p += _sprintf(p, buf, "QP_RX_SRQ_PTR    %p\n", qp->srq);
+               p += _sprintf(p, buf, "QP_RX_CQ_PTR     %p\n", qp->recv_cq);
+               p += _sprintf(p, buf, "QP_TX_CQ_PTR     %p\n", qp->send_cq);
+               p += _sprintf(p, buf, "\n");
+       }
+
+       return (ssize_t)(p - buf);
+}
+static char* vnic_state_2str(enum fip_vnic_state state)
+{
+       switch(state) {
+       case FIP_VNIC_CLOSED: return "CLOSED";
+       case FIP_VNIC_CONNECTED: return "CONNECTED";
+       case FIP_VNIC_HADMIN_IDLE: return "HADMIN_IDLE";
+       case FIP_VNIC_LOGIN: return "LOGIN";
+       case FIP_VNIC_MCAST_INIT: return "MCAST_INIT";
+       case FIP_VNIC_MCAST_INIT_DONE: return "MCAST_INIT_DONE";
+       case FIP_VNIC_RINGS_INIT: return "RINGS_INIT";
+       case FIP_VNIC_VHUB_DONE: return "VHUB_DONE";
+       case FIP_VNIC_VHUB_INIT: return "VHUB_INIT";
+       case FIP_VNIC_VHUB_INIT_DONE: return "VHUB_INIT_DONE";
+       case FIP_VNIC_VHUB_WRITE: return "VHUB_WRITE";
+       case FIP_VNIC_WAIT_4_ACK: return "WAIT_4_ACK";
+       }
+       return "UNKNOWN";
+
+
+}
+
+int port_vnics_sysfs_show(struct vnic_port *port, char *buf)
+{
+       struct fip_gw_data *gw;
+       char *p = buf;
+       struct fip_discover *discover;
+       struct fip_vnic_data *vnic;
+
+       mutex_lock(&port->start_stop_lock);
+       list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+
+               down_read(&discover->l_rwsem);
+
+               list_for_each_entry(gw, &discover->gw_list, list) {
+                       list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+                               p += _sprintf(p, buf, "%-15s\t%-10s\t%10s:%d  %-10s\t%.7d\t%-10s\t%s\n",
+                                                         gw->info.vol_info.system_name,
+                                                         gw->info.vol_info.gw_port_name,
+                                                         gw->discover->port->dev->ca->name,
+                                                         gw->discover->port->num,
+                                                         vnic->name,
+                                                         vnic->vnic_id,
+                                                         vnic->hadmined?"HOSTADMIN":"NETADMIN",
+                                                         vnic_state_2str(vnic->state));
+                       }
+               }
+
+               up_read(&discover->l_rwsem);
+       }
+
+       mutex_unlock(&port->start_stop_lock);
+       return (p - buf);
+}
+
+
+#ifdef VNIC_PROFILLNG
+static ssize_t vnic_dentry_prof_skb_show(struct module_attribute *attr,
+                                    __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       char *p = buf;
+       struct vnic_sysfs_attr *vnic_dentry =
+              container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_login *login = vnic_dentry->ctx;
+       struct sk_buff *skb;
+       int i;
+
+       for (i = 0; i < VNIC_PROFILLNG_SKB_MAX; ++i) {
+               if (!login->prof_arr[i].cnt)
+                       continue;
+               skb = &login->prof_arr[i].skb;
+               p += _sprintf(p, buf, "==============\n");
+               p += _sprintf(p, buf, "SKB[%d] CNT %d\n", i, login->prof_arr[i].cnt);
+               p += _sprintf(p, buf, "len         %d\n", skb->len);
+               p += _sprintf(p, buf, "data_len    %d\n", skb->data_len);
+               p += _sprintf(p, buf, "head_len    %d\n", skb_headlen(skb));
+               p += _sprintf(p, buf, "gso         %d\n", skb_is_gso(skb));
+               p += _sprintf(p, buf, "nr_frags    %d\n", login->prof_arr[i].nr_frags);
+               p += _sprintf(p, buf, "jiffies     %lu\n", login->prof_arr[i].jiffies);
+               p += _sprintf(p, buf, "msecs       %u\n",
+                             jiffies_to_msecs(login->prof_arr[i].jiffies));
+               p += _sprintf(p, buf, "msecs_diff  %u\n",
+                             jiffies_to_msecs(login->prof_arr[i].jiffies) -
+                             jiffies_to_msecs(login->prof_arr[i ? i -1 : 0].jiffies));
+       }
+
+       return (ssize_t)(p - buf);
+}
+
+#endif
+
+static int get_guid(u8 *guid, char *s)
+{
+       if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+                  guid + 0, guid + 1, guid + 2, guid + 3, guid + 4,
+                  guid + 5, guid + 6, guid + 7) != 8)
+               return -1;
+
+       return 0;
+}
+
+static int get_mac(u8 *mac, char *s)
+{
+       if (sscanf(s, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx",
+                  mac + 0, mac + 1, mac + 2, mac + 3, mac + 4,
+                  mac + 5) != 6)
+               return -1;
+
+       return 0;
+}
+
+static int get_ipv4(short unsigned int *ip, char *s)
+{
+       if (sscanf(s, "%hu.%hu.%hu.%hu", ip + 0, ip + 1, ip + 2, ip + 3) != 4)
+               return -1;
+
+       return 0;
+}
+
+static int get_parent(struct vnic_port *port, char *parent)
+{
+       struct net_device *parent_netdev;
+
+       /* check parent syntax */
+       if (!dev_valid_name(parent))
+               return -EINVAL;
+
+       parent_netdev = dev_get_by_name(&init_net, parent);
+       if (parent_netdev)
+               dev_put(parent_netdev);
+
+       return parent_netdev ? 0 : -ENODATA;
+}
+
+static struct fip_hadmin_cache *get_hadmin_entry(void)
+{
+       struct fip_hadmin_cache *hadmin_entry;
+
+       hadmin_entry = kzalloc(sizeof *hadmin_entry, GFP_ATOMIC);
+       if (!hadmin_entry)
+               return NULL;
+
+       hadmin_entry->vnic_id = NOT_AVAILABLE_NUM;
+       hadmin_entry->gw_port_id = NOT_AVAILABLE_NUM;
+
+       return hadmin_entry;
+}
+
+void vnic_login_cmd_init(struct fip_hadmin_cmd *cmd)
+{
+       char *buf = (char *)cmd;
+       u8 i;
+
+       for (i = 0; i < MAX_INPUT_ARG; ++i)
+               sprintf(buf + (i * MAX_INPUT_LEN),  NOT_AVAILABLE_STRING);
+}
+
+int vnic_login_cmd_set(char *buf, struct fip_hadmin_cmd *cmd)
+{
+       int count;
+
+       if (cmd) {
+               count = sprintf(buf, "name=%s mac=%s vnic_id=%s vid=%s "
+                               "bxname=%s bxguid=%s eport=%s ipv4=%s ipv6=%s "
+                               "emac=%s pkey=%s parent=%s\n",
+                               cmd->c_name, cmd->c_mac, cmd->c_vnic_id,
+                               cmd->c_vid, cmd->c_bxname, cmd->c_bxguid,
+                               cmd->c_eport, cmd->c_ipv4, cmd->c_ipv6,
+                               cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+               vnic_dbg_sysfs((char *)(cmd->c_name), "cmd: %s", buf);
+       } else /* print the cmd syntax */
+               count = sprintf(buf, "name=%%s mac=%%s vnic_id=%%s vid=%%s "
+                               "bxname=%%s bxguid=%%s eport=%%s ipv4=%%s "
+                               "ipv6=%%s emac=%%s pkey=%%s parent=%%s\n");
+
+       return count;
+}
+
+/* create/destroy child vNic; syntax example:
+ * +00:11:22:33:44:55
+ */
+static ssize_t vnic_child_write(struct module_attribute *attr,
+                               __MODULE_KOBJ_TYPE *mod,
+                               const char *buf, size_t count)
+{
+       struct vnic_sysfs_attr *vnic_dentry =
+           container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_login *login = vnic_dentry->ctx;
+       char action = buf[0];
+       char *buf_mac = (char *)buf + 1;
+       int remove = -1;
+       u8 mac[ETH_ALEN];
+
+       if (action == '-')
+               remove = 1;
+       if (action == '+')
+               remove = 0;
+
+       if (remove < 0 || get_mac(mac, buf_mac) || !is_valid_ether_addr(mac))
+               return -EINVAL;
+
+       vnic_learn_mac(login->dev, mac, remove);
+       return count;
+}
+
+int fip_hadmin_sysfs_update(struct vnic_port *port,
+                           const char *buf, int count, int remove)
+{
+       struct fip_discover *discover;
+       struct fip_hadmin_cache *hadmin_entry, *hadmin_it;
+       struct fip_hadmin_cmd *cmd;
+       char *name = NULL;
+       int rc, num;
+       u16 pkey;
+
+       hadmin_entry = get_hadmin_entry();
+       if (!hadmin_entry) {
+               rc = -ENOMEM;
+               vnic_dbg_sysfs(port->name, "get_hadmin_entry failed\n");
+               goto err;
+       }
+
+       cmd = &hadmin_entry->cmd;
+       rc = sscanf(buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s "
+                   "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s",
+                   cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid,
+                   cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4,
+                   cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+       if (rc != MAX_INPUT_ARG) {
+               vnic_dbg_sysfs(port->name, "sscanf failed, rc %d\n", rc);
+               rc = -EINVAL;
+               goto err;
+       } else
+               name = (char *)(cmd->c_name);
+
+       /* get parent name */
+       if (!dev_valid_name(cmd->c_parent))
+               hadmin_entry->parent_used = 0;
+       else if (remove || !get_parent(port, cmd->c_parent)) {
+               vnic_dbg_sysfs(name, "parent set %s\n", cmd->c_parent);
+               strncpy(hadmin_entry->parent_name, cmd->c_parent,
+                       sizeof(hadmin_entry->parent_name));
+               hadmin_entry->parent_used = 1;
+       } else {
+               vnic_warn(name, "invalid parent name %s\n", cmd->c_parent);
+               rc = -EINVAL;
+               goto err;
+       }
+
+       /* get vNic ID dec (must) */
+       if (sscanf(cmd->c_vnic_id, "%d", &num) != 1) {
+               /* abort on failure */
+               vnic_warn(name, "invalid vNic ID %s\n", cmd->c_vnic_id);
+               rc = -EINVAL;
+               goto err;
+       }
+       hadmin_entry->vnic_id = (u16)num;
+
+       /* get vNic MAC (must) */
+       if (get_mac(hadmin_entry->mac, cmd->c_mac)) {
+               vnic_warn(name, "invalid vNic MAC %s\n", cmd->c_vnic_id);
+               rc = -EINVAL;
+               goto err;
+       }
+
+       /* get interface name (must) */
+       if ((!dev_valid_name(cmd->c_name) && !hadmin_entry->parent_used) ||
+           ((strlen(cmd->c_name) > VNIC_NAME_LEN) && hadmin_entry->parent_used)) {
+               vnic_warn(name, "invalid vNic name %s\n", cmd->c_name);
+               rc = -EINVAL;
+               goto err;
+       }
+
+       strncpy(hadmin_entry->interface_name, cmd->c_name,
+               sizeof(hadmin_entry->interface_name));
+
+       /* get BX GUID, if fails, get BX NAME */
+       if (get_guid(hadmin_entry->system_guid, cmd->c_bxguid)) {
+               strncpy(hadmin_entry->system_name, cmd->c_bxname,
+                       sizeof(hadmin_entry->system_name));
+               vnic_dbg_sysfs(name, "use BX NAME %s\n", cmd->c_bxname);
+       }
+
+       /* get shared emac/ip */
+       if (!get_ipv4((short unsigned int *)hadmin_entry->shared_vnic_ip,
+                     cmd->c_ipv4)) {
+               /* TODO, add IPv6 support for shared vNic */
+               get_mac(hadmin_entry->shared_vnic_mac, cmd->c_emac);
+               vnic_dbg_sysfs(name, "use shared ip/mac\n");
+       }
+
+#ifndef VLAN_GROUP_ARRAY_LEN
+#define VLAN_GROUP_ARRAY_LEN VLAN_N_VID
+#endif
+
+       /* get VLAN field (dec) */
+       if ((sscanf(cmd->c_vid, "%d", &num) == 1) &&
+           num < VLAN_GROUP_ARRAY_LEN && num >= 0) {
+               /* set other fields on success, skip on failure */
+               vnic_dbg_sysfs(name, "vlan set 0x%x\n", hadmin_entry->vlan);
+               hadmin_entry->vlan_used = 1;
+               hadmin_entry->vlan = (u16)num;
+       } else if (!strcmp(cmd->c_vid, ALL_VLAN_GW_VID)) {
+               /* Dont set 'vlan_used'. the code counts on it being NULL for
+                * host admin vnics in all_vlan mode, when Vlans are used */
+               hadmin_entry->vlan = 0;
+               hadmin_entry->all_vlan_gw = 1;
+       }
+
+       /* get eport name */
+       if (!strlen(cmd->c_eport)) {
+               vnic_warn(name, "invalid eport name %s\n", cmd->c_eport);
+               rc = -EINVAL;
+               goto err;
+       }
+       strncpy(hadmin_entry->eport_name, cmd->c_eport,
+               sizeof(hadmin_entry->eport_name));
+
+       /* set remove/add flag */
+       vnic_dbg_sysfs(name, "%s hadmin vNic\n", remove ? "remove" : "add");
+       hadmin_entry->remove = remove;
+
+       /* set pkey (hex) */
+       if ((sscanf(cmd->c_pkey, "%x", &num) != 1) || !num)
+               pkey = 0xffff; /* default */
+       else
+               pkey = (u16)num | 0x8000;
+       vnic_dbg_sysfs(name, "pkey 0x%x\n", pkey);
+
+       /* cannot sleep in this functions for child vnics flow
+        * (avoid schedule while atomic oops)
+        * TODO: check if holding start_stop_lock is needed here
+        */
+       //mutex_lock(&port->start_stop_lock);
+
+       list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+               if (discover->pkey == pkey) {
+                       spin_lock_irq(&discover->lock);
+
+                       if (discover->flush != FIP_NO_FLUSH) {
+                               rc = -EBUSY;
+                               spin_unlock_irq(&discover->lock);
+                               goto skip;
+                       }
+
+                       /* check that this mac/vlan is not in the cache list
+                        * (saves redundant queue_delayed_work call during
+                        * vnic_learn_mac bursts)
+                        */
+                       list_for_each_entry_reverse(hadmin_it, &discover->hadmin_cache, next) {
+                               if (!memcmp(hadmin_entry->mac, hadmin_it->mac, ETH_ALEN) &&
+                                   hadmin_entry->vlan == hadmin_it->vlan &&
+                                   hadmin_entry->remove == hadmin_it->remove) {
+                                       rc = -EEXIST;
+                                       spin_unlock_irq(&discover->lock);
+                                       goto skip;
+                               }
+                       }
+                       list_add_tail(&hadmin_entry->next, &discover->hadmin_cache);
+                       /* calls fip_discover_hadmin_update() */
+                       queue_delayed_work(fip_wq, &discover->hadmin_update_task, HZ/10);
+                       spin_unlock_irq(&discover->lock);
+                       goto updated_discover;
+               }
+       }
+
+       //mutex_unlock(&port->start_stop_lock);
+       vnic_dbg_sysfs(name, "Requested PKEY=0x%x is not configured\n", pkey);
+       goto skip;
+
+err:
+       vnic_dbg_sysfs(name, "Invalid host admin request format string. Request rejected\n");
+skip:
+       kfree(hadmin_entry);
+       return rc;
+
+updated_discover:
+       //mutex_unlock(&port->start_stop_lock);
+       return count;
+}
+
+static ssize_t vnic_login_cmd(struct module_attribute *attr,
+                             __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       char *p = buf;
+       struct vnic_sysfs_attr *vnic_dentry =
+               container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct fip_vnic_data *vnic_fip = vnic_dentry->ctx;
+       struct fip_hadmin_cmd *cmd;
+
+       if (!vnic_fip || !vnic_fip->hadmined)
+               goto out;
+
+       cmd = &vnic_fip->cmd;
+       p += _sprintf(p, buf, "name=%s mac=%s vnic_id=%s vid=%s bxname=%s bxguid=%s "
+                     "eport=%s ipv4=%s ipv6=%s emac=%s pkey=%s parent=%s ",
+                     cmd->c_name, cmd->c_mac, cmd->c_vnic_id, cmd->c_vid,
+                     cmd->c_bxname, cmd->c_bxguid, cmd->c_eport, cmd->c_ipv4,
+                     cmd->c_ipv6, cmd->c_emac, cmd->c_pkey, cmd->c_parent);
+       p += _sprintf(p, buf, "ib_port=%s", vnic_fip->port->name);
+       p += _sprintf(p, buf, "\n");
+
+out:
+       return (ssize_t)(p - buf);
+}
+
+int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic)
+{
+       char name[VNIC_SYSFS_FLEN];
+
+       DENTRY_CREATE(vnic, &vnic->dentry,
+                     vnic_dentry_name(name, vnic, "cmd"),
+                     vnic_login_cmd, NULL);
+       return 0;
+}
+
+void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic)
+{
+       if (vnic->dentry.ctx)
+               DENTRY_REMOVE(&vnic->dentry);
+}
+
+int vnic_create_dentry(struct vnic_login *login)
+{
+       int i = 0;
+       char name[VNIC_SYSFS_FLEN];
+
+       DENTRY_CREATE(login, &login->dentries[i++],
+                     login_dentry_name(name, login, "info"),
+                     vnic_login_show, NULL);
+       DENTRY_CREATE(login, &login->dentries[i++],
+                     login_dentry_name(name, login, "child"),
+                     NULL, vnic_child_write);
+       DENTRY_CREATE(login, &login->dentries[i++],
+                     login_dentry_name(name, login, "neigh"),
+                     vnic_neigh_show, NULL);
+       DENTRY_CREATE(login, &login->dentries[i++],
+                     login_dentry_name(name, login, "qps"),
+                     vnic_qps_show, NULL);
+       DENTRY_CREATE(login, &login->dentries[i++],
+                     login_dentry_name(name, login, "member"),
+                     vnic_member_show, NULL);
+
+#ifdef VNIC_PROFILLNG
+       DENTRY_CREATE(login, &login->dentries[i++],
+                     login_dentry_name(name, login, "prof_skb"),
+                     vnic_dentry_prof_skb_show, NULL);
+#endif
+       return 0;
+}
+
+void vnic_delete_dentry(struct vnic_login *login)
+{
+       int i;
+
+       for (i = 0; i < VNIC_MAX_DENTRIES; ++i) {
+               if (login->dentries[i].ctx)
+                       DENTRY_REMOVE(&login->dentries[i]);
+       }
+}
+
+static ssize_t port_gw_fs_show(struct module_attribute *attr,
+                              __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       struct vnic_sysfs_attr *vnic_dentry =
+               container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_port *port = vnic_dentry->ctx;
+
+       return fip_gw_sysfs_show(port, buf);
+}
+
+
+static ssize_t port_vnics_fs_show(struct module_attribute *attr,
+                              __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       struct vnic_sysfs_attr *vnic_dentry =
+               container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_port *port = vnic_dentry->ctx;
+       return port_vnics_sysfs_show(port, buf);
+}
+
+static ssize_t port_hadmin_syntax(struct module_attribute *attr,
+                                 __MODULE_KOBJ_TYPE *mod, char *buf)
+{
+       /* print cmd syntax only (for usage) */
+       return vnic_login_cmd_set(buf, NULL);
+}
+
+static ssize_t port_hadmin_add_write(struct module_attribute *attr,
+                                    __MODULE_KOBJ_TYPE *mod,
+                                    const char *buf, size_t count)
+{
+       struct vnic_sysfs_attr *vnic_dentry =
+               container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_port *port = vnic_dentry->ctx;
+
+       return fip_hadmin_sysfs_update(port, buf, count, 0);
+}
+
+static ssize_t port_hadmin_del_write(struct module_attribute *attr,
+                                    __MODULE_KOBJ_TYPE *mod,
+                                    const char *buf, size_t count)
+{
+       struct vnic_sysfs_attr *vnic_dentry =
+               container_of(attr, struct vnic_sysfs_attr, dentry);
+       struct vnic_port *port = vnic_dentry->ctx;
+
+       return fip_hadmin_sysfs_update(port, buf, count, 1);
+}
+
+int port_fs_init(struct vnic_port *port)
+{
+       int i = 0;
+       char name[VNIC_SYSFS_FLEN];
+
+       DENTRY_CREATE(port, &port->dentries[i++],
+                     port_dentry_name(name, port, "host_add"),
+                     port_hadmin_syntax, port_hadmin_add_write);
+
+       DENTRY_CREATE(port, &port->dentries[i++],
+                     port_dentry_name(name, port, "host_del"),
+                     port_hadmin_syntax, port_hadmin_del_write);
+
+       DENTRY_CREATE(port, &port->dentries[i++],
+                     port_dentry_name(name, port, "gws"),
+                     port_gw_fs_show, NULL);
+
+       DENTRY_CREATE(port, &port->dentries[i++],
+                     port_dentry_name(name, port, "vnics"),
+                     port_vnics_fs_show, NULL);
+       return 0;
+}
+
+void port_fs_exit(struct vnic_port *port)
+{
+       int i;
+
+       for (i = 0; i < VNIC_MAX_DENTRIES; ++i) {
+               if (port->dentries[i].ctx)
+                       DENTRY_REMOVE(&port->dentries[i]);
+       }
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_ib.c
new file mode 100644 (file)
index 0000000..ba6e93b
--- /dev/null
@@ -0,0 +1,1649 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <rdma/ib_cache.h>
+#include <net/ip6_checksum.h>
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+int vnic_post_recv(struct vnic_rx_ring *ring, u64 wr_id)
+{
+       struct ib_recv_wr *bad_wr;
+       int i, rc;
+
+       ring->wr.wr_id = wr_id;
+
+       for (i = 0; i < ring->num_frags; i++)
+               ring->sge[i].addr = ring->rx_info[wr_id].dma_addr[i];
+
+       rc = ib_post_srq_recv(ring->srq, &ring->wr, &bad_wr);
+       if (unlikely(rc)) {
+               /* we will not use a lock here. In the worst case we will have
+                * an incorrect value of need_refill. Not a biggie
+                */
+
+               /*ring->rx_info[wr_id].info = VNIC_FRAG_NOT_POSTED;
+                  ring->need_refill = 1;
+                */
+               vnic_dbg_data(ring->port->name, "receive failed for buf %llu (%d)\n",
+                             wr_id, rc);
+       }
+
+       return rc;
+}
+
+static void vnic_dealloc_tx_skb(struct vnic_login *login, unsigned cq_index,
+                               u64 wr_id)
+{
+       struct vnic_tx_res *tx_res = &login->tx_res[cq_index];
+       int is_inline = !!(wr_id & VNIC_SEND_INLINE_FLAG);
+       struct sk_buff *skb;
+       u64 *mapping;
+       int i, off = 0;
+
+       wr_id &= ~VNIC_SEND_INLINE_FLAG;
+       skb = tx_res->tx_ring[wr_id].skb;
+       ASSERT(skb);
+       mapping = tx_res->tx_ring[wr_id].mapping;
+
+       if (!is_inline) {
+               if (!vnic_encap_headroom && !skb_is_gso(skb)) {
+                       ib_dma_unmap_single(login->port->dev->ca, mapping[off],
+                                           VNIC_ENCAP_LEN, DMA_TO_DEVICE);
+                       off++;
+               }
+               if (skb_headlen(skb)) {
+                       ib_dma_unmap_single(login->port->dev->ca, mapping[off],
+                                           skb_headlen(skb), DMA_TO_DEVICE);
+                       off++;
+               }
+               for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+                       skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+                       ib_dma_unmap_page(login->port->dev->ca,
+                                         mapping[i + off], frag->size,
+                                         DMA_TO_DEVICE);
+               }
+       }
+
+       /* dealloc skb */
+       dev_kfree_skb_any(skb);
+       tx_res->tx_ring[wr_id].skb = NULL;
+}
+
+static void vnic_ib_handle_tx_wc(struct vnic_login *login,
+                                int tx_res_index, struct ib_wc *wc)
+{
+       struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+       u64 wr_id = wc->wr_id & ~VNIC_SEND_INLINE_FLAG;
+
+       vnic_dbg_data(login->name, "send completion: wr_id %llu, status: %d "
+                     "[head %d - tail %d]\n", wr_id, wc->status,
+                     tx_res->tx_head, tx_res->tx_tail);
+
+       ASSERT(wr_id < vnic_tx_rings_len);
+       vnic_dealloc_tx_skb(login, tx_res_index, wc->wr_id);
+
+       ++tx_res->tx_tail;
+       --tx_res->tx_outstanding;
+
+       if (unlikely(wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)) {
+               vnic_warn(login->name, "failed send event "
+                         "(status %d, wr_id %llu, vend_err 0x%x)\n",
+                         wc->status, wr_id, wc->vendor_err);
+               vnic_warn(login->name, "TX CQE error, queueing rings restart\n");
+               if (!login->queue_stopped)
+                       queue_delayed_work(login_wq, &login->restart_task, HZ / 100);
+       }
+}
+
+int vnic_post_recvs(struct vnic_rx_ring *ring)
+{
+       int i, rc;
+
+       for (i = 0; i < ring->size; i++) {
+               rc = vnic_post_recv(ring, i);
+               if (rc) {
+                       vnic_err(ring->port->name, "Failed post receive %d\n", rc);
+                       return rc;
+               }
+       }
+
+       return 0;
+}
+
+static int vnic_vlan_is_valid(struct vnic_login *login,
+                             struct vlan_ethhdr *veth)
+{
+       ASSERT(veth->h_vlan_proto == htons(ETH_P_8021Q));
+       if ((be16_to_cpu(veth->h_vlan_TCI) & 0xfff) !=
+           be16_to_cpu(login->vid)) {
+               vnic_dbg_data(login->name, "invalid vlan, ingress vid "
+                             "0x%x, login: vid 0x%x vlan_used %d\n",
+                             be16_to_cpu(veth->h_vlan_TCI),
+                             be16_to_cpu(login->vid),
+                             login->vlan_used);
+               return 0;
+       }
+
+       return 1;
+}
+
+/* If a vlan tag should exist in the eth_hdr - validate it.
+   is_vlan_proto is set if vlan protocol is present in the eth header
+   return values 0 - on success, 1 - on error :
+   for all vlans gateway (promisc vlan):
+       0 - there is no vlan or there is a vlan and it is valid
+       1 - vlan is present and not valid.
+   for all other vlans:
+       0 - there shouldn't be a vlan, or vlan should be present and is valid.
+       1 - vlan should be present and it is not, ot it is not valid. */
+static int validate_vnic_vlan(struct vnic_login *login,
+                             struct vlan_ethhdr *veth,
+                             int *is_vlan_proto)
+{
+       int is_vlan = !!(veth->h_vlan_proto == htons(ETH_P_8021Q));
+
+       *is_vlan_proto = is_vlan;
+
+       if (login->all_vlan_gw)
+               return 0;
+
+       if (VNIC_VLAN_ENABLED(login) && login->vid && !is_vlan) {
+               vnic_dbg_data(login->name, "missing vlan tag\n");
+               VNIC_STATS_INC(login->port_stats.vlan_err);
+               return 1;
+       }
+
+       if (is_vlan && unlikely(!vnic_vlan_is_valid(login, veth))) {
+               vnic_dbg_data(login->name, "invalid vlan tag\n");
+               VNIC_STATS_INC(login->port_stats.vlan_err);
+               return 1;
+       }
+
+       return 0;
+}
+
+static void vnic_ib_handle_rx_wc_linear(struct vnic_login *login,
+                                       struct ib_wc *wc, int rx_ring_index)
+{
+       struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index];
+       struct eoibhdr *eoib_hdr;
+       struct sk_buff *skb;
+       struct vlan_ethhdr *veth;
+       int rc, wr_id = wc->wr_id, checksum_ok, ip_summed,
+           buf_size = VNIC_BUF_SIZE(ring->port);
+       int is_vlan_proto;
+       u64 mapping;
+       u16 eth_type;
+       u8 *va, *eth_hdr;
+
+       spin_lock_bh(&ring->lock);
+       ASSERT(wr_id < ring->size);
+
+       skb = ring->rx_info[wr_id].skb;
+       mapping = ring->rx_info[wr_id].dma_addr[0];
+
+       /* termination with error */
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               if(wc->status != IB_WC_REM_ABORT_ERR &&
+                  wc->status != IB_WC_LOC_LEN_ERR) {
+                       vnic_dbg_data(login->name, "RX CQE error "
+                                     "(status %d, vend_err 0x%x), "
+                                     "queueing rings restart\n",
+                                     wc->status, wc->vendor_err);
+                       if (!login->queue_stopped)
+                               queue_delayed_work(login_wq,
+                                                  &login->restart_task,
+                                                  HZ / 10);
+               }
+               goto repost;
+       }
+
+       ASSERT(skb);
+       ASSERT(mapping);
+
+       /* If we can't allocate a new RX buffer, dump
+        * this packet and reuse the old buffer.
+        */
+       if (unlikely(!vnic_alloc_rx_skb(ring, wr_id, GFP_ATOMIC))) {
+               VNIC_STATS_DO_INC(login->stats.rx_dropped);
+               goto repost;
+       }
+
+       ib_dma_unmap_single(login->port->dev->ca, mapping,
+                           buf_size, DMA_FROM_DEVICE);
+       skb_put(skb, wc->byte_len);
+       skb_pull(skb, IB_GRH_BYTES);
+
+       /* check EoIB header signature and version */
+       va = skb->data;
+       eoib_hdr = (struct eoibhdr *)va;
+       if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG ||
+                    VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) {
+               vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n",
+                             VNIC_EOIB_HDR_GET_SIG(eoib_hdr),
+                             VNIC_EOIB_HDR_GET_VER(eoib_hdr));
+               VNIC_STATS_INC(login->port_stats.sig_ver_err);
+               goto repost;
+       }
+
+       /* check EoIB CSUM */
+       checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr);
+       ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+       if (likely((checksum_ok)))
+               VNIC_STATS_INC(login->port_stats.rx_chksum_good);
+       else
+               VNIC_STATS_INC(login->port_stats.rx_chksum_none);
+
+       /* Ethernet header */
+       skb_pull(skb, VNIC_ENCAP_LEN);
+       va += VNIC_ENCAP_LEN;
+       veth = (struct vlan_ethhdr *)(va);
+
+       eth_hdr = va;
+       eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto);
+
+       /* validate VLAN tag, strip it if valid */
+       if (validate_vnic_vlan(login, veth, &is_vlan_proto))
+               goto repost;
+
+       /* for all_vlan_gw - we don't strip the packet but send it as is*/
+       if (!login->all_vlan_gw && is_vlan_proto) {
+               eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto);
+               eth_hdr += VLAN_HLEN;
+               skb_pull(skb, VLAN_HLEN);
+               memmove(eth_hdr, va, ETH_ALEN * 2);
+       }
+
+       /* update skb fields, keep this before LRO/GRO funcs */
+       skb->dev = login->dev;
+       skb->protocol = eth_type_trans(skb, skb->dev);
+       skb->ip_summed = ip_summed;
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+       if ((login->dev->features & NETIF_F_GRO) && checksum_ok) {
+               struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+               int ret;
+
+               ret = napi_gro_receive(&rx_res->napi, skb);
+               if (ret == GRO_HELD)
+                       VNIC_STATS_INC(login->port_stats.gro_held);
+               else if (ret == GRO_NORMAL)
+                       VNIC_STATS_INC(login->port_stats.gro_normal);
+               else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
+                       VNIC_STATS_INC(login->port_stats.gro_merged);
+               else
+                       VNIC_STATS_INC(login->port_stats.gro_drop);
+
+               goto rx_repost;
+       }
+#elif defined(NETIF_F_LRO)
+       if (login->dev->features & NETIF_F_LRO && checksum_ok) {
+               struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+
+               /* processed for LRO */
+                lro_receive_skb(&rx_res->lro, skb, NULL);
+               VNIC_STATS_INC(login->port_stats.lro_aggregated);
+
+               goto rx_repost;
+       }
+#endif
+
+       rc = vnic_rx(login, skb, wc);
+       if (unlikely(rc)) {
+               vnic_dbg_data(login->name, "vnic_rx failed, rc %d\n", rc);
+               goto repost;
+       }
+
+rx_repost:
+       VNIC_STATS_INC(ring->stats.rx_packets);
+       VNIC_STATS_ADD(ring->stats.rx_bytes, wc->byte_len);
+
+       VNIC_STATS_DO_INC(login->stats.rx_packets);
+       VNIC_STATS_DO_ADD(login->stats.rx_bytes, wc->byte_len);
+
+       if (unlikely(vnic_post_recv(ring, wr_id)))
+               vnic_dbg_data(login->name, "failed to post RX WQE id %d\n",
+                             (int)wr_id);
+       spin_unlock_bh(&ring->lock);
+
+       return;
+
+repost:
+       login->dev->last_rx = jiffies;
+       if (unlikely(vnic_post_recv(ring, wr_id)))
+               vnic_dbg_data(login->name, "failed to post RX WQE id %d\n",
+                             (int)wr_id);
+
+       VNIC_STATS_INC(ring->stats.rx_dropped);
+       VNIC_STATS_DO_INC(login->stats.rx_dropped);
+       spin_unlock_bh(&ring->lock);
+
+       return;
+}
+
+static void vnic_ib_handle_rx_wc(struct vnic_login *login,
+                                struct ib_wc *wc, int rx_ring_index)
+{
+       struct vnic_rx_ring *ring = login->port->rx_ring[rx_ring_index];
+       struct ib_device *ib_device = login->port->dev->ca;
+       struct vnic_frag_data *frags_entry;
+       struct skb_frag_struct frags[VNIC_MAX_RX_FRAGS] = {};
+       struct eoibhdr *eoib_hdr;
+       struct vlan_ethhdr *veth;
+       struct iphdr *ip_hdr;
+       u64 wr_id = wc->wr_id;
+       u16 eth_type;
+       u8 *va, *eth_hdr, ip_type;
+       int rc, checksum_ok, ip_offset = ETH_HLEN,
+               packet_length = wc->byte_len - VNIC_EOIB_HDR_SIZE,
+               page_offset = VNIC_EOIB_HDR_SIZE, ip_summed;
+       int is_vlan_proto;
+
+       spin_lock_bh(&ring->lock);
+       ASSERT(wr_id < ring->size);
+
+       /* termination with error */
+       if (unlikely(wc->status != IB_WC_SUCCESS)) {
+               if(wc->status != IB_WC_REM_ABORT_ERR &&
+                  wc->status != IB_WC_LOC_LEN_ERR) {
+                       vnic_dbg_data(login->name, "RX CQE error "
+                                     "(status %d, vend_err 0x%x), "
+                                     "queueing rings restart\n",
+                                     wc->status, wc->vendor_err);
+                       if (!login->queue_stopped)
+                               queue_delayed_work(login_wq, &login->restart_task, HZ / 10);
+                       goto out;
+               }
+               goto drop_repost;
+       }
+
+       frags_entry = &ring->rx_info[wr_id];
+
+       /* ensure cache coherency for packet headers and get vq */
+       ib_dma_sync_single_for_cpu(ib_device,
+                                  ring->rx_info[wr_id].dma_addr[0] + IB_GRH_BYTES,
+                                  MAX_HEADER_SIZE, DMA_FROM_DEVICE);
+
+       va = page_address(ring->rx_info[wr_id].frags[0].page.p) +
+               ring->rx_info[wr_id].frags[0].page_offset + IB_GRH_BYTES;
+
+       /* check EoIB header signature and version */
+       eoib_hdr = (struct eoibhdr *)va;
+       if (unlikely(VNIC_EOIB_HDR_GET_SIG(eoib_hdr) != VNIC_EOIB_HDR_SIG ||
+                    VNIC_EOIB_HDR_GET_VER(eoib_hdr) != VNIC_EOIB_HDR_VER)) {
+               vnic_dbg_data(login->name, "bad sig (0x%x) or ver (0x%x)\n",
+                             VNIC_EOIB_HDR_GET_SIG(eoib_hdr),
+                             VNIC_EOIB_HDR_GET_VER(eoib_hdr));
+               VNIC_STATS_INC(login->port_stats.sig_ver_err);
+               goto unmap_repost;
+       }
+
+       /* check EoIB CSUM */
+       checksum_ok = login->rx_csum && VNIC_CSUM_OK(eoib_hdr);
+       ip_summed = checksum_ok ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
+       if (likely((checksum_ok)))
+               VNIC_STATS_INC(login->port_stats.rx_chksum_good);
+       else
+               VNIC_STATS_INC(login->port_stats.rx_chksum_none);
+
+       /* Ethernet header */
+       va += VNIC_ENCAP_LEN;
+       veth = (struct vlan_ethhdr *)(va);
+
+       eth_hdr = va;
+       eth_type = be16_to_cpu(((struct ethhdr *)(va))->h_proto);
+
+       /* validate VLAN tag, strip it if valid
+        * - if VID is set and !0, then VLAN tag must exist
+        *   note: VID zero can accept untagged packets
+        * - if ingress VID exists: validate it, and update the packet
+        *   note: rx user prio is ignored
+        * - else; it's valid untagged packet
+        */
+       if (validate_vnic_vlan(login, veth, &is_vlan_proto))
+               goto unmap_repost;
+
+       /* for all_vlan_gw - we don't strip the packet but send it as is*/
+       if (!login->all_vlan_gw && is_vlan_proto) {
+               ip_offset += VLAN_HLEN;
+               page_offset += VLAN_HLEN;
+               packet_length -= VLAN_HLEN;
+               eth_hdr += VLAN_HLEN;
+               eth_type = be16_to_cpu(veth->h_vlan_encapsulated_proto);
+               memmove(eth_hdr, va, ETH_ALEN * 2);
+       }
+
+       /* IP header */
+       va += ip_offset;
+       ip_hdr = (struct iphdr *)va;
+       ip_type = ip_hdr->protocol;
+
+       ib_dma_sync_single_for_device(ib_device,
+                                     frags_entry->dma_addr[0] + IB_GRH_BYTES,
+                                     MAX_HEADER_SIZE, DMA_FROM_DEVICE);
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+       if ((login->dev->features & NETIF_F_GRO) && checksum_ok) {
+               struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+               struct sk_buff *gro_skb;
+               struct skb_frag_struct *gro_frags;
+               int nr_frags, ret;
+
+               gro_skb = napi_get_frags(&rx_res->napi);
+               if (!gro_skb)
+                       goto drop_repost;
+
+               gro_frags = skb_shinfo(gro_skb)->frags;
+               nr_frags = vnic_unmap_and_replace_rx(ring, ib_device,
+                                                    gro_frags, wr_id,
+                                                    wc->byte_len);
+               if (unlikely(!nr_frags))
+                       goto drop_repost;
+
+               /* disregard GRH and eoib headers */
+               gro_frags[0].page_offset += page_offset;
+               gro_frags[0].size -= page_offset;
+
+               skb_shinfo(gro_skb)->nr_frags = nr_frags;
+               gro_skb->len = packet_length;
+               gro_skb->data_len = packet_length;
+               gro_skb->truesize += packet_length;
+               gro_skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+               /* processed for GRO */
+               skb_record_rx_queue(gro_skb, rx_res->index);
+               ret = napi_gro_frags(&rx_res->napi);
+               if (ret == GRO_HELD)
+                       VNIC_STATS_INC(login->port_stats.gro_held);
+               else if (ret == GRO_NORMAL)
+                       VNIC_STATS_INC(login->port_stats.gro_normal);
+               else if (ret == GRO_MERGED || ret == GRO_MERGED_FREE)
+                       VNIC_STATS_INC(login->port_stats.gro_merged);
+               else
+                       VNIC_STATS_INC(login->port_stats.gro_drop);
+
+               goto rx_repost;
+       }
+#elif defined(NETIF_F_LRO)
+       if (login->dev->features & NETIF_F_LRO && checksum_ok &&
+           eth_type == ETH_P_IP && ip_type == IPPROTO_TCP) {
+               struct vnic_rx_res *rx_res = &login->rx_res[rx_ring_index];
+               int nr_frags;
+
+               /* unmap the needed fragment and reallocate them.
+                * Fragments that were not used will be reused as is.*/
+               nr_frags = vnic_unmap_and_replace_rx(ring, ib_device, frags,
+                                                    wr_id, wc->byte_len);
+               if (unlikely(!nr_frags))
+                       goto drop_repost;
+
+               /* disregard GRH and eoib headers */
+               frags[0].page_offset += page_offset;
+               frags[0].size -= page_offset;
+
+               /* processed for LRO */
+#if defined(CONFIG_COMPAT_LRO_ENABLED)
+               lro_receive_frags(&rx_res->lro, frags, packet_length,
+                                 packet_length, NULL, 0);
+#endif
+               VNIC_STATS_INC(login->port_stats.lro_aggregated);
+
+               goto rx_repost;
+       }
+#endif
+
+       rc = vnic_rx_skb(login, ring, wc, ip_summed, eth_hdr);
+       if (unlikely(rc)) {
+               vnic_dbg_data(login->name, "vnic_rx_skb failed, rc %d\n", rc);
+               goto drop_repost;
+       }
+
+rx_repost:
+       /* must hold lock when touching login->stats so the stats
+        * task won't read invalid values
+        */
+       spin_lock(&login->stats_lock);
+       VNIC_STATS_INC(ring->stats.rx_packets);
+       VNIC_STATS_ADD(ring->stats.rx_bytes, packet_length);
+
+       VNIC_STATS_DO_INC(login->stats.rx_packets);
+       VNIC_STATS_DO_ADD(login->stats.rx_bytes, packet_length);
+       spin_unlock(&login->stats_lock);
+
+       login->dev->last_rx = jiffies;
+       if (vnic_post_recv(ring, wr_id))
+               vnic_dbg_data(login->name, "vnic_post_recv failed, "
+                             "wr_id %llu\n", wr_id);
+       spin_unlock_bh(&ring->lock);
+
+       return;
+
+unmap_repost:
+       /* ignore rc of vnic_unmap_and_replace_rx() */
+       vnic_unmap_and_replace_rx(ring, ib_device, frags,
+                                 wr_id, wc->byte_len);
+drop_repost:
+       VNIC_STATS_INC(ring->stats.rx_dropped);
+
+       spin_lock(&login->stats_lock);
+       VNIC_STATS_DO_INC(login->stats.rx_dropped);
+       spin_unlock(&login->stats_lock);
+
+       if (vnic_post_recv(ring, wr_id))
+               vnic_dbg_data(login->name, "vnic_post_recv failed, "
+                             "wr_id %llu\n", wr_id);
+out:
+       spin_unlock_bh(&ring->lock);
+       return;
+}
+
+static inline void vnic_drain_tx_cq(struct vnic_login *login,
+                                   int tx_res_index)
+{
+       struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+       int n, i;
+
+       do {
+               n = ib_poll_cq(tx_res->cq, VNIC_MAX_TX_CQE, tx_res->send_wc);
+               for (i = 0; i < n; ++i)
+                       vnic_ib_handle_tx_wc(login, tx_res_index,
+                                            tx_res->send_wc + i);
+       } while (n == VNIC_MAX_TX_CQE);
+}
+
+static void vnic_drain_arm_tx_cq(struct vnic_login *login, int tx_res_index)
+{
+       struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+
+       ASSERT(login);
+       ASSERT(login->dev);
+
+       /* darin CQ then [arm] it */
+       vnic_drain_tx_cq(login, tx_res_index);
+
+       /* in tx interrupt mode, arm TX CQ after every interrupt */
+       if (!vnic_tx_polling && ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP))
+               vnic_dbg(login->name, "ib_req_notify_cq failed\n");
+       else if (unlikely(VNIC_TXQ_STOPPED(tx_res) &&
+                    test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))) {
+               if ((tx_res->tx_outstanding <= vnic_tx_rings_len >> 1)) {
+                       if (!test_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state)) {
+                               VNIC_STATS_DO_INC(login->port_stats.wake_queue);
+                               VNIC_TXQ_WAKE(tx_res);
+                       }
+               /* make sure that after arming the cq, there is no access to
+                * login fields to avoid conflict with cq event handler.
+                * i.e., ib_req_notify_cq() must come at the end of this func
+                */
+               } else if (ib_req_notify_cq(tx_res->cq, IB_CQ_NEXT_COMP)) {
+                       vnic_dbg(login->name, "ib_req_notify_cq failed\n");
+                       /* TODO: have to reset the device here */
+               }
+       }
+}
+
+static inline void vnic_comp_handler_tx(struct ib_cq *cq, void *ctx)
+{
+       struct vnic_tx_res *tx_res = ctx;
+
+       if (!vnic_tx_polling) {
+               spin_lock(&tx_res->lock);
+               vnic_drain_arm_tx_cq(tx_res->login, tx_res->index);
+               spin_unlock(&tx_res->lock);
+       } else
+               vnic_drain_arm_tx_cq(tx_res->login, tx_res->index);
+
+}
+
+static int vnic_drain_rx_cq(struct vnic_login *login, int max_poll,
+                           int rx_res_index)
+{
+       struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+       int polled, i;
+
+       ASSERT(max_poll <= vnic_napi_weight);
+       polled = ib_poll_cq(rx_res->cq, max_poll, rx_res->recv_wc);
+
+       for (i = 0; vnic_rx_linear && i < polled; ++i)
+               vnic_ib_handle_rx_wc_linear(login, &rx_res->recv_wc[i],
+                                           rx_res_index);
+
+       for (i = 0; !vnic_rx_linear && i < polled; ++i)
+               vnic_ib_handle_rx_wc(login, &rx_res->recv_wc[i],
+                                    rx_res_index);
+
+#ifdef NETIF_F_LRO
+       /* Done CQ handling: flush all LRO sessions unconditionally */
+       if (login->dev->features & NETIF_F_LRO) {
+               VNIC_STATS_INC(login->port_stats.lro_flushed);
+               lro_flush_all(&rx_res->lro);
+       }
+#endif
+
+       return polled;
+}
+
+/* RX CQ polling - called by NAPI */
+#ifndef _BP_NAPI_POLL
+int vnic_poll_cq_rx(struct napi_struct *napi, int budget)
+{
+       struct vnic_rx_res *rx_res = container_of(napi, struct vnic_rx_res, napi);
+       struct vnic_login *login = rx_res->login;
+       struct ib_cq *cq_rx = rx_res->cq;
+       int rx_res_index = rx_res->index, polled;
+
+       /* shouldn't happen, since when stopped=1 NAPI is disabled */
+       if (unlikely(rx_res->stopped)) {
+#ifndef _BP_NAPI_NETIFRX
+               napi_complete(napi);
+#else
+               netif_rx_complete(login->dev, napi);
+#endif
+               return 0;
+       }
+
+       polled = vnic_drain_rx_cq(login, min(budget, VNIC_MAX_RX_CQE), rx_res_index);
+       vnic_dbg_data(login->name, "after vnic_drain_rx_cq budget %d,"
+                     " done %d, index %d\n", budget, polled, rx_res_index);
+
+       /* If we used up all the quota - we're probably not done yet... */
+       ASSERT(polled <= budget);
+       if (polled < budget) {
+               /* ATTENTION: ARM CQ must come after napi_complete() */
+#ifndef _BP_NAPI_NETIFRX
+               napi_complete(napi);
+#else
+               netif_rx_complete(login->dev, napi);
+#endif
+               /* Eventually calls vnic_comp_handler_rx() */
+               if (ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP))
+                       vnic_err(login->name, "ib_req_notify_cq failed\n");
+       }
+
+       return polled;
+}
+#else
+int vnic_poll_cq_rx(struct net_device *poll_dev, int *budget)
+{
+       struct vnic_rx_res *rx_res = poll_dev->priv;
+       struct vnic_login *login = rx_res->login;
+       struct ib_cq *cq_rx = rx_res->cq;
+       int rx_res_index = rx_res->index, polled, max_poll = min(*budget, poll_dev->quota);
+
+       /* shouldn't happen, since when stopped=1 NAPI is disabled */
+       if (unlikely(rx_res->stopped)) {
+               netif_rx_complete(poll_dev);
+               return 0;
+       }
+
+       while (max_poll >= 0) {
+               polled = vnic_drain_rx_cq(login, min(max_poll, VNIC_MAX_RX_CQE), rx_res_index);
+               if (polled <= 0)
+                       break;
+               else {
+                       poll_dev->quota -= polled;
+                       *budget -= polled;
+               }
+               max_poll -= polled;
+       }
+
+       if (!max_poll)
+               return 1;
+
+       netif_rx_complete(poll_dev);
+       ib_req_notify_cq(cq_rx, IB_CQ_NEXT_COMP);
+
+       return 0;
+}
+#endif
+
+static void vnic_comp_handler_rx(struct ib_cq *cq, void *rx_res_ptr)
+{
+       struct vnic_rx_res *rx_res = rx_res_ptr;
+       struct vnic_login *login = rx_res->login;
+
+       ASSERT(rx_res->cq == cq);
+       ASSERT(login->dev);
+
+       /* is this happens, will re-arm later in vnic_open */
+       if (unlikely(rx_res->stopped))
+               return;
+
+#ifndef _BP_NAPI_POLL
+       /* calls vnic_poll_cq_rx() */
+#ifndef _BP_NAPI_NETIFRX
+       napi_schedule(&rx_res->napi);
+#else
+       netif_rx_schedule(login->dev, &rx_res->napi);
+#endif
+#else
+       netif_rx_schedule(rx_res->poll_dev);
+#endif /* _BP_NAPI_POLL*/
+
+}
+
+static void vnic_stop_qp(struct vnic_login *login, int qp_index)
+{
+       struct ib_qp_attr qp_attr = { .qp_state = IB_QPS_ERR };
+       struct vnic_qp_res *qp_res = &login->qp_res[qp_index];
+       struct vnic_rx_res *rx_res = &login->rx_res[qp_res->rx_index];
+       struct vnic_tx_res *tx_res = &login->tx_res[qp_res->tx_index];
+       struct vnic_rx_ring *ring = login->port->rx_ring[rx_res->index];
+       unsigned long flags;
+       int polled, attr_mask, rc, i;
+
+       /* move QP to ERR, wait for last WQE async event to drain the SRQ */
+       rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE);
+       if (rc) {
+               /* calls vnic_qp_event_handler() */
+               vnic_warn(login->name, "failed to modify QP 0x%x to ERR state"
+                         " (err = %d)\n", qp_res->qp->qp_num, rc);
+               /* continue anyway, but don't wait for completion */
+       } else {
+               wait_for_completion(&qp_res->last_wqe_complete);
+       }
+
+       /* === at this point, no NAPI/RX comps === */
+
+       /* drain TX CQ before moving to RESET, must hold tx_res->lock to
+        * protect from vnic_comp_handler_tx() after this call, all CQEs
+        * are polled (either by this direct call, or by CQ handlers)
+        */
+       spin_lock_irqsave(&tx_res->lock, flags);
+       vnic_drain_tx_cq(login, tx_res->index);
+       spin_unlock_irqrestore(&tx_res->lock, flags);
+
+       /* drain RX CQ before moving to RESET drop and re-post all comps */
+       spin_lock_bh(&ring->lock);
+       do {
+               polled = ib_poll_cq(rx_res->cq, VNIC_MAX_RX_CQE, rx_res->recv_wc);
+               for (i = 0; i < polled; ++i)
+                       if (vnic_post_recv(ring, rx_res->recv_wc[i].wr_id))
+                               vnic_dbg_data(login->name, "vnic_post_recv failed, "
+                                             "wr_id %llu\n", rx_res->recv_wc[i].wr_id);
+       } while (polled == VNIC_MAX_RX_CQE);
+       spin_unlock_bh(&ring->lock);
+
+       /* move QP to RESET */
+       qp_attr.qp_state = IB_QPS_RESET;
+       rc = ib_modify_qp(qp_res->qp, &qp_attr, IB_QP_STATE);
+       if (rc)
+               vnic_warn(login->name, "failed to modify QP 0x%x to RESET"
+                         " state (err = %d)\n", qp_res->qp->qp_num, rc);
+
+       /* move QP to INIT to avoid multicast qp cache misses */
+       qp_attr.qp_state = IB_QPS_INIT;
+       qp_attr.qkey = login->qkey;
+       qp_attr.port_num = login->port->num;
+       qp_attr.pkey_index = login->pkey_index;
+       attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+       rc = ib_modify_qp(qp_res->qp, &qp_attr, attr_mask);
+       if (rc)
+               vnic_warn(login->name, "failed to modify QP 0x%x to INIT state"
+                         " (err = %d)\n", qp_res->qp->qp_num, rc);
+}
+
+int vnic_ib_stop(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       struct vnic_tx_res *tx_res;
+       unsigned long begin = jiffies;
+       int wr_id, i;
+
+       /* flush tx and rx comps */
+       for (i = 0; i < login->qps_num; ++i)
+               vnic_stop_qp(login, i);
+
+       /* check any pending tx comps */
+       for (i = 0; i < login->tx_rings_num; i++) {
+               tx_res = &login->tx_res[i];
+               /* if tx_outstanding is non-zero, give it a chance to complete */
+               if (!tx_res->tx_outstanding)
+                       continue;
+               msleep(10);
+
+               /* else, drain tx cq. This is indicates that something is
+                * wrong, thus we won't protect vnic_comp_handler_tx() here
+                */
+               while (tx_res->tx_outstanding &&
+                      time_before(jiffies, begin + 5 * HZ)) {
+                       vnic_drain_tx_cq(login, i);
+                       msleep(1);
+               }
+
+               /* if they're still not complete, force skb deallocation */
+               if (!tx_res->tx_outstanding)
+                       continue;
+               vnic_warn(login->name, "timing out: %d sends not completed\n",
+                         tx_res->tx_outstanding);
+               while (tx_res->tx_outstanding) {
+                       wr_id = tx_res->tx_tail & (vnic_tx_rings_len - 1);
+                       vnic_dealloc_tx_skb(login, i, wr_id);
+                       ++tx_res->tx_tail;
+                       --tx_res->tx_outstanding;
+               }
+       }
+
+       return 0;
+}
+
+int vnic_ib_open(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int i;
+
+       /* move QP to RTS and attach to bcast group */
+       for (i = 0; i < login->qps_num; ++i) {
+               if (vnic_init_qp(login, i)) {
+                       vnic_err(login->name, "vnic_init_qp failed\n");
+                       goto stop_qps;
+               }
+       }
+
+       return 0;
+
+stop_qps:
+       for (--i ; i >= 0; --i)
+               vnic_stop_qp(login, i);
+
+       return -EINVAL;
+}
+
+void vnic_destroy_qp(struct vnic_login *login, int qp_index)
+{
+       struct ib_qp *qp = login->qp_res[qp_index].qp;
+
+       if (!qp)
+               return;
+       if (ib_destroy_qp(qp))
+               vnic_warn(login->name, "ib_destroy_qp failed\n");
+       return;
+}
+
+void vnic_qp_to_reset(struct vnic_login *login, struct ib_qp *qp)
+{
+       struct ib_qp_attr qp_attr;
+       int rc;
+
+       qp_attr.qp_state = IB_QPS_RESET;
+       rc = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+       if (rc)
+               vnic_err(login->name, "ib_modify_qp 0x%06x to RESET err %d\n",
+                        qp->qp_num, rc);
+}
+
+int vnic_qp_to_init(struct vnic_login *login, struct ib_qp *qp, u32 qkey)
+{
+       struct ib_qp_attr qp_attr;
+       int attr_mask, rc;
+
+       /* move QP to INIT */
+       qp_attr.qp_state = IB_QPS_INIT;
+       qp_attr.qkey = qkey;
+       qp_attr.port_num = login->port->num;
+       /* pkey will be overwritten later by login->pkey_index */
+       qp_attr.pkey_index = login->port->pkey_index;
+       attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+       rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+       if (rc) {
+               vnic_err(login->name, "ib_modify_qp 0x%06x to INIT err %d\n",
+                        qp->qp_num, rc);
+               goto out_qp_reset;
+       }
+
+       return 0;
+
+out_qp_reset:
+       vnic_qp_to_reset(login, qp);
+       return rc;
+}
+
+int vnic_init_qp(struct vnic_login *login, int qp_index)
+{
+       struct ib_qp_attr qp_attr;
+       int attr_mask, rc, rc1;
+       struct ib_qp *qp = login->qp_res[qp_index].qp;
+
+       init_completion(&login->qp_res[qp_index].last_wqe_complete);
+       /* move QP to INIT */
+       qp_attr.qp_state = IB_QPS_INIT;
+       qp_attr.qkey = login->qkey;
+       qp_attr.port_num = login->port->num;
+       qp_attr.pkey_index = login->pkey_index;
+       attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+       rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+       if (rc) {
+               vnic_err(login->name, "ib_modify_qp to INIT err %d\n", rc);
+               goto out_qp_reset;
+       }
+
+       /* move QP to RTR */
+       qp_attr.qp_state = IB_QPS_RTR;
+       attr_mask &= ~IB_QP_PORT;
+       rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+       if (rc) {
+               vnic_err(login->name, "ib_modify_qp to RTR err %d\n", rc);
+               goto out_qp_reset;
+       }
+
+       /* move QP to RTS */
+       qp_attr.qp_state = IB_QPS_RTS;
+       qp_attr.sq_psn = 0;
+       attr_mask |= IB_QP_SQ_PSN;
+       attr_mask &= ~IB_QP_PKEY_INDEX;
+       rc = ib_modify_qp(qp, &qp_attr, attr_mask);
+       if (rc) {
+               vnic_err(login->name, "ib_modify_qp to RTS err, rc %d\n", rc);
+               goto out_qp_reset;
+       }
+
+       /* What a Good QP! */
+       vnic_dbg_data(login->name, "qpn 0x%06x moved to RTS\n",
+                     qp->qp_num);
+
+       return 0;
+
+out_qp_reset:
+       qp_attr.qp_state = IB_QPS_RESET;
+       rc1 = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+       if (rc1)
+               vnic_err(login->name, "ib_modify_qp to RESET err %d\n", rc1);
+
+       return rc;
+}
+
+static void vnic_qp_event_handler(struct ib_event *event, void *ctx)
+{
+       struct vnic_qp_res *qp_res = ctx;
+       struct vnic_login *login = qp_res->login;
+
+       ASSERT(login);
+       vnic_dbg_data(login->name, "[%s] qpn %d got event %d\n",
+                     event->device->name, event->element.qp->qp_num,
+                     event->event);
+       if (event->event == IB_EVENT_QP_LAST_WQE_REACHED)
+               complete(&qp_res->last_wqe_complete);
+}
+
+void vnic_destroy_rx_res(struct vnic_login *login, int rx_res_index)
+{
+       struct ib_cq *cq = login->rx_res[rx_res_index].cq;
+       int rc = 0;
+
+       if (cq)
+               rc = ib_destroy_cq(cq);
+       if (rc)
+               vnic_warn(login->name, "ib_destroy_cq() index %d failed\n",
+                         rx_res_index);
+}
+
+void vnic_destroy_tx_res(struct vnic_login *login, int tx_res_index)
+{
+       struct ib_cq *cq = login->tx_res[tx_res_index].cq;
+       struct vnic_tx_buf *tx_ring = login->tx_res[tx_res_index].tx_ring;
+       int rc = 0;
+
+       if (tx_ring)
+               vfree(tx_ring);
+       if (cq)
+               rc = ib_destroy_cq(cq);
+       if (rc)
+               vnic_warn(login->name, "ib_destroy_cq() index %d failed\n",
+                         tx_res_index);
+}
+
+#if 0
+static inline int get_comp_vector(int index, struct vnic_port *port)
+{
+       int vector;
+       int num_cpus = roundup_pow_of_two(num_online_cpus());
+       int port_for_eq;
+
+       port_for_eq = (((index / port->dev->mdev->eq_per_port) %
+                       port->dev->mdev->dev->caps.num_ports) + 1);
+       vector = (index % port->dev->mdev->eq_per_port) +
+                (port_for_eq * num_cpus);
+
+       return vector;
+}
+#endif
+
+int vnic_create_rx_res(struct vnic_login *login, int rx_res_index)
+{
+       struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+       int comp_vector = rx_res_index % login->port->dev->ca->num_comp_vectors;
+       struct ib_cq *cq =
+               ib_create_cq(login->port->dev->ca,
+                            vnic_comp_handler_rx,
+                            NULL, &login->rx_res[rx_res_index],
+                            vnic_rx_rings_len, comp_vector);
+       if (IS_ERR(cq)) {
+               vnic_err(login->name, "ib_create_cq failed, index %d, "
+                        "comp_vector %d, rc %d\n",
+                        rx_res_index, comp_vector, (int)PTR_ERR(cq));
+               return -EINVAL;
+       }
+
+       rx_res->cq = cq;
+       rx_res->index = rx_res_index;
+       rx_res->login = login;
+
+       return 0;
+}
+
+int vnic_create_tx_res(struct vnic_login *login, int tx_res_index)
+{
+       struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+       struct ib_cq *cq;
+       struct vnic_tx_buf *tx_ring;
+       int i, comp_vector;
+
+       tx_ring = vmalloc(vnic_tx_rings_len * sizeof *tx_res->tx_ring);
+       if (!tx_ring) {
+               vnic_err(login->name, "vmalloc failed to allocate %u * %lu\n",
+                        vnic_tx_rings_len,
+                        (long unsigned int) (sizeof *tx_res->tx_ring));
+               return -ENOMEM;
+       }
+       memset(tx_ring, 0, vnic_tx_rings_len * sizeof *tx_res->tx_ring);
+
+       /* create TX CQ and set WQE drafts */
+       tx_res->tx_wr.sg_list = tx_res->tx_sge;
+       tx_res->tx_wr.send_flags = IB_SEND_SIGNALED;
+       tx_res->tx_wr.wr.ud.remote_qkey = login->qkey;
+
+       for (i = 0; i < VNIC_MAX_TX_FRAGS; ++i)
+               tx_res->tx_sge[i].lkey = login->port->mr->lkey;
+
+       /* set mcast av draft*/
+       memset(&tx_res->mcast_av, 0, sizeof(struct ib_ah_attr));
+       tx_res->mcast_av.port_num = login->port->num;
+       tx_res->mcast_av.ah_flags = IB_AH_GRH;
+
+       /* create tx cq */
+       comp_vector = tx_res_index % login->port->dev->ca->num_comp_vectors;
+       cq = ib_create_cq(login->port->dev->ca,
+                         vnic_comp_handler_tx,
+                         NULL, &login->tx_res[tx_res_index],
+                         vnic_tx_rings_len, comp_vector);
+       if (IS_ERR(cq)) {
+               vnic_err(login->name, "ib_create_cq failed, index %d, "
+                        "comp_vector %d, rc %d\n",
+                        tx_res_index, comp_vector, (int)PTR_ERR(cq));
+               vfree(tx_ring);
+               return -EINVAL;
+       }
+
+       tx_res->tx_ring = tx_ring;
+       tx_res->cq = cq;
+       tx_res->index = tx_res_index;
+       tx_res->login = login;
+
+       return 0;
+}
+
+int vnic_create_qp_range(struct vnic_login *login)
+{
+       int qp_index, create_flags = 0, rc;
+       struct ib_qp_init_attr *attr;
+       struct ib_qp *qps[VNIC_MAX_NUM_CPUS];
+       struct vnic_qp_res *qp_res;
+
+       attr = kzalloc(VNIC_MAX_NUM_CPUS * sizeof *attr, GFP_KERNEL);
+       if (!attr)
+               return -ENOMEM;
+
+       create_flags |= login->port->dev->attr.device_cap_flags &
+               IB_DEVICE_BLOCK_MULTICAST_LOOPBACK ?
+               IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK : 0;
+
+       /* TODO: rename IB_QP_CREATE_IPOIB_UD_LSO */
+       create_flags |= login->port->dev->attr.device_cap_flags &
+               IB_DEVICE_UD_TSO ?
+               IB_QP_CREATE_IPOIB_UD_LSO : 0;
+
+       for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+               qp_res = &login->qp_res[qp_index];
+               qp_res->tx_index = qp_index % login->tx_rings_num;
+               qp_res->rx_index = qp_index % login->rx_rings_num;
+               memset(&attr[qp_index], 0, sizeof(struct ib_qp_init_attr));
+               attr[qp_index].cap.max_send_wr = vnic_tx_rings_len;
+               attr[qp_index].cap.max_send_sge = VNIC_MAX_TX_FRAGS;
+               attr[qp_index].cap.max_recv_wr = 0; /* we use SRQ */
+               attr[qp_index].cap.max_recv_sge = 0;
+               attr[qp_index].sq_sig_type = IB_SIGNAL_ALL_WR;
+               attr[qp_index].qp_type = IB_QPT_UD;
+               attr[qp_index].send_cq = login->tx_res[qp_res->tx_index].cq;
+               attr[qp_index].recv_cq = login->rx_res[qp_res->rx_index].cq;
+               attr[qp_index].srq = login->port->rx_ring[qp_res->rx_index]->srq;
+               attr[qp_index].event_handler = vnic_qp_event_handler;
+               attr[qp_index].qp_context = &login->qp_res[qp_index];
+               attr[qp_index].create_flags = create_flags;
+               attr[qp_index].cap.max_inline_data = vnic_inline_tshold;
+       }
+
+
+       rc = vnic_ib_create_qp_range(login->port->pd, attr, NULL,
+                                    login->qps_num, login->qps_num, qps);
+       if (rc) {
+               vnic_err(login->name, "vnic_ib_create_qp_range failed, rc %d\n", rc);
+               goto err;
+       }
+
+       for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+               qp_res = &login->qp_res[qp_index];
+               qp_res->qp = qps[qp_index];
+               qp_res->login = login;
+       }
+
+       for (qp_index = 0; qp_index < login->qps_num; ++qp_index) {
+               rc = vnic_qp_to_init(login, qps[qp_index], login->qkey);
+               if (rc) {
+                       vnic_err(login->name, "vnic_qp_to_init failed, rc %d\n", rc);
+                       goto destroy_qps;
+               }
+       }
+
+       kfree(attr);
+       return 0;
+
+destroy_qps:
+       for (qp_index--; qp_index>=0; qp_index--)
+               vnic_qp_to_reset(login, qps[qp_index]);
+
+       for (qp_index = 0; qp_index < login->qps_num; ++qp_index)
+               vnic_destroy_qp(login, qp_index);
+
+err:
+       kfree(attr);
+       return rc;
+}
+
+static inline int use_inline(struct sk_buff *skb)
+{
+       return skb->len <= vnic_inline_tshold && !skb_shinfo(skb)->nr_frags;
+}
+
+int vnic_post_send(struct vnic_login *login, int tx_res_index,
+                  u64 wr_id, struct ib_ah *ah, u32 dqpn)
+{
+       struct ib_send_wr *bad_wr;
+       struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+       struct vnic_qp_res *qp_res = &login->qp_res[tx_res_index % login->qps_num];
+       struct vnic_tx_buf *tx_req = &tx_res->tx_ring[wr_id];
+       skb_frag_t *frags = skb_shinfo(tx_req->skb)->frags;
+       int nr_frags = skb_shinfo(tx_req->skb)->nr_frags, i, off = 0;
+
+       ASSERT(qp_res);
+       ASSERT(tx_res);
+       ASSERT(qp_res->tx_index == tx_res->index);
+       ASSERT(qp_res->qp->send_cq == tx_res->cq);
+
+       if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) {
+               tx_res->tx_sge[off].addr = tx_req->mapping[off];
+               tx_res->tx_sge[off].length = VNIC_ENCAP_LEN;
+               off++;  
+       }
+
+       if (likely(skb_headlen(tx_req->skb))) {
+               if (vnic_encap_headroom && use_inline(tx_req->skb)) {
+                       tx_res->tx_wr.send_flags |= IB_SEND_INLINE;
+                       wr_id |= VNIC_SEND_INLINE_FLAG;
+                       tx_res->tx_sge[off].addr = (unsigned long)tx_req->skb->data;
+               } else {
+                       tx_res->tx_wr.send_flags &= ~IB_SEND_INLINE;
+                       tx_res->tx_sge[off].addr = tx_req->mapping[off];
+               }
+               tx_res->tx_sge[off].length = skb_headlen(tx_req->skb);
+               off++;
+       }
+
+       for (i = 0; i < nr_frags; ++i) {
+               tx_res->tx_sge[i + off].addr = tx_req->mapping[i + off];
+               tx_res->tx_sge[i + off].length = frags[i].size;
+       }
+
+       /* handle runt packets using additional SG */
+       if (unlikely(tx_req->skb->len < login->zlen)) {
+               /* Note: always extend runt packets (for both
+                * internal & external) for virtualization, some emulators
+                * drop runt packets, so we need to avoid runt packets even
+                * if the traffic is not passing the bridge
+                */
+               vnic_dbg_data(login->name, "runt packet, skb %p len %d => %d\n",
+                             tx_req->skb, tx_req->skb->len, login->zlen);
+               /* If there are frags, then packets is longer than 60B */
+               if (use_inline(tx_req->skb))
+                       tx_res->tx_sge[i + off].addr = (u64)(unsigned long)login->pad_va;
+               else
+                       tx_res->tx_sge[i + off].addr = login->pad_dma;
+
+               tx_res->tx_sge[i + off].length = login->zlen - tx_req->skb->len;
+               ++nr_frags;
+               VNIC_STATS_INC(login->port_stats.runt_packets);
+       }
+
+       tx_res->tx_wr.num_sge = nr_frags + off;
+       tx_res->tx_wr.wr_id = wr_id;
+       tx_res->tx_wr.wr.ud.remote_qpn = dqpn;
+       tx_res->tx_wr.wr.ud.ah = ah;
+
+       /* check if we need to calc csum */
+       if (tx_req->skb->ip_summed == CHECKSUM_PARTIAL) {
+               u16 csum_pseudo;
+
+               /* calc pseudo header csum without the length
+                * and put in the transport's header checksum field.
+                * The HW will calculate the rest of it (SWP)
+                */
+               if (tx_req->ip_off)
+                       csum_pseudo = ~csum_tcpudp_magic(ip_hdr(tx_req->skb)->saddr,
+                                                         ip_hdr(tx_req->skb)->daddr,
+                                                         0, /* length */
+                                                         ip_hdr(tx_req->skb)->protocol,
+                                                         0);
+               else
+                       csum_pseudo = ~csum_ipv6_magic(&ipv6_hdr(tx_req->skb)->saddr,
+                                                       &ipv6_hdr(tx_req->skb)->daddr,
+                                                       0, /* length */
+                                                       ipv6_hdr(tx_req->skb)->nexthdr,
+                                                       0);
+
+               /* place the calculated csum in the checksum field in
+                * tcp/udp header
+                */
+               if (tx_req->tcp_off)
+                       tcp_hdr(tx_req->skb)->check = csum_pseudo;
+               else
+                       udp_hdr(tx_req->skb)->check = csum_pseudo;
+
+               /* set CSUM flag in ib_send_wr */
+               tx_res->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+       } else {
+               /* csum already calculated in SW */
+               tx_res->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+       }
+
+       /* prepare TSO header */
+       if (skb_is_gso(tx_req->skb)) {
+               tx_res->tx_wr.wr.ud.mss = skb_shinfo(tx_req->skb)->gso_size + tx_req->hlen;
+               tx_res->tx_wr.wr.ud.header = tx_req->phead;
+               tx_res->tx_wr.wr.ud.hlen = tx_req->hlen;
+               tx_res->tx_wr.opcode = IB_WR_LSO;
+       } else {
+               tx_res->tx_wr.opcode = IB_WR_SEND;
+       }
+
+       vnic_dbg_data(login->name,
+                     "skb %p wr_id %llu sqpn 0x%06x dqpn 0x%06x num_sge "
+                     "%d phead %p was sent\n", tx_req->skb, wr_id, qp_res->qp->qp_num,
+                     dqpn, tx_res->tx_wr.num_sge, tx_req->phead);
+
+       /* if EoIB encap is OOB, copy LRO header to linear part */
+       if (!vnic_encap_headroom && skb_is_gso(tx_req->skb)) {
+               memcpy(tx_res->lso_hdr, VNIC_SKB_GET_ENCAP(tx_req->skb),
+                      VNIC_ENCAP_LEN);
+               memcpy((u8 *)(tx_res->lso_hdr) + VNIC_ENCAP_LEN,
+                      tx_res->tx_wr.wr.ud.header,
+                      tx_res->tx_wr.wr.ud.hlen);
+               tx_res->tx_wr.wr.ud.header = tx_res->lso_hdr;
+               tx_res->tx_wr.wr.ud.mss += VNIC_ENCAP_LEN;
+               tx_res->tx_wr.wr.ud.hlen += VNIC_ENCAP_LEN;
+       }
+
+       return vnic_ib_post_send(qp_res->qp, &tx_res->tx_wr, &bad_wr,
+                                tx_req->ip_off,
+                                tx_req->ip6_off,
+                                tx_req->tcp_off,
+                                tx_req->udp_off);
+}
+
+static int vnic_dma_map_tx(struct ib_device *ca, struct vnic_tx_buf *tx_req)
+{
+       struct sk_buff *skb = tx_req->skb;
+       struct skb_shared_info *shinfo = skb_shinfo(skb);
+       u64 *mapping = tx_req->mapping;
+       int i = 0, off = 0, headlen = skb_headlen(skb);
+
+       if (vnic_encap_headroom && use_inline(skb))
+               return 0;
+
+       if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb)) {
+               mapping[off] = ib_dma_map_single(ca, VNIC_SKB_GET_ENCAP(skb),
+                                                VNIC_ENCAP_LEN, DMA_TO_DEVICE);
+               if (unlikely(ib_dma_mapping_error(ca, mapping[off])))
+                       return -EIO;
+               off++;
+       }
+
+       if (likely(headlen)) {
+               mapping[off] = ib_dma_map_single(ca, skb->data,
+                                                headlen, DMA_TO_DEVICE);
+               if (unlikely(ib_dma_mapping_error(ca, mapping[off])))
+                       goto partial_error;
+               off++;
+       }
+
+       for (i = 0; i < shinfo->nr_frags; ++i) {
+               skb_frag_t *frag = &shinfo->frags[i];
+               mapping[i + off] = ib_dma_map_page(ca, frag->page.p,
+                                                  frag->page_offset,
+                                                  frag->size, DMA_TO_DEVICE);
+               if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
+                       goto partial_error;
+       }
+
+       return 0;
+
+partial_error:
+       for (--i; i >= 0; i--) {
+               skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+               ib_dma_unmap_page(ca, mapping[i + off], frag->size,
+                                 DMA_TO_DEVICE);
+       }
+
+       if (headlen)
+               ib_dma_unmap_single(ca, mapping[--off], skb_headlen(skb),
+                                   DMA_TO_DEVICE);
+
+       if (!vnic_encap_headroom && !skb_is_gso(tx_req->skb))
+               ib_dma_unmap_single(ca, mapping[--off], VNIC_ENCAP_LEN,
+                                   DMA_TO_DEVICE);
+
+       return -EIO;
+}
+
+void vnic_send(struct vnic_login *login, struct sk_buff *skb,
+              struct ib_ah *ah, u32 dqpn, int tx_res_index)
+{
+       struct eoibhdr *_eoib_hdr = VNIC_SKB_GET_ENCAP(skb);
+       struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+       struct vnic_tx_buf *tx_req;
+       unsigned long flags = 0;
+       u64 wr_id;
+       int tx_pkt_num = 1;
+       u8 ip_off;
+
+       if (!vnic_tx_polling)
+               spin_lock_irqsave(&tx_res->lock, flags);
+
+       ASSERT(tx_res_index < login->tx_rings_num);
+       wr_id = tx_res->tx_head & (vnic_tx_rings_len - 1);
+       tx_req = &tx_res->tx_ring[wr_id];
+       tx_req->skb = skb;
+
+       if (skb->ip_summed == CHECKSUM_PARTIAL) {
+               tx_req->ip_off = tx_req->ip6_off = tx_req->tcp_off = tx_req->udp_off = 0;
+               if (VNIC_IP_CSUM_OK(_eoib_hdr)) {
+                       ip_off = vnic_encap_headroom ?
+                               ((skb_network_header(skb) - skb->data) >> 1) :
+                               /* skb_network_header doesn't count the encap since it's OOB */
+                               ((skb_network_header(skb) - skb->data + VNIC_ENCAP_LEN) >> 1);
+                       switch (ntohs(skb->protocol)) {
+                       case ETH_P_IP:
+                               tx_req->ip_off = ip_off;
+                               break;
+                       case ETH_P_IPV6:
+                               tx_req->ip6_off = ip_off;
+                       }
+               }
+               if (VNIC_TCP_CSUM_OK(_eoib_hdr))
+                       tx_req->tcp_off =
+                           (skb_transport_header(skb) - skb_network_header(skb)) >> 2;
+               else if (VNIC_UDP_CSUM_OK(_eoib_hdr))
+                       tx_req->udp_off =
+                           (skb_transport_header(skb) - skb_network_header(skb)) >> 2;
+               ASSERT(!tx_req->udp_off || !tx_req->tcp_off);
+               vnic_dbg_data(login->name, "ip_off = %d, tcp_off = %d, udp_off = %d\n",
+                             tx_req->ip_off, tx_req->tcp_off, tx_req->udp_off);
+               VNIC_STATS_INC(login->port_stats.tx_chksum_offload);
+       }
+
+       /* TSO skb */
+       if (skb_is_gso(skb)) {
+               tx_req->hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+               tx_req->phead = skb->data;
+               ASSERT(skb_pull(skb, tx_req->hlen));
+               VNIC_STATS_INC(login->port_stats.tso_packets);
+               tx_pkt_num = skb_shinfo(tx_req->skb)->gso_segs;
+       }
+
+       /* map tx skb */
+       if (unlikely(vnic_dma_map_tx(login->port->dev->ca, tx_req)))
+               goto err;
+
+       /* send.. unmap.. free skb.. drain tx cq.. [pray] */
+       if (unlikely(++tx_res->tx_outstanding == vnic_tx_rings_len)) {
+               if (++tx_res->tx_stopped_cnt % 100 == 0)
+                       vnic_dbg(login->name, "tx queue %d stopped cnt %d, outs %d\n",
+                                tx_res->index,
+                                tx_res->tx_stopped_cnt,
+                                tx_res->tx_outstanding);
+               ASSERT(!VNIC_TXQ_STOPPED(tx_res));
+               VNIC_TXQ_STOP(tx_res);
+               /* vnic_drain_arm_tx_cq() will arm the cq OR resume the ring */
+               VNIC_STATS_DO_INC(login->port_stats.queue_stopped);
+       }
+
+       ASSERT(tx_res->tx_outstanding <= vnic_tx_rings_len);
+
+       if (unlikely(vnic_post_send(login, tx_res_index, wr_id, ah, dqpn))) {
+               vnic_warn(login->name, "vnic_post_send failed\n");
+               VNIC_STATS_DO_INC(tx_res->stats.tx_errors);
+               VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+               --tx_res->tx_outstanding;
+               vnic_dealloc_tx_skb(login, tx_res->index, wr_id);
+               /* no need to netif_wake_queue() here, because
+                * vnic_comp_handler_tx() will eventually be called 
+                * for armed cq, and it will wake-up the queue when it's ready
+                */
+       } else {
+               VNIC_STATS_DO_ADD(tx_res->stats.tx_packets, tx_pkt_num);
+               VNIC_STATS_DO_ADD(tx_res->stats.tx_bytes, skb->len);
+               login->dev->trans_start = jiffies;
+               ++tx_res->tx_head;
+
+
+               if (vnic_tx_polling) {
+                       if (likely(!skb_shared(skb)))
+                               skb_orphan(skb);
+                       else
+                               VNIC_STATS_DO_INC(login->port_stats.shared_packets);
+               }
+       }
+
+       /* poll every vnic_max_tx_outs packets */
+       if (vnic_tx_polling) {
+               if (tx_res->tx_outstanding > vnic_max_tx_outs ||
+                   VNIC_TXQ_STOPPED(tx_res))
+                       vnic_drain_arm_tx_cq(login, tx_res_index);
+       } else
+               spin_unlock_irqrestore(&tx_res->lock, flags);
+
+       return;
+
+err:
+       VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+       VNIC_STATS_DO_INC(tx_res->stats.tx_errors);
+       dev_kfree_skb_any(skb);
+
+       if (!vnic_tx_polling)
+               spin_unlock_irqrestore(&tx_res->lock, flags);
+
+       return;
+}
+
+void vnic_ib_free_ring(struct vnic_rx_ring *ring)
+{
+       ASSERT(ring->srq);
+       ib_destroy_srq(ring->srq);
+}
+
+int vnic_ib_init_ring(struct vnic_rx_ring *ring)
+{
+       struct ib_srq_init_attr srq_attr;
+       struct vnic_port *port = ring->port;
+       int rc = 0, headroom = 10;
+
+       /* alloc SRQ */
+       memset(&srq_attr, 0, sizeof(struct ib_srq_init_attr));
+       srq_attr.attr.max_sge = VNIC_MAX_RX_FRAGS;
+       srq_attr.attr.max_wr = vnic_rx_rings_len + headroom;
+       srq_attr.attr.srq_limit = vnic_rx_rings_len + headroom;
+       ring->srq = ib_create_srq(port->pd, &srq_attr);
+       if (IS_ERR(ring->srq)) {
+               vnic_err(ring->port->name, "ib_create_srq failed, index %d, rc %d\n",
+                        ring->index, (int)PTR_ERR(ring->srq));
+               rc = (int)PTR_ERR(ring->srq);
+       }
+
+       return rc;
+}
+
+int vnic_port_ib_init(struct vnic_port *port)
+{
+       int i;
+
+       /* alloc PD */
+       port->pd = ib_alloc_pd(port->dev->ca);
+       if (IS_ERR(port->pd)) {
+               vnic_err(port->name, "failed to allocate PD\n");
+               goto err;
+       }
+       vnic_dbg_data(port->name, "port->pd %p\n", port);
+
+       /* alloc MR */
+       port->mr = ib_get_dma_mr(port->pd, IB_ACCESS_LOCAL_WRITE);
+       if (IS_ERR(port->mr)) {
+               vnic_err(port->name, "failed to allocate MR\n");
+               goto free_pd;
+       }
+       vnic_dbg_data(port->name, "port->mr %p\n", port->mr);
+
+       /* alloc RX RING */
+       for (i = 0; i < port->rx_rings_num; ++i) {
+               port->rx_ring[i] = vnic_create_rx_ring(port, i);
+               if (IS_ERR(port->rx_ring[i])) {
+                       vnic_err(port->name, "failed to allocate rx_ring %d\n", i);
+                       port->rx_ring[i] = NULL;
+                       goto free_rx_ring;
+               }
+       }
+       vnic_dbg_data(port->name, "allocated %d RX ring\n", port->rx_rings_num);
+
+       return 0;
+
+free_rx_ring:
+       for (i = 0; i < port->rx_rings_num; ++i)
+               vnic_destroy_rx_ring(port->rx_ring[i]);
+/* free_mr: */
+       ib_dereg_mr(port->mr);
+free_pd:
+       ib_dealloc_pd(port->pd);
+err:
+       return -EINVAL;
+
+}
+
+void vnic_port_ib_cleanup(struct vnic_port *port)
+{
+       int i;
+
+       for (i = 0; i < port->rx_rings_num; ++i)
+               vnic_destroy_rx_ring(port->rx_ring[i]);
+
+       ib_dereg_mr(port->mr);
+       ib_dealloc_pd(port->pd);
+
+       return;
+}
+
+void vnic_ib_dispatch_event(struct ib_event *event)
+{
+       return;
+}
+
+int vnic_ib_set_moder(struct vnic_login *login, u16 rx_usecs, u16 rx_frames,
+                     u16 tx_usecs, u16 tx_frames)
+{
+       int rc, i;
+
+       vnic_dbg_moder(login->name, "set coalescing params for mtu:%d to "
+                      "rx_frames:%d rx_usecs:%d, "
+                      "tx_frames:%d tx_usecs:%d, "
+                      "adaptive_rx_coal:%d, "
+                      "adaptive_tx_coal:%d, "
+                      "sample_interval:%d, "
+                      "port.state: %d\n",
+                      login->dev->mtu,
+                      rx_frames, rx_usecs,
+                      tx_frames, tx_usecs,
+                      login->adaptive_rx_coal, 0,
+                      login->sample_interval, login->port->attr.state);
+
+       for (i = 0; i < login->tx_rings_num; ++i) {
+               rc = ib_modify_cq(login->tx_res[i].cq, tx_frames, tx_usecs);
+               if (rc && rc != -ENOSYS) {
+                       vnic_warn(login->name, "failed modifying tx_res,"
+                                 " rc %d, tx ring index %d\n", rc, i);
+                       return rc;
+               }
+       }
+
+       for (i = 0; i < login->rx_rings_num; ++i) {
+               rc = ib_modify_cq(login->rx_res[i].cq, rx_frames, rx_usecs);
+               if (rc && rc != -ENOSYS) {
+                       vnic_warn(login->name, "failed modifying rx_res,"
+                                 " rc %d, rx ring index %d\n", rc, i);
+                       return rc;
+               }
+       }
+
+       return 0;
+}
+
+int vnic_ib_down(struct net_device *dev)
+{
+       return 0;
+}
+
+int vnic_ib_up(struct net_device *dev)
+{
+       return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_mac.c
new file mode 100644 (file)
index 0000000..996d70d
--- /dev/null
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip_discover.h"
+
+static void vnic_mace_dealloc(struct vnic_mac *mace)
+{
+       ASSERT(mace);
+       kfree(mace);
+}
+
+static struct vnic_mac *vnic_mace_alloc(const u8 *mac, u16 vnic_id)
+{
+       struct vnic_mac *mace;
+
+       mace = kzalloc(sizeof *mace, GFP_ATOMIC);
+       if (!mace)
+               return ERR_PTR(-ENOMEM);
+
+       /* set mac entry fields */
+       memcpy(mace->mac, mac, ETH_ALEN);
+       mace->created = jiffies;
+       mace->last_tx = jiffies;
+       mace->vnic_id = vnic_id;
+
+       return mace;
+}
+
+static void vnic_mace_del(struct vnic_login *login, struct vnic_mac *mace)
+{
+       ASSERT(mace);
+       rb_erase(&mace->rb_node, &login->mac_tree);
+}
+
+static int vnic_mace_add(struct vnic_login *login, struct vnic_mac *mace)
+{
+       struct rb_node **n = &login->mac_tree.rb_node, *pn = NULL;
+       struct vnic_mac *mace_t;
+       int rc;
+
+       while (*n) {
+               pn = *n;
+               mace_t = rb_entry(pn, struct vnic_mac, rb_node);
+               rc = memcmp(mace->mac, mace_t->mac, ETH_ALEN);
+               if (rc < 0)
+                       n = &pn->rb_left;
+               else if (rc > 0)
+                       n = &pn->rb_right;
+               else {
+                       rc = -EEXIST;
+                       goto out;
+               }
+       }
+
+       rb_link_node(&mace->rb_node, pn, n);
+       rb_insert_color(&mace->rb_node, &login->mac_tree);
+       rc = 0;
+
+out:
+       return rc;
+}
+
+/* vnic_mace_search --
+ * Return entry pointer if found, or ERR_PTR(-ENODATA) if not found.
+ */
+static struct vnic_mac *vnic_mace_search(struct vnic_login *login, u8 *mac)
+{
+       struct rb_node *n = login->mac_tree.rb_node;
+       struct vnic_mac *mace_t;
+       int rc;
+
+       ASSERT(login);
+       ASSERT(mac);
+
+       while (n) {
+               mace_t = rb_entry(n, struct vnic_mac, rb_node);
+               ASSERT(mace_t);
+               rc = memcmp(mac, mace_t->mac, ETH_ALEN);
+               if (rc < 0)
+                       n = n->rb_left;
+               else if (rc > 0)
+                       n = n->rb_right;
+               else
+                       goto out;
+       }
+
+       mace_t = ERR_PTR(-ENODATA);
+
+out:
+       return mace_t;
+}
+
+/* vnic_mace_update --
+ * Remove: -ENODATA if not found, if removed, update ref_cnt, return 0
+ * Add:    -ENOMEM if no mem, -EEXIST if already exists,
+ *         if added, update ref_cnt, return 0
+ * NOTE: ref counters must be updated here, as this function is
+ *       shared among multiple entry points
+ */
+int vnic_mace_update(struct vnic_login *login, u8 *mac, u16 vnic_id, int remove)
+{
+       struct vnic_mac *mace;
+       int rc;
+
+       mace = vnic_mace_search(login, mac);
+       if (remove) {
+               if (IS_ERR(mace))
+                       return -ENODATA;
+               vnic_mace_del(login, mace);
+               vnic_mace_dealloc(mace);
+               /* update ref cnt */
+               ASSERT(atomic_read(&login->vnic_child_cnt));
+               atomic_dec(&login->vnic_child_cnt);
+       } else {
+               if (PTR_ERR(mace) != -ENODATA)
+                       return -EEXIST;
+
+               /* test ref cnt */
+               if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) {
+                       vnic_warn(login->name, "too many child vNics, max %d\n",
+                                 vnic_child_max);
+                       return -EUSERS; /* too many users */
+               }
+
+               mace = vnic_mace_alloc(mac, vnic_id);
+               if (!mace)
+                       return -ENOMEM;
+
+               rc = vnic_mace_add(login, mace);
+               if (rc) {
+                       vnic_mace_dealloc(mace);
+                       return rc;
+               }
+               /* update ref cnt */
+               atomic_inc(&login->vnic_child_cnt);
+               vnic_dbg_mac(login->name,
+                            "updated mac "MAC_6_PRINT_FMT" remove %d\n",
+                            MAC_6_PRINT_ARG(mac), remove);
+       }
+
+       return 0;
+}
+
+/* this function can be called from fast data-path 
+ * need to make sure that login instance is protected here
+ * likely/unlikely below were added to match the hard_start_xmit fast data flow
+ * + caller must hold login->mac_rwlock (read_lock is enough because we only
+ *   queue the job here)
+ * + it queues a job to create a child
+ */
+int vnic_child_update(struct vnic_login *login, u8 *mac, int remove)
+{
+       struct vnic_mac *mace;
+       char *cmd_str;
+       struct fip_hadmin_cmd *cmd_hadmin;
+       int count, rc = -EINVAL;
+       u16 vnic_id = 0;
+
+       vnic_dbg_func(login->name);
+
+       mace = vnic_mace_search(login, mac);
+
+       /* if asked to add, and data already exists, abort */
+       if (likely(!remove && !IS_ERR(mace))) {
+               mace->last_tx = jiffies;
+               return -EEXIST;
+       }
+
+       if (!remove) {
+               /* test if there is too many child vNics same check exist in
+                * vnic_mace_update(), but we have it here as well to let
+                * vnic_set_mac return friendly rc
+                */
+               if (atomic_read(&login->vnic_child_cnt) + 1 > vnic_child_max) {
+                       vnic_warn(login->name, "too many child vNics, "
+                                 "max %d\n", vnic_child_max);
+                       return -EUSERS; /* too many users */
+               }
+
+               /* update last_tx */
+               ASSERT(mace);
+               /* generate new vnic_id only when new child is being added */
+               vnic_id = atomic_inc_return(&login->port->vnic_child_ids);
+               /* set bit 14 so we avoid conflict with normal host/net admin */
+               vnic_id %= (1 << (VNIC_ID_LEN - 2));
+               vnic_id |= (1 << (VNIC_ID_LEN - 2));
+
+               /* TODO: update hadmin user-script and manual to make hadmin
+                * vnic_id interval >= 16K (1<<14 == 16384) so bit 14 is clear
+                * for parent host admin.
+                * to avoid atomic counter wrap around, move to bitmap array
+                */ 
+       } else {
+               /* if asked to remove, and data not found, abort */
+               if (IS_ERR(mace))
+                       return -ENODATA;
+
+               ASSERT(mace);
+               vnic_id = mace->vnic_id;
+       }
+
+       /* allocate cmd structs, too big to be local vars
+        * use GFP_ATOMIC because this func can be called from data path
+        */
+       cmd_str = kmalloc(sizeof *cmd_str * PAGE_SIZE, GFP_ATOMIC);
+       if (!cmd_str)
+               return -ENOMEM;
+
+       cmd_hadmin = kmalloc(sizeof *cmd_hadmin, GFP_ATOMIC);
+       if (!cmd_hadmin) {
+               kfree(cmd_str);
+               return -ENOMEM;
+       }
+
+       /* inherit command from parent, change:
+        * name, parent, mac, vnic_id and source
+        * Note: cannot use parent login->fip_vnic->cmd here
+        * in order to support net-admin-vnics
+        */
+       vnic_login_cmd_init(cmd_hadmin);
+
+       /* child vNic name scheme:
+        * eth<parent-cnt>.c<child-vnic-id>
+        * Note: avoid sysfs files conflict (that's why parent unique cnt must
+        * be included in the name here)
+        */
+       snprintf(cmd_hadmin->c_name, MAX_INPUT_LEN, "%s%u.c%u",
+                "eth", login->cnt, vnic_id);
+       snprintf(cmd_hadmin->c_mac, MAX_INPUT_LEN, MAC_6_PRINT_FMT,
+                MAC_6_PRINT_ARG(mac));
+       snprintf(cmd_hadmin->c_vnic_id, MAX_INPUT_LEN, "%u",
+                vnic_id);
+       snprintf(cmd_hadmin->c_eport, MAX_INPUT_LEN, "%s",
+                login->fip_vnic->gw_info.gw_port_name);
+       snprintf(cmd_hadmin->c_parent, MAX_INPUT_LEN, "%s",
+                login->dev->name);
+       snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s",
+                login->fip_vnic->gw_info.system_name);
+       snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, VNIC_GUID_FMT,
+                VNIC_GUID_RAW_ARG(login->fip_vnic->gw_info.system_guid));
+
+       /* all hadmin vNics must use same BX format (guid vs. name) */
+       if (login->fip_vnic->hadmined) {
+               snprintf(cmd_hadmin->c_bxname, MAX_INPUT_LEN, "%s",
+                        login->fip_vnic->cmd.c_bxname);
+               snprintf(cmd_hadmin->c_bxguid, MAX_INPUT_LEN, "%s",
+                        login->fip_vnic->cmd.c_bxguid);
+       }
+
+       /* VLAN is optional, set it only when used by parent */
+       if (login->vlan_used)
+               snprintf(cmd_hadmin->c_vid, MAX_INPUT_LEN, "%d",
+                        login->fip_vnic->vlan);
+
+       /* ready to set the command */
+       count = vnic_login_cmd_set(cmd_str, cmd_hadmin);
+       if (!count)
+               goto out;
+
+       /* queue job (similar to sysfs write function,
+        * will eventually call fip_discover_hadmin_update_parent() ->
+        * vnic_mace_update()
+        */
+       count = fip_hadmin_sysfs_update(login->port, cmd_str, count, remove);
+       if (count <= 0 && count != -EEXIST)
+               goto out;
+
+       /* at this point, job queued, return success */
+       rc = 0;
+
+out:
+       kfree(cmd_str);
+       kfree(cmd_hadmin);
+       return rc;
+}
+
+void vnic_child_flush(struct vnic_login *login, int all)
+{
+       struct rb_node *n;
+       struct vnic_mac *mace, *mace_t;
+       LIST_HEAD(local_list);
+
+       vnic_dbg_func(login->name);
+
+       n = rb_first(&login->mac_tree);
+       while (n) {
+               mace = rb_entry(n, struct vnic_mac, rb_node);
+               list_add_tail(&mace->list, &local_list);
+               n = rb_next(n);
+       }
+
+       list_for_each_entry_safe(mace, mace_t, &local_list, list) {
+               list_del(&mace->list);
+               /* if not-flush-all, and mac is dev_addr mac, skip this entry */
+               if (!all && !memcmp(login->dev->dev_addr, mace->mac, ETH_ALEN))
+                       continue;
+               vnic_child_update(login, mace->mac, 1);
+               vnic_mace_del(login, mace);
+               vnic_mace_dealloc(mace);
+       }
+
+
+}
+
+/* find parent vNic
+ * add the child vnic to its mac_tree
+ * sync child qp_base_num with parent
+ * for child removal, it's ok not to find the parent, or the child mac entry
+ */
+int vnic_parent_update(struct vnic_port *port, char *name, u16 vnic_id,
+                      u8 *mac, u32 *qp_base_num_ptr, char *parent_name,
+                      int remove)
+{
+       struct vnic_login *login;
+       int rc = -ENODATA;
+
+       vnic_dbg_func(name);
+
+       mutex_lock(&port->mlock);
+       list_for_each_entry(login, &port->login_list, list) {
+               vnic_dbg_mac(name, "checking parent %s for child %s (expect %s)\n",
+                            login->dev->name, name, parent_name);
+               /* check if parent vnic has valid QPN and not being destroyed */
+               if (!strcmp(login->dev->name, parent_name) &&
+                   test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->fip_vnic->login_state) &&
+                   !login->fip_vnic->flush) {
+                       /* sync qp_base_num with parent */
+                       if (qp_base_num_ptr)
+                               *qp_base_num_ptr = login->qp_base_num;
+
+                       /* update mac_tree and mace vnic_id */
+                       write_lock_bh(&login->mac_rwlock);
+                       rc = vnic_mace_update(login, mac, vnic_id, remove);
+                       write_unlock_bh(&login->mac_rwlock);
+
+                       break;
+               }
+       }
+
+       mutex_unlock(&port->mlock);
+
+       /* for vNic removal, ignore rc */
+       return remove ? 0 : rc;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_main.c
new file mode 100644 (file)
index 0000000..7e17e8d
--- /dev/null
@@ -0,0 +1,1179 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+void vnic_login_refresh_mcasts(struct vnic_port *port)
+{
+       struct vnic_login *login;
+
+       vnic_dbg_mark();
+       mutex_lock(&port->mlock);
+       list_for_each_entry(login, &port->login_list, list)
+               vnic_tree_mcast_detach(&login->mcast_tree);
+       list_for_each_entry(login, &port->login_list, list)
+       {
+                       if (vnic_sa_query) {
+                               /* take the tx lock to make sure no delete function is called at the time */
+                               netif_tx_lock_bh(login->dev);
+                               vnic_neigh_invalidate(login);
+                               netif_tx_unlock_bh(login->dev);
+                       }
+
+                       vnic_tree_mcast_attach(&login->mcast_tree);
+       }
+       mutex_unlock(&port->mlock);
+}
+
+int vnic_login_pre_create_1(struct vnic_port *port,
+                           struct fip_vnic_data *vnic)
+{
+       struct vnic_login *login;
+       struct net_device *dev;
+
+       /* set login to zero first (for parent_used case) */
+       vnic->login = NULL;
+
+       /* if parent_used, skip */
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               return 0;
+       } else {
+               vnic_dbg_func(vnic->name);
+       }
+
+       /* create netdev per login, vlan configuration is done from outside */
+       dev = vnic_alloc_netdev(port);
+       if (IS_ERR(dev)) {
+               vnic_err(port->name, "vnic_alloc_netdev failed\n");
+               goto err;
+       }
+
+       login = vnic_netdev_priv(dev);
+       login->fip_vnic = vnic;
+       vnic->login = login;
+       login->vlan_used = vnic->vlan_used;
+       login->dev->hard_header_len += (vnic->vlan_used && vnic->hadmined)? VLAN_HLEN: 0;
+       vnic_dbg_fip(vnic->name,"creating vnic, hadmin=%d vlan_used=%d hard_header_len += %d\n",
+                                vnic->hadmined, vnic->vlan_used, (vnic->vlan_used && vnic->hadmined)? VLAN_HLEN: 0);
+       set_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state);
+
+       return 0;
+
+err:
+       return -ENODEV;
+}
+
+int vnic_login_pre_create_2(struct fip_vnic_data *vnic, int qps_num, int is_lag)
+{
+       struct vnic_login *login = vnic->login;
+       int i, j;
+
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               return 0;
+       } else {
+               ASSERT(login);
+               vnic_dbg_func(login->name);
+       }
+
+       login->qps_num = qps_num;
+       login->qkey = VNIC_DATA_QKEY;
+       login->is_lag = is_lag;
+       VNIC_TXQ_SET_ACTIVE(login, min(login->tx_rings_num, login->qps_num));
+
+       /* prepare padding for runt packets */
+       login->pad_va = kzalloc(VNIC_EOIB_ZLEN_MAX, GFP_KERNEL);
+       if (!login->pad_va)
+               return -ENOMEM;
+
+       login->pad_dma = ib_dma_map_single(login->port->dev->ca, login->pad_va,
+                                          VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+       if (ib_dma_mapping_error(login->port->dev->ca, login->pad_dma))
+               goto err;
+
+       /* create TX resources */
+       for (i = 0; i < login->tx_rings_num; ++i) {
+               if (vnic_create_tx_res(login, i)) {
+                       vnic_err(login->name, "vnic_create_tx_res failed,"
+                                " index %d\n", i);
+                       goto free_tx_res;
+               }
+       }
+
+       /* create RX resources */
+       for (j = 0; j < login->rx_rings_num; ++j) {
+               if (vnic_create_rx_res(login, j)) {
+                       vnic_err(login->name, "vnic_create_rx_res failed,"
+                                " index %d\n", j);
+                       goto free_rx_res;
+               }
+       }
+
+       /* create QPs */
+       if (vnic_create_qp_range(login)) {
+               vnic_err(login->name, "vnic_create_qp_range failed\n");
+               goto free_rx_res;
+       }
+
+       /* first QP is the base QP */
+       login->qp_base_num = login->qp_res[0].qp->qp_num;
+       vnic->qp_base_num = login->qp_base_num;
+
+       /* update state */
+       set_bit(VNIC_STATE_LOGIN_PRECREATE_2, &vnic->login_state);
+
+       login->queue_stopped = 0;
+
+       /* calls vnic_do_get_stats() */
+       queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY);
+
+       return 0;
+
+free_rx_res:
+       for (--j; j >= 0; --j)
+               vnic_destroy_rx_res(login, j);
+
+       i = login->tx_rings_num;
+free_tx_res:
+       for (--i; i >= 0; --i)
+               vnic_destroy_tx_res(login, i);
+/*free_pad:*/
+       ib_dma_unmap_single(login->port->dev->ca, login->pad_dma,
+                           VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+err:
+       kfree(login->pad_va);
+       return -ENODEV;
+}
+
+int vnic_login_register_netdev(struct fip_vnic_data *vnic,
+                              const char *mac,
+                              const char *name)
+{
+       struct vnic_login *login = vnic->login;
+
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               vnic_info("%s created (parent %s mac "MAC_6_PRINT_FMT")\n",
+                         name, vnic->parent_name,
+                         MAC_6_PRINT_ARG(vnic->mac_cache));
+               return 0;
+       } else {
+               ASSERT(login);
+               vnic_dbg_func(login->name);
+       }
+
+       /* set netdev name and mac */
+       if (name)
+               strncpy(login->dev->name, name, IFNAMSIZ);
+       if (mac) {
+               memcpy(login->dev->dev_addr, mac, ETH_ALEN);
+               /* save original mac */
+               memcpy(login->dev_addr, mac, ETH_ALEN);
+       }
+
+       /* set device features according to all_vlan mode */
+       login->dev->features |= NETIF_F_HIGHDMA;
+
+       //ronni - fixme. add comment here
+        if (!vnic->all_vlan_gw) {
+                login->dev->features |= NETIF_F_VLAN_CHALLENGED;
+                login->dev->features &= ~NETIF_F_HW_VLAN_FILTER;
+        } else
+                login->dev->features |= NETIF_F_HW_VLAN_FILTER;
+
+       /* register netdev */
+       if (register_netdev(login->dev)) {
+               vnic_err(login->name, "register_netdev failed name=%s mac="
+                        MAC_6_PRINT_FMT" login->dev=%p\n",
+                        name ? name : "net_admin",
+                        MAC_6_PRINT_ARG(login->dev->dev_addr), login->dev);
+               goto err;
+       }
+
+       /* encode the port number in dev_id:
+        * This allows us to associate the net device
+        * with the underlying device's port.
+        */
+       login->dev->dev_id = login->port->num - 1;
+
+       if (vnic_create_dentry(login)) {
+               vnic_err(login->name, "vnic_create_dentry failed\n");
+               goto err;
+       }
+       
+       /* print info only after register_netdev so dev->name is valid */
+       sprintf(login->name, "%s", login->dev->name);
+       vnic_info("%s created (%s port %d)\n",
+                 login->dev->name,
+                 login->port->dev->ca->name, login->port->num);
+
+       /* disable tx queues and carrier. They will be started
+        * after create 2 is called the mcast is attached ...
+        */
+       netif_tx_disable(login->dev);
+       netif_carrier_off(login->dev);
+
+       mutex_lock(&login->port->mlock);
+       vnic_dbg_mac(login->name, "added to login_list\n");
+       list_add_tail(&login->list, &login->port->login_list);
+       mutex_unlock(&login->port->mlock);
+
+       set_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state);
+
+       return 0;
+
+err:
+       return -EINVAL;
+}
+
+int vnic_login_complete_ack(struct fip_vnic_data *vnic,
+                           struct fip_login_data *login_data,
+                           struct fip_shared_vnic_data *shared_vnic)
+{
+       struct vnic_mcast *mcaste, *mcaste_bcast, *mcast_shared = NULL;
+       struct vnic_login *login = vnic->login;
+       int rc;
+       int first_time_vlan = 0;
+
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               return 0;
+       } else {
+               ASSERT(login);
+               vnic_dbg_func(login->name);
+       }
+
+       /*
+       * TODO, check if you need them all, check overlap with gw_neigh
+       * check how pkey is passed from FIP
+       */
+       login->pkey = login_data->pkey;
+       login->pkey_index = login_data->pkey_index;
+       login->n_mac_mcgid = login_data->n_mac_mcgid;
+       login->gw_port_id = login_data->port_id;
+
+       /*GW should send the data SL from the login packet*/
+       login->sl = login_data->sl;
+
+       login->vnic_id = login_data->vnic_id;
+
+       memcpy(login->mgid_prefix, login_data->mgid_prefix, VNIC_MGID_PREFIX_LEN);
+       memcpy(login->vnic_name, login_data->vnic_name, sizeof(login_data->vnic_name));
+       memcpy(login->vendor_id, login_data->vendor_id, sizeof(login_data->vendor_id));
+
+       VNIC_STR_STRIP(login->vnic_name);
+       VNIC_STR_STRIP(login->vendor_id);       /* set ZLEN (varies per VLAN support) */
+
+       /* set VLAN */
+       login->zlen = ETH_ZLEN + (vnic_encap_headroom? VNIC_ENCAP_LEN: 0);
+       first_time_vlan = !login->vlan_used; /* always false for hadmin vnics with vlans */
+       login->vlan_used = login_data->vp;
+       login->all_vlan_gw = login_data->all_vlan_gw;
+       if ((VNIC_VLAN_ENABLED(login))) {
+               login->vid = cpu_to_be16(login_data->vlan);
+               if (first_time_vlan) {
+                       vnic_dbg_fip(login->dev->name,"Updating hard_header_len %d+%d=%d\n",
+                                                login->dev->hard_header_len, VLAN_HLEN,
+                                                login->dev->hard_header_len + VLAN_HLEN);
+                       login->dev->hard_header_len += VLAN_HLEN;
+               }
+               login->zlen = ETH_ZLEN + VLAN_HLEN + (vnic_encap_headroom? VNIC_ENCAP_LEN: 0);
+       }
+
+       /* create gw_neigh (no RSS when sending to the GW)
+        * user zero mac to describe GW L2 address
+        */
+       login->gw_neigh = 
+               vnic_neighe_alloc(login, NULL, login_data->lid,
+                                 login_data->qpn, 0);
+       if (IS_ERR(login->gw_neigh)) {
+               vnic_err(login->name, "failed to alloc gw neigh\n");
+               goto err;
+       }
+
+       /* alloc mcast entries here to simplify the error flow */
+       mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+       if (IS_ERR(mcaste))
+               goto err_free_gw_ah;
+       mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL);
+       if (IS_ERR(mcaste_bcast)) {
+               vnic_mcast_dealloc(mcaste);
+               goto err_free_gw_ah;
+       }
+       /* used by shared vnic mcast group */
+       if (shared_vnic && shared_vnic->enabled) {
+               mcast_shared = vnic_mcast_alloc(login->port, NULL, NULL);
+               if (IS_ERR(mcast_shared)) {
+                       vnic_mcast_dealloc(mcaste);
+                       vnic_mcast_dealloc(mcaste_bcast);
+                       goto err_free_gw_ah;
+               }
+       }
+
+       /* attach to default mgid */
+       __vnic_mcaste_fill(login, mcaste, login->gw_port_id, ETH_ZERO_MAC, 0, vnic_mcast_create);
+       mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+       mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+       mcaste->attach_cb = __bcast_attach_cb;
+       mcaste->detach_cb = __bcast_detach_cb;
+       mcaste->attach_cb_ctx = login;
+       mcaste->detach_cb_ctx = login;
+       rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+       ASSERT(!rc);
+       rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+       ASSERT(!rc);
+
+       /* attach to bcast mgid (use default mlid) */
+       if (login->n_mac_mcgid || vnic_mgid_data_type) {
+               __vnic_mcaste_fill(login, mcaste_bcast, login->gw_port_id, ETH_BCAST_MAC, 0, 0);
+               mcaste_bcast->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+               mcaste_bcast->retry = VNIC_MCAST_ULIMIT_RETRY;
+               /* The port gid is overun by the default gid as part of the mgid over
+                * same mlid hack */
+               memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN);
+               rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast);
+               ASSERT(!rc);
+               rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast);
+               ASSERT(!rc);
+       } else {
+               vnic_mcast_dealloc(mcaste_bcast);
+       }
+
+       login->shared_vnic = 0;
+       /* attach to bcast mgid (use default mlid) */
+       if (shared_vnic && shared_vnic->enabled) {
+               u8 rss_hash = shared_vnic->ip[0] ^  shared_vnic->ip[1] ^
+                       shared_vnic->ip[2] ^ shared_vnic->ip[3];
+
+               login->shared_vnic = 1;
+               __vnic_mcaste_fill(login, mcast_shared, login->gw_port_id, shared_vnic->emac, 0, 0);
+               mcast_shared->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+               mcast_shared->retry = VNIC_MCAST_ULIMIT_RETRY;
+               memcpy(&mcast_shared->port_gid, &mcaste->port_gid, GID_LEN);
+               mcast_shared->gid.raw[12]= rss_hash;
+
+               vnic_dbg_mcast(login->name, "vnic %s attaching shared vnic 1 "
+                              "MGID "VNIC_GID_FMT"\n", login->name,
+                              VNIC_GID_RAW_ARG(mcast_shared->gid.raw));
+               mcaste = mcast_shared;
+               memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+               rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+               ASSERT(!rc);
+               rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+               ASSERT(!rc);
+       }
+
+       /* set state */
+       set_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic->login_state);
+
+       /* call vnic_open() if open was called when we were not ready to handle it */
+       if (test_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state))
+#ifndef _BP_NO_NDO_OPS
+               login->dev->netdev_ops->ndo_open(login->dev);
+#else
+               login->dev->open(login->dev);
+#endif
+
+       return 0;
+
+err_free_gw_ah:
+       vnic_neighe_dealloc(login->gw_neigh);
+err:
+       return -EINVAL;
+}
+
+/*
+ * When destroying login, call to stop login wq tasks. do not call from
+ * login_wq context.
+*/
+void vnic_login_destroy_stop_wq(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+       struct vnic_login *login = vnic->login;
+
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               return;
+       } else {
+               ASSERT(login);
+               vnic_dbg_func(login->name);
+       }
+
+       if (test_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state)) {
+               /* cancel vnic_auto_moder() */
+               vnic_dbg_mark();
+               mutex_lock(&login->moder_lock);
+               login->queue_stopped = 1;
+               mutex_unlock(&login->moder_lock);
+#ifndef _BP_WORK_SYNC
+               cancel_delayed_work_sync(&login->stats_task);
+               if (cancel_delayed_work_sync(&login->mcast_task))
+                       dev_put(login->dev);
+               cancel_delayed_work_sync(&login->restart_task);
+#else
+               cancel_delayed_work(&login->stats_task);
+               if (cancel_delayed_work(&login->mcast_task))
+                       dev_put(login->dev);
+               cancel_delayed_work(&login->restart_task);
+               flush_workqueue(login_wq);
+#endif
+       }
+}
+
+/*
+ * When destroy login data struct. Assumes all login wq tasks are stopped.
+ * Can be called from any context, might block for a few secs.
+*/
+void vnic_login_destroy_wq_stopped(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+       struct vnic_login *login = vnic->login;
+       unsigned long flags;
+       int i;
+
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               vnic_info("%s destroyed (parent %s mac "MAC_6_PRINT_FMT")\n",
+                         vnic->interface_name, vnic->parent_name,
+                         MAC_6_PRINT_ARG(vnic->mac_cache));
+               /* Note: vNics can be logged out by BXM (bypass sysfs calls)
+                * so we need to cleanup the parent here as well
+                * if we reach this function from sysfs calls,
+                * then vnic_parent_update will have no effect here (ok)
+                */
+               vnic_parent_update(vnic->port, vnic->name, vnic->vnic_id,
+                                  vnic->mac_cache, NULL, vnic->parent_name, 1);
+               return;
+       } else {
+               ASSERT(login);
+               vnic_dbg_func(login->name);
+       }
+
+       /* the cleanup procedure depends on our state, our vnic type 
+        * (host/network admin), and the cleanup level required. In network admined
+        * vnics there is a single create state and only one cleanup level (full).
+        * for host admined there are two create states (init, regular) and two
+        * cleanup level. The flow depends on the reason for the cleanup. */
+       vnic_dbg_data(login->name, "vnic_login_destroy flush=%d\n", flush);
+
+       /* we need to change state to prevent from completion to re-open the TX
+        * queue once we close it. Before calling stop() function, need to make
+        * sure that all on-going hard_start_xmit() calls are done.
+        */
+
+       if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+               set_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+               netif_tx_disable(login->dev);
+               vnic_dbg_mark();
+       }
+
+       if (test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_2, &vnic->login_state)) {
+               if (test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)) {
+                       /* calls vnic_stop() */
+#ifndef _BP_NO_NDO_OPS
+                       login->dev->netdev_ops->ndo_stop(login->dev);
+#else
+                       login->dev->stop(login->dev);
+#endif
+                       set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+                       vnic_dbg_mark();
+               }
+               vnic_mcast_del_all(&login->mcast_tree);
+               vnic_member_remove_all(login);
+               vnic_neighe_dealloc(login->gw_neigh);
+               vnic_dbg_mark();
+       }
+       if (test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state))
+               clear_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+
+       if (flush == FIP_FULL_FLUSH &&
+           test_and_clear_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+               mutex_lock(&login->port->mlock);
+               vnic_dbg_mac(login->name, "delete from login_list\n");
+               list_del(&login->list);
+               mutex_unlock(&login->port->mlock);
+
+               /* print info if register_netdev was called before so
+                * dev->name is valid
+                */
+               vnic_info("%s destroyed (%s port %d)\n", login->dev->name,
+                         login->port->dev->ca->name, login->port->num);
+
+               /* use irq save so caller function supports any context */
+               write_lock_irqsave(&login->mac_rwlock, flags);
+               vnic_child_flush(login, 1);
+               write_unlock_irqrestore(&login->mac_rwlock, flags);
+
+               vnic_delete_dentry(login);
+               unregister_netdev(login->dev);
+               vnic_dbg_mark();
+       }
+
+       vnic_dbg_mark();
+       /* login_ctx was in pre created state [always true] */
+       spin_lock_bh(&login->stats_lock);
+       if (test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_2, &vnic->login_state)) {
+               spin_unlock_bh(&login->stats_lock);
+               vnic_dbg_mark();
+               /* take port->mlock in case of refresh event is being called vnic_refresh_mcasts */
+               mutex_lock(&login->port->mlock);
+               /* tx queues are already stopped here */
+               vnic_neigh_del_all(login);
+               vnic_mcast_del_all(&login->mcast_tree);
+               for (i = 0; i < login->qps_num; ++i)
+                       vnic_destroy_qp(login, i);
+               mutex_unlock(&login->port->mlock);
+
+               for (i = 0; i < login->rx_rings_num; ++i)
+                       vnic_destroy_rx_res(login, i);
+               for (i = 0; i < login->tx_rings_num; ++i)
+                       vnic_destroy_tx_res(login, i);
+               ib_dma_unmap_single(login->port->dev->ca, login->pad_dma,
+                                   VNIC_EOIB_ZLEN_MAX, DMA_TO_DEVICE);
+               kfree(login->pad_va);
+       } else
+               spin_unlock_bh(&login->stats_lock);
+
+       if (flush == FIP_FULL_FLUSH &&
+           test_and_clear_bit(VNIC_STATE_LOGIN_PRECREATE_1, &vnic->login_state)) {
+               vnic_free_netdev(login);
+       }
+}
+
+int vnic_vhube_add(struct fip_vnic_data *vnic, struct vnic_table_entry *vhube)
+{
+       struct vnic_neigh *neighe;
+       struct vnic_login *login = vnic->login;
+       int rc;
+
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               return 0;
+       } else {
+               ASSERT(login);
+               vnic_dbg_func(login->name);
+       }
+
+       vnic_dbg_data(login->name, "adding vhube lid 0x%02x qpn 0x%x, mac "
+                     MAC_6_PRINT_FMT"\n", vhube->lid, vhube->qpn,
+                     MAC_6_PRINT_ARG(vhube->mac));
+
+       neighe = vnic_neighe_alloc(login, vhube->mac, vhube->lid,
+                                  vhube->qpn, vhube->rss);
+       if (IS_ERR(neighe))
+               return (int)PTR_ERR(neighe);
+
+       vnic_dbg_mark();
+       /* when adding new neighe, make sure that TX queues are not running. */
+       netif_tx_lock_bh(login->dev);
+       rc = vnic_neighe_add(login, neighe);
+       netif_tx_unlock_bh(login->dev);
+       if (rc) {
+               vnic_neighe_dealloc(neighe);
+               return rc;
+       }
+
+       return 0;
+}
+
+void vnic_vhube_flush(struct fip_vnic_data *vnic)
+{
+       struct vnic_login *login = vnic->login;
+
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               return;
+       } else {
+               ASSERT(login);
+               vnic_dbg_func(login->name);
+       }
+
+       /* when adding new neighe, make sure that TX queues are not running. */
+       vnic_dbg_mark();
+       netif_tx_lock_bh(login->dev);
+       vnic_neigh_del_all(login);
+       netif_tx_unlock_bh(login->dev);
+
+       return;
+}
+
+void vnic_vhube_del(struct fip_vnic_data *vnic, u8* mac)
+{
+       struct vnic_neigh *neighe;
+       struct vnic_login *login = vnic->login;
+
+       if (vnic->parent_used) {
+               vnic_dbg_mac(vnic->name, "function skipped\n");
+               return;
+       } else {
+               ASSERT(login);
+               vnic_dbg_func(login->name);
+       }
+
+       vnic_dbg_mark();
+       /* when adding new neighe, make sure that TX queues are not running. */
+       netif_tx_lock_bh(login->dev);
+       neighe = vnic_neighe_search(login, mac);
+       if (IS_ERR(neighe)) {
+               vnic_warn(login->name, "couldn't find "MAC_6_PRINT_FMT"\n",
+                         MAC_6_PRINT_ARG(mac));
+       } else {
+               vnic_neighe_del(login, neighe);
+               vnic_neighe_dealloc(neighe);
+       }
+       netif_tx_unlock_bh(login->dev);
+       return;
+}
+
+struct fip_login_data login_data;
+struct fip_vnic_data vnic;
+struct vnic_login *__vnic_login_create(struct vnic_port *port, int index)
+{
+       struct vnic_login *login;
+       int rc, no_bxm_n_rss = 0x4;
+       int qps_num = (port->rx_rings_num > 1) ? (1 << no_bxm_n_rss) : 1;
+
+       /* pre create vnic */
+       rc = vnic_login_pre_create_1(port, &vnic);
+       if (rc) {
+               vnic_err(port->name, "vnic_login_pre_create_1 failed"
+                        " for %s port %d index %d\n",
+                        port->dev->ca->name, port->num, index);
+               goto err;
+       }
+
+       login = vnic.login;
+
+       rc = vnic_login_pre_create_2(&vnic, qps_num, 0);
+       if (rc) {
+               vnic_err(port->name, "vnic_login_pre_create_2 failed"
+                        " for %s port %d index %d\n",
+                        port->dev->ca->name, port->num, index);
+               goto create_fail;
+       }
+
+       /* create vnic */
+       memset(&login_data, 0, sizeof(struct fip_login_data));
+       sprintf(login_data.vendor_id, "%s", NOT_AVAILABLE_STRING);
+       sprintf(login_data.vnic_name, "%s", NOT_AVAILABLE_STRING);
+       memcpy(login_data.mgid_prefix, NO_BXM_MGID_PREFIX, VNIC_MGID_PREFIX_LEN);
+       login_data.qpn = 0xa00000;
+       login_data.lid = 1;
+       login_data.pkey = 0xffff;
+       login_data.mtu = 1500;
+
+       /* random_ether_addr(mac); */
+       memcpy(login_data.mac, port->gid.raw + 10, ETH_ALEN);
+       login_data.mac[0] += index * 0x10;
+       /* mcast bit must be zero */
+       login_data.mac[0] &= 0xfe;
+       vnic_dbg_mark();
+       if (vnic_login_register_netdev(&vnic, login_data.mac, NULL)) {
+               vnic_err(login->name, "vnic_login_register_netdev failed\n");
+               goto create_fail;
+       }
+       if (vnic_login_complete_ack(&vnic, &login_data, NULL)) {
+               vnic_err(login->name, "vnic_login_complete_ack failed\n");
+               goto create_fail;
+       }
+
+       return login;
+
+create_fail:
+       vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH);
+err:
+       return ERR_PTR(-ENODEV);
+}
+
+int vnic_port_data_init(struct vnic_port *port)
+{
+       int i, no_bxm_vnic_per_port = 1;
+
+       vnic_dbg_mark();
+       mutex_lock(&port->start_stop_lock);
+       for (i = 0; i < no_bxm_vnic_per_port; ++i) {
+               __vnic_login_create(port, i);
+       }
+       mutex_unlock(&port->start_stop_lock);
+
+       return 0;
+       /*TODO - JPM: handle vnic_login_create failure */
+}
+
+void vnic_port_data_cleanup(struct vnic_port *port)
+{
+       struct vnic_login *login, *login_t;
+
+       vnic_dbg_mark();
+       /* vnic_login_destroy() acquires the port->mlock, cannot hold it here */
+       list_for_each_entry_safe(login, login_t,
+                                &port->login_list, list) {
+               vnic_dbg_data(login->name, "login %s\n", login->name);
+               vnic_login_destroy(login->fip_vnic, FIP_FULL_FLUSH);
+       }
+}
+
+/* ALI TODO: check if need to replace login ptr with vnic */
+void debug_dump_members(struct vnic_login *login, struct vnic_gw_info *member)
+{
+       int i;
+
+       vnic_warn(login->name, "Error members_debug_dump "
+                 "member id=%d gw id = %d active_count=%d\n",
+                 member->member_id, member->gw_id,
+                 login->lag_member_active_count);
+
+       /* go over map and count how many entries are mapped to each member*/
+       for (i=0; i<MAX_LAG_MEMBERS; i++) {
+               vnic_warn(login->name, "%d member %d used %x gw_id %d\n",
+                         i, login->lag_gw_neigh[i].member_id,
+                         login->lag_gw_neigh[i].info,
+                         login->lag_gw_neigh[i].gw_id);
+       }
+}
+
+static void vnic_build_map_histogram(struct vnic_login *login, int member_id, int *hist)
+{
+       int i;
+
+       memset(hist, 0, sizeof(int) * MAX_LAG_MEMBERS);
+
+       /* go over map and count how many entries are mapped to each member*/
+       for (i=0; i<LAG_MAP_TABLE_SIZE; i++) {
+               ASSERT(login->lag_gw_map[i] >= 0 && login->lag_gw_map[i] < MAX_LAG_MEMBERS);
+               hist[login->lag_gw_map[i]]++;
+       }
+}
+
+static void _vnic_remove_member_from_map(struct vnic_login *login, int member_id)
+{
+       int user_count[MAX_LAG_MEMBERS] = {0};
+       int i, j;
+       int continue_flag;
+       int thresh;
+
+       login->lag_member_active_count--;
+       if (login->lag_member_active_count > 0) {
+               /* go over map and count how many entries are mapped to each member*/
+               vnic_build_map_histogram(login, member_id, user_count);
+       
+               thresh = 2; //it might be possible to find a better lower boundary
+
+               for (i=0; i<LAG_MAP_TABLE_SIZE; i++) {
+                       /* entries that use the removed member must be remapped */
+                       if (login->lag_gw_map[i] != member_id)
+                               continue;
+
+                       continue_flag = 1;
+                       while (continue_flag) {
+                               for (j = 0; j < MAX_LAG_MEMBERS; j++) {
+                                       if (j == member_id)
+                                               continue;
+
+                                       /* Only use members that are connected, and are short of members */
+                                       if (login->lag_gw_neigh[j].info & GW_MEMBER_INFO_MAPPED &&
+                                           user_count[j] < thresh) {
+                                               login->lag_gw_map[i] = j;
+                                               user_count[j]++;
+                                               continue_flag = 0;
+                                               break;
+                                       }
+                               }
+                               if (j == MAX_LAG_MEMBERS)
+                                       thresh++;
+                       }
+               }
+       }
+}
+
+static void _vnic_add_member_to_map(struct vnic_login *login, int member_id)
+{
+       int i;
+       int expected;
+       int user_count[MAX_LAG_MEMBERS] = {0};
+       int continue_flag;
+       int thresh;
+
+       /* this is the first active port use it for all maps */
+       if (!login->lag_member_active_count) {
+               for (i=0; i<LAG_MAP_TABLE_SIZE; i++)
+                       login->lag_gw_map[i] = member_id;
+               login->lag_member_active_count++;
+       } else {
+               /* go over map and count how many entries are mapped to each member
+                * we will use count to reasign ports from the most heavily used members */
+               vnic_build_map_histogram(login, member_id, user_count);
+
+               /* when adding new member, make sure that TX queues are not running. */
+               login->lag_member_active_count++;
+               expected = LAG_MAP_TABLE_SIZE / login->lag_member_active_count;
+               thresh = LAG_MAP_TABLE_SIZE % login->lag_member_active_count;
+               continue_flag = 1;
+               while (continue_flag) {
+                       for (i = 0; i < LAG_MAP_TABLE_SIZE; i++) {
+                               if (user_count[login->lag_gw_map[i]] > expected + thresh) {
+                                       user_count[login->lag_gw_map[i]]--;
+                                       login->lag_gw_map[i] = member_id;
+                                       user_count[login->lag_gw_map[i]]++;
+                                       if (user_count[member_id] >= expected) {
+                                               continue_flag = 0;
+                                               break;
+                                       }
+                               }
+                       }
+                       thresh--;
+               }
+       }
+}
+
+void __bcast_member_attach_cb(struct vnic_mcast *mcaste, void *gw_ptr)
+{
+       struct vnic_gw_info *member = gw_ptr;
+
+       /* When SA is local, mcast join works even when port is down */
+       if (member->neigh.login->port->attr.state != IB_PORT_ACTIVE)
+               return;
+
+       vnic_dbg_lag(member->neigh.login->name, "__bcast_member_attach_cb for member id %d and "
+                    "gw_id=%d\n", member->member_id, member->gw_id);
+
+       netif_tx_lock_bh(member->neigh.login->dev);
+       member->info |= GW_MEMBER_INFO_MCAST;
+
+       if (member->info & GW_MEMBER_INFO_EPORT_UP &&
+           !(member->info & GW_MEMBER_INFO_MAPPED)) {
+               _vnic_add_member_to_map(member->neigh.login, member->member_id);
+               member->info |= GW_MEMBER_INFO_MAPPED;
+       }
+       netif_tx_unlock_bh(member->neigh.login->dev);
+}
+
+void __bcast_member_detach_cb(struct vnic_mcast *mcaste, void *gw_ptr)
+{
+       struct vnic_gw_info *member = gw_ptr;
+
+       vnic_dbg_lag(member->neigh.login->name, "__bcast_member_detach_cb for member id %d and "
+                    "gw_id=%d\n", member->member_id, member->gw_id);
+
+       netif_tx_lock_bh(member->neigh.login->dev);
+       if (member->info & GW_MEMBER_INFO_MAPPED)
+               _vnic_remove_member_from_map(member->neigh.login, member->member_id);
+
+       member->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_MCAST);
+       netif_tx_unlock_bh(member->neigh.login->dev);
+}
+
+/*
+ * create MGIDs and join the default MCAST addresses. The mcaste are added to the
+ * list contained within member struct. If more MGIDs are used by the vnic when
+ * a member is added we will join those too using the members GW_ID.
+*/
+static int _vnic_add_member_mgid(struct vnic_login *login, struct vnic_gw_info *member)
+{
+       struct vnic_mcast *mcaste, *mcaste_bcast;
+       int rc;
+#ifndef _BP_NO_MC_LIST
+       struct dev_mc_list *mclist;
+#else
+       struct netdev_hw_addr *ha;
+#endif
+
+       mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+       if (IS_ERR(mcaste))
+               return (-ENOMEM);
+
+       /* attach to default mgid */
+       __vnic_mcaste_fill(login, mcaste, member->gw_id, ETH_ZERO_MAC, 0, vnic_mcast_create);
+       mcaste->attach_cb = __bcast_member_attach_cb;
+       mcaste->detach_cb = __bcast_member_detach_cb;
+       mcaste->attach_cb_ctx = member;
+       mcaste->detach_cb_ctx = member;
+       mcaste->priv_data = member;
+       rc = vnic_mcast_add(&login->mcast_tree, mcaste);
+       if (rc) {
+               debug_dump_members(login, member);
+               ASSERT(!rc);
+       }
+
+       rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+       if (rc) {
+               debug_dump_members(login, member);
+               ASSERT(!rc);
+       }
+
+       if (login->n_mac_mcgid) {
+               mcaste_bcast = vnic_mcast_alloc(login->port, NULL, NULL);
+               if (IS_ERR(mcaste_bcast))
+                       goto  free_mcasts;
+
+               __vnic_mcaste_fill(login, mcaste_bcast, member->gw_id, ETH_BCAST_MAC, 0, 0);
+               /* The port gid is overun by the default gid as part of the mgid over
+                * same mlid hack */
+               memcpy(&mcaste_bcast->port_gid, &mcaste->port_gid, GID_LEN);
+               mcaste_bcast->priv_data = member;
+               rc = vnic_mcast_add(&login->mcast_tree, mcaste_bcast);
+               ASSERT(!rc);
+               rc = vnic_mcast_attach(&login->mcast_tree, mcaste_bcast);
+               ASSERT(!rc);
+       }
+
+
+       /* hold the tx lock so set_multicast_list() won't change mc_list */
+       netif_tx_lock_bh(login->dev);
+#ifndef _BP_NO_MC_LIST
+       for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) {
+               u8* mmac = mclist->dmi_addr;
+#else
+       netdev_for_each_mc_addr(ha, login->dev) {
+               u8* mmac = ha->addr;
+#endif
+               /* do not add the default MGIDS because they are always used */
+               if (IS_ZERO_MAC(mmac))
+                       continue;
+               if (IS_BCAST_MAC(mmac))
+                       continue;
+
+               vnic_dbg_lag(login->name, "_vnic_add_member_mgid for "
+                         MAC_6_PRINT_FMT" and member gw_id=%d\n",
+                         MAC_6_PRINT_ARG(mcaste->mac), member->gw_id);
+
+               if (_vnic_mcast_attach_mgid(login, mmac, mcaste, member,
+                                           member->gw_id))
+                       goto attach_failed;
+       }
+       netif_tx_unlock_bh(login->dev);
+
+       return 0;
+
+attach_failed:
+       netif_tx_unlock_bh(login->dev);
+free_mcasts:
+       vnic_mcast_del_user(&login->mcast_tree, member);
+       return -ENOMEM;
+}
+
+int vnic_member_add(struct vnic_login *login, int member_id, struct lag_member *member)
+{
+       struct vnic_gw_info *member_e;
+       int ret;
+
+       if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+               return -1;
+
+       vnic_dbg_lag(login->name,"vnic_member_add id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+                         member_id, member_e->gw_id, member->lid, member->qpn, member->sl);
+       /* member id is already in use */
+       if (login->lag_gw_neigh[member_id].info & GW_MEMBER_INFO_CREATED)
+               return -1;
+
+       member_e = &login->lag_gw_neigh[member_id];
+
+       /* create new entry */
+       member_e->member_id = member_id;
+       member_e->neigh.lid = member->lid;
+       member_e->neigh.qpn = member->qpn;
+       member_e->gw_id = member->gw_port_id;
+       member_e->neigh.login = login;
+       INIT_DELAYED_WORK(&member_e->neigh.destroy_task, vnic_neighe_dealloc_task);
+       skb_queue_head_init(&member_e->neigh.pkt_queue);
+       init_completion(&member_e->neigh.query_comp);
+       complete(&member_e->neigh.query_comp); /* mark as complete since no query is running */
+       member_e->neigh.valid = 0;
+       member_e->neigh.pquery = ERR_PTR(-ENODATA);
+       member_e->neigh.query_id = -1;
+       member_e->neigh.ah = ERR_PTR(-ENODATA); /* ah query will be done via datapath */
+       if (!vnic_sa_query) {
+               member_e->neigh.ah = vnic_ah_alloc(login, member->lid);
+               if (IS_ERR(member_e->neigh.ah))
+                       return -ENOMEM;
+       }
+       /* need to add multicast code */
+       ret = _vnic_add_member_mgid(login, member_e);
+       if (ret)
+               goto free_ah;
+
+       netif_tx_lock_bh(login->dev);
+       member_e->info = GW_MEMBER_INFO_CREATED;
+       if (member->eport_state)
+               member_e->info |= GW_MEMBER_INFO_EPORT_UP;
+       login->lag_member_count++;
+       netif_tx_unlock_bh(login->dev);
+
+       return 0;
+free_ah:
+       if (!IS_ERR(member_e->neigh.ah))
+               ib_destroy_ah(member_e->neigh.ah);
+       return ret;
+}
+
+void vnic_member_remove_all(struct vnic_login *login)
+{
+       int i;
+
+       if (!login->is_lag)
+               return;
+
+       for (i=0; i<MAX_LAG_MEMBERS; i++)
+               vnic_member_remove(login, i);
+}
+
+int vnic_member_remove(struct vnic_login *login, int member_id)
+{
+       struct vnic_gw_info *member_e;
+
+       vnic_dbg_lag(login->name, "vnic_member_remove for id %d\n", member_id);
+
+       if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+               return -1;
+
+       member_e = &login->lag_gw_neigh[member_id];
+
+       vnic_dbg_lag(login->name,"vnic_member_remove id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+                         member_id, member_e->gw_id, member_e->neigh.lid, member_e->neigh.qpn, member_e->neigh.sl);
+
+       /* member id is not in use */
+       if (!(member_e->info & GW_MEMBER_INFO_CREATED))
+               return -1;
+
+       if (member_e->neigh.query_id >=0 && member_e->neigh.pquery && !IS_ERR(member_e->neigh.pquery))
+               ib_sa_cancel_query(member_e->neigh.query_id, member_e->neigh.pquery);
+
+       netif_tx_lock_bh(login->dev);
+       if (member_e->info & GW_MEMBER_INFO_MAPPED)
+               _vnic_remove_member_from_map(login, member_e->member_id);
+       member_e->info &= ~(GW_MEMBER_INFO_MAPPED);
+       member_e->neigh.valid = 0;
+       netif_tx_unlock_bh(login->dev);
+
+       /* wait for completion after the entry was removed from login data path */
+       wait_for_completion(&member_e->neigh.query_comp);
+
+       /* modification of map will be done through mcast CB if needed */
+       vnic_mcast_del_user(&login->mcast_tree, member_e);
+
+       if(member_e->neigh.ah && !IS_ERR(member_e->neigh.ah))
+               ib_destroy_ah(member_e->neigh.ah);
+       member_e->neigh.ah = ERR_PTR(-ENODATA);
+       member_e->info = 0;
+       login->lag_member_count--;
+
+       return 0;
+}
+
+void vnic_member_prop(struct vnic_login *login, struct lag_properties *prop)
+{
+       if (login->lag_prop.hash_mask != prop->hash_mask) {
+               netif_tx_lock_bh(login->dev);
+               memcpy(&login->lag_prop, prop,
+                      sizeof(login->lag_prop));
+               netif_tx_unlock_bh(login->dev);
+       }
+}
+
+/*
+ * modify a specific LAG eport member parameters. The parameters might not be
+ * "interesting" and might not effect data traffic. They might require creating
+ * a new ah, or might even result in a modification of the transmit hash mapping
+ * function.
+*/
+int vnic_member_modify(struct vnic_login *login, int member_id, struct lag_member *member)
+{
+       struct vnic_gw_info *member_e;
+
+       if (member_id >= MAX_LAG_MEMBERS || member_id < 0)
+               return -1;
+
+       member_e = &login->lag_gw_neigh[member_id];
+
+       vnic_dbg_lag(login->name,"vnic_member_modify id:%d gw_id:%d lid:%x qpn:%x sl:%d\n",
+                  member_id, member_e->gw_id, member_e->neigh.lid, member_e->neigh.qpn, member_e->neigh.sl);
+
+       /* member id is not in use */
+       if (! member_e->info & GW_MEMBER_INFO_CREATED)
+               return -1;
+
+       /* change in LID requires new ah */
+       /* TODO Test this */
+       if (member_e->neigh.lid != member->lid) {
+               /* take tx lock to make sure ah is not being used */
+               if (vnic_sa_query) {
+                       /* Cancel SA query in case */
+                       if (member_e->neigh.query_id >=0 && member_e->neigh.pquery && !IS_ERR(member_e->neigh.pquery))
+                               ib_sa_cancel_query(member_e->neigh.query_id, member_e->neigh.pquery);
+                       netif_tx_lock_bh(login->dev);
+                       member_e->neigh.lid = member->lid;
+                       member_e->neigh.valid = 0;
+                       if ((member_e->neigh.ah && !IS_ERR(member_e->neigh.ah)))
+                       {
+                               /* lid is not the same : destroy AH */
+                               ib_destroy_ah(member_e->neigh.ah);
+                               member_e->neigh.ah = ERR_PTR(-ENODATA);
+                       }
+                       netif_tx_unlock_bh(login->dev);
+               } else {
+                       struct ib_ah *ah, *ah1;
+                       ah = member_e->neigh.ah;
+                       ah1 = vnic_ah_alloc(login, member->lid);
+                       if (IS_ERR(ah1))
+                                 return -ENOMEM;
+                       netif_tx_lock_bh(login->dev);
+                       member_e->neigh.lid = member->lid;
+                       member_e->neigh.ah = ah1;
+                       netif_tx_unlock_bh(login->dev);
+                       ib_destroy_ah(ah);
+               }
+       }
+
+       if (member_e->neigh.qpn != member->qpn)
+               member_e->neigh.qpn = member->qpn;
+
+       netif_tx_lock_bh(login->dev);
+       /* link changed from up to down */
+       if (member_e->info & GW_MEMBER_INFO_MAPPED && !member->eport_state) {
+               _vnic_remove_member_from_map(login, member_id);
+               member_e->info &= ~(GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP);
+       } 
+
+       /* link changed from down to up and mcast are connected */
+       if (!(member_e->info & GW_MEMBER_INFO_MAPPED) &&
+           member->eport_state) {
+               if (member_e->info & GW_MEMBER_INFO_MCAST) {
+                       _vnic_add_member_to_map(login, member_id);
+                       member_e->info |= (GW_MEMBER_INFO_MAPPED | GW_MEMBER_INFO_EPORT_UP);
+               } else
+                       member_e->info |= GW_MEMBER_INFO_EPORT_UP;
+       }
+       netif_tx_unlock_bh(login->dev);
+
+       return 0;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_neigh.c
new file mode 100644 (file)
index 0000000..a331aeb
--- /dev/null
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+void vnic_neighe_dealloc_task(struct work_struct *work)
+{
+       struct vnic_neigh *neighe =
+               container_of(work, struct vnic_neigh, destroy_task.work);
+       if (IS_NEIGH_QUERY_RUNNING(neighe))
+               ib_sa_cancel_query(neighe->query_id, neighe->pquery);
+       wait_for_completion(&neighe->query_comp);
+       if (neighe->ah && !IS_ERR(neighe->ah))
+               ib_destroy_ah(neighe->ah);
+       kfree(neighe);
+}
+
+void vnic_neighe_dealloc(struct vnic_neigh *neighe)
+{
+       ASSERT(neighe);
+       /* calls vnic_neighe_dealloc_task */
+       queue_delayed_work(neighe->login->neigh_wq, &neighe->destroy_task, 0);
+}
+
+struct ib_ah *vnic_ah_alloc(struct vnic_login *login, u16 dlid)
+{
+       struct ib_ah_attr av;
+       struct ib_ah *ah;
+
+       memset(&av, 0, sizeof(av));
+       av.dlid = dlid;
+       av.port_num = login->port->num;
+       av.sl = login->sl; /* PATH Query is need here to allocate the data sl*/
+       ah = ib_create_ah(login->port->pd, &av);
+       if (IS_ERR(ah)) {
+               return ERR_PTR(-ENOMEM);
+       }
+       return(ah);
+}
+
+struct vnic_neigh *vnic_neighe_alloc(struct vnic_login *login,
+                                    const u8 *mac,
+                                    u16 dlid, u32 dqpn, u8 rss)
+{
+       struct vnic_neigh *neighe;
+       neighe = kzalloc(sizeof *neighe, GFP_ATOMIC);
+       if (!neighe)
+               return ERR_PTR(-ENOMEM);
+       INIT_DELAYED_WORK(&neighe->destroy_task, vnic_neighe_dealloc_task);
+       skb_queue_head_init(&neighe->pkt_queue);
+       if (mac)
+               memcpy(neighe->mac, mac, ETH_ALEN);
+       neighe->rss = rss;
+       neighe->ah = ERR_PTR(-ENODATA);
+       if (!vnic_sa_query) {
+               neighe->ah = vnic_ah_alloc(login, dlid);
+               if (IS_ERR(neighe->ah)) {
+                          kfree(neighe);
+                          return ERR_PTR(-ENOMEM);
+               }
+       }
+       init_completion(&neighe->query_comp);
+       complete(&neighe->query_comp); /* mark as complete since no query is running */
+       neighe->pquery = ERR_PTR(-ENODATA);
+       neighe->query_id = -1;
+       neighe->qpn = dqpn;
+       neighe->lid = dlid;
+       neighe->login = login;
+
+       return neighe;
+}
+
+void vnic_neighe_del(struct vnic_login *login, struct vnic_neigh *neighe)
+{
+       ASSERT(neighe);
+       rb_erase(&neighe->rb_node, &login->neigh_tree);
+}
+
+int vnic_neighe_add(struct vnic_login *login, struct vnic_neigh *neighe)
+{
+       struct rb_node **n = &login->neigh_tree.rb_node, *pn = NULL;
+       struct vnic_neigh *neighe_t;
+       int rc;
+
+       while (*n) {
+               pn = *n;
+               neighe_t = rb_entry(pn, struct vnic_neigh, rb_node);
+               rc = memcmp(neighe->mac, neighe_t->mac, ETH_ALEN);
+               if (rc < 0)
+                       n = &pn->rb_left;
+               else if (rc > 0)
+                       n = &pn->rb_right;
+               else {
+                       rc = -EEXIST;
+                       goto out;
+               }
+       }
+
+       rb_link_node(&neighe->rb_node, pn, n);
+       rb_insert_color(&neighe->rb_node, &login->neigh_tree);
+       rc = 0;
+
+out:
+       return rc;
+}
+
+struct vnic_neigh *vnic_neighe_search(struct vnic_login *login, u8 *mac)
+{
+       struct rb_node *n = login->neigh_tree.rb_node;
+       struct vnic_neigh *neighe_t;
+       int rc;
+
+       while (n) {
+               neighe_t = rb_entry(n, struct vnic_neigh, rb_node);
+               rc = memcmp(mac, neighe_t->mac, ETH_ALEN);
+               if (rc < 0)
+                       n = n->rb_left;
+               else if (rc > 0)
+                       n = n->rb_right;
+               else {
+                       vnic_dbg_data(login->name,
+                                     "found: mac "MAC_6_PRINT_FMT" vid %d "
+                                     "qpn 0x%06x lid 0x%02x\n",
+                                     MAC_6_PRINT_ARG(neighe_t->mac),
+                                     be16_to_cpu(login->vid), neighe_t->qpn,
+                                     neighe_t->lid);
+                       goto out;
+               }
+       }
+       neighe_t = ERR_PTR(-ENODATA);
+
+out:
+       return neighe_t;
+}
+
+void vnic_neigh_del_all(struct vnic_login *login)
+{
+       struct rb_node *n;
+       struct vnic_neigh *neighe;
+
+       ASSERT(login);
+       n = rb_first(&login->neigh_tree);
+       while (n) {
+               neighe = rb_entry(n, struct vnic_neigh, rb_node);
+               vnic_neighe_del(login, neighe);
+               n = rb_first(&login->neigh_tree);
+               vnic_neighe_dealloc(neighe);
+       }
+}
+
+void vnic_neigh_invalidate(struct vnic_login *login)
+{
+       struct vnic_neigh *neighe;
+       struct rb_node *n;
+       int i;
+
+       if (login->gw_neigh && !IS_ERR(login->gw_neigh))
+               login->gw_neigh->valid = 0;
+
+       n = rb_first(&login->neigh_tree);
+       while (n) {
+               neighe = rb_entry(n, struct vnic_neigh, rb_node);
+               neighe->valid = 0;
+               n = rb_next(n);
+       }
+
+       if (login->is_lag)
+               for (i=0; i<MAX_LAG_MEMBERS; i++)
+                       login->lag_gw_neigh[i].neigh.valid = 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_netdev.c
new file mode 100644 (file)
index 0000000..abfd2e2
--- /dev/null
@@ -0,0 +1,1085 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+extern struct net_device_stats *mlx4_vnic_stats_func_container(struct net_device *n);
+
+static int mlx4_vnic_vlan_rx_add_vid(struct net_device *dev, __be16 proto,
+                                    unsigned short vid)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       vnic_dbg_data(login->name, "add VLAN:%d was called\n", vid);
+       return 0;
+}
+
+static int mlx4_vnic_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
+                                     unsigned short vid)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       vnic_dbg_data(login->name, "Kill VID:%d was called\n", vid);
+       return 0;
+}
+
+void vnic_carrier_update(struct vnic_login *login)
+{
+       int attached, eport_up, eport_enforce, carrier_ok;
+
+       ASSERT(login);
+       attached = test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+       eport_up = fip_vnic_get_eport_state(login->fip_vnic);
+       eport_enforce = vnic_eport_state_enforce;
+       carrier_ok = netif_carrier_ok(login->dev);
+
+       /* bring carrier up */
+       if (!carrier_ok && attached && (!eport_enforce || eport_up)) {
+               set_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state);
+               netif_carrier_on(login->dev);
+               vnic_info("%s link is up\n", login->dev->name);
+               return;
+       }
+
+       /* bring carrier down */
+       if (carrier_ok && (!attached || (!eport_up && eport_enforce))) {
+               clear_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state);
+               netif_carrier_off(login->dev);
+               vnic_info("%s link is down\n", login->dev->name);
+               return;
+       }
+
+}
+
+void __bcast_attach_cb(struct vnic_mcast *mcaste, void *login_ptr)
+{
+       struct vnic_login *login = login_ptr;
+
+       /* When SA is local, mcast join works even when port is down */
+       if (login->port->attr.state != IB_PORT_ACTIVE)
+               return;
+       set_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+       vnic_carrier_update(login);
+}
+
+void __bcast_detach_cb(struct vnic_mcast *mcaste, void *login_ptr)
+{
+       struct vnic_login *login = login_ptr;
+
+       clear_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state);
+       vnic_carrier_update(login);
+}
+
+/* this function cannot sleep, avoid any mutex() in consequent calls */
+static int vnic_set_mac(struct net_device *dev, void *_mac)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       struct sockaddr *saddr = _mac;
+       u8 *mac = (u8 *)(saddr->sa_data);
+       int rc = 0;
+
+       vnic_dbg_func(login->name);
+
+       vnic_dbg_mac(login->name, "mac "MAC_6_PRINT_FMT" => "MAC_6_PRINT_FMT"\n",
+                    MAC_6_PRINT_ARG((u8 *)(dev->dev_addr)),
+                    MAC_6_PRINT_ARG(mac));
+
+       /* must support child vNics for mac modification */
+       if (!vnic_child_max)
+               return -ENOSYS;
+
+       /* skip if invalid address */
+       if (unlikely(!is_valid_ether_addr(mac)))
+               return -EINVAL;
+
+       /* skip if same mac was already set */
+       if (!(memcmp((u8 *)(dev->dev_addr), mac, ETH_ALEN)))
+               return 0;
+
+       /* already in bh, calls vnic_child_update that queues a job,
+        * so read_lock is enough
+        */
+       read_lock(&login->mac_rwlock);
+
+       /* if mac same as original, delete child, set mac and return */
+       if (!(memcmp(mac, login->dev_addr, ETH_ALEN)))
+               goto out;
+
+       /* else, this is a new child vNic,
+        * add new child vNic
+        * NOTE: pay attention that the GC should not destroy a child vNic that
+        * is being used as mac-change even if it was created by different
+        * source.
+        */
+       rc = vnic_child_update(login, mac, 0);
+       if (rc && rc != -EEXIST)
+               goto err;
+
+out:
+       memcpy(dev->dev_addr, mac, ETH_ALEN);
+       vnic_child_update(login, (u8 *)(dev->dev_addr), 1);
+       vnic_dbg_mac(login->name, "mac changed successfully to "
+                    MAC_6_PRINT_FMT"\n", MAC_6_PRINT_ARG(mac));
+
+err:
+       read_unlock(&login->mac_rwlock);
+       return rc;
+}
+
+static void vnic_set_multicast_list(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       vnic_dbg_func(login->name);
+
+       /* test promisc flag changes */
+       if (is_ucast_promisc(login) && !login->promisc) {
+               /* promisc is being set */
+               if (!vnic_child_max) {
+                       /* must support child vNics for promisc mode */
+                       vnic_info("%s promisc mode cannot be set "
+                                 "(vnic_child_max %u)\n",
+                                 dev->name, vnic_child_max);
+                } else if (vnic_src_mac_enforce) {
+                       /* cannot support promisc if source mac is enforced
+                        * because sender should be able to use any smac
+                        */
+                       vnic_info("%s promisc mode cannot be set "
+                                 "(vnic_src_mac_enforce %u)\n",
+                                 dev->name, vnic_src_mac_enforce);
+                } else {
+                        login->promisc = 1;
+                        vnic_dbg_mac(dev->name,
+                                     "entered promiscuous mode: confirmed\n");
+                }
+       } else if (!is_ucast_promisc(login) && login->promisc) {
+               /* promisc is being cleared */
+               login->promisc = 0;
+               write_lock(&login->mac_rwlock);
+               vnic_child_flush(login, 0);
+               write_unlock(&login->mac_rwlock);
+               vnic_dbg_mac(dev->name,
+                            "left promiscuous mode: confirmed\n");
+       }
+
+       /* test mcast changes */
+       if (!no_bxm && !login->queue_stopped) {
+               dev_hold(dev);
+               if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100))
+                       dev_put(dev);
+       }
+}
+
+static void vnic_auto_moder(struct vnic_login *login)
+{
+       unsigned long period =
+               (unsigned long)(jiffies - login->last_moder_jiffies);
+       unsigned long packets;
+       unsigned long rate;
+       unsigned long avg_pkt_size;
+       unsigned long rx_packets;
+       unsigned long rx_bytes;
+       unsigned long tx_packets;
+       unsigned long tx_pkt_diff;
+       unsigned long rx_pkt_diff;
+       int moder_time;
+
+       period = (unsigned long)(jiffies - login->last_moder_jiffies);
+#if 0
+       vnic_dbg_moder_v(login->name, "adaptive_rx_coal %d, period %d, "
+                        "sample_interval %d, state %d\n",
+                        login->adaptive_rx_coal, period,
+                        login->sample_interval, login->port->attr.state);
+#endif
+
+       if (!login->adaptive_rx_coal || period < login->sample_interval * HZ)
+               return;
+
+       /* TODO: when NAPI is disabled, the RX completion will be called from
+        * IRQ context (and not BH context) and thus spin_lock_bh should be
+        * replaced with spin_lock_irq
+        */
+       spin_lock_bh(&login->stats_lock);
+       rx_packets = login->stats.rx_packets;
+       rx_bytes = login->stats.rx_bytes;
+       tx_packets = login->stats.tx_packets;
+       spin_unlock_bh(&login->stats_lock);
+
+       if (!login->last_moder_jiffies || !period)
+               goto out_set;
+
+       tx_pkt_diff = ((unsigned long)(tx_packets -
+                                      login->last_moder_tx_packets));
+       rx_pkt_diff = ((unsigned long)(rx_packets - login->last_moder_packets));
+       packets = max(tx_pkt_diff, rx_pkt_diff);
+       rate = packets * HZ / period;
+       avg_pkt_size = packets ? ((unsigned long)(rx_bytes -
+                                                 login->last_moder_bytes)) /
+           packets : 0;
+
+       if (rate > VNIC_RX_RATE_THRESH && avg_pkt_size > VNIC_AVG_PKT_SMALL) {
+               /* If tx and rx packet rates are not balanced, assume that
+                * traffic is mainly BW bound and apply maximum moderation.
+                * Otherwise, moderate according to packet rate */
+               if (2 * tx_pkt_diff > 3 * rx_pkt_diff ||
+                   2 * rx_pkt_diff > 3 * tx_pkt_diff) {
+                       moder_time = login->rx_usecs_high;
+               } else {
+                       if (rate < login->pkt_rate_low)
+                               moder_time = login->rx_usecs_low;
+                       else if (rate > login->pkt_rate_high)
+                               moder_time = login->rx_usecs_high;
+                       else
+                               moder_time = (rate - login->pkt_rate_low) *
+                                       (login->rx_usecs_high - login->rx_usecs_low) /
+                                       (login->pkt_rate_high - login->pkt_rate_low) +
+                                       login->rx_usecs_low;
+               }
+       } else {
+               moder_time = login->rx_usecs_low;
+       }
+
+       if (moder_time != login->last_moder_time) {
+               vnic_dbg_moder(login->name, "tx rate:%lu rx_rate:%lu\n",
+                              tx_pkt_diff * HZ / period,
+                              rx_pkt_diff * HZ / period);
+               vnic_dbg_moder(login->name,
+                              "Rx moder_time changed from:%lu to %d period:%lu"
+                              " [jiff] packets:%lu avg_pkt_size:%lu rate:%lu"
+                              " [p/s])\n", login->last_moder_time, moder_time,
+                             period, packets, avg_pkt_size, rate);
+               login->last_moder_time = moder_time;
+               vnic_ib_set_moder(login,
+                                 login->last_moder_time, login->rx_frames,
+                                 login->tx_usecs, login->tx_frames);
+       }
+
+out_set:
+       login->last_moder_packets = rx_packets;
+       login->last_moder_tx_packets = tx_packets;
+       login->last_moder_bytes = rx_bytes;
+       login->last_moder_jiffies = jiffies;
+}
+
+void vnic_dump_stats(struct vnic_login *login)
+{
+       unsigned long *stats, *login_stats = (unsigned long *)(&login->stats);
+       int i, j, len = sizeof(struct net_device_stats) / sizeof(unsigned long);
+       struct net_device_stats stats_tmp;
+
+       spin_lock_bh(&login->stats_lock);
+       /* tx stats are distributed between tx_res entries */
+       stats_tmp = login->stats;
+       memset(&login->stats, 0, sizeof(struct net_device_stats));
+       for (i = 0; i < login->tx_rings_num; ++i) {
+               stats = (unsigned long *)(&login->tx_res[i].stats);
+               for (j = 0; j < len; ++j)
+                       login_stats[j] += stats[j];
+       }
+
+       /* rx stats are in login->stats */
+       login->stats.rx_bytes = stats_tmp.rx_bytes;
+       login->stats.rx_packets = stats_tmp.rx_packets;
+       login->stats.rx_errors = stats_tmp.rx_errors;
+       login->stats.rx_dropped = stats_tmp.rx_dropped;
+        spin_unlock_bh(&login->stats_lock);
+}
+
+static void vnic_do_get_stats(struct work_struct *work)
+{
+       struct vnic_login *login =
+               container_of(work, struct vnic_login, stats_task.work);
+
+       mutex_lock(&login->moder_lock);
+       vnic_dump_stats(login);
+
+       if (login->queue_stopped)
+               goto out;
+
+       if (!(test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)))
+               goto resched;
+
+       if (login->port->attr.state == IB_PORT_ACTIVE)
+               vnic_auto_moder(login);
+
+resched:
+       /* calls vnic_do_get_stats() */
+       if (!login->queue_stopped)
+               queue_delayed_work(login_wq, &login->stats_task, VNIC_STATS_DELAY);
+out:
+       mutex_unlock(&login->moder_lock);
+}
+
+static void vnic_mcast_reattach(struct work_struct *work)
+{
+       struct vnic_mcast *mcaste, *mcaste_t;
+       struct rb_node *n;
+       unsigned long flags;
+       union vhub_mgid mgid;
+       LIST_HEAD(local_list);
+       int i;
+       struct vnic_gw_info *lag_member;
+       struct vnic_login *login;
+       struct net_device *dev;
+#ifndef _BP_NO_MC_LIST
+       struct dev_mc_list *mclist;
+#else
+       struct netdev_hw_addr *ha;
+#endif
+
+       login = container_of(work, struct vnic_login, mcast_task.work);
+       dev = login->dev;
+
+       vnic_dbg_mcast(login->name, "set_multicast_list was notified\n");
+       if (login->queue_stopped) {
+               dev_put(dev);
+               return;
+       }
+
+       /* detach all mcast (except default and bcast mcasts) */
+       spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+       if (!list_empty(&login->mcast_tree.reattach_list)) {
+               /* an event is being processed */
+               spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+               goto retry;
+       }
+               
+       for (n = rb_first(&login->mcast_tree.mcast_tree); n; n = rb_next(n)) {
+               mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+               if (IS_ZERO_MAC(mcaste->mac))
+                       continue;
+               if (IS_BCAST_MAC(mcaste->mac))
+                       continue;               
+               list_add_tail(&mcaste->list, &local_list);
+       }
+
+       list_for_each_entry(mcaste, &local_list, list) {
+               vnic_mcast_del(&login->mcast_tree, mcaste);
+               mcaste->attach_task_cnt = 0;
+       }
+
+       spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+       vnic_dbg_mcast(login->name, "local_list is %s empty n_mac_mcgid %u\n",
+                      (list_empty(&local_list) ? "" : "not"),
+                      login->n_mac_mcgid);
+
+       list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+               list_del(&mcaste->list);
+               vnic_mcast_detach(&login->mcast_tree, mcaste);
+               vnic_mcast_dealloc(mcaste);
+       }
+
+       /* attach all mcasts in mc_list */
+       vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid,
+                        CREATE_VHUB_ID(login->vid, login->gw_port_id),
+                        VHUB_MGID_DATA, 0, &mgid);
+
+       spin_lock_irqsave(&login->mcast_tree.mcast_rb_lock, flags);
+       mcaste_t = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid);
+       if (IS_ERR(mcaste_t) || !test_bit(VNIC_STATE_LOGIN_BCAST_ATTACH, &login->fip_vnic->login_state)) {
+               vnic_dbg_data(login->name, "default mgid not ready\n");
+               spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+               dev_put(dev);
+               return;
+       }
+       spin_unlock_irqrestore(&login->mcast_tree.mcast_rb_lock, flags);
+
+       /* hold the tx lock so set_multicast_list() won't change mc_list */
+       netif_tx_lock_bh(dev);
+#ifndef _BP_NO_MC_LIST
+       for (mclist = login->dev->mc_list; mclist; mclist = mclist->next) {
+               u8* mmac = mclist->dmi_addr;
+#else
+       netdev_for_each_mc_addr(ha, login->dev) {
+               u8* mmac = ha->addr;
+#endif
+               /* do not add the default MGIDS because they are always used */
+               if (IS_ZERO_MAC(mmac))
+                       continue;
+               if (IS_BCAST_MAC(mmac))
+                       continue;
+
+               /* attach to the legacy GW / LAG gw id MGID */
+               if (_vnic_mcast_attach_mgid(login, mmac, mcaste_t, login,
+                                           login->gw_port_id))
+                       goto attach_failed;
+
+               if (!login->is_lag)
+                       continue;
+
+               for (i=0; i<MAX_LAG_MEMBERS; i++) {
+                       lag_member = &login->lag_gw_neigh[i];
+                       /* member id is already in use */
+                       if (lag_member->info & GW_MEMBER_INFO_CREATED)
+                               /* attach to the legacy GW / LAG gw id MGID */
+                               if (_vnic_mcast_attach_mgid(login, mmac,
+                                                           mcaste_t,
+                                                           lag_member,
+                                                           lag_member->gw_id))
+                                       goto attach_failed;
+               }
+       }
+       netif_tx_unlock_bh(dev);
+       dev_put(dev);
+       return;
+
+attach_failed:
+       netif_tx_unlock_bh(dev);
+       vnic_mcast_del_all(&login->mcast_tree);
+
+retry:
+       if (!login->queue_stopped) {
+               if (!queue_delayed_work(login_wq, &login->mcast_task, HZ / 100))
+                       dev_put(dev);
+       } else
+               dev_put(dev);
+}
+
+static int vnic_change_mtu(struct net_device *dev, int new_mtu)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       if (new_mtu > login->max_mtu) {
+               vnic_warn(login->name, "failed: new_mtu %d > %d\n", new_mtu,
+                         login->max_mtu);
+               return -EINVAL;
+       }
+
+       vnic_dbg_data(login->name, "mtu %d -> %d\n", dev->mtu, new_mtu);
+       dev->mtu = new_mtu;
+
+       return 0;
+}
+
+static void vnic_set_default_moder(struct vnic_login *login)
+{
+
+       login->rx_frames = VNIC_RX_COAL_TARGET / login->dev->mtu + 1;
+       login->rx_usecs = VNIC_RX_COAL_TIME;
+       login->tx_frames = VNIC_TX_COAL_PKTS;
+       login->tx_usecs = VNIC_TX_COAL_TIME;
+       login->pkt_rate_low = VNIC_RX_RATE_LOW;
+       login->rx_usecs_low = VNIC_RX_COAL_TIME_LOW;
+       login->pkt_rate_high = VNIC_RX_RATE_HIGH;
+       login->rx_usecs_high = VNIC_RX_COAL_TIME_HIGH;
+       login->sample_interval = VNIC_SAMPLE_INTERVAL;
+       login->adaptive_rx_coal = 1;
+       login->last_moder_time = VNIC_AUTO_CONF;
+       login->last_moder_jiffies = 0;
+       login->last_moder_packets = 0;
+       login->last_moder_tx_packets = 0;
+       login->last_moder_bytes = 0;
+
+       vnic_dbg_data(login->name, "default coalescing params for mtu:%d to "
+                     "rx_frames:%d rx_usecs:%d "
+                     "tx_frames:%d tx_usecs:%d\n",
+                     login->dev->mtu,
+                     login->rx_frames, login->rx_usecs,
+                     login->tx_frames, login->tx_usecs);
+}
+
+#ifndef _BP_NAPI_POLL
+int vnic_napi_alloc(struct vnic_login *login, int rx_res_index)
+{
+
+       struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+       netif_napi_add(login->dev, napi, vnic_poll_cq_rx, vnic_napi_weight);
+
+       return 0;
+}
+
+void vnic_napi_enable(struct vnic_login *login, int rx_res_index)
+{
+
+       struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+       napi_enable(napi);
+}
+
+static void vnic_napi_disable(struct vnic_login *login, int rx_res_index)
+{
+       struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+       if (!napi->poll)
+               return;
+
+       napi_disable(napi);
+}
+
+static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index)
+{
+#ifndef _BP_NAPI_NO_DEL
+       struct napi_struct *napi = &login->rx_res[rx_res_index].napi;
+
+       netif_napi_del(napi);
+#else
+       return;
+#endif
+}
+
+#else
+int vnic_napi_alloc(struct vnic_login *login, int rx_res_index)
+{
+       struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+       char name[IFNAMSIZ];
+
+       snprintf(name, IFNAMSIZ, "%s-N%d", login->name, rx_res_index);
+       rx_res->poll_dev =
+               alloc_netdev(0, name, ether_setup);
+       if (!rx_res->poll_dev)
+               return -ENOMEM;
+
+       rx_res->poll_dev = rx_res->poll_dev;
+       rx_res->poll_dev->priv = rx_res;
+       rx_res->poll_dev->weight = vnic_napi_weight;
+       rx_res->poll_dev->poll = vnic_poll_cq_rx;
+
+       return 0;
+}
+
+void vnic_napi_enable(struct vnic_login *login, int rx_res_index)
+{
+       struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+
+       ASSERT(rx_res->poll_dev);
+       set_bit(__LINK_STATE_START, &rx_res->poll_dev->state);
+}
+
+static void vnic_napi_disable(struct vnic_login *login, int rx_res_index)
+{
+       struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+       struct net_device *poll_dev = rx_res->poll_dev;
+
+       if (!poll_dev)
+               return;
+
+       while (test_bit(__LINK_STATE_RX_SCHED, &poll_dev->state))
+               msleep(VNIC_NAPI_SCHED_TIMEOUT);
+}
+
+static void vnic_napi_dealloc(struct vnic_login *login, int rx_res_index)
+{
+       struct vnic_rx_res *rx_res = &login->rx_res[rx_res_index];
+       struct net_device *poll_dev = rx_res->poll_dev;
+
+       if (!poll_dev)
+               return;
+
+       free_netdev(poll_dev);
+       rx_res->poll_dev = NULL;
+}
+#endif
+
+static int _vnic_open(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int i;
+
+       /* Todo add locks here */
+       if (!(test_bit(VNIC_STATE_LOGIN_CREATE_2, &login->fip_vnic->login_state))) {
+               set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+               return 0;
+       }
+
+       if (test_and_set_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))
+               return 0;
+
+       clear_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+
+       /* ARM RX handlers */
+       for (i = 0; i < login->rx_rings_num; ++i) {
+               login->rx_res[i].stopped = 0;
+               if (ib_req_notify_cq(login->rx_res[i].cq, IB_CQ_NEXT_COMP)) {
+                       vnic_err(login->name, "ib_req_notify_cq failed\n");
+                       goto err;
+               }
+       }
+
+       /* ARM TX handlers */
+       for (i = 0; i < login->tx_rings_num; ++i) {
+               login->tx_res[i].stopped = 0;
+               spin_lock_init(&login->tx_res[i].lock);
+               if (!vnic_tx_polling &&
+                   ib_req_notify_cq(login->tx_res[i].cq, IB_CQ_NEXT_COMP)) {
+                       vnic_err(login->name, "ib_req_notify_cq failed\n");
+                       goto err;
+               }
+       }
+
+       /* enable napi*/
+       for (i = 0; i < login->napi_num; ++i)
+               vnic_napi_enable(login, i);
+
+       /* move QP to RTS, post recv skb */
+       if (vnic_ib_open(dev))
+               goto err_napi;
+
+       /* dummy call */
+       if (vnic_ib_up(dev))
+               goto err_ib_stop;
+
+       /* configure */
+       vnic_set_default_moder(login);
+       if (vnic_ib_set_moder(login, login->last_moder_time, login->rx_frames,
+                             login->tx_usecs, login->tx_frames))
+               vnic_warn(login->name, "vnic_ib_set_moder failed!\n");
+
+       /* start interface TX queue */
+       VNIC_TXQ_START_ALL(login);
+
+       /* report and return */
+       vnic_info("%s is opened\n", dev->name);
+
+       return 0;
+
+err_ib_stop:
+       vnic_ib_stop(dev);
+err_napi:
+       /* disable napi*/
+       for (i = 0; i < login->napi_num; ++i)
+               vnic_napi_disable(login, i);
+err:
+       clear_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state);
+       return -EINVAL;
+}
+
+static int vnic_open(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int ret;
+
+       vnic_dbg_func(login->name);
+
+       mutex_lock(&login->state_lock);
+       ret = _vnic_open(dev);
+       mutex_unlock(&login->state_lock);
+       return ret;
+}
+
+static int _vnic_stop(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int i, _watchdog_timeo = dev->watchdog_timeo;
+
+       /* check if already stopped */
+       if (!(test_and_clear_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state)))
+               return 0;
+
+       /* Set trans_start to jiffies and watchdog_timeo to max
+        * to avoid spurious transmit timeouts in the interval between
+        * tx queue stopped and carrier down.
+        */
+       dev->trans_start = jiffies;
+       dev->watchdog_timeo = 0x7fffffff;
+
+       VNIC_TXQ_STOP_ALL(login);
+
+       /* disable rx handlers */
+       for (i = 0; i < login->rx_rings_num; ++i)
+               login->rx_res[i].stopped = 1;
+
+       /* disable tx handlers */
+       for (i = 0; i < login->tx_rings_num; ++i)
+               login->tx_res[i].stopped = 1;
+
+       /* disable napi managers */
+       for (i = 0; i < login->napi_num; ++i)
+               vnic_napi_disable(login, i);
+
+       vnic_ib_down(dev);
+       vnic_ib_stop(dev);
+
+       /* restore watchdog_timeo */
+       dev->watchdog_timeo = _watchdog_timeo;
+
+       vnic_info("%s is stopped\n", dev->name);
+
+       return 0;
+}
+
+static int vnic_stop(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int ret;
+
+       vnic_dbg_func(login->name);
+
+       mutex_lock(&login->state_lock);
+       ret = _vnic_stop(dev);
+       mutex_unlock(&login->state_lock);
+
+       return ret;
+}
+
+int vnic_restart(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int rc = 0;
+
+       if (login->queue_stopped || !test_bit(VNIC_STATE_NETDEV_OPEN, &login->netdev_state))
+               return rc;
+
+       set_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+       netif_tx_disable(login->dev);
+
+       mutex_lock(&login->state_lock);
+       _vnic_stop(login->dev);
+
+       clear_bit(VNIC_STATE_NETDEV_NO_TX_ENABLE, &login->netdev_state);
+       set_bit(VNIC_STATE_NETDEV_OPEN_REQ, &login->netdev_state);
+
+       rc = _vnic_open(login->dev);
+       mutex_unlock(&login->state_lock);
+
+       return rc;
+}
+
+static void vnic_restart_task(struct work_struct *work)
+{
+       struct vnic_login *login =
+               container_of(work, struct vnic_login, restart_task.work);
+
+       vnic_restart(login->dev);
+}
+
+struct net_device_stats *vnic_get_stats(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       if (dev->reg_state != NETREG_REGISTERED)
+               return &dev->stats;
+
+       spin_lock_bh(&login->stats_lock);
+       if (test_bit(VNIC_STATE_LOGIN_PRECREATE_2, &login->fip_vnic->login_state))
+               memcpy(&dev->stats, &login->stats, sizeof(login->stats));
+       spin_unlock_bh(&login->stats_lock);
+
+       return &dev->stats;
+}
+
+static void vnic_tx_timeout(struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       vnic_warn(login->name, "TX timeout called on port: %d, "
+                 "latency: %d msec,  stopped: %d, carrier_ok: %d,"
+                 "queue_stopped: %d, watchdog_timeo: %d msec\n",
+                 login->port->num,
+                 jiffies_to_msecs(jiffies - dev->trans_start),
+                 netif_queue_stopped(dev), netif_carrier_ok(dev),
+                 login->queue_stopped,
+                 jiffies_to_msecs(dev->watchdog_timeo));
+
+       if (netif_carrier_ok(dev)) {
+               VNIC_STATS_DO_INC(login->port_stats.tx_timeout);
+               if (!login->queue_stopped) {
+                       vnic_warn(login->name, "TX timeout, queueing rings restart\n");
+                       queue_delayed_work(login_wq, &login->restart_task, HZ / 100);
+               }
+       }
+}
+
+#ifndef _BP_NETDEV_NO_TMQ
+u16 vnic_select_queue(struct net_device *dev, struct sk_buff *skb,
+                     void *accel_priv, select_queue_fallback_t fallback)
+{
+       /* Notes:
+        * - In kernel 2.6.32 the skb->mac_header 0x1a is not set when
+        * select_queue() is called
+        * - In OVM Server 3.0, DomU tx skb network and transport
+        * headers are not set
+        */
+       skb_reset_mac_header(skb);
+       skb_set_network_header(skb, ETH_HLEN);
+        skb_set_transport_header(skb,
+                                 ETH_HLEN +
+                                 (skb->protocol == htons(ETH_P_IPV6) ?
+                                  sizeof(struct ipv6hdr) : ip_hdrlen(skb)));
+
+       return vnic_hash(dev, skb) % dev->real_num_tx_queues;
+}
+
+#endif
+
+#ifndef _BP_NO_NDO_OPS
+static struct net_device_ops vnic_netdev_ops = {
+       .ndo_open = vnic_open,
+       .ndo_stop = vnic_stop,
+       .ndo_start_xmit = vnic_tx,
+       .ndo_get_stats = vnic_get_stats,
+       .ndo_set_rx_mode = vnic_set_multicast_list,
+       .ndo_change_mtu = vnic_change_mtu,
+       .ndo_tx_timeout = vnic_tx_timeout,
+       .ndo_set_mac_address = vnic_set_mac,
+       .ndo_vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid,
+       .ndo_vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid,
+#ifndef _BP_NETDEV_NO_TMQ
+       .ndo_select_queue = vnic_select_queue,
+#endif
+};
+#endif
+
+static void vnic_setup(struct net_device *dev)
+{
+       ether_setup(dev);
+
+       dev->hard_header_len += VNIC_SKB_GET_ENCAP_OFFSET;
+       dev->watchdog_timeo = VNIC_WATCHDOG_TIMEOUT;
+
+#ifndef _BP_NO_NDO_OPS
+       if (!vnic_change_mac)
+               vnic_netdev_ops.ndo_set_mac_address = NULL;
+
+       dev->netdev_ops = &vnic_netdev_ops;
+#else
+       dev->open = vnic_open;
+       dev->stop = vnic_stop;
+       dev->hard_start_xmit = vnic_tx;
+       dev->get_stats = mlx4_vnic_stats_func_container;
+       dev->set_multicast_list = vnic_set_multicast_list;
+       dev->change_mtu = vnic_change_mtu;
+       dev->tx_timeout = vnic_tx_timeout;
+       dev->set_mac_address = vnic_set_mac;
+       dev->vlan_rx_add_vid = mlx4_vnic_vlan_rx_add_vid;
+       dev->vlan_rx_kill_vid = mlx4_vnic_vlan_rx_kill_vid;
+
+       if (!vnic_change_mac)
+               dev->set_mac_address = NULL;
+
+#ifndef _BP_NETDEV_NO_TMQ
+       dev->select_queue = vnic_select_queue;
+#endif
+#endif // _BP_NO_NDO_OPS
+}
+
+static int vnic_get_frag_header(struct skb_frag_struct *frags, void **mac_hdr,
+                               void **ip_hdr, void **tcpudp_hdr,
+                               u64 *hdr_flags, void *priv)
+{
+       struct iphdr *iph;
+       *mac_hdr = page_address(frags->page.p) + frags->page_offset;
+       *ip_hdr = iph = (struct iphdr *)(*mac_hdr + ETH_HLEN);
+       *tcpudp_hdr = (struct tcphdr *)(iph + (iph->ihl << 2));
+       *hdr_flags = LRO_IPV4 | LRO_TCP;
+
+       return 0;
+}
+
+static int vnic_get_skb_header(struct sk_buff *skb, void **iphdr,
+                              void **tcphdr, u64 *hdr_flags, void *priv)
+{
+       struct iphdr *iph;
+       struct tcphdr *tcph;
+
+       if (unlikely(skb->protocol != htons(ETH_P_IP)))
+               return -1;
+
+       if (unlikely(skb->ip_summed != CHECKSUM_UNNECESSARY))
+               return -1;
+
+       iph = (struct iphdr *)(skb->data + ETH_HLEN);
+       if (iph->protocol != IPPROTO_TCP)
+               return -1;
+
+       tcph = (struct tcphdr *)(iph + (iph->ihl << 2));
+
+       if (ntohs(iph->tot_len) < (iph->ihl * 4 + tcph->doff * 4))
+               return -1;
+
+       *hdr_flags = LRO_IPV4 | LRO_TCP;
+       *iphdr = iph;
+       *tcphdr = tcph;
+
+       return 0;
+}
+
+static int vnic_lro_enable(struct vnic_login *login, int rx_res_index)
+{
+       struct net_lro_mgr *lro = &login->rx_res[rx_res_index].lro;
+
+       lro->dev = login->dev;
+       lro->features = login->napi_num ? LRO_F_NAPI : 0;
+       lro->frag_align_pad = NET_IP_ALIGN;
+       lro->ip_summed = CHECKSUM_UNNECESSARY;
+       lro->ip_summed_aggr = CHECKSUM_UNNECESSARY;
+       lro->max_desc = login->lro_num;
+       lro->max_aggr = VNIC_MAX_LRO_AGGR;
+       lro->lro_arr = login->rx_res[rx_res_index].lro_desc;
+
+       if (lro->max_aggr > MAX_SKB_FRAGS)
+               lro->max_aggr = MAX_SKB_FRAGS;
+
+       if (!vnic_rx_linear)
+               lro->get_frag_header = vnic_get_frag_header;
+       else
+               lro->get_skb_header = vnic_get_skb_header;
+
+       return 0;
+}
+
+static void vnic_lro_disable(struct vnic_login *login, int rx_res_index)
+{
+       /* nop */
+       return;
+}
+
+struct net_device *vnic_alloc_netdev(struct vnic_port *port)
+{
+       struct vnic_login_info *info;
+       struct vnic_login *login;
+       struct net_device *dev;
+       static int vnic_cnt = 0;
+       int i;
+
+       dev = VNIC_TXQ_ALLOC_NETDEV(sizeof *info, "eth%d", vnic_setup, port->tx_rings_num);
+       if (!dev) {
+               vnic_err(port->name, "VNIC_TXQ_ALLOC_NETDEV failed "
+                        "(size %Zu, tx_rings_num %d)\n",
+                        sizeof *info, port->tx_rings_num);
+               goto err;
+       }
+
+       /* this is a *very* large beast... */
+       login = vmalloc(sizeof *login);
+       if (!login) {
+               vnic_err(port->name, "failed to allocate login struct (%Zu)\n",
+                        sizeof *login);
+               goto free_netdev;
+       }
+
+       /* init fields */
+       memset(login, 0, sizeof *login);
+       info = netdev_priv(dev);
+       info->login = login;
+       login->dev = dev;
+       login->port = port;
+       login->max_mtu = VNIC_BUF_SIZE(login->port) - IB_GRH_BYTES -
+                        VNIC_ENCAP_LEN - ETH_HLEN - VLAN_HLEN;
+       login->cnt = ++vnic_cnt;
+       /* name will be overwritten later */
+       sprintf(login->name, "%s-%d", "vnic", login->cnt);
+       sprintf(login->desc, "%s-P%d",
+               login->port->dev->ca->node_desc, port->num);
+
+       login->neigh_wq = create_singlethread_workqueue(login->name);
+       if (!login->neigh_wq) {
+               vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+                                login->name);
+               goto free_login;
+       }
+
+       login->rx_csum = 1;
+       login->rx_rings_num = port->rx_rings_num;
+       login->tx_rings_num = port->tx_rings_num;
+#ifdef _BP_NETDEV_NO_TMQ
+       /* if the kernel doesn't support Multiple TX queues,
+        * then use only one TX queue */
+       login->tx_rings_num = 1;
+#endif
+       vnic_dbg_mark();
+       spin_lock_init(&login->lock);
+       spin_lock_init(&login->stats_lock);
+       rwlock_init(&login->mac_rwlock);
+       atomic_set(&login->vnic_child_cnt, 0);
+       vnic_mcast_root_init(&login->mcast_tree);
+       mutex_init(&login->moder_lock);
+       mutex_init(&login->state_lock);
+       SET_NETDEV_DEV(login->dev, login->port->dev->ca->dma_device);
+       INIT_DELAYED_WORK(&login->stats_task, vnic_do_get_stats);
+       INIT_DELAYED_WORK(&login->mcast_task, vnic_mcast_reattach);
+       INIT_DELAYED_WORK(&login->restart_task, vnic_restart_task);
+
+       vnic_set_ethtool_ops(dev);
+       /* init ethtool */
+       dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+       dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH;
+       dev->hw_features |= NETIF_F_TSO | NETIF_F_TSO6;
+       dev->features |= dev->hw_features;
+
+       /* init NAPI (must be before LRO init) */
+       login->napi_num = login->rx_rings_num;
+       for (i = 0; i < login->napi_num; ++i) {
+               if (vnic_napi_alloc(login, i)) {
+                       vnic_err(login->name, "NAPI alloc %d failed\n", i);
+                       goto free_napi;
+               }
+       }
+
+#if defined(NETIF_F_GRO) && !defined(_BP_NO_GRO)
+       login->dev->features |= NETIF_F_GRO;
+#elif defined(NETIF_F_LRO)
+       login->lro_num = vnic_lro_num;
+       login->lro_mng_num = vnic_lro_num ? login->rx_rings_num : 0;
+       login->dev->features |= vnic_lro_num ? NETIF_F_LRO : 0;
+#endif
+       for (i = 0; i < login->lro_mng_num; ++i) {
+               if (vnic_lro_enable(login, i)) {
+                       vnic_err(login->name, "vnic_lro_enable %d failed\n", i);
+                       goto free_lro;
+               }
+       }
+
+       return dev;
+
+free_lro:
+       for (--i; i >= 0; --i)
+               vnic_lro_disable(login, i);
+
+       i = login->napi_num;
+free_napi:
+       for (--i; i >= 0; --i)
+               vnic_napi_dealloc(login, i);
+free_login:
+       vfree(login);
+free_netdev:
+       free_netdev(dev);
+err:
+       return ERR_PTR(-ENODEV);
+}
+
+void vnic_free_netdev(struct vnic_login *login)
+{
+       int i;
+
+       vnic_dbg_func(login->name);
+
+       for (i = 0; i < login->lro_mng_num; ++i)
+               vnic_lro_disable(login, i);
+       for (i = 0; i < login->napi_num; ++i)
+               vnic_napi_dealloc(login, i);
+       flush_workqueue(login->neigh_wq);
+       destroy_workqueue(login->neigh_wq);
+       free_netdev(login->dev);
+       vfree(login);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_rx.c
new file mode 100644 (file)
index 0000000..0051dee
--- /dev/null
@@ -0,0 +1,677 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static inline void free_single_frag(struct vnic_rx_ring *ring, int e,int i)
+{
+               ib_dma_unmap_single(ring->port->dev->ca,
+                       ring->rx_info[e].dma_addr[i],
+                       ring->frag_info[i].frag_size,
+                       PCI_DMA_FROMDEVICE);
+               ring->rx_info[e].dma_addr[i] = 0;
+               put_page(ring->rx_info[e].frags[i].page.p);
+}
+
+#ifndef _BP_NETDEV_NO_TMQ
+/* this functions used only in no_bxm mode,
+ * it's not implemented in netdevice.h so we have it here
+ * based on netif_tx_lock()
+ */
+static inline int vnic_netif_tx_trylock(struct net_device *dev)
+{
+       int i, cpu;
+
+       spin_lock(&dev->tx_global_lock);
+       cpu = smp_processor_id();
+       for (i = 0; i < dev->num_tx_queues; ++i) {
+               struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+               if (__netif_tx_trylock(txq)) {
+                       set_bit(__QUEUE_STATE_FROZEN, &txq->state);
+                       __netif_tx_unlock(txq);
+               } else {
+                       goto unlock;
+               }
+       }
+
+       return 1;
+
+unlock:
+       /* based on netif_tx_unlock() */
+       for (--i; i >= 0; --i) {
+               struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
+               clear_bit(__QUEUE_STATE_FROZEN, &txq->state);
+               if (!test_bit(QUEUE_STATE_ANY_XOFF, &txq->state))
+                       __netif_schedule(txq->qdisc);
+       }
+       spin_unlock(&dev->tx_global_lock);
+
+       return 0;
+}
+#else
+#define vnic_netif_tx_trylock(dev) netif_tx_trylock(dev)
+#endif
+
+int vnic_rx(struct vnic_login *login, struct sk_buff *skb, struct ib_wc *wc)
+{
+       ASSERT(skb);
+       vnic_dbg_skb("RX", skb, (unsigned long)-1, (unsigned long)0);
+
+       if (no_bxm) {
+               /* In no_bxm mode, we update neigh table based on ARP reqlies
+                * QPN & LID are retrieved from the IB completion
+                * ATTENTION: on RSS mode, make sure that ARPs are
+                * sent on base QPN
+                */
+               struct vnic_neigh *neighe;
+               struct ethhdr *eth_hdr = (struct ethhdr *)skb->data;
+               struct arphdr *arp_hdr = (struct arphdr *)(skb->data + ETH_HLEN);
+               u16 eth_proto = ntohs(eth_hdr->h_proto);
+               u16 arp_proto = ntohs(arp_hdr->ar_op);
+
+               if (eth_proto != ETH_P_ARP)
+                       goto out;
+               if (arp_proto == ARPOP_REQUEST)
+                       vnic_dbg_data(login->name, "ARP REQUEST\n");
+               else
+                       vnic_dbg_data(login->name, "ARP REPLY\n");
+
+               /* don't stop TX queue, only try, this way we avoid blocking 
+                * IRQs in TX flow (performance wise).
+                * other vnic_neighe_* functions are not called in parallel 
+                * to this flow (in no_bxm mode)
+                */
+               if (!vnic_netif_tx_trylock(login->dev))
+                       goto out;
+
+               neighe = vnic_neighe_search(login, eth_hdr->h_source);
+               if (!IS_ERR(neighe)) {
+                       /* if IB address didn't change, do nothing */
+                       if (neighe->qpn == wc->src_qp &&
+                           neighe->lid == wc->slid)
+                               goto unlock;
+                       /* else, del old neigh entry, and add a new one */
+                       vnic_neighe_del(login, neighe);
+                       vnic_neighe_dealloc(neighe);
+               }
+
+               /* RSS: assume that your neighbours are like you */
+               neighe = vnic_neighe_alloc(login, eth_hdr->h_source,
+                                          wc->slid, wc->src_qp,
+                                          login->rx_rings_num > 1 ? 1 : 0);
+               if (IS_ERR(neighe))
+                       goto unlock;
+               if (vnic_neighe_add(login, neighe))
+                       vnic_neighe_dealloc(neighe);
+unlock:
+               netif_tx_unlock(login->dev);
+       }
+out:
+
+       /* shared_vnic may receive PACKET_OTHERHOST
+        * we 'fix' the pkt_type here so the kernel
+        * won't drop it
+        */
+       if (skb->pkt_type == PACKET_OTHERHOST && login->shared_vnic)
+               skb->pkt_type = PACKET_HOST;
+
+       netif_receive_skb(skb);
+
+       return 0;
+
+}
+
+struct sk_buff *vnic_alloc_rx_skb(struct vnic_rx_ring *ring, int buf_ind,
+                                 gfp_t gfp_flag)
+{
+       struct ib_device *ca = ring->port->dev->ca;
+       struct sk_buff *skb;
+       u64 mapping;
+       int buf_size = VNIC_BUF_SIZE(ring->port);
+
+       skb = alloc_skb(buf_size, gfp_flag);
+       if (!skb) {
+               vnic_dbg_data(ring->port->name,
+                             "alloc_skb for size %d failed\n", buf_size);
+               goto err_alloc;
+       }
+
+       mapping = ib_dma_map_single(ca, skb->data, buf_size, DMA_FROM_DEVICE);
+       if (unlikely(ib_dma_mapping_error(ca, mapping))) {
+               vnic_dbg_data(ring->port->name,
+                             "ib_dma_map_single len %d failed\n", buf_size);
+               goto err_map;
+       }
+
+       ring->rx_info[buf_ind].skb = skb;
+       ring->rx_info[buf_ind].dma_addr[0] = mapping;
+
+       return skb;
+
+err_map:
+       dev_kfree_skb_any(skb);
+err_alloc:
+       return NULL;
+}
+
+static int frag_sizes[] = {
+       FRAG_SZ0,
+       FRAG_SZ1,
+       FRAG_SZ2,
+       FRAG_SZ3
+};
+
+/* Calculate the last offset position that accomodates a full fragment
+ * (assuming fagment size = stride-align)
+ */
+static int vnic_last_alloc_offset(struct vnic_rx_ring *ring, u16 stride, u16 align)
+{
+       u16 res = VNIC_ALLOC_SIZE % stride;
+       u16 offset = VNIC_ALLOC_SIZE - stride - res + align;
+
+       vnic_dbg_data(ring->port->name, "calculated last offset for stride:%d align:%d "
+                     "res:%d offset:%d\n", stride, align, res, offset);
+       return offset;
+}
+
+static int vnic_init_allocator(struct vnic_rx_ring *ring)
+{
+       struct vnic_rx_alloc *page_alloc;
+       int i;
+
+       if (vnic_rx_linear)
+               return 0;
+
+       for (i = 0; i < ring->num_frags; i++) {
+               page_alloc = &ring->page_alloc[i];
+               page_alloc->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER);
+               if (!page_alloc->page)
+                       goto out;
+
+               page_alloc->offset = ring->frag_info[i].frag_align;
+               vnic_dbg_data(ring->port->name, "Initialized allocator:%d with page:%p\n",
+                             i, page_alloc->page);
+       }
+       return 0;
+
+out:
+       while (i--) {
+               page_alloc = &ring->page_alloc[i];
+               if (page_alloc->page) {
+                       put_page(page_alloc->page);
+                       page_alloc->page = NULL;
+               }
+       }
+       return -ENOMEM;
+}
+
+static void vnic_destroy_allocator(struct vnic_rx_ring *ring)
+{
+       struct vnic_rx_alloc *page_alloc;
+       int i;
+
+       if (vnic_rx_linear)
+               return;
+
+       for (i = 0; i < ring->num_frags; i++) {
+               page_alloc = &ring->page_alloc[i];
+               vnic_dbg_data(ring->port->name, "Freeing allocator:%d count:%d\n",
+                             i, page_count(page_alloc->page));
+               if (page_alloc->page) {
+                       put_page(page_alloc->page);
+                       page_alloc->page = NULL;
+               }
+       }
+}
+
+/*
+ * allocate a single fragment on a single ring entry and map it
+ * to HW address.
+ */
+static int vnic_alloc_frag(struct vnic_rx_ring *ring,
+                          struct vnic_frag_data *frags_data, int i)
+{
+       struct vnic_frag_info *frag_info = &ring->frag_info[i];
+       struct vnic_rx_alloc *page_alloc = &ring->page_alloc[i];
+       struct skb_frag_struct *skb_frags = &frags_data->frags[i];
+       struct skb_frag_struct skbf = *skb_frags;
+       struct page *page;      
+       struct ib_device *ib_device = ring->port->dev->ca;
+       u64 dma;
+       int decision;
+
+       if (vnic_rx_linear)
+               return 0;
+
+       if (page_alloc->offset >= frag_info->last_offset) {
+               decision = 0;
+               /* Allocate new page */
+               page = alloc_pages(GFP_ATOMIC | __GFP_COMP, VNIC_ALLOC_ORDER);
+               if (!page) {
+                       /*frags_data->dma_addr[i] = NULL;
+                          ring->rx_info[wr_id].info = VNIC_FRAG_ALLOC_FAIL;
+                          ring->need_refill = 1; */
+                       return -ENOMEM;
+               }
+               skbf.page.p = page_alloc->page;
+               skbf.page_offset = page_alloc->offset;
+       } else {
+               decision = 1;
+               page = page_alloc->page;
+               get_page(page);
+               skbf.page.p = page;
+               skbf.page_offset = page_alloc->offset;
+       }
+
+       skbf.size = frag_info->frag_size;
+       dma = ib_dma_map_single(ib_device, page_address(skbf.page.p) +
+                            skbf.page_offset, frag_info->frag_size,
+                            PCI_DMA_FROMDEVICE);
+       if (unlikely(ib_dma_mapping_error(ib_device, dma))) {
+               vnic_dbg_data(ring->port->name,
+                             "ib_dma_map_single len %d failed\n",
+                             frag_info->frag_size);
+               put_page(page);
+               return -ENOMEM;
+       }
+
+       if (!decision) {
+               page_alloc->page = page;
+               page_alloc->offset = frag_info->frag_align;
+       } else
+               page_alloc->offset += frag_info->frag_stride;
+
+       *skb_frags = skbf;
+       frags_data->dma_addr[i] = dma;
+
+       return 0;
+}
+
+void vnic_calc_rx_buf(struct vnic_rx_ring *ring)
+{
+       int eff_mtu = VNIC_BUF_SIZE(ring->port), buf_size = 0, i = 0;
+
+       if (vnic_rx_linear) {
+               ring->num_frags = 1;
+               return;
+       }
+
+       while (buf_size < eff_mtu) {
+               ring->frag_info[i].frag_size =
+                       (eff_mtu > buf_size + frag_sizes[i]) ?
+                               frag_sizes[i] : eff_mtu - buf_size;
+               ring->frag_info[i].frag_prefix_size = buf_size;
+               if (!i) {
+                       ring->frag_info[i].frag_align = NET_IP_ALIGN;
+                       ring->frag_info[i].frag_stride =
+                               ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES);
+               } else {
+                       ring->frag_info[i].frag_align = 0;
+                       ring->frag_info[i].frag_stride =
+                               ALIGN(frag_sizes[i], SMP_CACHE_BYTES);
+               }
+               ring->frag_info[i].last_offset =
+                       vnic_last_alloc_offset(ring,
+                                              ring->frag_info[i].frag_stride,
+                                              ring->frag_info[i].frag_align);
+               buf_size += ring->frag_info[i].frag_size;
+               i++;
+       }
+
+       ring->num_frags = i;
+       ring->rx_skb_size = eff_mtu;
+       ring->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct));
+
+       vnic_dbg(ring->port->name, "Rx buffer scatter-list (ring %d effective-mtu:%d "
+                 "num_frags:%d):\n", ring->index ,eff_mtu, ring->num_frags);
+       for (i = 0; i < ring->num_frags; i++) {
+               vnic_dbg(ring->port->name, "frag:%d - size:%d prefix:%d align:%d "
+                        "stride:%d last_offset:%d\n", i,
+                        ring->frag_info[i].frag_size,
+                        ring->frag_info[i].frag_prefix_size,
+                        ring->frag_info[i].frag_align,
+                        ring->frag_info[i].frag_stride,
+                        ring->frag_info[i].last_offset);
+       }
+}
+
+static void vnic_empty_rx_entry(struct vnic_rx_ring *ring, int i)
+{
+       int frag_num, buf_size = VNIC_BUF_SIZE(ring->port);
+       struct ib_device *ca = ring->port->dev->ca;
+       struct sk_buff *skb;
+       u64 mapping;
+
+       if (vnic_rx_linear) {
+               for (frag_num = 0; frag_num < ring->num_frags; frag_num++) {
+                       mapping = ring->rx_info[i].dma_addr[0];
+                       skb = ring->rx_info[i].skb;
+                       if (mapping)
+                               ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE);
+                       if (skb)
+                               dev_kfree_skb_any(skb);
+               }
+
+               return;
+       }
+
+       /* non linear buffers */
+       for (frag_num = 0; frag_num < ring->num_frags; frag_num++)
+               free_single_frag(ring, i, frag_num);
+}
+
+static int vnic_fill_rx_buffer(struct vnic_rx_ring *ring)
+{
+       struct vnic_frag_data *frags_data = &ring->rx_info[0];
+       struct sk_buff *skb;
+       struct ib_device *ca = ring->port->dev->ca;
+       int buf_ind, frag_num, buf_size = VNIC_BUF_SIZE(ring->port);
+       u64 mapping;
+
+       if (vnic_rx_linear) {
+               for (buf_ind = 0; buf_ind < ring->size; buf_ind++) {
+                       skb = vnic_alloc_rx_skb(ring, buf_ind, GFP_KERNEL);
+                       if (!skb)
+                               goto err_linear;
+               }
+
+               return 0;
+       }
+
+       /* non linear buffers */
+       for (buf_ind = 0; buf_ind < ring->size; buf_ind++, frags_data++) {
+               for (frag_num = 0; frag_num < ring->num_frags; frag_num++) {
+                       if (vnic_alloc_frag(ring, frags_data, frag_num))
+                               goto err_frags;
+               }
+       }
+
+       return 0;
+
+err_linear:
+       for (buf_ind = 0; buf_ind < ring->size; buf_ind++) {
+               mapping = ring->rx_info[buf_ind].dma_addr[0];
+               skb = ring->rx_info[buf_ind].skb;
+               if (mapping)
+                       ib_dma_unmap_single(ca, mapping, buf_size, DMA_FROM_DEVICE);
+               if (skb)
+                       dev_kfree_skb_any(skb);
+       }
+
+       return -ENOMEM;
+
+err_frags:
+       for (--frag_num; frag_num >= 0; frag_num--)
+               free_single_frag(ring, buf_ind, frag_num);
+
+       for (--buf_ind; buf_ind >= 0; buf_ind--)
+               vnic_empty_rx_entry(ring, buf_ind);
+
+       return -ENOMEM;
+}
+
+/*
+ * free entire ring full of fragments.
+*/
+static void vnic_empty_rx_buffer(struct vnic_rx_ring *ring)
+{
+       int buf_ind;
+
+       for (buf_ind = 0; buf_ind < ring->size; buf_ind++)
+               vnic_empty_rx_entry(ring, buf_ind);
+
+       ring->size = 0;
+}
+
+void vnic_destroy_rx_ring(struct vnic_rx_ring *ring)
+{
+       if (!ring)
+               return;
+       vnic_empty_rx_buffer(ring);
+       vnic_destroy_allocator(ring);
+       vfree(ring->rx_info);
+       vnic_ib_free_ring(ring);
+       kfree(ring);
+}
+
+int vnic_unmap_and_replace_rx(struct vnic_rx_ring *ring, struct ib_device *dev,
+                             struct skb_frag_struct *skb_frags_rx,
+                             u64 wr_id, int length)
+{
+       struct vnic_frag_info *frag_info;
+       struct vnic_frag_data *rx_info = &ring->rx_info[wr_id];
+
+       int nr;
+       dma_addr_t dma;
+
+       /* Collect used fragments while replacing them in the HW descriptors */
+       for (nr = 0; nr < ring->num_frags; nr++) {
+               frag_info = &ring->frag_info[nr];
+               if (length <= frag_info->frag_prefix_size)
+                       break;
+
+               /* Save page reference in skb */
+               skb_frags_rx[nr].page = rx_info->frags[nr].page;
+               skb_frags_rx[nr].size = rx_info->frags[nr].size;
+               skb_frags_rx[nr].page_offset = rx_info->frags[nr].page_offset;
+               dma = rx_info->dma_addr[nr];
+
+               /* Allocate a replacement page */
+               if (vnic_alloc_frag(ring, rx_info, nr))
+                       goto fail;
+
+               /* Unmap buffer */
+               ib_dma_unmap_single(dev, dma, skb_frags_rx[nr].size,
+                                PCI_DMA_FROMDEVICE);
+       }
+
+       /* Adjust size of last fragment to match actual length */
+       if (nr > 0)
+               skb_frags_rx[nr - 1].size = length -
+                       ring->frag_info[nr - 1].frag_prefix_size;
+       return nr;
+
+fail:
+       /* Drop all accumulated fragments (which have already been replaced in
+        * the descriptor) of this packet; remaining fragments are reused... */
+       while (nr > 0) {
+               nr--;
+               put_page(skb_frags_rx[nr].page.p);
+       }
+
+       return 0;
+}
+
+int vnic_rx_skb(struct vnic_login *login, struct vnic_rx_ring *ring,
+               struct ib_wc *wc, int ip_summed, char *eth_hdr_va)
+{
+       u64 wr_id = (unsigned int)wc->wr_id;
+       struct sk_buff *skb;
+       int used_frags;
+       char *va = eth_hdr_va;
+       int length = wc->byte_len - VNIC_EOIB_HDR_SIZE - VNIC_VLAN_OFFSET(login),
+           linear_length = (length <= SMALL_PACKET_SIZE) ?
+           length : SMALL_PACKET_SIZE, hdr_len = min(length, HEADER_COPY_SIZE),
+           offest = NET_IP_ALIGN + 16;
+       struct ib_device *ib_dev = login->port->dev->ca;
+
+       /* alloc a small linear SKB */
+       skb = alloc_skb(linear_length + offest, GFP_ATOMIC);
+       if (unlikely(!skb))
+               return -ENOMEM;
+
+       skb_record_rx_queue(skb, ring->index);
+       skb_reserve(skb, offest);
+
+       if (vnic_linear_small_pkt && length <= SMALL_PACKET_SIZE) {
+               u64 dma;
+
+               /* We are copying all relevant data to the skb - temporarily
+                * synch buffers for the copy
+                */
+               dma = ring->rx_info[wr_id].dma_addr[0] + VNIC_EOIB_HDR_SIZE +
+                       VNIC_VLAN_OFFSET(login);
+               ib_dma_sync_single_for_cpu(ib_dev, dma, length,
+                                          DMA_FROM_DEVICE);
+               skb_copy_to_linear_data(skb, va, length);
+               ib_dma_sync_single_for_device(ib_dev, dma, length,
+                                             DMA_FROM_DEVICE);
+               skb->tail += length;
+       } else {
+               /* unmap the needed fragmentand reallocate them. Fragments that
+                * were not used will not be reused as is. */
+               used_frags = vnic_unmap_and_replace_rx(ring, ib_dev,
+                                                      skb_shinfo(skb)->frags,
+                                                      wr_id, wc->byte_len);
+               if (!used_frags)
+                       goto free_and_repost;
+
+               skb_shinfo(skb)->nr_frags = used_frags;
+
+               /* Copy headers into the skb linear buffer */
+               memcpy(skb->data, va, hdr_len);
+               skb->tail += hdr_len;
+               /* Skip headers in first fragment */
+               skb_shinfo(skb)->frags[0].page_offset +=
+                   (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) +
+                    hdr_len);
+
+               /* Adjust size of first fragment */
+               skb_shinfo(skb)->frags[0].size -=
+                   (VNIC_EOIB_HDR_SIZE + VNIC_VLAN_OFFSET(login) +
+                    hdr_len);
+               skb->data_len = length - hdr_len;
+       }
+
+       /* update skb fields */
+       skb->len = length;
+       skb->truesize = length + sizeof(struct sk_buff);
+       skb->ip_summed = ip_summed;
+       skb->dev = login->dev;
+       skb->protocol = eth_type_trans(skb, skb->dev);
+
+       return vnic_rx(login, skb, wc);
+
+free_and_repost:
+       dev_kfree_skb(skb);
+       return -ENODEV;
+
+}
+
+static void vnic_set_rx_sge(struct vnic_rx_ring *ring)
+{
+       int i;
+
+       ring->wr.num_sge = ring->num_frags;
+       ring->wr.next = NULL;
+       ring->wr.sg_list = ring->sge;
+       for (i = 0; i < ring->num_frags; ++i) {
+               ring->sge[i].lkey = ring->port->mr->lkey;
+               ring->sge[i].length = ring->frag_info[i].frag_size;
+       }
+}
+
+struct vnic_rx_ring *vnic_create_rx_ring(struct vnic_port *port, int index)
+{
+       int rc, rx_info, size = vnic_rx_rings_len;
+       struct vnic_rx_ring *ring;
+
+       ring = kzalloc(sizeof *ring, GFP_KERNEL);
+       if (!ring)
+               return ERR_PTR(-ENOMEM);
+
+       /* init attributes */
+       ring->port = port;
+       ring->size = size;
+       ring->index = index;
+       spin_lock_init(&ring->lock);
+
+       /* init rx ring IB resources */
+       if (vnic_ib_init_ring(ring)) {
+               vnic_err(port->name, "vnic_ib_init_ring failed\n");
+               goto free_ring;
+       }
+
+       rx_info = size * roundup_pow_of_two(sizeof(struct vnic_frag_data));
+       ring->rx_info = vmalloc(rx_info);
+       if (!ring->rx_info) {
+               vnic_err(port->name, "Failed allocating rx_info ring"
+                        " (%d bytes)\n", rx_info);
+               goto free_ib;
+       }
+       memset(ring->rx_info, 0, rx_info);
+
+       /* determine the sizes of the fragments as result of mtu */
+       vnic_calc_rx_buf(ring);
+
+       rc = vnic_init_allocator(ring);
+       if (rc) {
+               vnic_err(port->name, "Failed initializing ring"
+                        " allocator %d\n", rc);
+               goto free_rxinfo;
+       }
+
+       rc = vnic_fill_rx_buffer(ring);
+       if (rc) {
+               vnic_err(port->name, "vnic_fill_rx_buffer failed %d\n", rc);
+               goto free_allocator;
+       }
+
+       /* set rx WQEs drafts */
+       vnic_set_rx_sge(ring);
+
+       /* Initailize all descriptors and post to srq */
+       rc = vnic_post_recvs(ring);
+       if (rc) {
+               vnic_err(port->name, "vnic_post_recvs failed %d\n", rc);
+               goto free_rx_buffer;
+       }
+
+       return ring;
+
+free_rx_buffer:
+       /* TODO: we are freeing posted packets need to move SRQ
+        * to error and free them first
+        */
+       vnic_empty_rx_buffer(ring);
+free_allocator:
+       vnic_destroy_allocator(ring);
+free_rxinfo:
+       vfree(ring->rx_info);
+free_ib:
+       vnic_ib_free_ring(ring);
+free_ring:
+       kfree(ring);
+
+       return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_data_tx.c
new file mode 100644 (file)
index 0000000..0233d4f
--- /dev/null
@@ -0,0 +1,622 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb,
+                           struct neighbour *neighbour, int tx_res_index);
+/* Push VLAN & EoIB headers and calculate RSS hash value
+ * We do the RSS hash here because we already check IP|TCP|UDP
+ * in this function for EoIB fields, so we make use of that
+ * and do RSS too.
+ */
+static struct eoibhdr eoib_h_draft = {
+       .encap_data = ((VNIC_EOIB_HDR_VER << 4) | (VNIC_EOIB_HDR_SIG << 6)),
+       .seg_off = 0,
+       .seg_id = 0
+};
+
+void vnic_learn_mac(struct net_device *dev, u8 *mac, int remove)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+
+       vnic_dbg_func(login->name);
+
+       /* skip invalid address */
+       if (unlikely(!is_valid_ether_addr(mac)))
+               return;
+
+       /* skip parent vNic address (original dev_addr) */
+       if (!(memcmp(login->dev_addr, mac, ETH_ALEN)))
+               return;
+
+       vnic_dbg_mac(login->name, "learn mac "MAC_6_PRINT_FMT"\n",
+                    MAC_6_PRINT_ARG(mac));
+
+       /* update child vNic list, ignore returned code */
+       read_lock_bh(&login->mac_rwlock);
+       vnic_child_update(login, mac, remove);
+       read_unlock_bh(&login->mac_rwlock);
+}
+
+u32 vnic_hash(struct net_device *dev, struct sk_buff *skb)
+{
+       struct tcphdr *tr_h = tcp_hdr(skb);
+       struct iphdr *ip_h = ip_hdr(skb);
+       struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+       u32 hash = 0, addrlen, i;
+
+       /* All mcast traffic is sent and received on 1st queue
+        * because only the 1st QP is attached to the MGIDs
+        * TODO: consider distributing tx/rx mcast traffic as well
+        */
+       if (is_multicast_ether_addr(skb_mac_header(skb)))
+               goto out;
+
+       switch (ntohs(skb->protocol)) {
+       case ETH_P_IP:
+               /* In IPv4, access TCP/UDP header only when IP packet is not
+                * fragmented: flags == DF == 0x02.
+                */
+               if (ntohs(ip_h->frag_off) >> 13 == 0x2 &&
+                   (ip_h->protocol == IPPROTO_TCP ||
+                    ip_h->protocol == IPPROTO_UDP)) {
+                       hash ^= (u32)ntohl(ip_h->saddr);
+                       hash ^= (u32)ntohl(ip_h->daddr);
+                       hash ^= (u32)ntohs(tr_h->source);
+                       hash ^= (u32)ntohs(tr_h->dest);
+               }
+               break;
+       case ETH_P_IPV6:
+               /* In IPv6, access TCP/UDP header only when IP packet is not
+                * fragmented: main header nexthdr field points to TCP/UDP
+                */
+               if (ip_h6->nexthdr == IPPROTO_TCP ||
+                   ip_h6->nexthdr == IPPROTO_UDP) {
+                       addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32);
+                       for (i = 0; i < addrlen; ++i) {
+                               hash ^= (u32)ntohl(ip_h6->saddr.in6_u.u6_addr32[i]);
+                               hash ^= (u32)ntohl(ip_h6->daddr.in6_u.u6_addr32[i]);
+                       }
+                       tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6);
+                       hash ^= (u32)ntohs(tr_h->source);
+                       hash ^= (u32)ntohs(tr_h->dest);
+               }
+       }
+out:
+       VNIC_SKB_SET_HASH(skb, hash);
+       return hash;
+}
+
+u8 vnic_lag_hash(struct sk_buff *skb, u16 hash_mask, u16 vid)
+{
+       struct tcphdr *tr_h = tcp_hdr(skb);
+       struct iphdr *ip_h = ip_hdr(skb);
+       struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+       u32 hash = 0, addrlen, i;
+       struct ethhdr *eth = (struct ethhdr *)skb_mac_header(skb);
+       u32 hash_dmac, hash_smac, hash_prot, hash_vid;
+       u32 hash_sip = 0, hash_dip = 0, hash_sp = 0, hash_dp = 0;
+       u8 res_hash;
+       u8 *tmp;
+
+       hash_dmac = *(u32 *)(&eth->h_dest[ETH_ALEN - sizeof hash_smac]);
+       hash_smac = *(u32 *)(&eth->h_source[ETH_ALEN - sizeof hash_smac]);
+       hash_prot = (u32)ntohs(skb->protocol);
+       hash_vid  = (u32)vid;
+
+       if (hash_mask & GW_LAG_LAYER_2_3) {
+               switch (hash_prot) {
+               case ETH_P_IP:
+                       /* In IPv4, access TCP/UDP header only when IP packet is not
+                        * fragmented: flags == DF == 0x02.
+                        */
+                       if (ntohs(ip_h->frag_off) >> 13 == 0x2 &&
+                           (ip_h->protocol == IPPROTO_TCP ||
+                            ip_h->protocol == IPPROTO_UDP)) {
+                               hash_sip = (u32)(ip_h->saddr);
+                               hash_dip = (u32)(ip_h->daddr);
+                               hash_sp  = (u32)(tr_h->source);
+                               hash_dp  = (u32)(tr_h->dest);
+                       }
+                       break;
+               case ETH_P_IPV6:
+                       /* In IPv6, access TCP/UDP header only when IP packet is not
+                        * fragmented: main header nexthdr field points to TCP/UDP
+                        */
+                       if (ip_h6->nexthdr == IPPROTO_TCP ||
+                           ip_h6->nexthdr == IPPROTO_UDP) {
+                               addrlen = ARRAY_LEN(ip_h6->saddr.in6_u.u6_addr32);
+                               for (i = 0; i < addrlen; ++i) {
+                                       hash_sip ^= (u32)(ip_h6->saddr.in6_u.u6_addr32[i]);
+                                       hash_dip ^= (u32)(ip_h6->daddr.in6_u.u6_addr32[i]);
+                               }
+                               tr_h = (struct tcphdr *)((void *)ip_h6 + sizeof *ip_h6);
+                               hash_sp = (u32)(tr_h->source);
+                               hash_dp = (u32)(tr_h->dest);
+                       }
+               }
+       }
+
+       hash ^= (hash_mask & GW_LAG_HASH_DMAC) ? hash_dmac : 0;
+       hash ^= (hash_mask & GW_LAG_HASH_SMAC) ? hash_smac : 0;
+       hash ^= (hash_mask & GW_LAG_HASH_TPID) ? hash_prot : 0;
+       hash ^= (hash_mask & GW_LAG_HASH_VID)  ? hash_vid  : 0;
+       hash ^= (hash_mask & GW_LAG_HASH_SIP)  ? hash_sip  : 0;
+       hash ^= (hash_mask & GW_LAG_HASH_DIP)  ? hash_dip  : 0;
+       hash ^= (hash_mask & GW_LAG_HASH_SPORT)  ? hash_sp  : 0;
+       hash ^= (hash_mask & GW_LAG_HASH_DPORT)  ? hash_dp  : 0;
+
+       tmp  = (u8 *)&hash;
+       res_hash = tmp[0] ^ tmp[1] ^ tmp[2] ^ tmp[3];
+
+       return res_hash;
+}
+
+static inline int vnic_header_encap(struct sk_buff *skb)
+{
+       struct vnic_login *login = vnic_netdev_priv(skb->dev);
+       struct eoibhdr *eoib_h;
+       struct iphdr *ip_h = ip_hdr(skb);
+       struct ipv6hdr *ip_h6 = (struct ipv6hdr *)ip_h;
+
+       /* push VLAN header
+        * TODO: when VID iz zero, push header only when prio exists, i.e.:
+        * if (VNIC_VLAN_ENABLED(login) && (login->vid || login->user_prio))
+        */
+       if (VNIC_VLAN_ENABLED(login) && login->vid) {
+               struct vlan_ethhdr *veth =
+                       (struct vlan_ethhdr *)skb_push(skb, VLAN_HLEN);
+               ASSERT(veth);
+               vnic_dbg_data_v(login->name, "push vlan tag with ID %u\n",
+                               be16_to_cpu(login->vid));
+               memmove(skb->data, skb->data + VLAN_HLEN, 2 * ETH_ALEN);
+               veth->h_vlan_proto = htons(ETH_P_8021Q);
+               veth->h_vlan_TCI = login->vid;
+       }
+
+       /* push EoIB header */
+       if (vnic_encap_headroom)
+               skb_push(skb, VNIC_ENCAP_LEN);
+
+       /* reset MAC header here, it can be changed for the following reasons:
+        * - vnic_encap_headroom is set, thus EoIB header is pushed
+        * - VLAN is enabled, thus VLAN header is pushed
+        * - some kernels (e.g., 2.6.18-194.el5) call dev_hard_start_xmit()
+        *   without setting the mac header pointer
+        */
+       skb_set_mac_header(skb, VNIC_SKB_GET_ENCAP_OFFSET);
+
+       /* enforce source mac*/
+       if (vnic_src_mac_enforce)
+               memcpy(skb_mac_header(skb) + ETH_ALEN,
+                      login->dev->dev_addr, ETH_ALEN);
+
+       /* set EoIB header VER/SIG, others set to zero */
+       eoib_h = VNIC_SKB_GET_ENCAP(skb);
+       *eoib_h = eoib_h_draft;
+
+       /* set EoIB header IP_CHK */
+       switch (ntohs(skb->protocol)) {
+       case ETH_P_IP:
+               VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h);
+               if (ip_h->protocol == IPPROTO_TCP)
+                       VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h);
+               else if (ip_h->protocol == IPPROTO_UDP)
+                       VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h);
+               break;
+       case ETH_P_IPV6:
+               VNIC_EOIB_HDR_SET_IP_CHK_OK(eoib_h);
+               if (ip_h6->nexthdr == IPPROTO_TCP)
+                       VNIC_EOIB_HDR_SET_TCP_CHK_OK(eoib_h);
+               else if (ip_h6->nexthdr == IPPROTO_UDP)
+                       VNIC_EOIB_HDR_SET_UDP_CHK_OK(eoib_h);
+       }
+
+#ifdef _BP_NETDEV_NO_TMQ
+       /* if TSS is enabled, use the hash value calculated by
+        * vnic_select_queue() otherwise call vnic_hash()
+        */
+       vnic_hash(skb->dev, skb);
+#endif
+
+       return 0;
+}
+
+static void vnic_neigh_path_query_complete(int status,
+                                                                                  struct ib_sa_path_rec *pathrec,
+                                                                                  void *context)
+{
+       struct vnic_neigh *neigh = context;
+       struct ib_ah *old_ah, *new_ah;
+       struct net_device *dev = neigh->login->dev;
+       struct sk_buff_head skqueue;
+       struct vnic_login *login = neigh->login;
+
+       if (status) {
+               vnic_dbg_data(neigh->login->name, "neigh %d "MAC_6_PRINT_FMT" path query complete FAILED\n",
+                                               neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+               goto drop_pkts;
+       } else {
+               struct ib_ah_attr av;
+               struct sk_buff *skb;
+               vnic_dbg_data(login->name, "neigh %d "MAC_6_PRINT_FMT" path query complete sucess SL=%d\n",
+                                               neigh->lid, MAC_6_PRINT_ARG(neigh->mac), pathrec->sl);
+               if(ib_init_ah_from_path(login->port->dev->ca, login->port->num, pathrec, &av)){
+                       vnic_warn(login->name, "ib_init_ah_from_path %d "MAC_6_PRINT_FMT" failed!\n",
+                                               neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+                       goto drop_pkts;
+               }
+
+               old_ah = neigh->ah;
+               new_ah = ib_create_ah(login->port->pd, &av);
+               if (IS_ERR(new_ah) || !new_ah) {
+                       vnic_warn(login->name, "ib_create_ah %d "MAC_6_PRINT_FMT" failed!\n",
+                                               neigh->lid, MAC_6_PRINT_ARG(neigh->mac));
+
+                       goto drop_pkts;
+               }
+
+               neigh->sl = pathrec->sl;
+               skb_queue_head_init(&skqueue);
+               netif_tx_lock_bh(login->dev);
+               neigh->ah = new_ah;
+               neigh->valid = 1;
+               neigh->query_id = -1;
+               while ((skb = __skb_dequeue(&neigh->pkt_queue)))
+                       __skb_queue_tail(&skqueue, skb);
+               netif_tx_unlock_bh(login->dev);
+
+               /* retransmit all pending packets */
+               while ((skb = __skb_dequeue(&skqueue))) {
+                       /* reset skb headers */
+                       /* TODO ALL VLAN ?? */
+                       if (VNIC_VLAN_ENABLED(login) && login->vid)
+                               skb_pull(skb, VLAN_HLEN);
+                       if (vnic_encap_headroom)
+                               skb_pull(skb, VNIC_ENCAP_LEN);
+
+                       skb->dev = dev;
+                       dev_queue_xmit(skb);
+               }
+
+               if (old_ah && !IS_ERR(old_ah))
+                       ib_destroy_ah(old_ah);
+       }
+       complete(&neigh->query_comp);
+       return;
+
+drop_pkts:
+       netif_tx_lock_bh(dev);
+       neigh->query_id = -1; /* this will cause a retry */
+       while (!skb_queue_empty(&neigh->pkt_queue))
+       {
+               struct sk_buff *skb = skb_dequeue(&neigh->pkt_queue);
+               int tx_res_index;
+               struct vnic_tx_res *tx_res;
+               skb->dev = dev;
+               tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num);
+               ASSERT(tx_res_index <= login->tx_rings_num);
+               tx_res = &login->tx_res[tx_res_index];
+               VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+               dev_kfree_skb_any(skb);
+       }
+       netif_tx_unlock_bh(dev);
+       complete(&neigh->query_comp);
+}
+
+int vnic_neighe_path_query(struct vnic_neigh *neighe)
+{
+       ib_sa_comp_mask comp_mask;
+       struct ib_sa_path_rec p_rec;
+       u16 slid = neighe->login->port->attr.lid;
+       vnic_dbg_data(neighe->login->vnic_name,"neighe SL Query slid %d dlid %d dmac:"MAC_6_PRINT_FMT"\n",
+                                 slid, neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+
+       comp_mask =        IB_SA_PATH_REC_SERVICE_ID  |
+                                          IB_SA_PATH_REC_DLID        |
+                                          IB_SA_PATH_REC_SLID        |
+                                          IB_SA_PATH_REC_PKEY;
+
+       if (IS_NEIGH_QUERY_RUNNING(neighe))
+               ib_sa_cancel_query(neighe->query_id, neighe->pquery);
+
+       init_completion(&neighe->query_comp);
+       neighe->query_id = -1;
+       neighe->pquery = NULL;
+
+       p_rec.dlid = cpu_to_be16(neighe->lid);
+       p_rec.slid = cpu_to_be16(slid);
+       p_rec.service_id = cpu_to_be64(EOIB_SERVICE_ID);
+       p_rec.pkey = cpu_to_be16(neighe->login->pkey);
+
+       neighe->query_id = ib_sa_path_rec_get(&vnic_sa_client,
+                                          neighe->login->port->dev->ca,
+                                          neighe->login->port->num,
+                                          &p_rec,
+                                          comp_mask,
+                                          1000/*TOUT*/,
+                                          GFP_ATOMIC,
+                                          vnic_neigh_path_query_complete,
+                                          neighe,
+                                          &neighe->pquery);
+       if (neighe->query_id < 0) {
+               vnic_dbg_data(neighe->login->vnic_name, "FAILED neigh SL Query slid %d dlid %d dmac:"MAC_6_PRINT_FMT"\n",
+                         slid, neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+               complete(&neighe->query_comp);
+       }
+       return neighe->query_id;
+}
+
+static int vnic_ucast_send(struct vnic_login *login, struct sk_buff *skb,
+                           struct neighbour *neighbour, int tx_res_index)
+{
+       struct vnic_neigh *neighe;
+       int hash;
+
+       neighe = vnic_neighe_search(login, skb_mac_header(skb));
+       if (IS_ERR(neighe)) {
+               vnic_dbg_data(login->name, "no dst_neigh and no vnic_neigh - "
+                             "gw unicast packet\n");
+
+               /* for egress unicast traffic of a shared vnic,
+                * replace src mac by shared mac
+                */
+               if (login->shared_vnic)
+                       memcpy(skb_mac_header(skb) + ETH_ALEN,
+                              login->shared_mac, ETH_ALEN);
+
+               if (!login->is_lag)
+                       neighe = login->gw_neigh;
+               else {
+                       if (unlikely(!login->lag_member_active_count))
+                               return -ENOENT;
+
+                       /* use hash value precomputed and mapping to find LAG GW to send to */
+                       hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid);
+                       hash = hash % LAG_MAP_TABLE_SIZE;
+                       neighe = &login->lag_gw_neigh[login->lag_gw_map[hash]].neigh;
+               }
+
+               /* update GW statistics */
+               VNIC_STATS_ADD(login->port_stats.gw_tx_bytes, skb->len);
+               VNIC_STATS_INC(login->port_stats.gw_tx_packets);
+       } else {
+               vnic_dbg_data(login->name,
+                             "no dst_neigh but vnic_neigh exists - "
+                             "local unicast packet\n");
+       }
+
+       /* TODO: in VNIC_NEIGH_GET_DQPN use neigh qps_num instead of login */
+       vnic_dbg_data(login->name, "vnic_send to (base qpn 0x%06x) dqpn 0x%06x"
+                     " dlid 0x%08x %s\n", neighe->qpn,
+                     VNIC_NEIGH_GET_DQPN(skb, neighe), neighe->lid,
+                     neighe == login->gw_neigh ? "[GW]" : "");
+
+       if (unlikely(vnic_sa_query && !neighe->valid)) {
+               /* query neigh ah*/
+               vnic_dbg_data(login->name, "AH is not %s, running path query: LID=%d mac="MAC_6_PRINT_FMT"\n",
+                                 !IS_ERR(neighe->ah) && neighe->ah ? "valid":"found",
+                                 neighe->lid, MAC_6_PRINT_ARG(neighe->mac));
+
+               if (!IS_NEIGH_QUERY_RUNNING(neighe))
+                       vnic_neighe_path_query(neighe);
+
+               if (IS_ERR(neighe->ah) || !neighe->ah)
+               {   /* AH is not ready yet, Queue pkt */
+                       if (skb_queue_len(&neighe->pkt_queue) > VNIC_SKB_QUEUE_LEN || !IS_NEIGH_QUERY_RUNNING(neighe))
+                               return 1; /* Drop in case queue is full or no query is currently runnig*/
+                       __skb_queue_tail(&neighe->pkt_queue, skb);
+                       return 0;
+               }
+               /* if ah is initialized send anyway */
+       }
+       vnic_send(login, skb, neighe->ah, VNIC_NEIGH_GET_DQPN(skb, neighe), tx_res_index);
+       return 0;
+}
+
+void vnic_mcast_send(struct vnic_login *login, struct sk_buff *skb, int tx_res_index)
+{
+       struct vnic_mcast *mcaste;
+       union vhub_mgid mgid;
+       struct ethhdr *eth;
+       struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+       struct ib_ah_attr *av = &tx_res->mcast_av;
+       struct ib_ah *ah;
+       u16 gw_id;
+       int hash;
+
+       eth = (struct ethhdr *)skb_mac_header(skb);
+
+       /* for LAG GW, perform hashing on mcast address */
+       if (login->is_lag && login->lag_member_active_count) {
+               hash = vnic_lag_hash(skb, login->lag_prop.hash_mask, login->vid);
+               hash = hash % LAG_MAP_TABLE_SIZE;
+               gw_id = login->lag_gw_neigh[login->lag_gw_map[hash]].gw_id;
+       }
+       else
+               gw_id = login->gw_port_id;
+
+       /* retrieve the mlid */
+       vhub_mgid_create(login->mgid_prefix, ETH_ZERO_MAC, login->n_mac_mcgid,
+                        CREATE_VHUB_ID(login->vid, gw_id),
+                        VHUB_MGID_DATA, 0, &mgid);
+
+       spin_lock(&login->mcast_tree.mcast_rb_lock);
+       mcaste = vnic_mcast_search(&login->mcast_tree, &mgid.ib_gid);
+       if (unlikely(IS_ERR(mcaste) || !mcaste->ah)) {
+               vnic_dbg_data(login->name, "couldn't find mcaste for "
+                             MAC_6_PRINT_FMT"\n",
+                             MAC_6_PRINT_ARG(eth->h_dest));
+               spin_unlock(&login->mcast_tree.mcast_rb_lock);
+               goto drop;
+       }
+
+       spin_lock(&mcaste->lock);
+       vhub_mgid_create(login->mgid_prefix, eth->h_dest, login->n_mac_mcgid,
+                        CREATE_VHUB_ID(login->vid, gw_id),
+                        vnic_mgid_data_type, 0, &mgid);
+       vnic_dbg_mcast_v(login->name, "sending to ETH "MAC_6_PRINT_FMT"-> "
+                        "GID "VNIC_GID_FMT" (mask %d bit)\n",
+                        MAC_6_PRINT_ARG(eth->h_dest),
+                        VNIC_GID_ARG(mgid.ib_gid),
+                        login->n_mac_mcgid);
+
+       av->dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+       av->static_rate = mcaste->port_mcaste->rec.rate;
+       av->sl = mcaste->port_mcaste->rec.sl;
+       memcpy(&av->grh.dgid, mgid.ib_gid.raw, GID_LEN);
+
+       ah = ib_create_ah(login->port->pd, av);
+       spin_unlock(&mcaste->lock);
+       spin_unlock(&login->mcast_tree.mcast_rb_lock);
+
+       if (!ah || IS_ERR(ah))
+               goto drop;
+
+       vnic_send(login, skb, ah, IB_MULTICAST_QPN, tx_res_index);
+       ib_destroy_ah(ah);
+       /* used as a counter for multicast TX packets (not RX) */
+       VNIC_STATS_DO_INC(tx_res->stats.multicast);
+
+       return;
+
+drop:
+       VNIC_STATS_DO_INC(tx_res->stats.tx_dropped);
+       dev_kfree_skb_any(skb);
+}
+
+int vnic_tx(struct sk_buff *skb, struct net_device *dev)
+{
+       struct vnic_login *login = vnic_netdev_priv(dev);
+       int tx_res_index = 0, headroom = dev->hard_header_len - ETH_HLEN;
+       struct vnic_tx_res *tx_res = &login->tx_res[tx_res_index];
+
+       ASSERT(dev);
+       ASSERT(skb);
+#ifdef VNIC_PROFILLNG
+       login->prof_arr[login->prof_arr_it].cnt++;
+       /* copy only fields for reporting, data buffer is invalid */
+       login->prof_arr[login->prof_arr_it].skb = *skb;
+       login->prof_arr[login->prof_arr_it].skb.data = NULL;
+       login->prof_arr[login->prof_arr_it].tstamp = current_kernel_time();
+       login->prof_arr[login->prof_arr_it].jiffies = jiffies;
+       login->prof_arr[login->prof_arr_it].nr_frags = skb_shinfo(skb)->nr_frags;
+       login->prof_arr_it = (login->prof_arr_it + 1) % VNIC_PROFILLNG_SKB_MAX;
+
+#endif
+
+       /* drop zero length skbs */
+       if (unlikely(!skb->len))
+               goto drop;
+
+       /* sometimes, vnic_tx is called before carrier is up FM #100882 */
+       if (unlikely(!test_bit(VNIC_STATE_NETDEV_CARRIER_ON, &login->netdev_state)))
+               goto drop;
+
+       /* check headroom and reallocate skb if needed:
+        * If VLAN used: need VLAN_HLEN (4) Bytes
+        * If vnic_encap_headroom set: need VNIC_ENCAP_LEN (4) Bytes
+        * when vnic_encap_headroom is clear, we do not encap EoIB header
+        * into the headroom, but rather use additional SG entry to hold it
+        */
+
+       if (unlikely(skb_headroom(skb) < headroom)) {
+               struct sk_buff *skb_new;
+
+               skb_new = skb_realloc_headroom(skb, headroom);
+               if (!skb_new)
+                       goto drop;
+
+               dev_kfree_skb(skb);
+               skb = skb_new;
+               VNIC_STATS_INC(login->port_stats.realloc_packets);
+       }
+       /* don't use dev->header_ops, use vnic_header_encap() inline
+        * function instead, because when raw socket is used or BR_CTL mode
+        * then header_ops are not called as expected, and we'll end up sending
+        * the packet without EoIB header
+        */
+       if (unlikely(vnic_header_encap(skb)))
+               goto drop;
+
+       /* in promiscuous mode, learn the source mac */
+       if (is_ucast_promisc(login) && vnic_learn_mac_enabled)
+               vnic_learn_mac(dev, skb_mac_header(skb) + ETH_ALEN, 0);
+
+       /* get TX resource for this SKB, keep it after vnic_header_encap()
+        * so if we don't have kernel multiple queue support we use the
+        * RSS hash result for TSS
+        */
+       tx_res_index = VNIC_TXQ_GET_HASH(skb, login->real_tx_rings_num);
+       ASSERT(tx_res_index <= login->tx_rings_num);
+       tx_res = &login->tx_res[tx_res_index];
+
+
+       /* send ucast/mcast packet */
+       vnic_dbg_skb("TX", skb, (unsigned long)(vnic_encap_headroom ? 0 : -1),
+                    (unsigned long)(vnic_encap_headroom ? VNIC_ENCAP_LEN : 0));
+#if 0 /* neighbour caching disabled */
+       if (likely(skb->dst && skb->dst->neighbour)) {
+               if (is_multicast_ether_addr(skb_mac_header(skb))) {
+                       vnic_dbg_data(login->name,
+                                     "dst_neigh exists but no vnic_neigh - "
+                                     "multicast packet\n");
+                       vnic_mcast_send(login, skb, tx_res_index);
+               } else {
+                       vnic_dbg_data(login->name,
+                                     "dst_neigh exists but no vnic_neigh - "
+                                     "unicast packet\n");
+                       vnic_ucast_send(login, skb, skb->dst->neighbour, tx_res_index);
+               }
+       } else 
+#endif
+       {
+               if (is_multicast_ether_addr(skb_mac_header(skb))) {
+                       vnic_dbg_data(login->name,
+                                     "no dst_neigh - multicast packet\n");
+                       vnic_mcast_send(login, skb, tx_res_index);
+               } else {
+                       vnic_dbg_data(login->name,
+                                     "no dst_neigh - unicast packet\n");
+                       if (unlikely(vnic_ucast_send(login, skb, NULL, tx_res_index)))
+                               goto drop;
+               }
+       }
+
+       return NETDEV_TX_OK;
+
+drop:
+       dev_kfree_skb(skb);
+       return NETDEV_TX_OK;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip.h
new file mode 100644 (file)
index 0000000..0f77c1a
--- /dev/null
@@ -0,0 +1,1025 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_FIP_H
+#define _VNIC_FIP_H
+
+#include "vnic.h"
+
+
+#define FIP_TYPE(FIPT) FIP_TYPE_##FIPT
+#define FIP_TYPE_IDX(FIPT) FIP_TYPE_IDX_##FIPT
+
+#define FIP_CASE(FIPT) case FIP_TYPE(FIPT): return FIP_TYPE_IDX(FIPT)
+
+#define FIP_CASE_STR(FIPT) case FIP_TYPE(FIPT): return # FIPT
+#define FIP_SUBCODE_CASE_STR(SUBCODE) case (SUBCODE): return # SUBCODE
+
+#define FIP_MASK(FIPT) (((u64)1) << FIP_TYPE_IDX(FIPT))
+
+#define ADV_EXT_TYPE(FIPT) ADV_EXT_TYPE_##FIPT
+#define ADV_EXT_IDX(FIPT) ADV_EXT_IDX_##FIPT
+
+#define GUID_FORMAT "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x"
+#define MGID_PREFIX_FMT "%02x:%02x:%02x:%02x:%02x"
+#define GUID_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4], (g)[5], (g)[6], (g)[7]
+#define MGID_PRE_ARG(g) (g)[0], (g)[1], (g)[2], (g)[3], (g)[4]
+
+enum {
+       FIP_TYPE(VENDOR_ID)     = 13,
+       FIP_TYPE(ADDRESS)       = 240,
+       FIP_TYPE(GW_INFORMATION)= 241,
+       FIP_TYPE(LOGIN)         = 242,
+       FIP_TYPE(VHUB_UPDATE)   = 243,
+       FIP_TYPE(VHUB_TABLE)    = 244,
+       FIP_TYPE(VNIC_IDENTITY) = 245,
+       FIP_TYPE(PARTITION)     = 246,
+       FIP_TYPE(GW_IDENTIFIER) = 248,
+       FIP_TYPE(KA_PARAMS)     = 249,
+       FIP_TYPE(EXT_DESC)      = 254,
+};
+
+enum {
+       FIP_TYPE_IDX(VENDOR_ID),
+       FIP_TYPE_IDX(ADDRESS),
+       FIP_TYPE_IDX(GW_INFORMATION),
+       FIP_TYPE_IDX(LOGIN),
+       FIP_TYPE_IDX(VHUB_UPDATE),
+       FIP_TYPE_IDX(VHUB_TABLE),
+       FIP_TYPE_IDX(VNIC_IDENTITY),
+       FIP_TYPE_IDX(PARTITION),
+       FIP_TYPE_IDX(GW_IDENTIFIER),
+       FIP_TYPE_IDX(KA_PARAMS),
+       FIP_TYPE_IDX(EXT_DESC),
+};
+
+enum {
+       ADV_EXT_TYPE(CAP)        = 40,
+       ADV_EXT_TYPE(BOOT)       = 18,
+       ADV_EXT_TYPE(LAG)        = 41,
+       ADV_EXT_TYPE(MEMBER)     = 42,
+       ADV_EXT_TYPE(PC_ID)      = 43, /* Power Cycle ID */
+       ADV_EXT_TYPE(CTRL_IPORT) = 240,
+};
+
+enum {
+       ADV_EXT_IDX(CAP),
+       ADV_EXT_IDX(BOOT),
+       ADV_EXT_IDX(LAG),
+       ADV_EXT_IDX(PC_ID),
+       ADV_EXT_IDX(CTRL_IPORT),
+};
+
+
+enum {
+       EPORT_STATE_DOWN = 0,
+       EPORT_STATE_UP = 1,
+};
+
+enum fip_packet_type {
+       FIP_DISCOVER_UCAST = 0,
+       FIP_DISCOVER_MCAST = 1
+};
+
+enum {
+       FIP_TABLE_HDR_MIDDLE = 0,
+       FIP_TABLE_HDR_FIRST = 1,
+       FIP_TABLE_HDR_LAST = 2,
+       FIP_TABLE_HDR_ONLY = 3
+};
+
+enum {
+       FIP_EXT_LAG_W_POLICY_HOST  = 1,
+       FIP_EXT_LAG_W_POLICY_UCAST = 1 << 2
+};
+
+/* string "mellanox" */
+#define FIP_VENDOR_MELLANOX { 0x6d, 0x65, 0x6c, 0x6c, 0x61, 0x6e, 0x6f, 0x78 }
+
+
+#define FIP_TEST_PKT_LENGTH(port, length, type)                                   \
+        if ((length) != sizeof(type) + IB_GRH_BYTES) {                    \
+                vnic_dbg_fip(port->name, "Dump packet:"                   \
+                        "at %d unexpected size. length %d expected %d\n", \
+                         __LINE__, (int)length,                           \
+                         (int)(sizeof(type) + IB_GRH_BYTES));             \
+                return -EINVAL;                                           \
+        }
+
+/*
+ * copy string b to string a and NULL termination.
+ * length a must be >= length b+1.
+ */
+#define TERMINATED_MEMCPY(a,b)                 \
+       do {                                    \
+               ASSERT(sizeof(a)>=sizeof(b)+1); \
+               memcpy((a), (b), sizeof(b));    \
+               (a)[sizeof(b)] = '\0';          \
+       } while (0);
+
+
+enum {
+       FIP_MAX_ADDR_TLVS = 6,
+       FIP_MAX_TLVS = 32,
+       FIP_MAX_EXT_DESC = 32,
+};
+
+struct fip_fip_type {
+       u8      type;
+       u8      length;
+       u16     reserved;
+};
+
+struct fip_header_simple {
+       __be16 opcode;
+       u8 reserved;
+       u8 subcode;
+       __be16 list_length;
+       __be16 flags;
+};
+
+struct fip_vendor_id_tlv {
+       struct fip_fip_type ft;
+       u8      vendor_id[8];
+};
+
+struct fip_address_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       __be32              gwtype_qpn;
+       __be16              sl_gwportid;
+       __be16              lid;
+       u8                  guid[8];
+};
+
+struct fip_gw_information_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       u8                  h_nmac_mgid;
+       u8                  n_rss_mgid_tss_qpn;
+       __be16              n_rss_qpn_vnics;
+};
+
+struct fip_login_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       __be16              mtu;
+       __be16              vnic_id;
+       __be16              flags_vlan;
+       u8                  mac[6];
+       u8                  eth_gid_prefix[5];
+       u8                  antispoofing;
+       __be16              vfields;
+       __be32              syndrom_ctrl_qpn;
+       u8                  vnic_name[16];
+};
+
+struct context_table_entry {
+       u8      v_rss_type;
+       u8      reserved;
+       u8      mac[ETH_ALEN];
+       __be32  qpn;
+       u8      reserved1;
+       u8      sl;
+       __be16  lid;
+};
+
+struct fip_vhub_update_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       __be32              state_vhub_id;
+       __be32              tusn;
+};
+
+struct fip_vhub_table_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       __be32              vp_vhub_id;
+       __be32              tusn;
+       __be16              hdr;
+       __be16              table_size;
+};
+
+struct fip_vnic_identity_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       __be32              flags_vhub_id;
+       __be32              tusn;
+       __be16              vnic_id;
+       u8                  mac[6];
+       u8                  port_guid[8];
+       u8                  vnic_name[16];
+};
+
+struct fip_partition_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       __be16              reserved;
+       __be16              pkey;
+};
+
+struct fip_gw_identifier_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       u8                  sys_guid[8];
+       u8                  sys_name[32];
+       u8                  gw_port_name[8];
+};
+
+struct fip_ka_params_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+       __be32              adv_period;
+       __be32              ka_period;
+       __be32              vnic_ka_period;
+};
+
+struct fip_ext_desc_tlv {
+       struct fip_fip_type ft;
+       u8                  vendor_id[8];
+};
+
+struct fip_extended_type {
+       u8      ext_type;
+       u8      len;
+       u8      reserved;
+       u8      mandatory;
+};
+
+struct fip_ext_type_cap {
+       struct fip_extended_type et;
+       u32                      reserved[4];
+};
+
+struct fip_ext_type_boot {
+       struct fip_extended_type et;
+       u8                       boot_prio;
+       u8                       reserved;
+       __be16                   discovery_timeout;
+};
+
+struct fip_ext_type_lag_props {
+       struct fip_extended_type et;
+       u8                       gw_type;
+       u8                       reserved;
+       __be16                   lag_hash;
+       u8                       weight_policy_flags;
+       u8                       ca_threshold;
+       __be16                   link_down_pol_thresh;
+       u32                      reserved2[2];
+};
+
+struct fip_ext_type_power_cycle_id {
+       struct fip_extended_type et;
+       __be64                   power_cycle_id;
+       u32                      reserved;
+} __attribute__((packed));
+
+struct fip_ext_type_hostname {
+       struct fip_extended_type et;
+       u8                       hostname[32];
+};
+
+struct fip_ext_type_ctrl_iport {
+       struct fip_extended_type et;
+       u8                  vendor_id[8];
+       __be32              gwtype_qpn;
+       __be16              sl_gwportid;
+       __be16              lid;
+       u8                  guid[8];
+};
+
+struct fip_ext_type_lag_member {
+       __be32                   qpn;
+       __be16                   sl_gw_portid;
+       __be16                   lid;
+       u8                       guid[8];
+       u8                       eport_state;
+       u8                       reserved1;
+       u8                       weight;
+       u8                       link_utilization;
+       u32                      reserved2;
+};
+
+struct fip_ext_type_lag_members {
+       struct fip_extended_type et;
+       struct fip_ext_type_lag_member lagm[0];
+};
+
+struct fip_ext_group {
+       struct fip_ext_desc_tlv *fed[FIP_MAX_EXT_DESC];
+       int                      num;
+};
+
+struct fip_address_group {
+       struct fip_address_tlv *fa[FIP_MAX_ADDR_TLVS];
+       int                     num;
+};
+
+struct fip_context_group {
+       struct context_table_entry *cte;
+       int                         num;
+};
+
+struct fip_content {
+       struct fip_eoib_ver *eoib_ver;
+       struct fip_header_simple *fh;
+       struct fip_vendor_id_tlv *fvend;
+       struct fip_address_group fa;
+       struct fip_gw_information_tlv *fgwi;
+       struct fip_login_tlv *fl;
+       struct fip_vhub_update_tlv *fvu;
+       struct fip_vhub_table_tlv *fvt;
+       struct fip_vnic_identity_tlv *fvi;
+       struct fip_partition_tlv *fp;
+       struct fip_gw_identifier_tlv *fgid;
+       struct fip_ka_params_tlv *fka;
+        struct fip_ext_group fed;
+       struct fip_context_group cte;
+       u64     mask;
+       u16     offsets[FIP_MAX_TLVS];
+       int     num;
+};
+
+/**************************************************************************/
+/*                           packet format structs                        */
+/**************************************************************************/
+#define VENDOR_ID_LENGTH 8
+
+struct fip_eoib_ver {
+       u8 version;
+       u8 reserved[3];
+};
+
+struct fip_fip_header {
+       __be16 opcode;
+       u8 reserved;
+       u8 subcode;
+       __be16 list_length;
+       __be16 flags;
+       struct fip_fip_type type;
+       u8 vendor_id[VNIC_VENDOR_LEN];
+};
+
+struct fip_discover_base {
+       struct fip_fip_type type;
+       u8 vendor_id[VNIC_VENDOR_LEN];
+       u32 qpn;
+       u16 sl_port_id;
+       u16 lid;
+       u8 guid[GUID_LEN];
+};
+
+struct eoib_adv_gw_info { /* Gabi */
+       struct fip_fip_type type; 
+       u8 vendor_id[VNIC_VENDOR_LEN];
+       u8 system_guid[GUID_LEN];
+       u8 system_name[VNIC_SYSTEM_NAME_LEN];
+       u8 gw_port_name[VNIC_GW_PORT_NAME_LEN];
+};
+
+/* keep alive information */
+struct eoib_adv_ka_info { /* Gabi */
+       struct fip_fip_type type; 
+       u8 vendor_id[VNIC_VENDOR_LEN];
+       u32 gw_adv_period;
+       u32 gw_period;
+       u32 vnic_ka_period;
+};
+
+struct eoib_advertise {
+       struct fip_eoib_ver version;
+       struct fip_fip_header fip;
+       struct fip_discover_base base;
+       struct fip_fip_type type_1;
+       u8 vendor_id[VNIC_VENDOR_LEN];
+       u8 flags;
+       u8 reserved;
+       u16 num_net_vnics;
+       struct eoib_adv_gw_info gw_info; /* Gabi */
+       struct eoib_adv_ka_info ka_info; /* Gabi */
+};
+
+struct syndrom_dword {
+       u8 syndrom;
+       u8 reserved[3];
+};
+
+union syn_qp_ctrl {
+       struct syndrom_dword syn;
+       u32 ctl_qpn;
+};
+
+struct eoib_login {
+       struct fip_eoib_ver             eoib_ver;
+       struct fip_header_simple        fh;
+       struct fip_vendor_id_tlv        fvend;
+       struct fip_address_tlv          fa;
+       struct fip_login_tlv            fl;
+};
+
+struct fip_solicit_legacy {
+       struct fip_eoib_ver version;
+       struct fip_header_simple fh;
+       struct fip_vendor_id_tlv fvend;
+       struct fip_address_tlv addr;
+};
+
+struct fip_solicit_new {
+       struct fip_eoib_ver version;
+       struct fip_header_simple fh;
+       struct fip_vendor_id_tlv fvend;
+       struct fip_address_tlv addr;
+       struct fip_ext_desc_tlv ext;
+       struct fip_ext_type_cap ext_cap;
+        struct fip_ext_type_hostname ext_hostname;
+};
+
+union fip_vhub_id {
+       struct {
+               u8 flags;
+               u8 reserved[3];
+       } flags;
+       u32 vhub_id;
+};
+
+struct eoib_context_table {
+       struct fip_eoib_ver version;
+       struct fip_fip_header fip;
+       struct fip_fip_type type_1;
+       u8 vendor_id[VNIC_VENDOR_LEN];
+       union fip_vhub_id vhub_id;
+       u32 tusn;
+       u8 flags;
+       u8 reserved;
+       u16 table_size;
+       /* here come the context entries */
+};
+
+/* this is the number of DWORDS to subtract from type_1->length
+ * to get the size of the entries / 4. (size in dwords from start
+ * of vendor_id field until the first context entry + 1 for checksum
+ */
+#define FIP_TABLE_SUB_LENGTH 6
+
+/*
+ * eoib_host_update will be used for vHub context requests,
+ * keep alives and logouts
+ */
+struct eoib_host_update {
+       struct fip_eoib_ver version;
+       struct fip_fip_header fip;
+       struct fip_fip_type type_1;
+       u8 vendor_id[VNIC_VENDOR_LEN];
+       union fip_vhub_id vhub_id;
+       u32 tusn;
+       u16 vnic_id;
+       u8 mac[ETH_ALEN];
+       u8 port_guid[GUID_LEN];
+       u8 vnic_name[VNIC_NAME_LEN];
+};
+
+enum fip_packet_fields {
+       EOIB_FIP_OPCODE = 0xFFF9,
+       FIP_FIP_HDR_LENGTH = 3,
+       FIP_FIP_HDR_TYPE = 13,
+
+       /* keep all subcodes here */
+       FIP_HOST_SOL_SUB_OPCODE = 0x1,
+       FIP_GW_ADV_SUB_OPCODE = 0x2,
+       FIP_HOST_LOGIN_SUB_OPCODE = 0x3,
+       FIP_GW_LOGIN_SUB_OPCODE = 0x4,
+       FIP_HOST_LOGOUT_SUB_OPCODE = 0x5,
+       FIP_GW_UPDATE_SUB_OPCODE = 0x6,
+       FIP_GW_TABLE_SUB_OPCODE = 0x7,
+       FIP_HOST_ALIVE_SUB_OPCODE = 0x8,
+       FIP_MAX_SUBCODES,
+       /* end subcodes section */
+
+       FIP_FIP_FCF_FLAG = 0x1,
+       FIP_FIP_SOLICITED_FLAG = 0x2,
+       FIP_FIP_ADVRTS_FLAG = 0x4,
+       FIP_FIP_FP_FLAG = 0x80,
+       FIP_FIP_SP_FLAG = 0x40,
+
+       FIP_BASIC_LENGTH = 7,
+       FIP_BASIC_TYPE = 240,
+
+       FIP_ADVERTISE_LENGTH_1 = 4,
+       FIP_ADVERTISE_TYPE_1 = 241,
+       FIP_ADVERTISE_HOST_VLANS = 0x80,
+       FIP_ADVERTISE_NUM_VNICS_MASK = 0x0FFF,
+       FIP_ADVERTISE_N_RSS_SHIFT = 12,
+       FIP_ADVERTISE_HOST_EN_MASK = 0x80,
+       FIP_ADVERTISE_ALL_VLAN_GW_MASK = 0x60,
+       FIP_ADVERTISE_GW_PORT_ID_MASK = 0x0FFF,
+       FIP_ADVERTISE_SL_SHIFT = 12,
+
+       FIP_ADVERTISE_GW_LENGTH = 15,
+       FIP_ADVERTISE_GW_TYPE = 248,
+
+       FIP_ADVERTISE_KA_LENGTH = 6,
+       FIP_ADVERTISE_KA_TYPE = 249,
+
+       FIP_LOGIN_LENGTH_1 = 13,
+       FIP_LOGIN_TYPE_1 = 242,
+       FIP_LOGIN_LENGTH_2 = 4,
+       FIP_LOGIN_TYPE_2 = 246,
+
+       FIP_LOGIN_V_FLAG = 0x8000,
+       FIP_LOGIN_M_FLAG = 0x4000,
+       FIP_LOGIN_VP_FLAG = 0x2000,
+       FIP_LOGIN_H_FLAG = 0x1000,
+       FIP_LOGIN_VLAN_MASK = 0x0FFF,
+       FIP_LOGIN_DMAC_MGID_MASK = 0x3F,
+       FIP_LOGIN_RSS_MGID_MASK = 0x0F,
+       FIP_LOGIN_RSS_MASK = 0x10,
+       FIP_LOGIN_RSS_SHIFT = 4,
+       FIP_LOGIN_CTRL_QPN_MASK = 0xFFFFFF,
+       FIP_LOGIN_VNIC_ID_BITS = 16,
+       FIP_LOGIN_ALL_VLAN_GW_FLAG = 0x0040,
+
+       FIP_LOGOUT_LENGTH_1 = 13,
+       FIP_LOGOUT_TYPE_1 = 245,
+
+       FIP_HOST_UPDATE_LENGTH = 13,
+       FIP_HOST_UPDATE_TYPE = 245,
+       FIP_HOST_VP_FLAG = 0x01,
+       FIP_HOST_U_FLAG = 0x80,
+       FIP_HOST_R_FLAG = 0x40,
+
+       FIP_CONTEXT_UP_LENGTH = 9,
+       FIP_CONTEXT_UP_TYPE = 243,
+       FIP_CONTEXT_UP_EPORT_MASK = 0x30,
+       FIP_CONTEXT_UP_EPORT_SHIFT = 4,
+       FIP_CONTEXT_V_FLAG = 0x80,
+       FIP_CONTEXT_RSS_FLAG = 0x40,
+       FIP_CONTEXT_TYPE_MASK = 0x0F,
+
+       FIP_CONTEXT_TBL_TYPE = 244,
+       FIP_CONTEXT_TBL_SEQ_MASK = 0xC0,
+       FIP_CONTEXT_TBL_SEQ_FIRST = 0x40,
+       FIP_CONTEXT_TBL_SEQ_LAST = 0x80,
+
+       FKA_ADV_PERIOD = 8000,  /* in mSecs */
+       FKA_ADV_MISSES = 3
+};
+
+enum fip_login_syndroms {
+       FIP_SYNDROM_SUCCESS = 0,
+       FIP_SYNDROM_HADMIN_REJECT = 1,
+       FIP_SYNDROM_GW_RESRC = 2,
+       FIP_SYNDROM_NO_NADMIN = 3,
+       FIP_SYNDROM_UNRECOGNISED_HOST = 4,
+       FIP_SYNDROM_UNSUPPORTED_PARAM = 5,
+       FIP_SYNDROM_GW_IS_LAG_MEMBER = 6,
+       FIP_SYNDROM_DUPLICATE_ADDRESS = 7,
+};
+
+/*
+ * Send a multicast or unicast solicit packet. The multicast packet is sent
+ * to the discover mcast group. Unicast packets are sent to the dqpn + dlid
+ * supplied. The dlid, dqpn, sl are ignored for multicast packets.
+ * functionreturns 0 on success and error code on failure
+*/
+int fip_solicit_send(struct fip_discover *discover,
+                    enum fip_packet_type multicast, u32 dqpn,
+                    u16 dlid, u8 sl, int new_prot);
+
+/*
+ * Send a unicast login packet. This function supports both host and
+ * network admined logins. function returns 0 on success and
+ * error code on failure
+*/
+int fip_login_send(struct fip_vnic_data *vnic);
+
+int fip_logout_send(struct fip_vnic_data *vnic);
+
+/*
+ * This function creates and sends a few types of packets (all ucast):
+ *   vHub context request - new=1, logout=0
+ *   vHub context update / vnic keep alive - new=0, logout=0
+ *   vnic logout - new=0, logout=1
+*/
+int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout);
+
+/*
+ * Check if a received packet is a FIP packet, And if so return its subtype.
+ * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE
+ * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned.
+*/
+int fip_pkt_parse(char *buffer, int length, int *fip_type);
+
+/*
+ * Already know that this is a FIP packet, return its subtype.
+*/
+int fip_pkt_get_subtype_bh(char *buffer);
+
+/*
+ * parse a packet that is suspected of being an advertise packet. The packet
+ * returns 0 for a valid advertise packet and an error code other wise. The
+ * packets "interesting" details are returned in data.
+*/
+int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc,
+                          struct fip_gw_data *data);
+
+/*
+ * parse a packet that is suspected of being an login ack packet. The packet
+ * returns 0 for a valid login ack packet and an error code other wise. The
+ * packets "interesting" details are returned in data.
+*/
+int fip_login_parse(struct fip_discover *discover, struct fip_content *fc,
+                   struct fip_login_data *data);
+
+static inline int _map_generic_pkt(struct vnic_port *port,
+                                  struct fip_ring_entry *tx_ring_entry,
+                                  void *mem, int pkt_size)
+{
+       /* alloc packet to be sent */
+       tx_ring_entry->mem = mem;
+
+       /* map packet to bus */
+       tx_ring_entry->bus_addr =
+           ib_dma_map_single(port->dev->ca,
+                             tx_ring_entry->mem, pkt_size, DMA_TO_DEVICE);
+
+       if (unlikely(ib_dma_mapping_error(port->dev->ca,
+                                         tx_ring_entry->bus_addr))) {
+               vnic_warn(port->name,
+                         "send_generic_pkt failed to map to pci\n");
+               return -ENOMEM;
+       }
+       tx_ring_entry->length = pkt_size;
+
+       return 0;
+}
+
+static inline int alloc_map_fip_buffer(struct ib_device *ca,
+                                      struct fip_ring_entry *me,
+                                      int size, gfp_t mask)
+{
+       me->mem = kmalloc(size, mask);
+       if (!me->mem) {
+               vnic_warn(ca->name, "failed to alloc memory (%d)\n", size);
+               return -ENOMEM;
+       }
+
+       me->bus_addr = ib_dma_map_single(ca, me->mem, size, DMA_FROM_DEVICE);
+       if (unlikely(ib_dma_mapping_error(ca, me->bus_addr))) {
+               kfree(me->mem);
+               vnic_warn(ca->name, "ib_dma_mapping_error failed\n");
+               return -ENOMEM;
+       }
+       me->length = size;
+       me->entry_posted = 0;
+
+       return 0;
+}
+
+#define DELAYED_WORK_CLEANUP_JIFFS     2
+#define FIP_MAX_PKT_PRINT_LENGTH       120
+#define        FIP_OP_RECV                     (1ul << 31)
+
+static const char fip_discover_mgid[GID_LEN] = {
+       0xFF, 0x12, 0xE0, 0x1B,
+       0x00, 0x06, 0x00, 0x00,
+       0x00, 0x00, 0x00, 0x00,
+       0x00, 0x00, 0x00, 0x00};
+static const char fip_solicit_mgid[GID_LEN] = {
+       0xFF, 0x12, 0xE0, 0x1B,
+       0x00, 0x07, 0x00, 0x00,
+       0x00, 0x00, 0x00, 0x00,
+       0x00, 0x00, 0x00, 0x00};
+
+
+/* TODO - remove this: for initial debug only */
+void fip_dbg_dump_raw_pkt(int level, void *buff,
+                         int length, int is_tx, char *name);
+enum {
+       FIP_ETH_HEADER_LEN = 14,
+       FIP_ENCAP_LEN = 4,
+       FIP_PROTOCOL_RX_SIZE = 16,      /* must be power of 2 */
+       FIP_PROTOCOL_TX_SIZE = 64,      /* must be power of 2 */
+       FIP_LOGIN_RX_SIZE = 64, /* must be power of 2 */
+       FIP_LOGIN_TX_SIZE = 64,         /* must be power of 2 */
+
+       /* timeout in seconds between LOGIN and ACK */
+       FIP_LOGIN_TIMEOUT = 8,
+       FIP_RESOLICIT_TIME = 8,
+
+       IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + FIP_ENCAP_LEN,
+};
+
+struct fip_rcv_pkt {
+       struct list_head list;
+       struct fip_content *fc;
+       int length;
+       void *mem;
+};
+
+/*
+ * Alloc the discover CQ, QP. Configure the QP to RTS.
+ * alloc the RX + TX rings and queue work for discover
+ * finite state machine code. If complete it set, it clears
+ * possible previous GW / VNIC data structs on init.
+ */
+int fip_discover_init(struct vnic_port *port, struct fip_discover *discover,
+                     u16 pkey, int complete);
+
+/*
+ * free the discover TX and RX rings, QP and CQ. If complete 
+ * is set, it clears possible previous GW / VNIC data structs
+ * by using a "complete" flush otherwise vnic data is preserved.
+*/
+int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complete);
+
+/*
+ * send a single multicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_mcast_send(struct vnic_port *port, struct ib_qp *qp,
+                  unsigned int wr_id, u64 mapping,
+                  int size, u16 pkey_index, struct vnic_mcast *mcast);
+/*
+ * send a single unicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_ucast_send(struct vnic_port *port, struct ib_ah *ah,
+                  struct ib_qp *qp,
+                  unsigned int wr_id, u64 mapping,
+                  int size, u16 pkey_index, u32 dest_qpn, u16 dlid,
+                  u32 qkey, u8 sl);
+/*
+ * qonfigure a newly allocated QP and move it
+ * from reset->init->RTR->RTS
+ */
+int fip_init_qp(struct vnic_port *port, struct ib_qp *qp,
+               u16 pkey_index, char *name);
+
+/*
+ * allocs a single rx buffer (of size size), map it to pci bus
+ * and post it to the qp for receive. id parameter is used
+ * to keep track of work request when completion is received.
+ * kernel and bus address are returned in mem_entry.
+ * returns 0 on success else failure.
+ * id used to identify entry in receive queue.
+ */
+int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size,
+                    int _id, struct fip_ring_entry *mem_entry, char *name);
+
+/* trigered by a core event */
+void fip_qp_to_reset(struct ib_qp *qp, char *name);
+void fip_flush_rings(struct vnic_port *port,
+                    struct ib_cq *cq,
+                    struct ib_qp *qp,
+                    struct fip_ring *rx_ring,
+                    struct fip_ring *tx_ring,
+                    char *name);
+void fip_free_rings(struct vnic_port *port,
+                   struct fip_ring *rx_ring,
+                   struct fip_ring *tx_ring,
+                   char *name);
+
+/*
+ * This function allocates the tx buffers and initializes the head and
+ * tail indexes.
+ */
+int fip_init_tx(int size, struct fip_ring *tx_ring, char *name);
+
+/*
+ * Configure the discover QP. This includes configuring rx+tx
+ * moving the discover QP to RTS and creating the tx  and rx rings
+ */
+int fip_init_rx(struct vnic_port *port, int ring_size, struct ib_qp *qp,
+               struct fip_ring *rx_ring, char *name);
+
+/*
+ * This is a general purpose CQ completion function that handles
+ * completions on RX and TX rings. It can serve all users that are
+ * using RX and TX rings.
+ * RX completions are destinguished from TX comp by the MSB that is set
+ * for RX and clear for TX. For RX, the memory is unmapped from the PCI,
+ * The head is incremented. For TX the memory is unmapped and then freed.
+ * The function returns the number of packets received.
+*/
+int fip_comp(struct vnic_port *port,
+            struct ib_cq *cq,
+            struct fip_ring *rx_ring,
+            struct fip_ring *tx_ring,
+            char *name);
+
+/*
+ * This function is the driving engine of the vnic logic. It manages the
+ * vnics state machines.
+ * Some of the states in the state machine could have been removed because
+ * they contain "actions" and not states. Still it is easier to maintaine
+ * the code this way and it gives an easy mechanism for exception handling
+ * and retries.
+ * Only call this function from fip_wq context.
+*/
+void fip_vnic_fsm(struct work_struct *work);
+
+/*
+ * Mark the vnic for deletion and trigger a delayed call to the cleanup
+ * function. In the past the vnic was moved to another list but this
+ * might cause vnic duplication if new vnics are added to the GW. Even
+ * if the vnic is being flushed we need to know it is there.
+ *
+ * Note: This deletion method insures that all pending vnic work requests
+ * are cleared without dependency of the calling context.
+*/
+void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush);
+
+/*
+ * Free vnic resources. This includes closing the data vnic (data QPs etc)
+ * and the discovery resources. If the vnic can be totaly destroyed (no
+ * pending work) the vnic will be removed from the GW list and it's memory
+ * freed. If not the vnic will not be freed and the function will return an
+ * error. The caller needs to recall this unction to complete the operation.
+*/
+int fip_vnic_destroy(struct fip_vnic_data *vnic);
+
+struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port,
+                                    struct fip_gw_data *gw,
+                                    int hadmin,
+                                    u16 vnic_id);
+
+/*
+ * Look for a vnic in the GW vnic list. The search key used is either the vnic_id
+ * that is unique, or the mac+vlan pair. A match on either key will result in the
+ * return of the vnic. both keys are nesesary because host assigned delete
+ * flow might not have access to the vnic_id. The search disregards vnics that
+ * are undergoing full flush (they will be removed soon).
+*/
+struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw,
+                                           u16 vnic_id, u8 *mac,
+                                           u16 vlan, u8 vlan_used);
+
+/*
+ * process an incoming login ack packet. The packet was already parsed and
+ * its data was placed in *data. The function creates RX and TX rings for the
+ * vnic and starts the multicast join procedure.
+ * This function should not be called for packets other then login ack packets.
+*/
+void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic,
+                            struct fip_login_data *data);
+
+/*
+ * This function should be called when the building of a vhub context
+ * table is done and the vnic state should transition to CONNECTED.
+*/
+int fip_vnic_tbl_done(struct fip_vnic_data *vnic);
+int fip_vnic_mcast_recnct(struct fip_vnic_data *vnic);
+
+/*
+ * Init the vnic's vHub table data structures, before using them
+ */
+void vhub_ctx_init(struct fip_vnic_data *vnic);
+void vhub_table_free(struct vhub_elist *elist);
+
+/*
+ * Clear and free the vnic's vHub context table data structures.
+ */
+void vhub_ctx_free(struct fip_vnic_data *vnic);
+
+/*
+ * This function handles a vhub context table packet. The table will
+ * be processed only if we do not have a up to date local coppy of
+ * our own. The table update supports multi-packet tables so care
+ * must be taken in building the complete table.
+*/
+int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc,
+                   u32 vhub_id, u32 tusn);
+
+/*
+ * This function handles a vhub context update packets. There are three flows
+ * in handeling update packets. The first is before the main table is up
+ * to date, the second is after the table is up to date but before it was
+ * passed to the ownership of the data vnic (login struct) and the local
+ * lists are freed, and the last is when the table maintanence is done
+ * by the data vnic. This function handles all cases.
+*/
+int vhub_handle_update(struct fip_vnic_data *vnic,
+                      u32 vhub_id, u32 tusn,
+                      struct vnic_table_entry *data);
+
+/*
+ * This function writes the main vhub table to the data (login) vnic.
+ * You should call it when the data vnic is ready for it and after the
+ * table is up to date (and the update list was applied to the main list)
+ */
+int fip_vnic_write_tbl(struct fip_vnic_data *vnic);
+
+/* sysfs entries for hadmin vNics*/
+int vnic_create_hadmin_dentry(struct fip_vnic_data *vnic);
+void vnic_delete_hadmin_dentry(struct fip_vnic_data *vnic);
+void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs,
+                          int ext_length,                        
+                          struct lag_members *lagm,
+                          char *name);
+int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm);
+int extract_vhub_extended(struct fip_ext_desc_tlv *fed,
+                          struct fip_vnic_data *vnic);
+static inline int send_generic_ucast_pkt(struct vnic_port *port,
+                                        struct ib_ah *ah,
+                                        struct fip_ring *tx_ring,
+                                        void *mem, int pkt_size,
+                                        struct ib_qp *qp,
+                                        int pkey_index,
+                                        u32 dst_qpn, u16 dst_lid,
+                                        u32 qkey, u8 sl)
+{
+       int index, rc;
+       unsigned long flags;
+       unsigned long tail;
+
+       /*
+        * we are only allowed to update the head at task level so no need to
+        * perform any locks here
+        */
+       spin_lock_irqsave(&tx_ring->ring_lock, flags);
+       index = tx_ring->head & (tx_ring->size - 1);
+
+       vnic_dbg_fip(port->name, "send ucast packet\n");
+
+       spin_lock(&tx_ring->head_tail_lock);
+       tail = tx_ring->tail;
+       spin_unlock(&tx_ring->head_tail_lock);
+
+       /* ring full try again */
+       if (tx_ring->head - tail >=  tx_ring->size) {
+               vnic_warn(port->name, "send_generic_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n",
+                         qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail);
+               rc = -EAGAIN;
+               goto err;
+       }
+
+
+       rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size);
+       if (rc)
+               goto err;
+
+       rc = fip_ucast_send(port, ah, qp, index,
+                           tx_ring->ring[index].bus_addr,
+                           pkt_size, pkey_index, dst_qpn, dst_lid,
+                           qkey, sl);
+
+       if (rc) {
+               vnic_warn(port->name, "fip_ucast_send() failed (%d)\n", rc);
+               rc = -ENODEV;
+               goto error_unmap_dma;
+       }
+
+       tx_ring->head++;
+
+       spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+       return 0;
+
+error_unmap_dma:
+       ib_dma_unmap_single(port->dev->ca,
+                           tx_ring->ring[index].bus_addr,
+                           pkt_size, DMA_TO_DEVICE);
+err:
+       spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+       return rc;
+}
+
+static inline const char *eport_state_str(int state)
+{
+       switch (state) {
+       case EPORT_STATE_DOWN: return "Down";
+       case EPORT_STATE_UP: return "Up";
+       default:return "Invalid";
+       }
+}
+
+#endif /* _VNIC_FIP_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.c
new file mode 100644 (file)
index 0000000..71829aa
--- /dev/null
@@ -0,0 +1,2183 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+#define FIP_MAX_PKT_PRINT_LENGTH 120
+
+static void fip_purge_gws(struct work_struct *work);
+static void fip_discover_gw_fsm(struct work_struct *work);
+static void fip_discover_hadmin_update(struct work_struct *work);
+static void fip_discover_fsm(struct work_struct *work);
+void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush);
+
+/* TODO - remove this: for initial debug only */
+void fip_dbg_dump_raw_pkt(int level, void *buff,
+                         int length, int is_tx, char *name)
+{
+       int i;
+       int tmp_len;
+       u32 *data_ptr;
+       unsigned char *tmp_data_ptr;
+
+       if (!(vnic_msglvl & VNIC_DEBUG_PKT_DUMP))
+               return;
+
+       printk(KERN_DEBUG "%s %s: packet length is %d\n",
+              is_tx ? "TX" : "RX", name, length);
+
+       length = (length > FIP_MAX_PKT_PRINT_LENGTH) ?
+               FIP_MAX_PKT_PRINT_LENGTH : length;
+
+       tmp_len = (length >> 2) + 1;
+       data_ptr = (u32 *)buff;
+       for (i = 0; i < tmp_len; i++) {
+               if (!is_tx && i == IB_GRH_BYTES >> 2)
+                       printk(KERN_DEBUG "========================\n");
+               tmp_data_ptr = (unsigned char *)&data_ptr[i];
+               printk(KERN_DEBUG "%02x %02x %02x %02x \n",
+                          tmp_data_ptr[0], tmp_data_ptr[1],
+                          tmp_data_ptr[2], tmp_data_ptr[3]);
+       }
+}
+
+/*
+ * Configure the discover QP. This includes configuring rx+tx
+ * moving the discover QP to RTS and creating the tx and rx rings
+ */
+int fip_discover_start_rings(struct fip_discover *discover,
+                            struct fip_ring *rx_ring,
+                            struct fip_ring *tx_ring,
+                            struct ib_cq *cq,
+                            struct ib_qp *qp)
+{
+       int rc;
+
+       rc = fip_init_tx(tx_ring->size, tx_ring, discover->name);
+       if (rc) {
+               vnic_warn(discover->name, "fip_init_tx failed rc %d\n", rc);
+               /* set RX ring size to 0 as indication of the failure
+                  so RX rings won't be freed, no need to set tx_ring->size
+                  since fip_init_tx error flow will handle it */
+               rx_ring->size = 0;
+               return rc;
+       }
+
+       rc = fip_init_rx(discover->port, rx_ring->size, qp, rx_ring, discover->name);
+       if (rc) {
+               vnic_warn(discover->name, "fip_init_rx returned %d\n", rc);
+               goto release_queues;
+       }
+
+       return 0;
+
+release_queues:
+       fip_flush_rings(discover->port, cq, qp, rx_ring, tx_ring, discover->name);
+       fip_free_rings(discover->port, rx_ring, tx_ring, discover->name);
+
+       return rc;
+}
+
+int fip_discover_init_rings(struct vnic_port *port,
+                           struct fip_discover *discover,
+                           struct fip_ring *rx_ring,
+                           struct fip_ring *tx_ring,
+                           struct ib_cq **cq,
+                           struct ib_qp **qp,
+                           ib_comp_handler comp_handler)
+{
+       struct ib_qp_init_attr qp_init_attr;
+       struct ib_device *ca = port->dev->ca;
+
+
+       *cq = ib_create_cq(ca, comp_handler, NULL, discover,
+                          rx_ring->size + tx_ring->size, 0);
+       if (IS_ERR(*cq)) {
+               vnic_warn(discover->name, "failed to create CQ\n");
+               goto out;
+       }
+
+       memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+       qp_init_attr.cap.max_send_wr = tx_ring->size;
+       qp_init_attr.cap.max_recv_wr = rx_ring->size;
+       qp_init_attr.cap.max_send_sge = 1;
+       qp_init_attr.cap.max_recv_sge = 1;
+       qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+       qp_init_attr.qp_type = IB_QPT_UD;
+       qp_init_attr.send_cq = *cq;
+       qp_init_attr.recv_cq = *cq;
+
+       *qp = ib_create_qp(port->pd, &qp_init_attr);
+       if (IS_ERR(*qp)) {
+               vnic_warn(discover->name, "failed to create QP\n");
+               goto error_free_cq;
+       }
+
+       /* move QP to RTS */
+       if (fip_init_qp(discover->port, *qp, discover->pkey_index, discover->name)) {
+               vnic_warn(discover->name, "fip_init_qp failed for  qp\n");
+               goto error_free_qp;
+       }
+
+       /* init RX + TX rings */
+       if (fip_discover_start_rings(discover, rx_ring, tx_ring, *cq, *qp)) {
+               vnic_warn(discover->name, "failed to start rings\n");
+               goto error_free_qp;
+       }
+
+       /* enable receiving CQ comps, triggers fip_discover_comp()  */
+       if (ib_req_notify_cq(*cq, IB_CQ_NEXT_COMP)) {
+               vnic_warn(discover->name, "ib_req_notify_cq failed for cq\n");
+               goto error_release_rings;
+       }
+
+       return 0;
+
+error_release_rings:
+       fip_flush_rings(discover->port, *cq, *qp, rx_ring, tx_ring, discover->name);
+       fip_free_rings(discover->port, rx_ring, tx_ring, discover->name);
+error_free_qp:
+       ib_destroy_qp(*qp);
+error_free_cq:
+       ib_destroy_cq(*cq);
+out:
+       *qp = NULL;
+       *cq = NULL;
+       return -ENODEV;
+}
+
+/*
+ * This function handles completions of both TX and RX
+ * packets. RX packets are unmapped lightly parsed moved to a list
+ * and passed to thread processing. TX packets are unmapped and freed.
+ * Note: this function is called from interrupt context
+ */
+static void fip_discover_comp(struct ib_cq *cq, void *discover_ptr)
+{
+       struct fip_discover *discover = discover_ptr;
+
+       /* handle completions. On RX packets this will call discover_process_rx
+        * from thread context to continue processing */
+       if (fip_comp(discover->port, discover->cq,
+                    &discover->rx_ring, &discover->tx_ring,
+                    discover->name))
+               fip_discover_process_rx(discover);
+}
+
+/*
+ * Alloc the discover CQ, QP. Configure the QP to RTS.
+ * alloc the RX + TX rings and queue work for discover
+ * finite state machine code.
+ */
+int fip_discover_init(struct vnic_port *port, struct fip_discover *discover,
+                     u16 pkey, int complete)
+{
+       int rc;
+
+       discover->port = port;
+       discover->flush = FIP_NO_FLUSH;
+       discover->state = FIP_DISCOVER_INIT;
+       discover->rx_ring.size = FIP_PROTOCOL_RX_SIZE;
+       discover->tx_ring.size = FIP_PROTOCOL_TX_SIZE;
+       discover->new_prot_gws = 0;
+       discover->old_prot_gws = 0;
+
+       /* This is in preparation for pkey discovery */
+
+       init_completion(&discover->flush_complete);
+
+       INIT_DELAYED_WORK(&discover->fsm_task, fip_discover_fsm);
+       INIT_DELAYED_WORK(&discover->cleanup_task, fip_purge_gws);
+       INIT_DELAYED_WORK(&discover->hadmin_update_task, fip_discover_hadmin_update);
+       INIT_WORK(&discover->pkt_rcv_task_bh, fip_discover_process_rx_bh);
+       spin_lock_init(&discover->rcv_list.lock);
+       INIT_LIST_HEAD(&discover->rcv_list.list);
+       spin_lock_init(&discover->lock);
+
+
+       if (complete) {
+               discover->pkey = pkey;
+               INIT_LIST_HEAD(&discover->gw_list);
+               init_rwsem(&discover->l_rwsem);
+               sprintf(discover->name, "%s_P%x", port->name, discover->pkey);
+       }
+       INIT_LIST_HEAD(&discover->hadmin_cache);
+       vnic_mcast_root_init(&discover->mcast_tree);
+
+       if (!ib_find_pkey(port->dev->ca, port->num, discover->pkey, &discover->pkey_index)) {
+               rc = fip_discover_init_rings(port, discover, &discover->rx_ring,
+                                            &discover->tx_ring, &discover->cq,
+                                            &discover->qp, fip_discover_comp);
+               if (rc) {
+                       vnic_warn(discover->name, "descovered init failed rc=%d\n", rc);
+                       return rc;
+               }
+
+               /* start discover FSM code */
+               /* calls fip_discover_fsm() */
+               queue_delayed_work(fip_wq, &discover->fsm_task, 0);
+       } else {
+               vnic_warn(discover->name, "Configured PKEY 0x%X is not supported on port\n", discover->pkey);
+               discover->pkey_index = ILLEGAL_PKEY_INDEX;
+       }
+
+
+       return 0;
+}
+
+void fip_recv_list_flush(struct fip_discover *discover)
+{
+       struct list_head discov_recv_local;
+       struct fip_rcv_pkt *rcv, *rcv1;
+       unsigned long flags;
+
+       INIT_LIST_HEAD(&discov_recv_local);
+
+       spin_lock_irqsave(&discover->rcv_list.lock, flags);
+       list_replace_init(&discover->rcv_list.list, &discov_recv_local);
+       spin_unlock_irqrestore(&discover->rcv_list.lock, flags);
+
+       list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+               list_del(&rcv->list);
+               kfree(rcv);
+       }
+       return;
+}
+
+/*
+ * free the discover TX and RX rings, QP and CQ.
+ * May not be called from fip wq context.
+ */
+int fip_discover_cleanup(struct vnic_port *port, struct fip_discover *discover, int complt)
+{
+       if (discover->state == FIP_DISCOVER_OFF)
+               return -EINVAL;
+
+       /* move FSM to flush state and wait for the FSM
+        * to finish whatever it is doing before we continue
+        */
+       vnic_dbg_mark();
+       init_completion(&discover->flush_complete);
+       discover->flush = complt ? FIP_FULL_FLUSH : FIP_PARTIAL_FLUSH;
+       cancel_delayed_work(&discover->fsm_task);
+#ifndef _BP_WORK_SYNC
+       cancel_delayed_work_sync(&discover->hadmin_update_task);
+#else
+       cancel_delayed_work(&discover->hadmin_update_task);
+       flush_workqueue(fip_wq);
+#endif
+       /* flush any hadmin entries leftovers */
+       {
+               struct fip_hadmin_cache *hadmin, *hadmin_t;
+
+               spin_lock_irq(&discover->lock);
+               list_for_each_entry_safe(hadmin, hadmin_t,
+                                        &discover->hadmin_cache, next) {
+                       list_del(&hadmin->next);
+                       kfree(hadmin);
+               }
+               spin_unlock_irq(&discover->lock);
+       }
+
+       /* calls fip_discover_fsm() */
+       queue_delayed_work(fip_wq, &discover->fsm_task, 0);
+       vnic_dbg_mark();
+       /* calls fip_discover_fsm() */
+       wait_for_completion(&discover->flush_complete);
+       vnic_dbg_mark();
+
+       /* make sure that discover FSM is idle */
+#ifndef _BP_WORK_SYNC
+       cancel_delayed_work_sync(&discover->fsm_task);
+#else
+       cancel_delayed_work(&discover->fsm_task);
+       flush_workqueue(fip_wq);
+#endif
+
+       if (discover->pkey_index != ILLEGAL_PKEY_INDEX) {
+               fip_flush_rings(port, discover->cq, discover->qp,
+                               &discover->rx_ring, &discover->tx_ring,
+                               discover->name);
+               fip_free_rings(port, &discover->rx_ring, &discover->tx_ring,
+                              discover->name);
+
+               fip_recv_list_flush(discover);
+               if (discover->qp)
+                       ib_destroy_qp(discover->qp);
+               discover->qp = NULL;
+
+               if (discover->cq)
+                       ib_destroy_cq(discover->cq);
+               discover->cq = NULL;
+       }
+
+       return 0;
+}
+
+/*
+ * This function runs in interrupt context
+ * It does sanity checking of the packet, moves it to a list and passes
+ * handling to a thread.
+ */
+void fip_discover_process_rx(struct fip_discover *discover)
+{
+       struct vnic_port *port = discover->port;
+       int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+       int rc;
+       int queue_packet, one_or_more_queued = 0;
+       struct fip_rcv_pkt *rcv, *rcv1;
+       struct list_head discov_recv_local;
+       int index;
+       struct fip_content *fc;
+       int err;
+       struct fip_ring_entry *ring;
+
+       INIT_LIST_HEAD(&discov_recv_local);
+
+       if (discover->flush != FIP_NO_FLUSH)
+               return;
+
+       while (discover->rx_ring.head != discover->rx_ring.tail) {
+               fc = NULL;
+               queue_packet = 0;
+               index = discover->rx_ring.tail & (discover->rx_ring.size - 1);
+               ring = &discover->rx_ring.ring[index];
+
+               if (ring->entry_posted == 1 &&
+                   discover->state == FIP_DISCOVER_SOLICIT) {
+                       fc = kzalloc(sizeof *fc, GFP_ATOMIC);
+                       if (likely(fc)) {
+                               /* login is the first state we RX packets in */
+                               rc = fip_packet_parse(port, ring->mem + IB_GRH_BYTES,
+                                                     ring->length - IB_GRH_BYTES, fc);
+                               if (!rc)
+                                       fip_discover_rx_packet(&queue_packet, fc);
+                       } else
+                               vnic_warn(discover->name, "allocation failed\n");
+               }
+               if (queue_packet) {
+                       int length;
+
+                       length = ring->length - IB_GRH_BYTES;
+                       rcv = kmalloc(sizeof *rcv, GFP_ATOMIC);
+                       if (!rcv) {
+                               vnic_dbg_fip(discover->name, "failed kmalloc\n");
+                               kfree(fc);
+                       } else {
+                               struct fip_ring_entry me;
+
+                               err = alloc_map_fip_buffer(port->dev->ca, &me,
+                                                          mtu_size, GFP_ATOMIC);
+                               if (err) {
+                                       kfree(fc);
+                                       kfree(rcv);
+                               } else {
+                                       rcv->length = length;
+                                       rcv->fc = fc;
+                                       rcv->mem = ring->mem;
+                                       list_add_tail(&rcv->list, &discov_recv_local);
+                                       one_or_more_queued++;
+                                       ib_dma_unmap_single(port->dev->ca,
+                                                           ring->bus_addr,
+                                                           mtu_size, DMA_FROM_DEVICE);
+                                       *ring = me;
+                               }
+                       }
+               } else
+                        kfree(fc);
+
+               rc = fip_post_receive(port, discover->qp,
+                                     FIP_UD_BUF_SIZE(discover->port->max_mtu_enum),
+                                     index, ring, discover->name);
+               if (rc)
+                       vnic_warn(discover->name, "fip_post_receive rc %d\n", rc);
+
+               discover->rx_ring.tail++;
+       }
+
+       if (one_or_more_queued) {
+               spin_lock(&discover->lock);
+               if (likely(discover->flush == FIP_NO_FLUSH)) {
+                       spin_lock(&discover->rcv_list.lock);
+                       list_splice_init(&discov_recv_local, discover->rcv_list.list.prev);
+                       spin_unlock(&discover->rcv_list.lock);
+                       /* calls fip_discover_process_rx_bh */
+                       queue_work(fip_wq, &discover->pkt_rcv_task_bh);
+                       spin_unlock(&discover->lock);
+               } else {
+                       spin_unlock(&discover->lock);
+                       list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+                               list_del(&rcv->list);
+                               kfree(rcv->fc);
+                               kfree(rcv->mem);
+                               kfree(rcv);
+                       }
+               }
+       }
+
+       return;
+}
+
+/*
+ * This function is the RX packet handler bottom half. It runs on the fip wq.
+*/
+void fip_discover_process_rx_bh(struct work_struct *work)
+{
+       struct fip_discover *discover =
+               container_of(work, struct fip_discover, pkt_rcv_task_bh);
+       int rc;
+       struct list_head discov_recv_local;
+       struct fip_rcv_pkt *rcv, *rcv1;
+       unsigned long flags;
+
+       INIT_LIST_HEAD(&discov_recv_local);
+
+       /* the irqsave is needed because debug kernel above 2.6.27 complains about
+        * hard irq safe to hard irq unsafe on discover.lock */
+       spin_lock_irqsave(&discover->rcv_list.lock, flags);
+       list_replace_init(&discover->rcv_list.list, &discov_recv_local);
+       spin_unlock_irqrestore(&discover->rcv_list.lock, flags);
+
+       if (discover->flush != FIP_NO_FLUSH) {
+               list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+                       list_del(&rcv->list);
+                       kfree(rcv->fc);
+                       kfree(rcv->mem);
+                       kfree(rcv);
+               }
+               return;
+       }
+
+       list_for_each_entry_safe(rcv, rcv1, &discov_recv_local, list) {
+                       rc = fip_discover_rx_packet_bh(discover, rcv->fc);
+                       if (rc)
+                               vnic_warn(discover->name, "discover_rx_packet rc %d\n", rc);
+
+               list_del(&rcv->list);
+               kfree(rcv->fc);
+               kfree(rcv->mem);
+               kfree(rcv);
+       }
+       return;
+}
+
+static inline int fip_close_all_vnics(struct fip_gw_data *gw, enum fip_flush flush)
+{
+       struct fip_vnic_data *vnic;
+       int open_vnics = 0;
+
+       vnic_dbg_func(gw->discover->name);
+
+       list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+               open_vnics++;
+               fip_vnic_close(vnic, flush);
+       }
+       return open_vnics;
+}
+
+static int fip_gw_create_vnics(struct fip_gw_data *gw)
+{
+       struct fip_vnic_data *vnic;
+       unsigned long first_free_vnic;
+       struct fip_vnic_send_info gw_address;
+       int i;
+
+       gw->info.gw_num_vnics = (gw->info.gw_num_vnics > FIP_MAX_VNICS_PER_GW) ?
+               FIP_MAX_VNICS_PER_GW : gw->info.gw_num_vnics;
+
+
+       gw->info.gw_num_vnics = vnic_net_admin ? gw->info.gw_num_vnics : 0;
+       fip_vnic_create_gw_param(&gw_address, gw->info.gw_qpn, VNIC_FIP_QKEY,
+                                gw->info.gw_lid,  vnic_gw_ctrl_sl(gw));
+       /* for host admined  */
+       list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+               if (vnic->hadmined) {
+                       if (gw->info.hadmined_en)
+                               fip_hadmin_vnic_refresh(vnic, &gw_address);
+                       else {
+                               vnic_dbg_fip(gw->discover->name,
+                                            "fip_gw_create_vnics hadmin disabled, "
+                                            "close open hadmin vnics\n");
+                               fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+                       }
+               }
+       }
+
+       /* for network admined  */
+       for (i = gw->vnic_count; i < gw->info.gw_num_vnics; i++) {
+               vnic_dbg_fip(gw->discover->name, "fip_gw_create_vnics available"
+                            " vnics %d needed %d\n",
+                            gw->vnic_count, gw->info.gw_num_vnics);
+
+               /* start network assigned at half array. leave first half to host admin */
+               first_free_vnic = find_first_zero_bit(gw->n_bitmask,
+                                                     FIP_MAX_VNICS_PER_GW);
+               if (first_free_vnic >= FIP_MAX_VNICS_PER_GW)
+                       return -ENOMEM;
+
+               vnic = fip_vnic_alloc(gw->discover->port, gw, 0 /* hadmin */, first_free_vnic);
+               if (!vnic)
+                       return -ENOMEM;
+
+               fip_vnic_set_gw_param(vnic, &gw_address);
+               set_bit(first_free_vnic, gw->n_bitmask);
+               list_add_tail(&vnic->gw_vnics, &gw->vnic_list);
+               gw->vnic_count++;
+
+               /* calls fip_vnic_fsm() */
+               cancel_delayed_work(&vnic->vnic_task);
+               fip_vnic_fsm(&vnic->vnic_task.work);
+       }
+
+       return 0;
+}
+
+/*
+ * This function goes over vnics and closes network administrated vNics
+ * that are not open and do not receive neighbor table info (there
+ * is no way for the BXM to tell the vNics to close before the
+ * vnic is listening to the neighbour tables).
+*/
+static int fip_gw_close_nonopen_vnics(struct fip_gw_data *gw)
+{
+       struct fip_vnic_data *vnic;
+       int closed_vnics = 0;
+
+       vnic_dbg_fip(gw->discover->name, "Try to close non open vnics\n");
+
+       list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+               vnic_dbg_fip(gw->discover->name, "check vnic %s, hadmin %d state %d\n",
+                            vnic->name, vnic->hadmined, vnic->state);
+               if (!vnic->hadmined && vnic->state < FIP_VNIC_VHUB_DONE) {
+                       vnic_dbg_fip(gw->discover->name, "closing vnic %s\n", vnic->name);
+                       fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+                       closed_vnics++;
+               }
+       }
+
+       return closed_vnics;
+}
+
+/* permanently delete all vnics pending delete. The function goes over
+ * the list of vnics awaiting deletion and tries to delete them. If the
+ * vnic destructor returns an error value (currently busy) the function
+ * will requeue it self for another try. The function will also test if
+ * new vnics need to be added as a result of vnic removal.
+ */
+static void fip_purge_vnics(struct work_struct *work)
+{
+       struct fip_gw_data *curr_gw =
+               container_of(work,struct fip_gw_data, vnic_cleanup_task.work);
+       struct fip_vnic_data *vnic, *tmp_vnic;
+       int vnic_id, rc, del_cnt = 0, retry = 0;
+       unsigned long *bitmask;
+
+       vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics\n");
+
+       list_for_each_entry_safe(vnic, tmp_vnic, &curr_gw->vnic_list, gw_vnics) {
+               enum fip_flush f;
+               vnic_id = vnic->vnic_id;
+               bitmask = vnic->hadmined ? NULL : curr_gw->n_bitmask;
+
+               /* If successful vnic is removed from list and destroyed */
+               f = vnic->flush;
+               if (f != FIP_NO_FLUSH) {
+                       rc = fip_vnic_destroy(vnic);
+                       if (!rc) {
+                               del_cnt++;
+                               if (f == FIP_FULL_FLUSH && bitmask)
+                                       clear_bit(vnic_id, bitmask);
+                       } else {
+                               retry |= rc;
+                       }
+               }
+
+               /* limit the number of vnics to purge in each loop to let other
+                * tasks on same wq to run (i.e., avoid starvation).
+                */
+               if (del_cnt > 2) {
+                       retry = 1;
+                       break;
+               }
+       }
+
+       /* This means we still have vnics that refuse to close retry later */
+       if (retry){
+               vnic_dbg_mark();
+               /* calls fip_purge_vnics() */
+               queue_delayed_work(fip_wq, &curr_gw->vnic_cleanup_task, HZ / 10);
+       } else {
+               vnic_dbg_fip(curr_gw->discover->name, "fip_purge_vnics, all GW"
+                            " vnics closed\n");
+
+               if (curr_gw->hadmin_gw && curr_gw->state == FIP_GW_HOST_ADMIN && list_empty(&curr_gw->vnic_list)) {
+                       vnic_warn(curr_gw->discover->name,
+                                         "Removing Host admin GW %s with no vnics\n",
+                                         (char*)curr_gw->info.vol_info.gw_port_name);
+                       fip_close_gw(curr_gw, FIP_FULL_FLUSH);
+               }
+               /* test and open new vnics if vnics are missing */
+               /* ALITODO: after GW timeout, a vnic is re-created! why is that?
+               if (fip_gw_create_vnics(curr_gw)) {
+                       vnic_dbg_mark();
+                       queue_delayed_work(fip_wq,
+                                          &curr_gw->vnic_cleanup_task, HZ);
+               }
+               */
+       }
+}
+
+/*
+ * This function adds or removes a single host admined vnic to a GW.
+ * First the function searches for the vnic. The search function
+ * disregards vnics that are undergoing a complete flush.
+*/
+int fip_gw_update_hadmin_gw(struct fip_gw_data *gw,
+                           struct fip_hadmin_cache *hadmin_entry)
+{
+       struct fip_vnic_data *vnic;
+       int vnic_id = hadmin_entry->vnic_id, rc = 0;
+
+       /* set bit 16 for hadmin vNics (by spec) */
+       vnic_id |= (1 << (VNIC_ID_LEN - 1));
+
+       vnic = fip_vnic_find_in_list(gw, vnic_id, hadmin_entry->mac,
+                                    hadmin_entry->vlan,
+                                    hadmin_entry->vlan_used);
+
+       /* remove: if vNic found - remove it and exit */
+       if (hadmin_entry->remove) {
+               if (vnic)
+                       fip_vnic_close(vnic, FIP_FULL_FLUSH);
+               else
+                       vnic_dbg_fip(gw->discover->name, "vNic to remove is"
+                                    " not found (name:%s mac:"MAC_6_PRINT_FMT
+                                    " vlan:%d id:%d)\n",
+                         hadmin_entry->interface_name,
+                         MAC_6_PRINT_ARG(hadmin_entry->mac),
+                         hadmin_entry->vlan, vnic_id);
+               goto out;
+       }
+
+       /* add: if vNic found - report error, otherwise add new vNic */
+       if (vnic) {
+               /* skip error reporting between child vNics conflict,
+                * as vnic_learn_mac() may learn same child while it's still
+                * pending. TODO: improve this to avoid such cases.
+                */
+               if (hadmin_entry->parent_used && vnic->parent_used)
+                       goto out;
+               vnic_warn(gw->discover->name, "vNic creation failed, duplicate"
+                         " vNic detected (name:%s mac:"MAC_6_PRINT_FMT
+                         " vlan:%d id:%d & existing name:%s mac:"
+                         MAC_6_PRINT_FMT" vlan:%d id:%d)\n",
+                         hadmin_entry->interface_name,
+                         MAC_6_PRINT_ARG(hadmin_entry->mac),
+                         hadmin_entry->vlan, vnic_id, vnic->interface_name,
+                         MAC_6_PRINT_ARG(vnic->login_data.mac),
+                         vnic->login_data.vlan, vnic->login_data.vnic_id);
+               goto out;
+       }
+
+#if 0
+       /* if the GW is in all_vlan mode,
+        * the host can only create vlans in this mode.
+        * However if it is not in all_vlan mode, the host must not create
+        * vlans in this mode */
+       if ((gw->info.all_vlan_gw && !hadmin_entry->all_vlan_gw
+            && hadmin_entry->vlan_used) ||
+            (!gw->info.all_vlan_gw && hadmin_entry->all_vlan_gw)) {
+               vnic_warn(gw->discover->name, "vnic creation failed, all_vlan"
+                         " gateway policy must be enforced between the gateway"
+                         "  and the host\n");
+               rc = -EINVAL;
+               goto out;
+       }
+#endif
+
+       vnic = fip_vnic_alloc(gw->discover->port, gw, 1 /* hadmin */, vnic_id);
+       if (!vnic) {
+               rc = -ENOMEM;
+               goto out;
+       }
+
+       /* hand over info from hadmin to vnic struct */
+       memcpy(vnic->login_data.mac, hadmin_entry->mac, sizeof(vnic->login_data.mac));
+       memcpy(vnic->interface_name, hadmin_entry->interface_name,
+              sizeof(vnic->interface_name));
+       vnic->login_data.vlan = hadmin_entry->vlan;
+       vnic->login_data.vp = hadmin_entry->vlan_used;
+       vnic->login_data.all_vlan_gw = hadmin_entry->all_vlan_gw;
+       memcpy(vnic->shared_vnic.ip, hadmin_entry->shared_vnic_ip,
+              sizeof(vnic->shared_vnic.ip));
+       memcpy(vnic->shared_vnic.emac, hadmin_entry->shared_vnic_mac,
+              sizeof(vnic->shared_vnic.emac));
+       vnic->shared_vnic.enabled = is_valid_ipv4(hadmin_entry->shared_vnic_ip);
+       vnic->vnic_id = vnic_id; /* will be overwritten later */
+       vnic->vlan_used = hadmin_entry->vlan_used;
+       vnic->parent_used =  hadmin_entry->parent_used;
+       memcpy(vnic->parent_name, hadmin_entry->parent_name,
+              sizeof(vnic->parent_name));
+       vnic->qp_base_num = hadmin_entry->qp_base_num;
+       vnic->vlan = hadmin_entry->vlan;
+       vnic->cmd = hadmin_entry->cmd;
+       vnic->all_vlan_gw = hadmin_entry->all_vlan_gw;
+
+       /* create dentry */
+       rc = vnic_create_hadmin_dentry(vnic);
+       if (rc)
+               goto init_failed;
+
+       rc = fip_vnic_hadmin_init(gw->discover->port, vnic);
+       if (rc)
+               goto init_failed;
+
+       list_add_tail(&vnic->gw_vnics, &gw->vnic_list);
+
+       /* calls fip_vnic_fsm() */
+       fip_vnic_fsm(&vnic->vnic_task.work);
+
+       return 0;
+
+init_failed:
+       vnic_delete_hadmin_dentry(vnic);
+       kfree(vnic);
+out:
+       return rc;
+}
+
+/*
+ * Queue the GW for deletion. And trigger a delayed call to the cleanup
+ * function.
+ * Note: This deletion method insures that all pending GW work requests
+ * are cleared without dependency of the calling context.
+*/
+void fip_close_gw(struct fip_gw_data *gw, enum fip_flush flush)
+{
+       enum fip_flush tmp_flush = gw->hadmin_gw ? flush : FIP_FULL_FLUSH;
+
+       if (tmp_flush == FIP_PARTIAL_FLUSH && gw->state < FIP_GW_HOST_ADMIN)
+               return;
+
+       /* close already in process, disregard*/
+       if (gw->flush >= tmp_flush)
+               return;
+
+       gw->flush = tmp_flush;
+       gw->info.gw_num_vnics = 0;
+       cancel_delayed_work(&gw->gw_task);
+
+       /* This is not mandatory but will save us time because there is a
+        * better chance that all vnics would be destroyed before trying to
+        * destroy the GW */
+       fip_close_all_vnics(gw, tmp_flush);
+
+       /* calls fip_purge_gws() */
+       queue_delayed_work(fip_wq, &gw->discover->cleanup_task, DELAYED_WORK_CLEANUP_JIFFS);
+}
+
+/*
+ * Free GW resources. This includes destroying the vnics. If the GW can be
+ * totally destroyed (no pending work for the GW and all the vnics have been
+ * destroyed) the GW will be removed from the GWs list and it's memory
+ * freed. If the GW can not be closed at this time it will not be freed
+ * and the function will return an error.
+ * In this case the caller needs to recall the unction to complete the
+ * operation.
+ * Do not call this function directly use: fip_close_gw
+ */
+static int fip_free_gw(struct fip_discover *discover, struct fip_gw_data *gw)
+{
+       struct fip_vnic_data *vnic;
+       int vnic_close_fail = 0;
+
+       gw->info.gw_num_vnics = 0;
+
+       if (delayed_work_pending(&gw->gw_task))
+               return -EBUSY;
+
+       list_for_each_entry(vnic, &gw->vnic_list, gw_vnics)
+               vnic_close_fail |= (vnic->flush != FIP_NO_FLUSH);
+
+       /* true if vnics need to be closed */
+       /* if some of the vnics are still open return and retry later */
+       if (vnic_close_fail)
+               return -EBUSY;
+
+       if (delayed_work_pending(&gw->vnic_cleanup_task))
+               return -EBUSY;
+
+       /*
+        * it is possible that during gw removal we added the GW again. Test GW
+        * list to ensure it is not in the list already before adding it again.
+        */
+       if (gw->state > FIP_GW_HOST_ADMIN) {
+               if (gw->info.gw_prot_new)
+                       discover->new_prot_gws--;
+               else
+                       discover->old_prot_gws--;
+       }
+       if (gw->flush == FIP_PARTIAL_FLUSH) {
+               gw->state = FIP_GW_HOST_ADMIN;
+               gw->flush = FIP_NO_FLUSH;
+       } else {
+               list_del(&gw->list);
+               if (!IS_ERR(gw->pquery) && gw->query_id >= 0)
+                       ib_sa_cancel_query(gw->query_id, gw->pquery);
+               wait_for_completion(&gw->query_comp);
+               kfree(gw);
+       }
+       return 0;
+}
+
+/*
+ * permanently delete all GWs pending delete. The function goes over
+ * the list of GWs awaiting deletion and tries to delete them. If the
+ * GW destructor returns an error value (currently busy) the function
+ * will requeue it self for another try.
+ */
+static void fip_purge_gws(struct work_struct *work)
+{
+       struct fip_discover *discover =
+               container_of(work, struct fip_discover, cleanup_task.work);
+       struct fip_gw_data *gw, *tmp_gw;
+       int gw_close_fail = 0;
+
+       down_write(&discover->l_rwsem);
+       list_for_each_entry_safe(gw, tmp_gw, &discover->gw_list, list) {
+               if (gw->flush  != FIP_NO_FLUSH) {
+                       gw_close_fail |= fip_free_gw(discover, gw);
+               }
+       }
+       up_write(&discover->l_rwsem);
+
+       /* This means we still have vnics that refuse to close, retry later */
+       if (gw_close_fail) {
+               vnic_dbg_fip(discover->name, "still have open GWs\n");
+               /* calls fip_purge_gws() */
+               queue_delayed_work(fip_wq, &discover->cleanup_task,
+                                  DELAYED_WORK_CLEANUP_JIFFS);
+       } else {
+               vnic_dbg_fip(discover->name, "fip_purge_gws all gws"
+                            " closed and freed\n");
+       }
+}
+
+static int fip_free_gw_done(struct fip_discover *discover, enum fip_flush flush)
+{
+       struct fip_gw_data *curr_gw;
+       int rc;
+
+       down_read(&discover->l_rwsem);
+       if (flush == FIP_FULL_FLUSH) {
+               rc = list_empty(&discover->gw_list);
+               up_read(&discover->l_rwsem);
+               return rc;
+       }
+
+       list_for_each_entry(curr_gw, &discover->gw_list, list) {
+               if (curr_gw->flush  != FIP_NO_FLUSH) {
+                       up_read(&discover->l_rwsem);
+                       return 0;
+               }
+       }
+
+       up_read(&discover->l_rwsem);
+       return 1;
+}
+
+/*
+ * Go over the GW list and try to close the GWs. It is possible that some
+ * of the GWs have pending work and therefore can not be closed. We can not
+ * sleep on this because we might be running on the same context as the one
+ * we are waiting for. The user should call this function once and then test
+ * if the free is done by polling (must release wq context) fip_free_gw_done
+ */
+static int fip_free_gw_list(struct fip_discover *discover, enum fip_flush flush)
+{
+       struct fip_gw_data *curr_gw;
+
+       down_read(&discover->l_rwsem);
+       list_for_each_entry(curr_gw, &discover->gw_list, list)
+               fip_close_gw(curr_gw, flush);
+       up_read(&discover->l_rwsem);
+
+       vnic_dbg_fip(discover->name, "fip_free_gw_list not done\n");
+       return 0;
+}
+
+static inline void update_gw_address(struct fip_gw_data *gw,
+                                    struct fip_gw_data_info *new_gw_data)
+{
+       gw->info.gw_qpn = new_gw_data->gw_qpn;
+       gw->info.gw_lid = new_gw_data->gw_lid;
+       gw->info.gw_port_id = new_gw_data->gw_port_id;
+       gw->info.gw_sl = new_gw_data->gw_sl;
+       memcpy(gw->info.gw_guid, new_gw_data->gw_guid, sizeof gw->info.gw_guid);
+
+       vnic_dbg_fip(gw->discover->name, "GW address was modified. "
+                    "QPN: 0x%x, LID: 0x%x, guid: " GUID_FORMAT
+                    "port id: %d, SL: %d\n", gw->info.gw_qpn,
+                    gw->info.gw_lid, GUID_ARG(gw->info.gw_guid),
+                    gw->info.gw_port_id, gw->info.gw_sl);
+       /* restart fsm to path query */
+       if (vnic_sa_query)
+               fip_discover_gw_fsm_move(gw, FIP_GW_CTRL_PATH_QUERY);
+}
+
+int fip_gw_modified(struct fip_gw_data *gw,
+                   struct fip_gw_data_info *new_gw_data)
+{
+       char *name = gw->discover->name;
+       ASSERT(new_gw_data);
+
+       vnic_dbg_fip(name, "fip_gw_modified called, gw_num_vnics %d -> %d\n",
+                    gw->info.gw_num_vnics, new_gw_data->gw_num_vnics);
+
+       if (memcmp(gw->info.gw_guid, new_gw_data->gw_guid,
+                  sizeof(gw->info.gw_guid)) ||
+           gw->info.gw_lid != new_gw_data->gw_lid ||
+           gw->info.gw_port_id != new_gw_data->gw_port_id ||
+           gw->info.gw_qpn != new_gw_data->gw_qpn ||
+           (!vnic_sa_query && gw->info.gw_sl != new_gw_data->gw_sl)) {
+               /* TODO: Make sure that the GW doesn't change the sl sent in solicitation */
+               /* In this case the GW address might be modified even
+                  in 'good flow' */
+               if (gw->info.gw_type == GW_TYPE_LAG &&
+                   gw->info.ext_lag.ucast)
+                       update_gw_address(gw, new_gw_data);
+               else {
+                       vnic_dbg_fip(name, "fip_gw_modified changing "
+                                    "unsupported parameter closing GW\n");
+                       fip_close_gw(gw, FIP_PARTIAL_FLUSH);
+               }
+       } else if (gw->info.gw_num_vnics < new_gw_data->gw_num_vnics) {
+               vnic_dbg_fip(name, "fip_gw_modified changing num "
+                            "vnics from %d to %d\n", gw->info.gw_num_vnics,
+                            new_gw_data->gw_num_vnics);
+               gw->info.gw_num_vnics = new_gw_data->gw_num_vnics;
+               if (fip_gw_create_vnics(gw))
+                       vnic_err(name, "fip_gw_create_vnics failed\n");
+
+       }  else if (gw->info.gw_num_vnics > new_gw_data->gw_num_vnics) {
+               gw->info.gw_num_vnics = new_gw_data->gw_num_vnics;
+               fip_gw_close_nonopen_vnics(gw);
+               if (gw->vnic_count < gw->info.gw_num_vnics)
+                       fip_gw_create_vnics(gw);
+               vnic_dbg_fip(name, "fip_gw_modified changing num "
+                            "vnics from %d to %d\n", gw->info.gw_num_vnics,
+                            new_gw_data->gw_num_vnics);
+       } else if (gw->info.n_rss_qpn != new_gw_data->n_rss_qpn) {
+               gw->info.n_rss_qpn = new_gw_data->n_rss_qpn;
+               vnic_dbg_fip(name, "fip_gw_modified changing n_rss_qpn "
+                            "from %d to %d\n", gw->info.n_rss_qpn,
+                            new_gw_data->n_rss_qpn);
+       } else if (gw->info.hadmined_en != new_gw_data->hadmined_en) {
+               if (fip_gw_create_vnics(gw))
+                       vnic_err(name, "fip_gw_create_vnics failed\n");
+       }
+
+       return 0;
+}
+
+static inline int is_none_zero_guid(u8 *guid)
+{
+       int i;
+       u8 ored = 0;
+
+       if (!guid)
+               return 0;
+
+       for (i = 0; i < 8; ++i)
+               ored |= guid[i];
+
+       return !!ored;
+}
+
+/*
+ * Look for a GW in the GW list.
+ * The search need one identifier to identify the Box (either GUID or system name)
+ * and one identifier for the external port (port_id or eport_name).
+ * This function uses what ever data is available for the search since
+ * various callers do not have access to a single pair of ids.
+ * use NULL for unknown strings and GW_PORT_ID_UNKNOWN for unknown port_id.
+ * GW that are undergoing complete flush are disregarded by the search.
+ */
+struct fip_gw_data *fip_find_gw_in_list(
+                               struct fip_discover *discover,
+                               int     port_id,
+                               u8      *eport_name,
+                               u8      *gw_guid,
+                               u8      *system_guid,
+                               u8      *system_name,
+                               int     is_login)
+{
+       struct fip_gw_data *curr_gw;
+       int use_guid = is_none_zero_guid(gw_guid);
+       int use_system_name = system_name && strlen(system_name) > 0;
+       int use_system_guid = is_none_zero_guid(system_guid);
+       int use_eport = eport_name && strlen(eport_name) > 0;
+       int use_port_id = port_id >= 0;
+       int port_id_pass;
+       int eport_match;
+
+       if(!((use_eport || use_port_id) && 
+            (use_guid || use_system_name || use_system_guid))) {
+               vnic_dbg_fip_v(discover->name,
+                              "fip_find_gw_in_list not enough param for search\n");
+               return NULL;
+       }
+
+       if (use_system_name)
+               vnic_dbg_fip_v(discover->name, "system name %s\n", system_name);
+
+       if (use_guid)
+               vnic_dbg_fip_v(discover->name, "gw guid "VNIC_GUID_FMT"\n",
+                              VNIC_GUID_RAW_ARG(gw_guid));
+
+       if (use_system_guid)
+               vnic_dbg_fip_v(discover->name, "system guid "VNIC_GUID_FMT"\n",
+                              VNIC_GUID_RAW_ARG(system_guid));
+
+       if (use_eport)
+               vnic_dbg_fip_v(discover->name, "eport %s\n", eport_name);
+
+       if (use_port_id)
+               vnic_dbg_fip_v(discover->name, "port_id 0x%x\n", port_id);
+
+       down_read(&discover->l_rwsem);
+       list_for_each_entry(curr_gw, &discover->gw_list, list) {
+               vnic_dbg_fip_v(discover->name, "check gw on eport %s, gw_guid "VNIC_GUID_FMT" "
+                              "system_guid "VNIC_GUID_FMT", flush %d\n",
+                              curr_gw->info.vol_info.gw_port_name,
+                              VNIC_GUID_RAW_ARG(curr_gw->info.gw_guid),
+                              VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid),
+                              curr_gw->flush);
+
+               if (curr_gw->flush == FIP_FULL_FLUSH)
+                       continue;
+
+               /* for login ack, skip non connected GWs */
+               if (is_login && use_port_id && curr_gw->state == FIP_GW_HOST_ADMIN) /* skip dangling hadmined GWs */
+                       continue;
+
+               /* use the eport names only if you don't have port_id indexes
+                * This is in order to enable port_id changes.
+                * in case of host admin GW, ignore gw_port_id since the old GW
+                * will never be flushed and the new GW id can change */
+               port_id_pass = use_port_id && (curr_gw->info.gw_port_id != (u16)-1) && !(curr_gw->hadmin_gw && use_eport);
+               eport_match = (use_eport && !port_id_pass &&
+                        !strncmp(curr_gw->info.vol_info.gw_port_name,
+                                 eport_name,VNIC_GW_PORT_NAME_LEN)) ||
+                       (port_id_pass && (port_id == curr_gw->info.gw_port_id));
+               if (!eport_match)
+                       continue;
+
+               if (use_guid && !memcmp(curr_gw->info.gw_guid, gw_guid, GUID_LEN))
+                       goto found;
+
+               if (use_system_guid &&
+                   !memcmp(curr_gw->info.vol_info.system_guid,
+                           system_guid, GUID_LEN))
+                       goto found;
+
+               if(use_system_name &&
+                  !strncmp(curr_gw->info.vol_info.system_name, system_name,
+                           VNIC_SYSTEM_NAME_LEN))
+                       goto found;
+       }
+
+       up_read(&discover->l_rwsem);
+       vnic_dbg_fip(discover->name, "gw not found!\n");
+       return NULL;
+found:
+       if (curr_gw->hadmin_gw && use_eport && use_port_id &&
+               !strncmp(curr_gw->info.vol_info.gw_port_name,eport_name,VNIC_GW_PORT_NAME_LEN) &&
+               curr_gw->info.gw_port_id != port_id) {
+               vnic_info("%s:["VNIC_GUID_FMT"] %s eport ID changed from %d to %d\n",
+                                 curr_gw->info.vol_info.system_name,
+                                 VNIC_GUID_RAW_ARG(curr_gw->info.vol_info.system_guid),
+                                 curr_gw->info.vol_info.gw_port_name,
+                                 curr_gw->info.gw_port_id, port_id);
+       }
+
+       up_read(&discover->l_rwsem);
+       return curr_gw;
+}
+
+/*
+ * Alloc and init a new GW struct
+ */
+static struct fip_gw_data *fip_discover_create_gw(struct fip_discover *discover)
+{
+       struct fip_gw_data *gw_data;
+
+       gw_data = kzalloc(sizeof(struct fip_gw_data), GFP_KERNEL);
+       if (!gw_data)
+               goto out;
+
+       INIT_DELAYED_WORK(&gw_data->gw_task, fip_discover_gw_fsm);
+       INIT_DELAYED_WORK(&gw_data->vnic_cleanup_task, fip_purge_vnics);
+       INIT_LIST_HEAD(&gw_data->vnic_list);
+       gw_data->discover = discover;
+       gw_data->pquery = ERR_PTR(-ENODATA);
+       gw_data->query_id = -1;
+       init_completion(&gw_data->query_comp);
+       complete(&gw_data->query_comp);
+       mutex_init(&gw_data->mlock);
+
+out:
+       return gw_data;
+}
+
+static void fip_discover_hadmin_update(struct work_struct *work)
+{
+       struct fip_discover *discover =
+               container_of(work, struct fip_discover,
+                            hadmin_update_task.work);
+       struct fip_hadmin_cache *hadmin_entry;
+       struct fip_hadmin_cache *hadmin_tmp;
+       struct fip_gw_data *curr_gw;
+       struct list_head hadmin_head;
+       char *name;
+       int flush, used_guid, rc;
+
+       /* move list from hadmin_cache to a temporary list */
+       spin_lock_irq(&discover->lock);
+       list_replace(&discover->hadmin_cache, &hadmin_head);
+       INIT_LIST_HEAD(&discover->hadmin_cache);
+       flush = discover->flush;
+       spin_unlock_irq(&discover->lock);
+
+       if (flush != FIP_NO_FLUSH)
+               goto out;
+
+       /* process hadmin list */
+       list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next) {
+               name = (char *)(hadmin_entry->interface_name);
+               vnic_dbg_mac(name, "parent_used %d, remove %d\n",
+                            hadmin_entry->parent_used,
+                            hadmin_entry->remove);
+               if (hadmin_entry->parent_used) {
+                       rc = vnic_parent_update(discover->port, hadmin_entry->interface_name,
+                                               hadmin_entry->vnic_id, hadmin_entry->mac,
+                                               &(hadmin_entry->qp_base_num),
+                                               hadmin_entry->parent_name,
+                                               hadmin_entry->remove);
+                       if (rc)
+                               continue;
+               }
+
+               used_guid = is_valid_guid(hadmin_entry->system_guid);
+               curr_gw = fip_find_gw_in_list(discover, NOT_AVAILABLE_NUM,
+                                             hadmin_entry->eport_name,
+                                             NULL,
+                                             used_guid ? hadmin_entry->system_guid : NULL,
+                                             used_guid ? NULL : hadmin_entry->system_name, 0/* is_login */);
+               if (!hadmin_entry->remove) {
+                       /* in case no GW or GW is being removed create a new one */
+                       if (!curr_gw || curr_gw->flush == FIP_FULL_FLUSH) {
+                               curr_gw = fip_discover_create_gw(discover);
+                               if (!curr_gw) {
+                                       vnic_warn(discover->name, "failed to create hadmin GW\n");
+                                       continue;
+                               } else {
+                                       down_write(&discover->l_rwsem);
+                                       list_add_tail(&curr_gw->list, &discover->gw_list);
+                                       up_write(&discover->l_rwsem);
+                               }
+
+                               memcpy(curr_gw->info.vol_info.system_guid,
+                                      hadmin_entry->system_guid, GUID_LEN);
+                               memcpy(curr_gw->info.vol_info.gw_port_name,
+                                      hadmin_entry->eport_name,
+                                      VNIC_GW_PORT_NAME_LEN);
+                               if (used_guid)
+                                       strcpy(curr_gw->info.vol_info.system_name,
+                                              NOT_AVAILABLE_STRING);
+                               else
+                                       memcpy(curr_gw->info.vol_info.system_name,
+                                              hadmin_entry->system_name,
+                                              VNIC_SYSTEM_NAME_LEN);
+
+                               curr_gw->info.gw_port_id = hadmin_entry->gw_port_id;
+                               curr_gw->state = FIP_GW_HOST_ADMIN;
+                       }
+
+                       curr_gw->hadmin_gw = 1;
+                       fip_gw_update_hadmin_gw(curr_gw, hadmin_entry);
+               } else if(curr_gw)
+                       fip_gw_update_hadmin_gw(curr_gw, hadmin_entry);
+
+               list_del(&hadmin_entry->next);
+               kfree(hadmin_entry);
+       }
+
+out:
+       /* flush hadmin_tmp list and exit */
+       list_for_each_entry_safe(hadmin_entry, hadmin_tmp, &hadmin_head, next)
+               kfree(hadmin_entry);
+}
+
+static const char *gw_state_to_str(enum fip_gw_state state)
+{
+       switch (state) {
+       case FIP_GW_CONNECTED:
+               return "FIP_GW_CONNECTED";
+       case FIP_GW_CTRL_PATH_QUERY:
+               return "FIP_GW_CTRL_PATH_QUERY";
+       case FIP_GW_DATA_PATH_QUERY:
+               return "FIP_GW_DATA_PATH_QUERY";
+       case FIP_GW_HOST_ADMIN:
+               return "FIP_GW_HOST_ADMIN";
+       case FIP_GW_SEND_SOLICIT:
+               return "FIP_GW_SEND_SOLICIT";
+       default:
+               return "UNKNOWN";
+       }
+}
+
+int fip_gw_sysfs_show(struct vnic_port *port, char *buf)
+{
+       struct fip_gw_data *gw;
+       char *p = buf;
+       struct fip_discover *discover;
+
+       mutex_lock(&port->start_stop_lock);
+       list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+
+               down_read(&discover->l_rwsem);
+
+               list_for_each_entry(gw, &discover->gw_list, list) {
+                       p += _sprintf(p, buf, "IOA_PORT      %s:%d\n",
+                                     gw->discover->port->dev->ca->name,
+                                     gw->discover->port->num);
+                       p += _sprintf(p, buf, "BX_NAME       %s\n",
+                                     gw->info.vol_info.system_name);
+                       if (!(*(u64 *)(gw->info.vol_info.system_guid)))
+                               p += _sprintf(p, buf, "BX_GUID       %s\n", NOT_AVAILABLE_STRING);
+                       else
+                               p += _sprintf(p, buf, "BX_GUID       "VNIC_GUID_FMT"\n",
+                                             VNIC_GUID_RAW_ARG(gw->info.vol_info.system_guid));
+                       p += _sprintf(p, buf, "EPORT_NAME    %s\n", gw->info.vol_info.gw_port_name);
+                       p += _sprintf(p, buf, "EPORT_ID      %u\n", gw->info.gw_port_id);
+                       p += _sprintf(p, buf, "STATE         %s\n", gw_state_to_str(gw->state));
+                       p += _sprintf(p, buf, "GW_TYPE       %s\n", gw->info.gw_type == GW_TYPE_LAG ?
+                                     "AGGREGATED" : "LEGACY");
+                       p += _sprintf(p, buf, "PKEY          0x%x\n", discover->pkey);
+                       p += _sprintf(p, buf, "ALL_VLAN      %s\n",
+                                     gw->state == FIP_GW_CONNECTED ?
+                                     (gw->info.all_vlan_gw ? "yes" : "no") : NOT_AVAILABLE_STRING);
+                       p += _sprintf(p, buf, "CTRL_SL       %d\n", gw->ctrl_prec.sl);
+                       p += _sprintf(p, buf, "DATA_SL       %d\n", gw->data_prec.sl);
+                       p += _sprintf(p, buf, "\n");
+               }
+
+               up_read(&discover->l_rwsem);
+       }
+
+       mutex_unlock(&port->start_stop_lock);
+       return (p - buf);
+}
+
+static int fip_discover_rx_advertise_bh(struct fip_discover *discover,
+                                       struct fip_gw_data *advertise_data)
+{
+       struct fip_gw_data *gw_data;
+       int update_entry = 0;
+
+       /* see if we received advertise packets from this GW before */
+       gw_data = fip_find_gw_in_list(discover,
+                                     advertise_data->info.gw_port_id,
+                                     advertise_data->info.vol_info.gw_port_name,
+                                     advertise_data->info.gw_guid,
+                                     advertise_data->info.vol_info.system_guid,
+                                     advertise_data->info.vol_info.system_name, 0/* is_login */);
+
+       /*
+        * GW not found in GW list. Create a new GW structure
+        * and add it to the GW list. 
+        */
+       if (!gw_data) {
+               gw_data = fip_discover_create_gw(discover);
+               if (!gw_data) {
+                       vnic_dbg_fip(discover->name, "Could not create gw\n");
+                       return -ENOMEM;
+               }
+               gw_data->keep_alive_jiffies = jiffies;
+               
+               down_write(&discover->l_rwsem);
+               list_add_tail(&gw_data->list, &discover->gw_list);
+               up_write(&discover->l_rwsem);
+               update_entry = 1;
+       } else {
+               gw_data->keep_alive_jiffies = jiffies;
+               vnic_dbg_fip(discover->name, "gw_data->flush %d\n", gw_data->flush);
+               if (gw_data->flush != FIP_NO_FLUSH)
+                       return 0;
+
+               if (gw_data->state <= FIP_GW_SEND_SOLICIT)
+                       update_entry = 1;
+       }
+
+       /* If GW is in multicast state (based on received mcast packet),
+        * replace it with the newer up-to-date packet info.
+        */
+       if (update_entry) {
+               if (gw_data->state < FIP_GW_CTRL_PATH_QUERY) {
+                       down_write(&discover->l_rwsem);
+                       if (advertise_data->info.gw_prot_new)
+                               discover->new_prot_gws++;
+                       else
+                               discover->old_prot_gws++;
+                       up_write(&discover->l_rwsem);
+               }
+               memcpy(&gw_data->info, &advertise_data->info,
+                      sizeof(struct fip_gw_data_info));
+               if (gw_data->state < FIP_GW_SEND_SOLICIT)
+                       gw_data->state = vnic_sa_query? FIP_GW_CTRL_PATH_QUERY : FIP_GW_SEND_SOLICIT;
+       } else {
+               /* If the pc_id in the adv doesn't match the one
+                  saved - there was a power cycle, so we want to close
+                  the GW */
+               if (advertise_data->info.ext_pc_id.valid &&
+                   (advertise_data->info.ext_pc_id.power_cycle_id !=
+                    gw_data->info.ext_pc_id.power_cycle_id)) {
+                       vnic_dbg_fip_p0(discover->name, "received advertisement with "
+                                       "pc_id %llu when expecting %llu. closing the GW",
+                                        advertise_data->info.ext_pc_id.power_cycle_id,
+                                        gw_data->info.ext_pc_id.power_cycle_id);
+                       fip_close_gw(gw_data, FIP_PARTIAL_FLUSH);
+                       goto no_repost;
+               }
+
+               /* TBD: enforce discard ?? */
+               if (gw_data->info.gw_type != advertise_data->info.gw_type)
+                       vnic_dbg_fip_p0(discover->name, "gateway type must not change\n");
+
+               /* update GW descriptors that do not require additional processing.
+                  These will be updated as part of GW_MODIFY flow */
+               mutex_lock(&gw_data->mlock);
+               if (advertise_data->info.ext_pc_id.valid)
+                       memcpy(&gw_data->info.ext_pc_id, &advertise_data->info.ext_pc_id,
+                              sizeof(gw_data->info.ext_pc_id));
+
+               memcpy(&gw_data->info.vol_info, &advertise_data->info.vol_info,
+                      sizeof(gw_data->info.vol_info));
+               if (gw_data->info.ext_lag.valid) {
+                       gw_data->info.ext_lag.hash = advertise_data->info.ext_lag.hash;
+                       gw_data->info.ext_lag.ca = advertise_data->info.ext_lag.ca;
+                       gw_data->info.ext_lag.ca_thresh = advertise_data->info.ext_lag.ca_thresh;
+                       gw_data->info.ext_lag.weights_policy = advertise_data->info.ext_lag.weights_policy;
+               }
+               mutex_unlock(&gw_data->mlock);
+       }
+
+       /* if multicast advertisement received */
+       if (advertise_data->info.flags & FIP_RCV_MULTICAST) {
+               vnic_dbg_fip(discover->name, "FIP_RCV_MULTICAST ADVERTISE, state %d\n",
+                            gw_data->state);
+               /* we are beyond accepting mcast advertisement */
+               if (gw_data->state > FIP_GW_SEND_SOLICIT)
+                       goto out;
+
+               vnic_dbg_fip(discover->name, "received mcast advertise sending"
+                            " ucast solicit to GW qpn %d lid %d flags 0x%x\n",
+                            gw_data->info.gw_qpn, gw_data->info.gw_lid,
+                            gw_data->info.flags);
+       } else { /* unicast advertisement received */
+               int ack_received = advertise_data->info.flags & FIP_GW_AVAILABLE;
+
+               vnic_dbg_fip(discover->name, "received ucast advertise from GW "
+                            "qpn %d lid %d flags 0x%x, ack_received %s "
+                            "gw_num_vnics %d gw->state=%d, "
+                            VNIC_GUID_FMT"\n",
+                            gw_data->info.gw_qpn, gw_data->info.gw_lid,
+                            gw_data->info.flags, ack_received ? "yes" : "no",
+                            gw_data->info.gw_num_vnics, gw_data->state,
+                            VNIC_GUID_RAW_ARG(gw_data->info.gw_guid));
+
+               if (ack_received) {
+                       /* if this is first ACK received */
+                       switch (gw_data->state) {
+                       case FIP_GW_CTRL_PATH_QUERY:
+                               /*
+                               * in case we are in FIP_GW_CTRL_PATH_QUERY we wait until it completes
+                               * to move us to FIP_GW_SEND_SOLICIT
+                               */
+                               break;
+                       case FIP_GW_SEND_SOLICIT:
+                               /* in case we received an ack in this state we move to DATA_PATH_QUERY */
+                               gw_data->state = vnic_sa_query ? FIP_GW_DATA_PATH_QUERY : FIP_GW_CONNECTED;
+                               break;
+                       case FIP_GW_CONNECTED:
+                                /*
+                               * received an ACK and we are connected. we need to
+                               * check for changes in GW and apply them if needed
+                               */
+                               if (!fip_gw_modified(gw_data, &advertise_data->info))
+                                       gw_data->state = FIP_GW_CONNECTED;
+                               goto no_repost;
+                       default:
+                               break;
+                       }
+               } else  /* !ack_received */ {
+                       fip_close_gw(gw_data, FIP_PARTIAL_FLUSH);
+                       goto no_repost;
+               }
+               /*
+                * we don't accept ACKs in transient states.
+                * This should not be a problem since crowded multiple ACKs
+                * is not an expected flow, and if the packets are similar
+                * (no updates) it doesn't matter anyway.
+                */
+       }
+
+out:
+       vnic_dbg_fip(discover->name, "out gw->state=%d\n", gw_data->state);
+       /*
+        * we will call the GW FSM to hadle
+        */
+       cancel_delayed_work(&gw_data->gw_task);
+       fip_discover_gw_fsm(&gw_data->gw_task.work);
+no_repost:
+       return 0;
+}
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then validates the packet
+ * according to its type. This functions runs in ka_wq task context.
+ */
+void fip_discover_rx_packet(int *queue, struct fip_content *fc)
+{
+       *queue = 0;
+       switch (fc->fh->subcode) {
+       case FIP_GW_ADV_SUB_OPCODE:
+       case FIP_GW_LOGIN_SUB_OPCODE:
+               *queue = 1;
+               break;
+       default:
+               break;
+       }
+}
+
+/*
+ * Print FIP syndrome number and string
+ */
+static void fip_print_syndrome(struct fip_vnic_data *vnic, int synd) {
+       char *syndstr;
+
+       switch (synd) {
+       case FIP_SYNDROM_HADMIN_REJECT:
+               syndstr = "FIP_SYNDROM_HADMIN_REJECT";
+               break;
+       case FIP_SYNDROM_GW_RESRC:
+               syndstr = "FIP_SYNDROM_GW_RESRC";
+               break;
+       case FIP_SYNDROM_NO_NADMIN:
+               syndstr = "FIP_SYNDROM_NO_NADMIN";
+               break;
+       case FIP_SYNDROM_UNRECOGNISED_HOST:
+               syndstr = "FIP_SYNDROM_UNRECOGNISED_HOST";
+               break;
+       case FIP_SYNDROM_UNSUPPORTED_PARAM:
+               syndstr = "FIP_SYNDROM_UNSUPPORTED_PARAM";
+               break;
+       case FIP_SYNDROM_GW_IS_LAG_MEMBER:
+               syndstr = "FIP_SYNDROM_GW_IS_LAG_MEMBER";
+               break;
+       case FIP_SYNDROM_DUPLICATE_ADDRESS:
+               syndstr = "FIP_SYNDROM_DUPLICATE_ADDRESS";
+               break;
+       default:
+               syndstr = "FIP_OTHER";
+       }
+
+       vnic_warn(vnic->name, "SYNDROME 0x%x: %s\n",
+                 synd, syndstr);
+}
+
+static void handle_login_packet(struct fip_discover *discover,
+                               struct fip_login_data *login_data)
+{
+       struct fip_gw_data *gw;
+       struct fip_vnic_data *vnic;
+       int mac_vlan_refused = 0;
+       int synd;
+
+       /* find the GW that this login belongs to */
+       gw = fip_find_gw_in_list(discover,
+                                login_data->port_id,
+                                NULL,
+                                login_data->guid,
+                                NULL, NULL, 1/* is_login */);
+
+       if (!gw){
+               vnic_warn(discover->name,"dropping login ack with vnic_id:%d mac:"MAC_6_PRINT_FMT
+                                 "  BX port_id:%d GUID: "VNIC_GUID_FMT", GW not found!\n",
+                                 login_data->vnic_id,
+                                 MAC_6_PRINT_ARG(login_data->mac),
+                                 login_data->port_id,
+                                 VNIC_GUID_RAW_ARG(login_data->guid));
+               return;
+       }
+       vnic = fip_vnic_find_in_list(gw, login_data->vnic_id,
+                                    login_data->mac,
+                                    login_data->vlan,
+                                    login_data->vp);
+       if (!vnic){
+               vnic_warn(discover->name,"dropping login ack with vnic_id:%d mac:"MAC_6_PRINT_FMT
+                                 "  BX port_id:%d GUID: "VNIC_GUID_FMT", vnic not found!\n",
+                                 login_data->vnic_id,
+                                 MAC_6_PRINT_ARG(login_data->mac),
+                                 login_data->port_id,
+                                 VNIC_GUID_RAW_ARG(login_data->guid));
+               return;
+       }
+
+       /*
+        * For host administered vNICs we must have login and login ack
+        * macs equal and different than all zeros. login and and login
+        * ack must agree on vlan presence. And if vlan is present, vlans
+        * must be indentical. Otherwise, the request is rejected.
+        */
+       if (vnic->hadmined) {
+               if (!IS_ZERO_MAC(vnic->login_data.mac) &&
+                   memcmp(vnic->login_data.mac, login_data->mac, ETH_ALEN)) {
+                       vnic_dbg_fip(discover->name, "fip_discover_rx_packet"
+                                    " host admined mac refused\n");
+                       mac_vlan_refused = 1;
+               } else if (vnic->login_data.all_vlan_gw != login_data->all_vlan_gw)
+                       vnic_dbg_fip(discover->name,
+                                    "fip_discover_rx_packet host"
+                                    " host and GW disagree on all_vlan mode\n");
+               /* If the host is not working in all_vlan_gw policy -
+                  check the requested vlan against the accepted */
+               else if (!gw->info.all_vlan_gw &&
+                          (vnic->login_data.vp != login_data->vp ||
+                           (login_data->vp == 1 &&
+                            vnic->login_data.vlan != login_data->vlan))) {
+                       vnic_dbg_fip(discover->name,
+                                    "fip_discover_rx_packet host"
+                                    " admined vlan refused\n");
+                       mac_vlan_refused = 1;
+               }
+       }
+
+       /* process a login packet for the specific vnic */
+       synd = (int)login_data->syndrome;
+       if (synd || mac_vlan_refused) {
+               char *vnic_name = vnic->hadmined ?
+                         (char *)vnic->interface_name : (char *)vnic->name;
+               /* print syndrome as long as backlog limit is not exceeded */
+               if (vnic->synd_backlog++ >= vnic_synd_backlog)
+                       return;
+
+               vnic_warn(discover->name, "%s login failed "
+                         "(mac "MAC_6_PRINT_FMT" vlan %d) "
+                         "backlog %d/%d\n",
+                         vnic_name,
+                         MAC_6_PRINT_ARG(vnic->mac_cache),
+                         (vnic->vlan_used ? vnic->vlan : -1),
+                         vnic->synd_backlog, vnic_synd_backlog);
+
+               if (mac_vlan_refused)
+                       vnic_warn(vnic->name, "MAC/VLAN refused\n");
+
+               fip_print_syndrome(vnic, synd);
+
+               if (synd == FIP_SYNDROM_UNRECOGNISED_HOST) {
+                       vnic_info("%s %s sending ucast sloicit to Gateway\n",
+                                         discover->name, vnic_name);
+                       if(fip_solicit_send(gw->discover,
+                                    FIP_DISCOVER_UCAST,
+                                    gw->info.gw_qpn,
+                                    gw->info.gw_lid,
+                                    vnic_gw_ctrl_sl(gw),
+                                    gw->info.gw_prot_new))
+                               vnic_warn(discover->name, "%s Failed to send ucast solicit\n", vnic_name);
+               }
+       } else {
+               vnic->all_vlan_gw = !!((!vnic->hadmined && vnic->gw->info.all_vlan_gw) ||
+                                      (vnic->hadmined && vnic->login_data.all_vlan_gw));
+               fip_vnic_login_ack_recv(vnic, login_data);
+       }
+}
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then processes the packet
+ * according to its type. This functions runs in task context.
+ */
+int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc)
+{
+       struct fip_gw_data *advertise_data = NULL;
+       struct fip_login_data *login_data = NULL;
+       int rc;
+       int ret = 0;
+
+       switch (fc->fh->subcode) {
+       case FIP_GW_ADV_SUB_OPCODE:
+               advertise_data = kzalloc(sizeof *advertise_data, GFP_KERNEL);
+               if (!advertise_data) {
+                       vnic_warn(discover->name,
+                                 "Failed to allocate %Zu bytes",
+                                 sizeof *advertise_data);
+                       return -ENOMEM;
+               }
+
+               rc = fip_advertise_parse_bh(discover, fc, advertise_data);
+               if (!rc)
+                       ret = fip_discover_rx_advertise_bh(discover,
+                                                          advertise_data);
+               kfree(advertise_data);
+               break;
+   
+       case FIP_GW_LOGIN_SUB_OPCODE:
+               login_data = kzalloc(sizeof *login_data, GFP_KERNEL);
+               if (!login_data) {
+                       vnic_warn(discover->name,
+                                 "Failed to allocate %Zu bytes",
+                                 sizeof *login_data);
+                       return -ENOMEM;
+               }
+
+               rc = fip_login_parse(discover, fc, login_data);
+               if (!rc)
+                       handle_login_packet(discover, login_data);
+
+               kfree(login_data);
+               break;
+       default:
+               break;
+       }
+
+       return ret;
+}
+
+/*
+ * This function is a callback called upon successful join to a
+ * multicast group. The function checks if we have joined + attached
+ * to all required mcast groups and if so moves the discovery FSM to solicit.
+ */
+static void fip_discover_mcast_connect_cb(struct vnic_mcast *mcaste, void *ctx)
+{
+       struct fip_discover *discover = mcaste->priv_data;
+
+       if (mcaste->cur_attached && mcaste->req_attach) {
+               vnic_dbg_parse(discover->name, "attached mask = 0x%lx, req mask = 0x%lx\n",
+                              *mcaste->cur_attached, *mcaste->req_attach);
+               if ((*mcaste->cur_attached & *mcaste->req_attach) !=
+                   *mcaste->req_attach) {
+                       return;
+               }
+       }
+
+       discover->discover_mcast_attached_jiffies = jiffies;
+       set_bit(MCAST_ATTACHED, &discover->discover_mcast_state);
+       /* in the case of a reconnect don't change state or send a solicit
+        * packet
+        */
+       if (discover->state < FIP_DISCOVER_SOLICIT) {
+               vnic_dbg_fip(discover->name, "fip_multicast_connected moved"
+                            " state to solicit\n");
+               spin_lock_irq(&discover->lock);
+               if (discover->flush == FIP_NO_FLUSH) {
+                       /* delay sending solicit packet by 0-100 mSec */
+                       int rand_delay = jiffies % 100; /*get_random_int()*/
+                       discover->state = FIP_DISCOVER_SOLICIT;
+                       cancel_delayed_work(&discover->fsm_task);
+                       /* This is really (rand_delay / 1000) * HZ*/
+                       /* calls fip_discover_fsm() */
+                       queue_delayed_work(fip_wq, &discover->fsm_task,
+                                          (rand_delay * HZ) / 1000);
+               }
+               spin_unlock_irq(&discover->lock);
+       }
+       vnic_dbg_fip(discover->name, "discover_mcast_connect_cb done\n");
+}
+
+/*
+ * This function is a callback called upon a mcast deattach event.
+ * This event can be triggered due to discovery teardown or due to an async
+ * event. Currently this code does not participate in the discovery's FSM.
+*/
+void fip_discover_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx)
+{
+//     struct vnic_mcast *mcast_other = ctx;
+       struct fip_discover *discover = mcast->priv_data;
+
+       discover->discover_mcast_detached_jiffies = jiffies;
+       clear_bit(MCAST_ATTACHED, &discover->discover_mcast_state);
+
+       vnic_dbg_fip(NULL, "fip_discover_mcast_deattach_cb\n");
+}
+
+/*
+ * Try to connect to the relevant mcast groups. If one of the mcast failed
+ * The function should be recalled to try and complete the join process
+ * (for the mcast groups that the join process was not performed).
+ * Note: A successful return of vnic_mcast_join means that the mcast join
+ * started, not that the join completed. completion of the connection process
+ * is asyncronous and uses a supplyed callback.
+ */
+static int fip_discover_mcast_connect(struct fip_discover *discover)
+{
+       struct vnic_mcast *mcaste_disc, *mcaste_sol, *mcaste;
+       int rc;
+
+       mcaste_disc = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached);
+       if (IS_ERR(mcaste_disc))
+               return -EINVAL;
+
+       mcaste_sol = vnic_mcast_alloc(discover->port, &discover->req_attach, &discover->cur_attached);
+       if (IS_ERR(mcaste_sol)) {
+               vnic_mcast_dealloc(mcaste_disc);
+               return -EINVAL;
+       }
+
+       set_bit(FIP_MCAST_DISCOVER, &discover->req_attach);
+       set_bit(FIP_MCAST_SOLICIT, &discover->req_attach);
+
+       mcaste = mcaste_disc;
+       mcaste->priv_data = discover;
+       mcaste->attach_bit_nr = FIP_MCAST_DISCOVER;
+       memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+       memcpy(&mcaste->gid, fip_discover_mgid, GID_LEN);
+       if (discover->pkey != 0xffff)
+               *(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000);
+       memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+       mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+       mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+       mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+       mcaste->attach_cb = fip_discover_mcast_connect_cb;
+       mcaste->detach_cb = fip_discover_mcast_deattach_cb;
+       mcaste->attach_cb_ctx = mcaste_sol;
+       mcaste->detach_cb_ctx = mcaste_sol;
+       mcaste->pkey = discover->pkey;
+       mcaste->qkey = VNIC_FIP_QKEY;
+       mcaste->qp = discover->qp;
+       mcaste->blocking = 0;
+       mcaste->join_state = 1;
+       rc = vnic_mcast_add(&discover->mcast_tree, mcaste);
+       ASSERT(!rc);
+       rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_RECEIVE_ONLY */
+       ASSERT(!rc);
+
+       mcaste = mcaste_sol;
+       mcaste->priv_data = discover;
+       mcaste->attach_bit_nr = FIP_MCAST_SOLICIT;
+       memcpy(mcaste->mac, ETH_BCAST_MAC, ETH_ALEN);
+       memcpy(&mcaste->gid, fip_solicit_mgid, GID_LEN);
+       if (discover->pkey != 0xffff)
+               *(u16 *)&mcaste->gid.raw[6] = htons(discover->pkey | 0x8000);
+       memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+       mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+       mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+       mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+       mcaste->attach_cb = fip_discover_mcast_connect_cb;
+       mcaste->detach_cb = fip_discover_mcast_deattach_cb;
+       mcaste->attach_cb_ctx = mcaste_disc;
+       mcaste->detach_cb_ctx = mcaste_disc;
+       mcaste->pkey = discover->pkey;
+       mcaste->qkey = VNIC_FIP_QKEY;
+       mcaste->qp = discover->qp;
+       mcaste->blocking = 0;
+       mcaste->join_state = 1;
+       mcaste->sender_only = 1;
+       rc = vnic_mcast_add(&discover->mcast_tree, mcaste);
+       ASSERT(!rc);
+       rc = vnic_mcast_attach(&discover->mcast_tree, mcaste); /* MCAST_SEND_ONLY */
+       ASSERT(!rc);
+
+       return 0;
+}
+
+int fip_discover_mcast_reattach(struct fip_discover *discover,
+                               struct vnic_port *port)
+{
+       int flush;
+
+       spin_lock_irq(&discover->lock);
+       flush = discover->flush;
+       spin_unlock_irq(&discover->lock);
+
+       if (flush == FIP_NO_FLUSH &&
+           discover->state > FIP_DISCOVER_INIT) {
+               vnic_tree_mcast_detach(&discover->mcast_tree);
+               vnic_tree_mcast_attach(&discover->mcast_tree);
+       }
+       return 0;
+}
+
+static void fip_discover_ctrl_path_query_complete(
+                                       int status,
+                                       struct ib_sa_path_rec *pathrec,
+                                       void *context)
+{
+       struct fip_gw_data *gw = context;
+       vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query complete status=%d\n", status);
+       if (!status) {
+               vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query success srcgid:"VNIC_GUID_FMT" dgid:"VNIC_GUID_FMT"\n",
+                                               VNIC_GUID_RAW_ARG(pathrec->sgid.raw+8),
+                                               VNIC_GUID_RAW_ARG(pathrec->dgid.raw+8));
+               gw->ctrl_prec = *pathrec;
+               fip_discover_gw_fsm_move(gw, FIP_GW_SEND_SOLICIT);
+       } else {
+               vnic_dbg_fip_p0(gw->discover->name, "fip ctrl path query FAILED ret=%d\n", status);
+               gw->query_id = -1; /* this will cause a retry */
+       }
+       complete(&gw->query_comp);
+}
+
+static void fip_discover_data_path_query_complete(
+                                               int status,
+                                               struct ib_sa_path_rec *pathrec,
+                                               void *context)
+{
+       struct fip_gw_data *gw = context;
+       vnic_dbg_fip_p0(gw->discover->name, "fip data path query complete status=%d\n", status);
+       if (!status) {
+               struct ib_sa_path_rec old_pathrec;
+               struct fip_vnic_data *vnic;
+               vnic_dbg_fip_p0(gw->discover->name, "fip data path query success srcgid:"VNIC_GUID_FMT" dgid:"VNIC_GUID_FMT"\n",
+                                               VNIC_GUID_RAW_ARG(pathrec->sgid.raw+8),
+                                               VNIC_GUID_RAW_ARG(pathrec->dgid.raw+8));
+               old_pathrec = gw->data_prec;
+               gw->data_prec = *pathrec;
+               if (old_pathrec.sl != gw->data_prec.sl) {
+                       /* in case of SL change close the vnic to relogin with the new SL */
+                       vnic_info("[%s] %s %s Data SL changed from %d to %d\n",
+                                         gw->info.vol_info.system_name,
+                                         gw->discover->port->name,
+                                         gw->info.vol_info.gw_port_name,
+                                         old_pathrec.sl, gw->data_prec.sl);
+                        list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+                if (vnic->flush != FIP_FULL_FLUSH && vnic->state >= FIP_VNIC_LOGIN)
+                                       fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+                       }
+               }
+               fip_discover_gw_fsm_move(gw, FIP_GW_CONNECTED);
+       } else {
+               vnic_dbg_fip_p0(gw->discover->name, "fip data path query FAILED ret=%d\n", status);
+               gw->query_id = -1; /* this will cause a retry */
+       }
+       complete(&gw->query_comp);
+}
+
+static int fip_discover_path_query(struct fip_gw_data *gw, int is_data_sl)
+{
+       ib_sa_comp_mask comp_mask;
+       struct ib_sa_path_rec p_rec;
+       void(*callback)(int status, struct ib_sa_path_rec *resp, void *context);
+
+       vnic_dbg_fip_p0(gw->discover->name, "fip path query %d of GW lid:%d sl=%d GID:"VNIC_GUID_FMT" SID=%llx data_path=%d!\n",
+                                gw->query_path_cnt,
+                                gw->info.gw_lid,
+                                gw->info.gw_sl,
+                                VNIC_GUID_RAW_ARG(gw->info.gw_guid),
+                                is_data_sl ? EOIB_SERVICE_ID : EOIB_CTRL_SERVICE_ID,
+                                is_data_sl);
+
+       comp_mask =      IB_SA_PATH_REC_SERVICE_ID  |
+                                        IB_SA_PATH_REC_DGID         |
+                                        IB_SA_PATH_REC_SGID         |
+                                        IB_SA_PATH_REC_REVERSIBLE  |
+                                        IB_SA_PATH_REC_PKEY;
+
+       callback = is_data_sl ? fip_discover_data_path_query_complete : fip_discover_ctrl_path_query_complete;
+       memset(&p_rec, 0, sizeof(p_rec));
+
+       p_rec.service_id = is_data_sl ? cpu_to_be64(EOIB_SERVICE_ID) : cpu_to_be64(EOIB_CTRL_SERVICE_ID);
+       p_rec.sgid = gw->discover->port->gid;
+       /* copy the subnet prefix from source gid */
+       memcpy(p_rec.dgid.raw, p_rec.sgid.raw, 8);
+       /* copy gw dgid */
+       memcpy(p_rec.dgid.raw+8, gw->info.gw_guid,8);
+       p_rec.pkey = cpu_to_be16(gw->discover->pkey);
+       p_rec.reversible = cpu_to_be32(1);
+
+       if (gw->query_id >= 0 && !IS_ERR(gw->pquery) && gw->pquery) {
+               ib_sa_cancel_query(gw->query_id, gw->pquery);
+               return -1; /* retry later */
+       }
+
+       init_completion(&gw->query_comp);
+       gw->query_path_cnt++;
+       gw->query_id = -1;
+       gw->pquery = ERR_PTR(-ENODATA);
+
+       gw->query_id =
+               ib_sa_path_rec_get(&vnic_sa_client,
+                                                  gw->discover->port->dev->ca,
+                                                  gw->discover->port->num,
+                                                  &p_rec,
+                                                  comp_mask,
+                                                  2000 /*TOUT*/,
+                                                  GFP_KERNEL,
+                                                  callback,
+                                                  gw,
+                                                  &gw->pquery);
+       if (gw->query_id < 0) {
+               complete(&gw->query_comp);
+               vnic_dbg_fip_p0(gw->discover->name, "ib_sa_path_rec_get failed, error %d\n", gw->query_id);
+               gw->pquery = ERR_PTR(-ENODATA);
+       }
+       return gw->query_id;
+}
+
+void fip_discover_gw_fsm_move(struct fip_gw_data *gw, enum fip_gw_state state)
+{
+       cancel_delayed_work(&gw->gw_task);
+       if (gw->pquery && !IS_ERR(gw->pquery) && gw->query_id >= 0)
+               ib_sa_cancel_query(gw->query_id, gw->pquery);
+
+       gw->state = state;
+       gw->query_id = -1;
+       gw->query_path_cnt = 0;
+       queue_delayed_work(fip_wq, &gw->gw_task, 0);
+}
+
+
+static void fip_discover_gw_fsm(struct work_struct *work)
+{
+       struct fip_gw_data *curr_gw =
+               container_of(work, struct fip_gw_data, gw_task.work);
+       unsigned long next_wakeup = curr_gw->info.gw_adv_period;
+       unsigned long rand = jiffies % 100 + 1;
+       int ret;
+
+       if (curr_gw->flush != FIP_NO_FLUSH)
+               return;
+
+       if (test_bit(MCAST_ATTACHED,
+                    &curr_gw->discover->discover_mcast_state)) {
+               if (time_after(jiffies, curr_gw->keep_alive_jiffies + next_wakeup)) {
+                       if (time_after(jiffies,
+                                      curr_gw->discover->discover_mcast_attached_jiffies
+                                       + next_wakeup)) {
+                               fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH);
+                               return;
+                       }
+               }
+       } else {
+               /* close gw if 1 minute has elapsed since mcast detach */
+               if (time_after(jiffies,
+                              curr_gw->discover->discover_mcast_detached_jiffies
+                               + 60*HZ)) {
+                       fip_close_gw(curr_gw, FIP_PARTIAL_FLUSH);
+                       return;
+               }
+       }
+
+       switch (curr_gw->state) {
+       case FIP_GW_HOST_ADMIN:
+               break;
+       case FIP_GW_CTRL_PATH_QUERY:
+               if (curr_gw->query_path_cnt && curr_gw->query_id >= 0) {
+                       /* PATH query is running */
+                       next_wakeup = msecs_to_jiffies(100);
+                       break;
+               }
+               ret = fip_discover_path_query(curr_gw, 0/*ctrl SL*/);
+               if (ret < 0)
+                       vnic_dbg_fip_p0(curr_gw->discover->name, "Query ctrl path Failed : retry num %d ...\n", curr_gw->query_path_cnt);
+               next_wakeup = msecs_to_jiffies(100);
+               break;
+
+       case FIP_GW_SEND_SOLICIT:
+               curr_gw->query_path_cnt = 0;
+               curr_gw->query_id = -1;
+               curr_gw->pquery = ERR_PTR(-ENODATA);
+               vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN FIP_GW_SEND_SOLICIT\n");
+               vnic_dbg_parse(curr_gw->discover->name, "new protocol %d\n", curr_gw->info.gw_prot_new);
+               ret = fip_solicit_send(curr_gw->discover, FIP_DISCOVER_UCAST,
+                                                          curr_gw->info.gw_qpn,
+                                                          curr_gw->info.gw_lid,
+                                                          vnic_gw_ctrl_sl(curr_gw),
+                                                          curr_gw->info.gw_prot_new);
+               if (ret)
+                       next_wakeup = (100 + rand * HZ) / 200;
+               else
+                       next_wakeup = (100 + rand * HZ) / 25;
+               break;
+
+       case FIP_GW_DATA_PATH_QUERY:
+               if (curr_gw->query_path_cnt && curr_gw->query_id >= 0) {
+                       /* PATH query is running */
+                       next_wakeup = msecs_to_jiffies(100);
+                       break;
+               }
+               ret = fip_discover_path_query(curr_gw, 1/*data SL*/);
+               if (ret < 0)
+                       vnic_dbg_fip_p0(curr_gw->discover->name, "Query data path Failed : retry num %d ...\n", curr_gw->query_path_cnt);
+               next_wakeup = msecs_to_jiffies(100);
+               break;
+
+       case FIP_GW_CONNECTED:
+               vnic_dbg_fip(curr_gw->discover->name, "DISCOVER_LOGIN: GW_CONNECTED!!!\n");
+               /* test vnic status */
+               fip_gw_create_vnics(curr_gw);
+               break;
+       default:
+               ASSERT(0);
+               break;
+       }
+
+       /* go to sleep until time out. We expect that we will be awaken by
+        * RX packets and never get to wake up due to timeout
+        */
+       cancel_delayed_work(&curr_gw->gw_task);
+       queue_delayed_work(fip_wq, &curr_gw->gw_task, next_wakeup);
+}
+
+static int is_new_solicit_prot(struct fip_discover *discover)
+{
+       vnic_dbg_parse(discover->name, "new gw %d, old gw %d\n",
+                      discover->new_prot_gws, discover->old_prot_gws);
+
+       if (!discover->old_prot_gws) {
+               if (!discover->new_prot_gws) {
+                       /* mcast solicit sent before any
+                        * advertise packets arrive. Use old format.
+                        */
+                       return 0;
+               } else
+                       return 1;
+       }
+       return 0;
+}
+
+/*
+ * This is the discover finite state machine that runs the
+ * advertise and solicit packet exchange of the discovery
+ * proccess.
+ * It is assumed that this function is only called from work queue
+ * task context (for locking)
+ */
+static void fip_discover_fsm(struct work_struct *work)
+{
+       struct fip_discover *discover =
+               container_of(work, struct fip_discover, fsm_task.work);
+       struct vnic_port *port = discover->port;
+       int recall_time = -1, flush = discover->flush;
+
+       /* we got a flush request and we have not performed it yet */
+       if ((flush != FIP_NO_FLUSH) &&
+            discover->state != FIP_DISCOVER_OFF) {
+               vnic_dbg_fip(discover->name, "discover_fsm switching to OFF\n");
+
+               recall_time = DELAYED_WORK_CLEANUP_JIFFS * 2;
+
+
+               if (discover->state != FIP_DISCOVER_CLEAR) {
+                       fip_free_gw_list(discover, flush);
+                       discover->state = FIP_DISCOVER_CLEAR;
+               }
+
+               /* if we open GWs we will test again later */
+               if (!fip_free_gw_done(discover, flush)) {
+                       vnic_dbg_fip(discover->name, "fip_free_gw_list not done, recalling \n");
+                       goto recall_fsm;
+               }
+
+               if (delayed_work_pending(&discover->cleanup_task))
+                       goto recall_fsm;
+
+               vnic_dbg_fip(discover->name, "fip_free_gw_list done \n");
+               vnic_dbg_mark();
+               vnic_mcast_del_all(&discover->mcast_tree);
+               vnic_dbg_mark();
+               discover->state = FIP_DISCOVER_OFF;
+
+               /* signal the unload to continue */
+               complete(&discover->flush_complete);
+               return;
+       }
+
+       if (discover->state == FIP_DISCOVER_OFF)
+               return;
+
+       if (!port->attr.lid) {
+               recall_time = 1 * HZ;
+               goto recall_fsm;
+       }
+
+       switch (discover->state) {
+        int new_prot;
+
+       case FIP_DISCOVER_INIT:
+               vnic_dbg_fip(discover->name, "FIP_DISCOVER_INIT\n");
+               /* in init try and join the discover multicast group
+                * This is a preliminary request for all other progress
+                * will eventually call fip_discover_mcast_connect_cb()
+                */
+               if (fip_discover_mcast_connect(discover)) {
+                       vnic_warn(discover->name, "fip_discover_mcast_connect() "
+                                 "failed\n");
+                       recall_time = 1 * HZ;
+               }
+               break;
+
+       case FIP_DISCOVER_SOLICIT:
+               new_prot = is_new_solicit_prot(discover);
+               vnic_dbg_fip(discover->name, "DISCOVER_SOLICIT\n");
+
+               /* send multicast solicit of type fip, if send is
+                * successfull move to login state and await advertise
+                * packets. It TX fail then retry
+                */
+               fip_solicit_send(discover, FIP_DISCOVER_MCAST, 0, 0, 0, new_prot);
+               recall_time = FIP_RESOLICIT_TIME * HZ;
+
+               break;
+
+       case FIP_DISCOVER_OFF:
+       default:
+               ASSERT(0);
+               break;
+
+       }
+
+recall_fsm:
+       if (recall_time >= 0)
+               queue_delayed_work(fip_wq, &discover->fsm_task, recall_time);
+
+       return;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_discover.h
new file mode 100644 (file)
index 0000000..52e11d3
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FIP_DISCOVER_H
+#define _FIP_DISCOVER_H
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+/* TODO - rethink this */
+#define FIP_UD_MTU(ib_mtu) (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN)
+#define FIP_UD_BUF_SIZE(ib_mtu)        (ib_mtu + IB_GRH_BYTES)
+
+#define FIP_MAX_BACKOFF_SECONDS        16
+#define FIP_MAX_VNICS_PER_GW   (1 << 9)
+
+#define FIP_TIMEOUT_FACTOR(a) ((a)*5/2)
+
+enum fip_gw_state {
+       FIP_GW_HOST_ADMIN,
+       FIP_GW_CTRL_PATH_QUERY,
+       FIP_GW_SEND_SOLICIT,    /* got mcast advertise & ctrl path query. sending solicit */
+       FIP_GW_DATA_PATH_QUERY,
+       FIP_GW_CONNECTED        /* we are already connected. do nothing */
+};
+
+
+enum {
+       GW_TYPE_SINGLE_EPORT = 0,
+       GW_TYPE_LAG = 1,
+};
+
+struct gw_ext_boot {
+       int valid;
+       int boot_prio;
+       int timeout;
+};
+
+struct gw_ext_lag {
+       int valid;
+       int hash;       /* enum gw_ext_lag_hash_policy */
+       int weights_policy;
+       int member_ka;
+       int ca;         /* conjestion aware */
+       int ca_thresh;
+       int ucast;      /* gw supports unicat keep alives */
+};
+
+
+struct gw_ext_pc_id {
+       int valid;
+       u64 power_cycle_id;
+};
+
+struct fip_gw_data_info {
+       struct fip_gw_volatile_info vol_info;
+       long gw_adv_period;  /* timeout in jiffies */
+       long gw_period;      /* timeout in jiffies */
+       long vnic_ka_period; /* in jiffies */
+       int flags;
+       u32 gw_qpn;
+       u16 gw_lid;
+       u16 gw_port_id;
+       u16 gw_num_vnics;
+       u16 n_rss_qpn;
+       u8 gw_sl; /* GW ctrl SL */
+       u8 hadmined_en;
+       u8 all_vlan_gw;
+       u8 gw_vendor_id[VNIC_VENDOR_LEN+1];
+       u8 gw_guid[GUID_LEN];
+       int gw_type;
+       int gw_prot_new;
+       int ext_mask;
+       struct gw_ext_boot   ext_boot;
+       struct gw_ext_lag    ext_lag;
+       struct gw_ext_pc_id  ext_pc_id;
+};
+
+struct fip_gw_data {
+       enum fip_flush flush;
+       int hadmin_gw;
+       struct mutex mlock;
+       struct fip_discover *discover;
+       struct list_head list;
+       unsigned long keep_alive_jiffies;
+       enum fip_gw_state state;
+       int vnic_count;
+       struct list_head vnic_list;
+       struct delayed_work gw_task;
+       struct delayed_work vnic_cleanup_task;
+       struct fip_gw_data_info info;
+       unsigned long n_bitmask[(FIP_MAX_VNICS_PER_GW >> 3) /
+                             sizeof(unsigned long)];
+
+       struct ib_sa_path_rec ctrl_prec;
+       struct ib_sa_path_rec data_prec;
+       struct ib_sa_query *pquery;
+       int query_path_cnt;
+       int query_id;
+       struct completion query_comp;
+};
+
+enum fip_gw_data_flags {
+       FIP_IS_FIP = 1 << 0,    /* protocol type */
+       FIP_RCV_MULTICAST = 1 << 1,     /* received mcast packet */
+       FIP_GW_AVAILABLE = 1 << 2,      /* GW available bit set in pkt */
+       FIP_HADMINED_VLAN = 1 << 3,     /* H bit set in advertise pkt */
+};
+
+static inline u8 vnic_gw_ctrl_sl(struct fip_gw_data *gw)
+{
+       return vnic_sa_query? gw->ctrl_prec.sl : gw->info.gw_sl;
+}
+
+/*
+ * TODO - we can do a nicer job here. stage 2
+ * allocates memory and post receives
+ */
+int fip_post_discovery_rcv(struct vnic_port *port,
+                          int ring_size, struct ib_qp *qp,
+                          struct fip_ring *rx_ring);
+
+int fip_discover_mcast_reattach(struct fip_discover *discover,
+                               struct vnic_port *port);
+
+/*
+ * This function handles a single received packet that are expected to be
+ * GW advertisements or login ACK packets. The function first parses the
+ * packet and decides what is the packet type and then handles the packets
+ * specifically according to its type. This functions runs in task context.
+*/
+void fip_discover_rx_packet(int *queue, struct fip_content *fc);
+int fip_discover_rx_packet_bh(struct fip_discover *discover, struct fip_content *fc);
+
+/*
+ * This function is the RX packet handler entry point at the thread level
+ * (unlike the completion handler that runs from interrupt context).
+ * the function calls a handler function and then reallocats the ring
+ * entry for the next receive.
+*/
+void fip_discover_process_rx(struct fip_discover *discover);
+void fip_discover_process_rx_bh(struct work_struct *work);
+void fip_discover_gw_fsm_move(struct fip_gw_data *gw, enum fip_gw_state state);
+
+/* This function creates an info string from GW attributes published
+ * by the GW in advertisement pkts */
+int fip_get_short_gw_info(struct fip_gw_data *gw, char *buff);
+
+
+int fip_packet_parse(struct vnic_port *port, void *packet, int size,
+                    struct fip_content *fc);
+
+#endif /* _FIP_DISCOVER_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_ib.c
new file mode 100644 (file)
index 0000000..ba63067
--- /dev/null
@@ -0,0 +1,437 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+#define        FIP_OP_RECV   (1ul << 31)
+/* TODO - rethink this */
+#define FIP_UD_MTU(ib_mtu)     (ib_mtu - FIP_ENCAP_LEN - FIP_ETH_HEADER_LEN)
+#define FIP_UD_BUF_SIZE(ib_mtu)        (ib_mtu + IB_GRH_BYTES)
+
+static inline void fip_wr_pepare(struct vnic_port *port,
+                                struct ib_send_wr *tx_wr,
+                                struct ib_sge *tx_sge,
+                                unsigned int wr_id, u64 mapping,
+                                int size, u16 pkey_index)
+{
+       /* This is a fixed part */
+       memset(tx_wr, 0, sizeof(struct ib_send_wr));
+       tx_wr->num_sge = 1;
+       tx_wr->sg_list = tx_sge;
+       tx_wr->opcode = IB_WR_SEND;
+       tx_wr->send_flags = IB_SEND_SIGNALED; 
+       tx_wr->wr.ud.pkey_index = pkey_index;
+       tx_wr->wr_id = wr_id;
+
+       memset(tx_sge, 0, sizeof(struct ib_sge));
+       tx_sge->lkey = port->mr->lkey;
+       tx_sge->addr = mapping;
+       tx_sge->length = size;
+}
+
+/*
+ * send a single multicast packet.
+ * return 0 on success, other on failure.
+*/
+int fip_mcast_send(struct vnic_port *port,
+                  struct ib_qp *qp,
+                  unsigned int wr_id,
+                  u64 mapping,
+                  int size,
+                  u16 pkey_index,
+                  struct vnic_mcast *mcast)
+{
+       struct ib_send_wr *bad_wr;
+       struct ib_sge tx_sge;
+       struct ib_send_wr tx_wr;
+       int ret;
+
+       fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index);
+
+       tx_wr.wr.ud.ah = mcast->ah;
+       tx_wr.wr.ud.remote_qpn = 0xFFFFFFFF;    /*dest_qpn; */
+       tx_wr.wr.ud.remote_qkey = mcast->qkey;
+
+       ret = ib_post_send(qp, &tx_wr, &bad_wr);
+
+       return ret;
+}
+
+/*
+ * send a single unicast packet.
+ * return 0 on success, other on failure.
+ */
+int fip_ucast_send(struct vnic_port *port,
+                  struct ib_ah *ah,
+                  struct ib_qp *qp,
+                  unsigned int wr_id,
+                  u64 mapping,
+                  int size,
+                  u16 pkey_index, u32 dest_qpn, u16 dlid,
+                  u32 qkey, u8 sl)
+{
+       struct ib_send_wr *bad_wr;
+       struct ib_ah *new_ah = NULL;
+       struct ib_sge tx_sge;
+       struct ib_send_wr tx_wr;
+       int ret;
+
+       fip_wr_pepare(port, &tx_wr, &tx_sge, wr_id, mapping, size, pkey_index);
+
+       if (!ah) {
+               struct ib_ah_attr ah_attr = {
+                       .dlid = dlid,
+                       .port_num = port->num,
+                       .sl = sl & 0xf,
+               };
+
+               new_ah = ib_create_ah(port->pd, &ah_attr);
+               if (IS_ERR(new_ah))
+                       return -1;
+
+               tx_wr.wr.ud.ah = new_ah;
+       } else
+               tx_wr.wr.ud.ah = ah;
+
+       tx_wr.wr.ud.remote_qpn = dest_qpn;
+       tx_wr.wr.ud.remote_qkey = qkey;
+
+       ret = ib_post_send(qp, &tx_wr, &bad_wr);
+
+       if (new_ah)
+               ib_destroy_ah(new_ah);
+
+       return ret;
+}
+
+/*
+ * This is a general purpose CQ completion function that handles
+ * completions on RX and TX rings. It can serve all users that are
+ * using RX and TX rings.
+ * RX completions are destinguished from TX comp by the MSB that is set
+ * for RX and clear for TX. For RX, the memory is unmapped from the PCI,
+ * The head is incremented. For TX the memory is unmapped and then freed.
+ * The function returns the number of packets received.
+*/
+int fip_comp(struct vnic_port *port,
+            struct ib_cq *cq,
+            struct fip_ring *rx_ring,
+            struct fip_ring *tx_ring,
+            char *name)
+{
+#define FIP_DISCOVER_WC_COUNT 4
+       struct ib_wc ibwc[FIP_DISCOVER_WC_COUNT];
+       int wrid, n, i;
+       int mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+       int rx_count = 0;
+       struct ib_device *dev = port->dev->ca;
+
+       do {
+               /*
+                * poll for up to FIP_DISCOVER_WC_COUNT in one request.
+                * returns the number of WC actually polled
+                */
+               n = ib_poll_cq(cq, FIP_DISCOVER_WC_COUNT, ibwc);
+               for (i = 0; i < n; ++i) {
+                       /*
+                        * use a mask on the id to decide if this is a receive
+                        * or transmit WC
+                        */
+                       if (ibwc[i].wr_id & FIP_OP_RECV) {
+                               wrid = ibwc[i].wr_id & ~FIP_OP_RECV;
+
+                               ib_dma_sync_single_for_cpu(dev,
+                                                          rx_ring->ring[wrid].bus_addr,
+                                                          mtu_size,
+                                                          DMA_FROM_DEVICE);
+
+                               if (likely(ibwc[i].status == IB_WC_SUCCESS)) {
+                                       rx_ring->ring[wrid].length =
+                                           ibwc[i].byte_len;
+                                       rx_count++;
+                               } else
+                                       rx_ring->ring[wrid].entry_posted = 0;
+
+                               rx_ring->head++;
+                       } else {        /* TX completion */
+                               unsigned long flags;
+                               wrid = ibwc[i].wr_id;
+
+                               /* unmap and free transmitted packet */
+                               ib_dma_unmap_single(dev,
+                                                   tx_ring->ring[wrid].
+                                                   bus_addr, tx_ring->ring[wrid].length,
+                                                   DMA_TO_DEVICE);
+
+                               kfree(tx_ring->ring[wrid].mem);
+                               tx_ring->ring[wrid].mem = NULL;
+                               tx_ring->ring[wrid].length = 0;
+                               spin_lock_irqsave(&tx_ring->head_tail_lock, flags);
+                               tx_ring->tail++;
+                               spin_unlock_irqrestore(&tx_ring->head_tail_lock, flags);
+                       }
+               }
+       } while (n == FIP_DISCOVER_WC_COUNT);
+
+       ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+       return rx_count;
+}
+
+/* qonfigure a newly allocated QP and move it
+ * from reset->init->RTR->RTS
+ */
+int fip_init_qp(struct vnic_port *port, struct ib_qp *qp, u16 pkey_index, char *name)
+{
+       struct ib_qp_attr qp_attr;
+       int attr_mask;
+
+       qp_attr.qp_state = IB_QPS_INIT;
+       qp_attr.qkey = VNIC_FIP_QKEY;
+       qp_attr.port_num = port->num;
+       qp_attr.pkey_index = pkey_index;
+       attr_mask = IB_QP_QKEY | IB_QP_PORT | IB_QP_PKEY_INDEX | IB_QP_STATE;
+
+       if (ib_modify_qp(qp, &qp_attr, attr_mask))
+               goto out_fail;
+
+       qp_attr.qp_state = IB_QPS_RTR;
+       attr_mask &= ~IB_QP_PORT;
+       if (ib_modify_qp(qp, &qp_attr, attr_mask))
+               goto out_fail;
+
+       qp_attr.qp_state = IB_QPS_RTS;
+       qp_attr.sq_psn = 0;
+       attr_mask |= IB_QP_SQ_PSN;
+       attr_mask &= ~IB_QP_PKEY_INDEX;
+       if (ib_modify_qp(qp, &qp_attr, attr_mask))
+               goto out_fail;
+
+       return 0;
+
+out_fail:
+       qp_attr.qp_state = IB_QPS_RESET;
+       if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE))
+               vnic_warn(name, "failed to modify QP to RESET state\n");
+
+       return -EINVAL;
+}
+
+void fip_qp_to_reset(struct ib_qp *qp, char *name)
+{
+       struct ib_qp_attr qp_attr;
+
+       qp_attr.qp_state = IB_QPS_RESET;
+       if (ib_modify_qp(qp, &qp_attr, IB_QP_STATE))
+               vnic_warn(name, "Failed to modify QP to RESET state\n");
+       return;
+}
+
+/*
+ * alloc a single buffer, map it and post it to the qp.
+ * id used to identify entry in receive queue.
+ */
+int fip_post_receive(struct vnic_port *port, struct ib_qp *qp, int size,
+                    int _id, struct fip_ring_entry *mem_entry, char *name)
+{
+       struct ib_recv_wr rx_wr, *bad_wr;
+       struct ib_sge rx_sge;
+       int rc;
+
+       rx_wr.wr_id = _id | FIP_OP_RECV;
+       rx_wr.next = NULL;
+       rx_wr.sg_list = &rx_sge;
+       rx_wr.num_sge = 1;
+       rx_sge.addr = mem_entry->bus_addr;
+       rx_sge.length = size;
+       rx_sge.lkey = port->mr->lkey;
+
+       ib_dma_sync_single_for_device(port->dev->ca, rx_sge.addr,
+                                     FIP_UD_BUF_SIZE(port->max_mtu_enum),
+                                     DMA_FROM_DEVICE);
+
+       rc = ib_post_recv(qp, &rx_wr, &bad_wr);
+       if (unlikely(rc)) {
+               vnic_warn(name, "post receive failed for buf rc %d (id %d)\n", _id, rc);
+               goto post_recv_failed;
+       }
+       mem_entry->entry_posted = 1;
+       return 0;
+
+post_recv_failed:
+       mem_entry->entry_posted = 0;
+       return -EIO;
+}
+
+void fip_flush_rings(struct vnic_port *port,
+                    struct ib_cq *cq,
+                    struct ib_qp *qp,
+                    struct fip_ring *rx_ring,
+                    struct fip_ring *tx_ring,
+                    char *name)
+{
+       vnic_dbg_fip(name, "fip_qp_to_err called\n");
+       if (qp) {
+               fip_qp_to_reset(qp, name);
+               fip_comp(port, cq, rx_ring, tx_ring, name);
+       }
+}
+
+void fip_free_rings(struct vnic_port *port,
+                   struct fip_ring *rx_ring,
+                   struct fip_ring *tx_ring,
+                   char *name)
+{
+       struct ib_device *dev = port->dev->ca;
+       int i;
+
+       for (i = rx_ring->size - 1; i >= 0; --i) {
+               if (rx_ring->ring[i].mem) {
+                       ib_dma_unmap_single(dev,
+                                           rx_ring->ring[i].bus_addr,
+                                           FIP_UD_BUF_SIZE(port->max_mtu_enum),
+                                           DMA_FROM_DEVICE);
+                       kfree(rx_ring->ring[i].mem);
+               }
+       }
+       rx_ring->size = 0;
+
+       for (i = tx_ring->size - 1; i >= 0; --i)
+               if (tx_ring->ring[i].length != 0) {
+                       ib_dma_unmap_single(dev,
+                                           tx_ring->ring[i].bus_addr,
+                                           tx_ring->ring[i].length,
+                                           DMA_TO_DEVICE);
+                       kfree(tx_ring->ring[i].mem);
+               }
+       tx_ring->size = 0;
+
+       vnic_dbg_fip(name, "Done cleaning RX and TX queues\n");
+
+       kfree(rx_ring->ring);
+       rx_ring->ring = NULL;
+       kfree(tx_ring->ring);
+       tx_ring->ring = NULL;
+}
+
+/*
+ * TODO - we can do a nicer job here. stage 2
+ *  allocates memory and post receives
+ * TODO2: need to handle the bad flow to free all existing entries in the ring
+ */
+int fip_init_rx(struct vnic_port *port,
+               int ring_size,
+               struct ib_qp *qp,
+               struct fip_ring *rx_ring,
+               char *name)
+{
+       struct ib_device *dev = port->dev->ca;
+       int i, rc = 0, mtu_size = FIP_UD_BUF_SIZE(port->max_mtu_enum);
+
+       rx_ring->size = ring_size;
+       rx_ring->ring = kzalloc(rx_ring->size *
+                               sizeof(struct fip_ring_entry),
+                               GFP_KERNEL);
+       if (!rx_ring->ring) {
+               vnic_warn(name, "failed to alloc fip RX ring, size %d\n", rx_ring->size);
+               rx_ring->size = 0;
+               return -ENOMEM;
+       }
+
+       /* allocate the ring entries */
+       for (i = 0; i < rx_ring->size; i++) {
+               rx_ring->ring[i].mem = kmalloc(mtu_size, GFP_KERNEL);
+               if (unlikely(!rx_ring->ring[i].mem)) {
+                       rc = -ENOMEM;
+                       goto error;
+               }
+
+               rx_ring->ring[i].entry_posted = 0;
+               rx_ring->ring[i].length = mtu_size;
+               rx_ring->ring[i].bus_addr = ib_dma_map_single(dev,
+                                                             rx_ring->ring[i].mem,
+                                                             mtu_size, DMA_FROM_DEVICE);
+               if (unlikely(ib_dma_mapping_error(dev, rx_ring->ring[i].bus_addr))) {
+                       rc = -ENODEV;
+                       goto dma_error;
+               }
+
+               if (fip_post_receive(port, qp, FIP_UD_BUF_SIZE(port->max_mtu_enum),
+                                    i, rx_ring->ring + i, name)) {
+                       rc = -EIO;
+                       goto post_recv_failed;
+               }
+       }
+
+       rx_ring->head = 0;
+       rx_ring->tail = 0;
+       spin_lock_init(&rx_ring->head_tail_lock);
+       spin_lock_init(&rx_ring->ring_lock);
+       return 0;
+
+post_recv_failed:
+       ib_dma_unmap_single(dev, rx_ring->ring[i].bus_addr,
+                           mtu_size, DMA_FROM_DEVICE);
+dma_error:
+       kfree(rx_ring->ring[i].mem);
+       rx_ring->ring[i].mem = NULL;
+error:
+       /* previous entries need to be freed after flushing the QP */
+       return rc;
+}
+
+/*
+ * This function allocates the tx buffers and initializes the head and
+ * tail indexes.
+ */
+int fip_init_tx(int size, struct fip_ring *tx_ring, char *name)
+{
+       tx_ring->size = size;
+       tx_ring->ring = kzalloc(tx_ring->size *
+                               sizeof(struct fip_ring_entry),
+                               GFP_KERNEL);
+
+       if (!tx_ring->ring) {
+               vnic_warn(name, "failed to alloc fip TX ring, size %d\n",
+                         tx_ring->size);
+               tx_ring->size = 0;
+               return -ENOMEM;
+       }
+
+       tx_ring->head = 0;
+       tx_ring->tail = 0;
+       spin_lock_init(&tx_ring->head_tail_lock);
+       spin_lock_init(&tx_ring->ring_lock);
+       return 0;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_login.c
new file mode 100644 (file)
index 0000000..55729f2
--- /dev/null
@@ -0,0 +1,1752 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+#ifndef work_pending /* back-port */
+#define work_pending(_work) test_bit(0, &(_work)->pending)
+#endif
+
+enum {
+       VNIC_LOGIN_REG_NETDEV_PENDING,
+       VNIC_LOGIN_REG_NETDEV_DONE,
+       VNIC_LOGIN_DESTROY_PENDING,
+       VNIC_LOGIN_DESTROY_DONE,
+       VNIC_LOGIN_DESTROY_FULL
+};
+
+static int fip_vnic_rings_create(struct vnic_port *port,
+                                struct fip_vnic_data *vnic);
+static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic);
+static void fip_vnic_recv(struct fip_vnic_data *vnic);
+
+#ifdef _BP_HR_TIMER
+int fip_vnic_keepalive(struct hrtimer * timer);
+#else
+enum hrtimer_restart fip_vnic_keepalive(struct hrtimer * timer);
+#endif
+int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source);
+
+
+#define QUEUE_VNIC_DWORK(vnic, task, time)                     \
+do {                                                           \
+       unsigned long flags;                                    \
+       spin_lock_irqsave(&vnic->lock, flags);                  \
+       if (likely(vnic->flush == FIP_NO_FLUSH))                \
+               queue_delayed_work(fip_wq, task, time);  \
+       spin_unlock_irqrestore(&vnic->lock, flags);             \
+} while(0)
+
+#define REQUEUE_VNIC_DWORK(vnic, task, time)                   \
+do {                                                           \
+       cancel_delayed_work(task);                              \
+       QUEUE_VNIC_DWORK(vnic, task, time);                     \
+} while(0);
+
+
+/*
+ * Look for a vnic in the GW vnic list. The search key used is either the vnic_id
+ * that is unique, or the mac+vlan pair. A match on either key will result in the
+ * return of the vnic. both keys are nesesary because host assigned delete
+ * flow might not have access to the vnic_id. The search disregards vnics that
+ * are undergoing full flush (they will be removed soon).
+*/
+struct fip_vnic_data *fip_vnic_find_in_list(struct fip_gw_data *gw, u16 vnic_id,
+                                           u8 *mac, u16 vlan, u8 vlan_used)
+{
+       struct fip_vnic_data *vnic;
+       int use_mac = mac ? 1 : 0;
+       int vlan_match;
+
+       ASSERT(gw);
+
+       if (list_empty(&gw->vnic_list))
+               return NULL;
+
+       /* do not use MAC 0:..:0 for vnic matches */
+       if (use_mac)
+               use_mac = !IS_ZERO_MAC(mac);
+
+       list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+               if (vnic->flush == FIP_FULL_FLUSH)
+                       continue;
+
+               if (vnic->vnic_id == vnic_id)
+                       return vnic;
+
+               if (vlan_used != vnic->login_data.vp)
+                       continue;
+
+               vlan_match = !vlan_used ||
+                       (vlan_used && (vlan == vnic->login_data.vlan));
+
+               if ((use_mac && !memcmp(vnic->login_data.mac, mac, ETH_ALEN)) &&
+                   vlan_match)
+                       return vnic;
+       }
+       return NULL;
+}
+
+/*
+ * This function handles completions of both TX and RX
+ * packets of vnics. RX packets are unmapped lightly parsed moved to a list
+ * and passed to thread processing. TX packets are unmapped and freed.
+ * Note: this function is called from interrupt context
+ */
+static void fip_vnic_comp(struct ib_cq *cq, void *vnic_ptr)
+{
+       struct fip_vnic_data *vnic = vnic_ptr;
+
+       /* handle completions. On RX packets this will call vnic_recv
+        * from thread context to continue processing */
+       if (fip_comp(vnic->port, vnic->cq, &vnic->rx_ring,
+                    &vnic->tx_ring, vnic->name))
+               fip_vnic_recv(vnic);
+
+       fip_vnic_keepalive_send(vnic, 0);
+}
+
+/*
+ * read the state of the gw eport. This can be done from any context and therefore
+ * requires protection.
+*/
+int fip_vnic_get_eport_state(struct fip_vnic_data *vnic)
+{
+       int i;
+
+       if (no_bxm)
+               return 1;
+
+       if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+               for (i = 0; i < MAX_LAG_MEMBERS; i++) {
+                       if (!(vnic->lm.used_bitmask & 1 << i))
+                               continue;
+
+                       if (vnic->lm.memb[i].eport_state)
+                               return 1;
+               }
+               return 0;
+       } else {
+               return atomic_read(&vnic->eport_state);
+       }
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_bx_name(struct fip_vnic_data *vnic, char *buff)
+{
+       struct fip_gw_data *gw = vnic->gw;
+       struct fip_gw_volatile_info tmp_info;
+       int rc;
+
+       if (!gw)
+               return -EINVAL;
+
+       mutex_lock(&gw->mlock);
+       memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+       mutex_unlock(&gw->mlock);
+
+       rc = sprintf(buff, "%s", tmp_info.system_name);
+
+       return rc < 0 ? rc : 0;
+}
+
+int fip_vnic_get_bx_guid(struct fip_vnic_data *vnic, char *buff)
+{
+       struct fip_gw_data *gw = vnic->gw;
+       struct fip_gw_volatile_info tmp_info;
+       void *rc;
+
+       memset(buff, 0, sizeof *buff);
+
+       if (!gw)
+               return -EINVAL;
+
+       mutex_lock(&gw->mlock);
+       memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+       mutex_unlock(&gw->mlock);
+
+       rc = memcpy(buff, tmp_info.system_guid, GUID_LEN);
+
+       return rc ? 0 : -EINVAL;
+}
+
+int fip_vnic_get_all_vlan_mode(struct fip_vnic_data *vnic, char *buff)
+{
+       struct fip_gw_data *gw = vnic->gw;
+       int rc;
+
+       if (!gw)
+               return -EINVAL;
+
+       rc = sprintf(buff, "%s", gw->info.all_vlan_gw ? "yes" : "no");
+
+       return rc < 0 ? rc : 0;
+}
+
+int fip_vnic_get_eport_name(struct fip_vnic_data *vnic, char *buff)
+{
+
+       struct fip_gw_data *gw = vnic->gw;
+       struct fip_gw_volatile_info tmp_info;
+       int rc;
+
+       if (!gw)
+               return -EINVAL;
+
+       mutex_lock(&gw->mlock);
+       memcpy(&tmp_info, &gw->info.vol_info, sizeof(tmp_info));
+       mutex_unlock(&gw->mlock);
+
+       rc = sprintf(buff, "%s", tmp_info.gw_port_name);
+
+       return rc < 0 ? rc : 0;
+}
+
+u8 fip_vnic_get_bx_sl(struct fip_vnic_data *vnic)
+{
+       return vnic->gw->info.gw_sl;
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_gw_type(struct fip_vnic_data *vnic)
+{
+       struct fip_gw_data *gw = vnic->gw;
+       int lag = 0;
+
+       if (!gw)
+               return -EINVAL;
+
+       lag = gw->info.gw_type == GW_TYPE_LAG;
+
+       return lag;
+}
+
+/*
+ * get GW info funcs.
+*/
+int fip_vnic_get_lag_eports(struct fip_vnic_data *vnic, char *buf)
+{
+       struct fip_gw_data *gw = vnic->gw;
+       int i;
+       struct lag_member *member;
+       char *p = buf;
+
+       if (!gw)
+               return -EINVAL;
+
+       if (gw->info.gw_type != GW_TYPE_LAG)
+               return -EINVAL;
+
+       p += _sprintf(p, buf, "LAG_MEMBER_INFORMATION:\n");
+       for (i=0; i<MAX_LAG_MEMBERS; i++) {
+               if (!(vnic->lm.used_bitmask & 1 << i))
+                       continue;
+
+               member = &vnic->lm.memb[i];
+               p += _sprintf(p, buf, "  %.2d ID=%.3X LID=%4X QPN=%8X STATE=%s\n",
+                             i, member->gw_port_id, member->lid, member->qpn,
+                             member->eport_state ? "UP" : "DOWN");
+       }
+
+       return p - buf;
+}
+
+/*
+ * process an incoming login ack packet. The packet was already parsed and
+ * its data was placed in *data. The function creates RX and TX rings for the
+ * vnic and starts the multicast join procedure.
+ * This function should not be called for packets other then login ack packets.
+ */
+void fip_vnic_login_ack_recv(struct fip_vnic_data *vnic,
+                            struct fip_login_data *data)
+{
+       /* we allow login acks only in wait for ack in other states
+        * we ignore them */
+       if (vnic->state != FIP_VNIC_WAIT_4_ACK) {
+               vnic_dbg_fip_v(vnic->name,
+                              "vnic_login_ack_recv in state other"
+                              " then FIP_VNIC_WAIT_4_ACK state %d\n",
+                              vnic->state);
+               return;
+       }
+
+       /* For LAG vnics, process login ack member data */
+       if (vnic->gw->info.gw_type == GW_TYPE_LAG)
+               handle_member_update(vnic, &data->lagm);
+
+       memcpy(&vnic->login_data, data, sizeof(vnic->login_data));
+
+       vnic->state = FIP_VNIC_RINGS_INIT;
+
+       /* calls fip_vnic_fsm() */
+       cancel_delayed_work(&vnic->vnic_task);
+       fip_vnic_fsm(&vnic->vnic_task.work);
+       // REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0);
+       return;
+}
+
+/*
+ * This is a helper function we use in order to move the login create
+ * to another context so we don't block the fip thread for too long.
+ * The call stack triggered by this function calls register_netdev that
+ * might block for some time when netdev are removed in parallel. This
+ * stalls the fip_wq which causes KA not to be sent. 
+*/
+void fip_vnic_login_create(struct work_struct *work)
+{
+       struct fip_vnic_data *vnic =
+               container_of(work, struct fip_vnic_data, vnic_login_create_task);
+       char *name = NULL;
+       int rc;
+
+       if (vnic->hadmined)
+               name = vnic->interface_name;
+
+       rc = vnic_login_register_netdev(vnic, vnic->mac_cache, name);
+
+       spin_lock_irq(&vnic->lock);
+       clear_bit(VNIC_LOGIN_REG_NETDEV_PENDING, &vnic->login_status);
+       if (!rc)
+               set_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status);
+       spin_unlock_irq(&vnic->lock);
+}
+
+/*
+ * Test if the create request posted earlier terminated or not.
+ * If yes and successfully returns 0, if still pending returns
+ * -EAGAIN , and if failed returns -EINVAL. if retry is set
+ * it will requeue a create attempt and try again. In this case 
+ * the function will return -EAGAIN. 
+*/
+static int fip_vnic_test_login(struct fip_vnic_data *vnic, int retry)
+{
+       int ret = 0;
+
+       spin_lock_irq(&vnic->lock);
+
+       if (!test_bit(VNIC_LOGIN_REG_NETDEV_DONE, &vnic->login_status)) {
+               /* queue retry login create request */
+               if (retry) {
+                       if (!test_and_set_bit(VNIC_LOGIN_REG_NETDEV_PENDING,
+                                             &vnic->login_status)) {
+                               memcpy(vnic->mac_cache, vnic->login_data.mac, ETH_ALEN);
+                               vnic->vlan_used = vnic->login_data.vp;
+                               vnic->vlan = vnic->login_data.vlan;
+                               vnic->all_vlan_gw = vnic->login_data.all_vlan_gw;
+
+                               /* calls fip_vnic_login_create() */
+                               if (vnic->flush == FIP_NO_FLUSH)
+                                       queue_work(login_wq, &vnic->vnic_login_create_task);
+                       }
+                       ret = -EAGAIN;
+               } else {
+                       if (test_bit(VNIC_LOGIN_REG_NETDEV_PENDING,
+                                    &vnic->login_status))
+                                ret = -EAGAIN;
+                       else
+                               ret = -EINVAL;
+               }
+       } 
+       spin_unlock_irq(&vnic->lock);
+
+       return ret;
+}
+
+
+/*
+ * This function should be called when the building of a vhub context
+ * table is done and the vnic state should transition to CONNECTED.
+ */
+int fip_vnic_tbl_done(struct fip_vnic_data *vnic)
+{
+       vnic->vhub_table.state = VHUB_TBL_UP2DATE;
+       vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn;
+
+       if (vnic->state <= FIP_VNIC_VHUB_DONE)
+               vnic->state = FIP_VNIC_VHUB_DONE;
+       else 
+               vnic->state = FIP_VNIC_VHUB_WRITE;
+
+       cancel_delayed_work(&vnic->vnic_task);
+       fip_vnic_fsm(&vnic->vnic_task.work);
+       return 0;
+}
+
+/*
+ * This function runs in interrupt context
+ * It does sanity checking of the packet, moves it to a list and passes
+ * handleing to a thread.
+ */
+static void fip_vnic_recv(struct fip_vnic_data *vnic)
+{
+       struct fip_ring *rx_ring = &vnic->rx_ring;
+       int ret, length;
+       u32 vhub_id;
+       void *mem;
+       int queue_packet = 0;
+       int one_or_more_queued = 0;
+       int index;
+       int err;
+
+       while (rx_ring->head != rx_ring->tail) {
+               struct fip_content *fc;
+
+               queue_packet = 0;
+               index = rx_ring->tail & (vnic->rx_ring.size - 1);
+
+               if (rx_ring->ring[index].entry_posted == 0)
+                       goto repost;
+
+               mem = rx_ring->ring[index].mem;
+               length = rx_ring->ring[index].length;
+
+
+               fc = kzalloc(sizeof *fc, GFP_ATOMIC);
+               if (!fc) {
+                       vnic_warn(vnic->name, "kzalloc failed\n");
+                       goto repost;
+               }
+
+               err = fip_packet_parse(vnic->port, mem + IB_GRH_BYTES, length - IB_GRH_BYTES, fc);
+               if (err) {
+                       vnic_warn(vnic->name, "packet parse failed\n");
+                       kfree(fc);
+                       goto repost;
+               }
+
+               switch (fc->fh->subcode) {
+               case FIP_GW_UPDATE_SUB_OPCODE:
+                       if (fc->fvu) {
+                               vhub_id = be32_to_cpu(fc->fvu->state_vhub_id) & 0xffffff;
+                               if (vnic->login_data.vhub_id == vhub_id)
+                                       queue_packet = 1;
+                       }
+
+                       break;
+               case FIP_GW_TABLE_SUB_OPCODE:
+                       if (vnic->state >= FIP_VNIC_VHUB_INIT &&
+                           vnic->vhub_table.state == VHUB_TBL_INIT) {
+                               /* handle vhub context table packets */
+                               if (fc->fvt) {
+                                       vhub_id = be32_to_cpu(fc->fvt->vp_vhub_id) & 0xffffff;
+                                       if (vnic->login_data.vhub_id == vhub_id)
+                                               queue_packet = 1;
+                               }
+                       }
+                       break;
+               default:
+                       vnic_dbg_fip_v(vnic->name,
+                                      "received unexpected format packet\n");
+                       break;
+               }
+
+               if (queue_packet && (likely(vnic->flush == FIP_NO_FLUSH))) {
+                       struct fip_rcv_pkt *rcv;
+                       struct fip_ring_entry me;
+
+                       /* record packet time for heart beat */
+                       vnic->keep_alive_jiffs = jiffies;
+                       length -= IB_GRH_BYTES;
+                       rcv = kzalloc(sizeof *rcv, GFP_ATOMIC);
+                       if (!rcv) {
+                               vnic_warn(vnic->name, "failed kmalloc\n");
+                               kfree(fc);
+                               goto repost;
+                       }
+
+                       /* replace it with new entry, and queue old one */
+                       err = alloc_map_fip_buffer(vnic->port->dev->ca, &me,
+                                                  FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+                                                  GFP_ATOMIC);
+                       if (err) {
+                               vnic_warn(vnic->name, "alloc_map_fip_buffer failed\n");
+                               kfree(fc);
+                               kfree(rcv);
+                               goto repost;
+                       }
+
+                       /* unmap old entry */
+                       ib_dma_unmap_single(vnic->port->dev->ca,
+                                           rx_ring->ring[index].bus_addr,
+                                           FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+                                           DMA_FROM_DEVICE);
+
+                       rx_ring->ring[index] = me;
+                       rcv->fc = fc;
+                       rcv->length = length;
+                       rcv->mem = mem;
+                       spin_lock(&vnic->vnic_rcv_list.lock);
+                       list_add_tail(&rcv->list, &vnic->vnic_rcv_list.list);
+                       spin_unlock(&vnic->vnic_rcv_list.lock);
+                       one_or_more_queued++;
+               } else
+                       kfree(fc);
+repost:
+               ret = fip_post_receive(vnic->port, vnic->qp,
+                                      FIP_UD_BUF_SIZE(vnic->port->max_mtu_enum),
+                                      index, rx_ring->ring + index, vnic->name);
+               if (ret)
+                       vnic_warn(vnic->name, "fip_post_receive ret %d\n", ret);
+
+               rx_ring->tail++;
+       }
+
+       if (one_or_more_queued && (likely(vnic->flush == FIP_NO_FLUSH))) {
+               /* calls fip_vnic_recv_bh() */
+               queue_work(fip_wq, &vnic->vnic_pkt_rcv_task_bh);
+       }
+
+       return;
+}
+
+void fip_vnic_recv_list_flush(struct fip_vnic_data *vnic)
+{
+       struct list_head vnic_recv_local;
+       struct fip_rcv_pkt *rcv, *rcv1;
+       unsigned long flags;
+
+       INIT_LIST_HEAD(&vnic_recv_local);
+
+       spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags);
+       list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local);
+       spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags);
+
+       list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+               list_del(&rcv->list);
+               kfree(rcv);
+       }
+       return;
+}
+
+void lag_ctx_clear(struct fip_vnic_data *vnic)
+{
+       memset(&vnic->lm, 0, sizeof (vnic->lm));
+}
+
+/*
+ * Handle the GW eport member info for a LAG GW. The function compares the
+ * member information to previous membership information that is stored in the
+ * vnic. The data path info is updated only after the login ack info was
+ * updated to prevent race conditions. 
+ * The vnic contains a local cache of the member info. The cache is updated
+ * in all cases other then if the write to the data path failed. If the write
+ * failed we will not update the cache and rely on periodic updates packets
+ * for the retry.
+ * There are 4 possible flows per member entry:
+ * 1. the entry is cached in the vnic but not in the packet - remove from vnic
+ * 2. the entry is not cached in the vnic but is in the packet - add to vnic,
+ * 3. entry is in vnic and in packet but different params - modifiy vnic
+ * 4. entry is in vnic and in packet and with similar params - do nothing
+*/
+int handle_member_update(struct fip_vnic_data *vnic, struct lag_members *lm)
+{
+       int i, j;
+       char packet_used[MAX_LAG_MEMBERS];
+       char vnic_used[MAX_LAG_MEMBERS];
+       struct lag_member *vnic_mem, *pkt_mem;
+       int last_bit = 0;
+       #define EMPTY_ENTRY (char)0xff
+       /* we only update data path  with new info after certain stage */
+       int write_through = !!(vnic->state >= FIP_VNIC_VHUB_WRITE);
+       int skip;
+       struct lag_properties lag_prop;
+       struct vnic_login *login = vnic->login;
+
+       memset(packet_used, EMPTY_ENTRY, sizeof(packet_used));
+       memset(vnic_used, EMPTY_ENTRY, sizeof(vnic_used));
+
+        /* if LAG is not enabled, or it's a child vNic, abort */
+       if (!vnic->gw->info.ext_lag.valid || vnic->parent_used)
+               return -EINVAL;
+
+       mutex_lock(&vnic->gw->mlock);
+       lag_prop.ca = vnic->gw->info.ext_lag.ca;
+       lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh;
+       lag_prop.hash_mask = vnic->gw->info.ext_lag.hash;
+       lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy;
+       mutex_unlock(&vnic->gw->mlock);
+       if (write_through)
+               vnic_member_prop(login, &lag_prop);
+
+       /* go over all known members, for each one search for a match in the
+        * packet member struct */
+       for (i=0; i<MAX_LAG_MEMBERS; i++) {
+               if (!(vnic->lm.used_bitmask & 1 << i))
+                       continue;
+
+               vnic_mem = &vnic->lm.memb[i];
+               for (j=0; j<lm->num; j++) {
+
+                       pkt_mem = &lm->memb[j];
+                       /* find match for member in vnic data structure */
+                       if (packet_used[j] == EMPTY_ENTRY &&
+                           !memcmp(vnic_mem->guid, pkt_mem->guid, GUID_LEN) &&
+                           vnic_mem->gw_port_id == pkt_mem->gw_port_id) {
+                               /* found a match, check for change in parameters */
+                               if (vnic->login) {
+                                       /* check for change in member parameters */
+                                       if (vnic_mem->lid != pkt_mem->lid ||
+                                           vnic_mem->qpn != pkt_mem->qpn ||
+                                           vnic_mem->eport_state != pkt_mem->eport_state ||
+                                           vnic_mem->sl != pkt_mem->sl ||
+                                           vnic_mem->link_utilization != pkt_mem->link_utilization) {
+
+                                               vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d modifying lid %d qpn %d state %d\n",
+                                                            i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+                                               /* update data path if required and store update info localy */
+                                               if (!write_through ||
+                                                   (write_through && !vnic_member_modify(login, i, &lm->memb[j])))
+                                                       *vnic_mem = lm->memb[j];
+                                       }
+                               }
+                               packet_used[j] = i;
+                               vnic_used[i] = j;
+                               break;
+                       }
+               }
+               /* if member was removed in last packet remove it */
+               if (vnic_used[i] == EMPTY_ENTRY) {
+                       if (!write_through ||
+                           (write_through && !vnic_member_remove(login, i))) {
+                               vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d removing lid %d qpn %d state %d\n",
+                                            i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+                               vnic->lm.used_bitmask &= ~(1 << i);
+                       }
+               }
+       }
+
+       /* go over packet and look for any new members */
+       for (j=0; j<lm->num; j++) {
+               /* if entry was matched up already */
+               if (packet_used[j]!= EMPTY_ENTRY)
+                       continue;
+
+               skip = 0;
+               /* verify that the same GW_ID is not in use by another port */
+               for (i=0; i<MAX_LAG_MEMBERS; i++) {
+                       if (!(vnic->lm.used_bitmask & 1 << i))
+                               continue;
+                       if (vnic->lm.memb[i].gw_port_id == lm->memb[j].gw_port_id)
+                               skip = 1;
+               }
+               if (skip)
+                       continue;
+
+               /* look for an empty member id and add the member to it */
+               for (i=last_bit; i<MAX_LAG_MEMBERS; i++) {
+                       if (vnic->lm.used_bitmask & 1 << i)
+                               continue;
+
+                       vnic_dbg_lag_v(vnic->name, "handle_member_update entry %d adding lid %d qpn %d state %d\n",
+                                    i, lm->memb[j].lid, lm->memb[j].qpn, lm->memb[j].eport_state);
+                       if (!write_through ||
+                           (write_through && !vnic_member_add(login, i, &lm->memb[j]))) {
+                               vnic->lm.used_bitmask |= (1 << i);
+                               vnic->lm.memb[i] = lm->memb[j];
+                       }
+
+                       break;
+               }
+               last_bit = i;
+       }
+
+       return 0;
+}
+
+/* Write the initial member table to the datapath. If we fail we will
+ * delete the entry from the local cache and rely on periodic updates
+ * packets for the retry*/
+int fip_vnic_write_members(struct fip_vnic_data *vnic)
+{
+       int i;
+       struct lag_properties lag_prop;
+       struct vnic_login *login = vnic->login;
+
+        /* if LAG is not enabled, or it's a child vNic, abort */
+       if (!vnic->gw->info.ext_lag.valid || vnic->parent_used)
+               return -EINVAL;
+
+       lag_prop.ca = vnic->gw->info.ext_lag.ca;
+       lag_prop.ca_thresh = vnic->gw->info.ext_lag.ca_thresh;
+       lag_prop.hash_mask = vnic->gw->info.ext_lag.hash;
+       lag_prop.weights_policy = vnic->gw->info.ext_lag.weights_policy;
+       vnic_member_prop(login, &lag_prop);
+
+       /* go over all members, for each une used write it to the data path */
+       for (i=0; i<MAX_LAG_MEMBERS; i++) {
+               if (!(vnic->lm.used_bitmask & 1 << i))
+                       continue;
+
+               /* if update failed, delete local entry we will use the
+                * the update packet flow for retries.
+                */
+               if (vnic_member_add(login, i, &vnic->lm.memb[i]))
+                       vnic->lm.used_bitmask &= ~(1 << i);
+       }
+
+       return 0;
+}
+
+/* runs in the context of vnic->vnic_pkt_rcv_task_bh */
+void fip_vnic_recv_bh(struct work_struct *work)
+{
+       struct fip_vnic_data *vnic =
+               container_of(work, struct fip_vnic_data, vnic_pkt_rcv_task_bh);
+       int length;
+       u32 vhub_id, tusn;
+       int eport_state;
+       struct vnic_table_entry *vhub_entries;
+       struct list_head vnic_recv_local;
+       struct fip_rcv_pkt *rcv, *rcv1;
+       unsigned long flags;
+       int i, __eport_state;
+       
+       INIT_LIST_HEAD(&vnic_recv_local);
+
+       spin_lock_irqsave(&vnic->vnic_rcv_list.lock, flags);
+       list_replace_init(&vnic->vnic_rcv_list.list, &vnic_recv_local);
+       spin_unlock_irqrestore(&vnic->vnic_rcv_list.lock, flags);
+
+       /* We Are not interested in packets prior to FIP_VNIC_VHUB_INIT */
+       if (vnic->state < FIP_VNIC_VHUB_INIT ||
+           vnic->flush != FIP_NO_FLUSH) {
+               list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+                       kfree(rcv->fc);
+                       kfree(rcv->mem);
+                       list_del(&rcv->list);
+                       kfree(rcv);
+               }
+       } else {
+               int err;
+
+               list_for_each_entry_safe(rcv, rcv1, &vnic_recv_local, list) {
+                       length = rcv->length;
+
+                       switch (rcv->fc->fh->subcode) {
+                       case FIP_GW_UPDATE_SUB_OPCODE:
+                               /* validate vhub id before processing packet */
+                               vhub_id = be32_to_cpu(rcv->fc->fvu->state_vhub_id) & 0xffffff;
+                               if(unlikely(vnic->login_data.vhub_id != vhub_id))
+                                       break;
+
+                               eport_state = be32_to_cpu(rcv->fc->fvu->state_vhub_id) >> 27 & 3;
+                               __eport_state = (eport_state == 0) ? EPORT_STATE_DOWN : EPORT_STATE_UP;
+                               atomic_set(&vnic->eport_state, __eport_state);
+
+                               /* handle vhub context update packets */
+                               if (rcv->fc->fed.num) {
+                                       err = extract_vhub_extended(rcv->fc->fed.fed[0], vnic);
+                                       if (err)
+                                               vnic_warn(vnic->name, "extract_vhub_extended() failed\n");
+                               }
+                               if (rcv->fc->cte.num) {
+                                       vhub_entries = kmalloc(rcv->fc->cte.num * sizeof *vhub_entries, GFP_KERNEL);
+                                       if (!vhub_entries) {
+                                               vnic_warn(vnic->port->name, "failed to allocate memory for update CTEs\n");
+                                               goto free_entry;
+                                       }
+
+                                       tusn = be32_to_cpu(rcv->fc->fvu->tusn);
+                                       for (i = 0; i < rcv->fc->cte.num; ++i) {
+                                               vhub_entries[i].lid = be16_to_cpu(rcv->fc->cte.cte[i].lid);
+                                               vhub_entries[i].qpn = be32_to_cpu(rcv->fc->cte.cte[i].qpn) & 0xffffff;
+                                               vhub_entries[i].sl = rcv->fc->cte.cte[i].sl & 0xf;
+                                               vhub_entries[i].rss = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_RSS_FLAG ? 1 : 0;
+                                               vhub_entries[i].valid = rcv->fc->cte.cte[i].v_rss_type & FIP_CONTEXT_V_FLAG ? 1 : 0;
+                                               memcpy(vhub_entries[i].mac, rcv->fc->cte.cte[i].mac, sizeof(vhub_entries[i].mac));
+                                               vhub_handle_update(vnic, vhub_id, tusn - rcv->fc->cte.num + i + 1, &vhub_entries[i]);
+                                       }
+                                       kfree(vhub_entries);
+                               }
+
+                               /* update vnic carrier only when vnic is ready:
+                                * not closing (non zero flush), and per-registered
+                                */
+                               if (!vnic->flush && vnic->login &&
+                                   test_bit(VNIC_STATE_LOGIN_CREATE_1, &vnic->login_state)) {
+                                               vnic_carrier_update(vnic->login);
+                               }
+                               break;
+                       case FIP_GW_TABLE_SUB_OPCODE:
+                               /* handle vhub context table packets */
+                               tusn = be32_to_cpu(rcv->fc->fvt->tusn);
+                               vhub_id = be32_to_cpu(rcv->fc->fvt->vp_vhub_id) & 0xffffff;
+                               vhub_handle_tbl(vnic, rcv->fc, vhub_id, tusn);
+                               break;
+
+                       default:
+                               break;
+                       }
+free_entry:
+                       list_del(&rcv->list);
+                       kfree(rcv->fc);
+                       kfree(rcv->mem);
+                       kfree(rcv);
+               }
+       }
+       return;
+}
+
+/*
+ * Mark the vnic for deletion and trigger a delayed call to the cleanup
+ * function. In the past the vnic was moved to another list but this
+ * might cause vnic duplication if new vnics are added to the GW. Even
+ * if the vnic is being flushed we need to know it is there.
+ *
+ * Note: This deletion method insures that all pending vnic work requests
+ * are cleared without dependency of the calling context.
+ */
+void fip_vnic_close(struct fip_vnic_data *vnic, enum fip_flush flush)
+{
+       int tmp_flush;
+
+       /* net admin -> full flush */
+       tmp_flush = vnic->hadmined ? flush : FIP_FULL_FLUSH;
+
+       /* child vNic -> full flush */
+       tmp_flush = (!vnic->parent_used) ? tmp_flush : FIP_FULL_FLUSH;
+
+       /* no need for partial cleanup in host admin idle */
+       if (tmp_flush == FIP_PARTIAL_FLUSH &&
+           vnic->state < FIP_VNIC_HADMIN_IDLE)
+               return;
+
+       /* close already in process, disregard */
+       spin_lock_irq(&vnic->lock);
+       if (vnic->flush >= tmp_flush){
+               spin_unlock_irq(&vnic->lock);
+               return;
+       }
+       if (vnic->flush == FIP_NO_FLUSH && vnic->state > FIP_VNIC_WAIT_4_ACK)
+               fip_update_send(vnic, 0, 1 /* logout */);
+
+       vnic->flush = tmp_flush;
+       cancel_delayed_work(&vnic->vnic_gw_alive_task);
+       cancel_delayed_work(&vnic->vnic_task);
+       spin_unlock_irq(&vnic->lock);
+       /* after this point we should have no work that is not already pending
+        * for execution, and no new work will be added
+        */
+
+       if (vnic->hadmined && tmp_flush == FIP_FULL_FLUSH)
+               vnic_delete_hadmin_dentry(vnic);
+       else if (!vnic->hadmined)
+               /* vnic_count is relevant for net admin only */
+               vnic->gw->vnic_count--;
+
+       vnic_dbg_mark();
+
+       /* calls fip_purge_vnics() */
+       queue_delayed_work(fip_wq, &vnic->gw->vnic_cleanup_task,
+                          DELAYED_WORK_CLEANUP_JIFFS);
+}
+
+/*
+ * This is a helper function we use in order to move the login destroy
+ * to another context so we don't block the fip thread for too long.
+*/
+void fip_vnic_login_destroy(struct work_struct *work)
+{
+       struct fip_vnic_data *vnic =
+               container_of(work, struct fip_vnic_data,
+                            vnic_login_destroy_task);
+       int flush = vnic->flush;
+
+       vnic_login_destroy_wq_stopped(vnic, flush);
+
+       /* we don't want to use a lock here so we will verify that the
+        * flush level did not change between the request and now */
+       if (flush == FIP_FULL_FLUSH)
+               set_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status);
+
+       set_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status);
+}
+
+/*
+ * Free vnic resources. This includes closing the data vnic (data QPs etc)
+ * and the discovery resources. If the vnic can be totaly destroyed (no
+ * pending work) the vnic will be removed from the GW and it's memory
+ * freed. If not the vnic will not be freed and the function will return an
+ * error. The caller needs to recall this unction to complete the operation.
+ * Note: Do not call this function to remove a vnic, use fip_vnic_close.
+*/
+int fip_vnic_destroy(struct fip_vnic_data *vnic)
+{
+       int pending;
+
+       vnic_dbg_func(vnic->name);
+       vnic_dbg_fip_p0(vnic->name, "fip_vnic_destroy called flow=%d state=%d mac" MAC_6_PRINT_FMT "\n",
+                    vnic->flush, vnic->state, MAC_6_PRINT_ARG(vnic->login_data.mac));
+
+       pending = work_pending(&vnic->vnic_pkt_rcv_task_bh) ||
+               delayed_work_pending(&vnic->vnic_gw_alive_task) ||
+               delayed_work_pending(&vnic->vnic_task);
+
+       /* verify no pending packets before we start tearing down the rings */
+       if (pending || fip_vnic_test_login(vnic, 0) == -EAGAIN)
+               goto retry_later;
+
+       if (!test_and_set_bit(VNIC_LOGIN_DESTROY_PENDING,
+                             &vnic->login_status)) {
+               vnic_login_destroy_stop_wq(vnic, vnic->flush);
+               /* calls fip_vnic_login_destroy() */
+               queue_work(login_wq, &vnic->vnic_login_destroy_task);
+       }
+
+       if (!test_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status))
+               goto retry_later;
+
+       clear_bit(VNIC_LOGIN_DESTROY_DONE, &vnic->login_status);
+       clear_bit(VNIC_LOGIN_DESTROY_PENDING, &vnic->login_status);
+
+       /* We need to test if when we queued the destroy request it was
+        * a partial flush but this has changed to a full flush.
+        * if so we need to try again */
+       if (vnic->flush == FIP_FULL_FLUSH &&
+           !test_bit(VNIC_LOGIN_DESTROY_FULL, &vnic->login_status))
+               goto retry_later;
+
+       hrtimer_cancel(&vnic->keepalive_timer);
+
+       if (vnic->state >= FIP_VNIC_VHUB_INIT) {
+               lag_ctx_clear(vnic);
+               vhub_ctx_free(vnic);
+       }
+
+       /* disconnect from mcast groups */
+       if (vnic->state >= FIP_VNIC_MCAST_INIT) {
+               vnic_mcast_del_all(&vnic->mcast_tree);
+               fip_vnic_rings_destroy(vnic);
+       }
+
+       if (vnic->state > FIP_VNIC_LOGIN)
+               ib_destroy_ah(vnic->ah);
+
+       if (vnic->flush == FIP_PARTIAL_FLUSH) {
+               if (vnic->hadmined) /* we close Host admin vnics so they won't do any login from fip_vnic_fsm */
+                       vnic->state = FIP_VNIC_CLOSED;
+               else
+                       vnic->state = FIP_VNIC_HADMIN_IDLE;
+
+               vnic->flush = FIP_NO_FLUSH;
+               vnic->last_send_jiffs = 0;
+
+               vnic_dbg_fip_v(vnic->name, "fip_vnic_remove partial done vnic->retry_count=%d\n", vnic->retry_count);
+               if (!VNIC_MAX_RETRIES || ++vnic->retry_count < VNIC_MAX_RETRIES)
+                       QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, FIP_LOGIN_TIMEOUT * HZ);
+
+       } else {
+               list_del(&vnic->gw_vnics);
+               vnic_dbg_fip_v(vnic->name, "fip_vnic_remove full done\n");
+               kfree(vnic);
+       }
+
+       return 0;
+
+retry_later:
+       return -EBUSY;
+}
+
+int fip_vnic_keepalive_send(struct fip_vnic_data *vnic, int source_timer)
+{
+       int update;
+       unsigned long flags;
+       int ret = 0;
+
+       if (vnic->flush != FIP_NO_FLUSH)
+               return ret;
+
+       if (vnic->last_send_jiffs > 1 && jiffies - vnic->last_send_jiffs > vnic->gw->info.vnic_ka_period * 3 / 2)
+               vnic_dbg_fip_p0(vnic->name, "Delaying in sending KA should be %ld actual time=%ld source=%d\n",
+                       vnic->gw->info.vnic_ka_period, jiffies - vnic->last_send_jiffs, source_timer);
+
+       spin_lock_irqsave(&vnic->ka_lock, flags);
+       if (source_timer ||
+           (vnic->last_send_jiffs && jiffies - vnic->last_send_jiffs >
+            vnic->gw->info.vnic_ka_period * 6 / 5)) {
+
+               /* we need to have mcast attached before we ask for a table */
+               if (vnic->state >= FIP_VNIC_VHUB_INIT &&
+                   vnic->vhub_table.state == VHUB_TBL_INIT)
+                       update = 1;
+               else
+                       update = 0;
+
+               /* send vnic keep alive to GW */
+               ret = fip_update_send(vnic, update, 0 /*not logout */);
+               if (!ret)
+                       vnic->last_send_jiffs = jiffies;
+       }
+       spin_unlock_irqrestore(&vnic->ka_lock, flags);
+
+       return ret;
+
+}
+
+//void fip_vnic_keepalive(unsigned long data)
+#ifdef _BP_HR_TIMER
+int fip_vnic_keepalive(struct hrtimer * timer)
+#else
+enum hrtimer_restart fip_vnic_keepalive(struct hrtimer *timer)
+#endif
+{
+//     struct fip_vnic_data *vnic = (struct fip_vnic_data *)data;
+       struct fip_vnic_data *vnic = (struct fip_vnic_data *)
+                                       container_of(timer, struct fip_vnic_data, keepalive_timer);
+       unsigned long flags;
+       ktime_t ktime;   
+       enum hrtimer_restart ret = HRTIMER_NORESTART;
+       int flush;
+
+       spin_lock_irqsave(&vnic->lock, flags);
+       flush = vnic->flush;
+       spin_unlock_irqrestore(&vnic->lock, flags);
+
+       if (flush != FIP_NO_FLUSH)
+               return ret;
+
+       fip_vnic_keepalive_send(vnic, 1);
+
+       /*mod_timer(&vnic->keepalive, jiffies + time);*/
+       ret = HRTIMER_RESTART;
+       ktime = ktime_set(0, vnic->gw->info.vnic_ka_period * (1000000000 / HZ));
+       hrtimer_forward(&vnic->keepalive_timer, vnic->keepalive_timer.base->get_time(), ktime);
+
+
+       return ret;
+
+}
+
+void fip_vnic_gw_alive(struct work_struct *work)
+{
+       struct fip_vnic_data *vnic =
+               container_of(work, struct fip_vnic_data,
+                            vnic_gw_alive_task.work);
+       long time_to_timeout;
+
+       if (vnic->flush != FIP_NO_FLUSH)
+               return;
+
+       if (!test_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state)) {
+               if (time_after(jiffies, vnic->detached_ka_jiffs + 60*HZ)) {
+                       vnic_dbg_fip_p0(vnic->name, "No GW keep alive timeout when mcast un attached "
+                                    "QPN 0x%06x, LID 0x%04x\n", vnic->qp->qp_num,
+                                    vnic->port->attr.lid);
+                       fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+                       return;
+               } else {
+                       vnic_dbg_fip_p0(vnic->name, "Got ka poll when bcast not "
+                                    "attached QPN 0x%06x, LID 0x%04x, ka=%u\n",
+                                    vnic->qp->qp_num, vnic->port->attr.lid,
+                                    jiffies_to_msecs(jiffies - vnic->detached_ka_jiffs));
+                       time_to_timeout = vnic->gw->info.gw_period;
+               }
+       } else {
+               long jiffs_from_last;
+               jiffs_from_last = (jiffies - vnic->keep_alive_jiffs);
+               time_to_timeout = vnic->gw->info.gw_period - jiffs_from_last;
+       }
+
+       /* Todo, change receive of update to rearm work timer so an expiration
+        * indicates a truie time out */
+       if (time_to_timeout <= 0) {
+               vnic_dbg_fip_p0(vnic->name, "GW keep alives timed out for "
+                         "QPN 0x%06x, LID 0x%04x timeout=%ld\n", vnic->qp->qp_num,
+                         vnic->port->attr.lid, time_to_timeout);
+               fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+       } else
+               QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task,
+                                time_to_timeout + 1);
+}
+
+struct fip_vnic_data *fip_vnic_alloc(struct vnic_port *port,
+                                    struct fip_gw_data *gw,
+                                    int hadmin, u16 vnic_id)
+{
+       struct fip_vnic_data *vnic;
+
+       vnic = kzalloc(sizeof(struct fip_vnic_data), GFP_KERNEL);
+       if (!vnic) {
+               vnic_err(port->name, "failed to alloc vnic\n");
+               return NULL;
+       }
+
+       vnic->state = hadmin ? FIP_VNIC_HADMIN_IDLE : FIP_VNIC_LOGIN;
+       vnic->vnic_id = vnic_id;
+       vnic->gw = gw;
+       vnic->gw_info = gw->info.vol_info;
+       vnic->port = port;
+       vnic->hadmined = hadmin;
+       vnic->flush = FIP_NO_FLUSH;
+
+       sprintf(vnic->name, "vnic-%d", vnic_id); /* will be overwritten */
+
+       spin_lock_init(&vnic->lock);
+       spin_lock_init(&vnic->ka_lock);
+       INIT_DELAYED_WORK(&vnic->vnic_task, fip_vnic_fsm);
+       INIT_DELAYED_WORK(&vnic->vnic_gw_alive_task, fip_vnic_gw_alive);
+       INIT_WORK(&vnic->vnic_login_destroy_task, fip_vnic_login_destroy);
+       INIT_WORK(&vnic->vnic_login_create_task, fip_vnic_login_create);
+
+
+#ifdef _BP_HR_TIMER
+       hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_REL);
+#else
+       hrtimer_init(&vnic->keepalive_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL );
+#endif
+       vnic->keepalive_timer.function = fip_vnic_keepalive;
+
+       vnic_mcast_root_init(&vnic->mcast_tree);
+       atomic_set(&vnic->eport_state,EPORT_STATE_DOWN);
+
+       return vnic;
+}
+
+int fip_vnic_hadmin_init(struct vnic_port *port, struct fip_vnic_data *vnic)
+{
+       int rc;
+
+       vnic_dbg_func(port->name);
+
+       rc = vnic_login_pre_create_1(port, vnic);
+       if (rc) {
+               vnic_warn(port->name, "vnic_login_pre_create_1 failed, rc %d\n", rc);
+               goto pre_create_failed;
+       }
+
+       strncpy(vnic->login_data.vnic_name, vnic->interface_name,
+               sizeof(vnic->interface_name));
+
+       /* queue login create request */
+       fip_vnic_test_login(vnic, 1);
+
+       return 0;
+
+pre_create_failed:
+       return -ENODEV;
+}
+
+void fip_vnic_create_gw_param(struct fip_vnic_send_info *gw_address, u32 gw_qpn,
+                             u32 qkey, u16 gw_lid, u8 gw_sl)
+{
+       gw_address->gw_qpn = gw_qpn;
+       gw_address->qkey = qkey;
+       gw_address->gw_lid = gw_lid;
+       gw_address->gw_sl = gw_sl;
+}
+
+void fip_vnic_set_gw_param(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address)
+{
+       memcpy(&vnic->gw_address, gw_address, sizeof(vnic->gw_address));
+}
+
+int fip_hadmin_vnic_refresh(struct fip_vnic_data *vnic, struct fip_vnic_send_info *gw_address)
+{
+       vnic_dbg_fip(vnic->name, "fip_vnic_to_login host admin flow flush=%d"
+                    " state=%d\n", vnic->flush, vnic->state);
+       if (likely(vnic->flush == FIP_NO_FLUSH) &&
+           vnic->state <= FIP_VNIC_HADMIN_IDLE &&
+           (!VNIC_MAX_RETRIES || vnic->retry_count < VNIC_MAX_RETRIES)) {
+               fip_vnic_set_gw_param(vnic, gw_address);
+               cancel_delayed_work(&vnic->vnic_task);
+               vnic->state = FIP_VNIC_LOGIN;
+               fip_vnic_fsm(&vnic->vnic_task.work);
+       }
+       return 0;
+}
+
+/*
+ * Call the data vnic precreate 1 + 2 in order to alloc and init the data vnic.
+ * This function updates qp numbers that the data vnic will use. These qp numbers
+ * are needed for the login.
+ * This function does not cleanup on failures. It assumes that the caller will call
+ * the login destoy.
+*/
+static int fip_vnic_login_init(struct vnic_port *port, struct fip_vnic_data *vnic)
+{
+       int qps_num;
+       int rc;
+
+       struct ib_ah_attr ah_attr = {
+               .dlid = vnic->gw_address.gw_lid,
+               .port_num = port->num,
+               .sl = vnic_gw_ctrl_sl(vnic->gw) & 0xf,
+       };
+
+       vnic_dbg_func(vnic->name);
+
+       /* If the driver wants to enable RSS (vnic_rss == 1) then the
+        * number of QPs is what the GW advertises: 1 << n_rss_qpn
+         */
+       qps_num = (port->rx_rings_num > 1) ? (1 << vnic->gw->info.n_rss_qpn) : 1;
+       qps_num = (qps_num == 0) ? 1 : qps_num;
+
+       /* However, we don't support any qps_num, if the GW asks for more than
+        * VNIC_MAX_NUM_CPUS QPs, then we're not going to enable RSS
+        * -- qps_num == 1 means RSS is disabled, otherwise it's enabled
+        */
+       qps_num = qps_num <= VNIC_MAX_NUM_CPUS ? qps_num : 1;
+
+       /* set in vnic, so it can be reported back to the BXM */
+       vnic->qps_num = qps_num;
+
+       /* in host admin vnic->login should be non NULL */
+       if (!vnic->hadmined) {
+               rc = vnic_login_pre_create_1(port, vnic);
+               if (rc) {
+                       vnic_warn(vnic->name,
+                                 "vnic_login_pre_create_1 failed, "
+                                 "rc %d\n", rc);
+                       goto failed;
+               }
+       }
+
+       /* in host admin vnic->login should be non NULL */
+       rc = vnic_login_pre_create_2(vnic, qps_num,
+                                    vnic->gw->info.gw_type == GW_TYPE_LAG);
+       if (rc) {
+               vnic_warn(port->name, "vnic_login_pre_create_2 failed\n");
+               goto failed;
+       }
+
+       /* if parent_used, you must already have the base QPN */
+       ASSERT(!vnic->parent_used || vnic->qp_base_num);
+
+       vnic->ah = ib_create_ah(port->pd, &ah_attr);
+       if (IS_ERR(vnic->ah)) {
+               vnic_warn(vnic->name, "fip_vnic_login_init failed to create ah\n");
+               vnic->ah = NULL;
+               goto failed;
+       }
+
+       vhub_ctx_init(vnic);
+
+       return 0;
+
+failed:
+       return -ENODEV;
+}
+
+/*
+ * create a CQ and QP for the new vNic. Create RX and TX rings for this
+ * QP. Move QP to RTS and connect it to the CQ.
+*/
+static int fip_vnic_rings_create(struct vnic_port *port,
+                                struct fip_vnic_data *vnic)
+{
+       struct ib_qp_init_attr qp_init_attr;
+       int ret;
+
+       vnic->rx_ring.size = FIP_LOGIN_RX_SIZE;
+       vnic->tx_ring.size = FIP_LOGIN_TX_SIZE;
+
+       INIT_WORK(&vnic->vnic_pkt_rcv_task_bh, fip_vnic_recv_bh);
+       spin_lock_init(&vnic->vnic_rcv_list.lock);
+       INIT_LIST_HEAD(&vnic->vnic_rcv_list.list);
+
+       if (ib_find_pkey(port->dev->ca, port->num, vnic->login_data.pkey,
+                        &vnic->login_data.pkey_index)) {
+               vnic_warn(vnic->name,
+                            "fip_vnic_rings_create PKey 0x%04x not found."
+                            " Check configuration in SM/BX\n", vnic->login_data.pkey);
+               goto out_w_err;
+       }
+
+       vnic->pkey = vnic->login_data.pkey;
+       vnic->pkey_index = vnic->login_data.pkey_index;
+
+       vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create pkey id %d "
+                      "for pkey 0x%x\n", (int)vnic->pkey_index,
+                      (int)vnic->pkey);
+
+       vnic->cq = ib_create_cq(port->dev->ca, fip_vnic_comp, NULL, vnic,
+                               vnic->rx_ring.size + vnic->tx_ring.size, 0);
+       if (IS_ERR(vnic->cq)) {
+               vnic_dbg_fip(vnic->name, "failed to create receive CQ\n");
+               goto out_w_err;
+       }
+
+       memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+       qp_init_attr.cap.max_send_wr = vnic->tx_ring.size;
+       qp_init_attr.cap.max_recv_wr = vnic->rx_ring.size;
+       qp_init_attr.cap.max_send_sge = 1;
+       qp_init_attr.cap.max_recv_sge = 1;
+       qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+       qp_init_attr.qp_type = IB_QPT_UD;
+       qp_init_attr.send_cq = vnic->cq;
+       qp_init_attr.recv_cq = vnic->cq;
+
+       vnic->qp = ib_create_qp(port->pd, &qp_init_attr);
+       if (IS_ERR(vnic->qp)) {
+               vnic_dbg_fip(vnic->name, "failed to create QP\n");
+               goto error_free_cq;
+       }
+
+       vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create QPN %d,"
+                      " LID %d\n", (int)vnic->qp->qp_num, (int)port->attr.lid);
+
+       /* move QP from reset to RTS */
+       if (fip_init_qp(vnic->port, vnic->qp, vnic->pkey_index, vnic->name)) {
+               vnic_dbg_fip(vnic->name, "fip_init_qp returned with error\n");
+               goto error_free_qp;
+       }
+
+       ret = fip_init_tx(vnic->tx_ring.size, &vnic->tx_ring, vnic->name);
+       if (ret) {
+               vnic_dbg_fip(vnic->name, "fip_init_tx failed ret %d\n", ret);
+               goto error_free_qp;
+       }
+
+       ret = fip_init_rx(port, vnic->rx_ring.size, vnic->qp,
+                         &vnic->rx_ring, vnic->name);
+       if (ret) {
+               vnic_dbg_fip(vnic->name, "fip_init_rx returned %d\n", ret);
+               goto error_release_rings;
+       }
+
+       /* enable recieving CQ completions */
+       if (ib_req_notify_cq(vnic->cq, IB_CQ_NEXT_COMP))
+               goto error_release_rings;
+       vnic_dbg_fip_v(vnic->name, "fip_vnic_rings_create done OK\n");
+
+       return 0;
+
+error_release_rings:
+       fip_flush_rings(port, vnic->cq, vnic->qp, &vnic->rx_ring,
+                       &vnic->tx_ring, vnic->name);
+       fip_free_rings(port, &vnic->rx_ring, &vnic->tx_ring, vnic->name);
+error_free_qp:
+       ib_destroy_qp(vnic->qp);
+error_free_cq:
+       ib_destroy_cq(vnic->cq);
+out_w_err:
+       vnic->qp = NULL;
+       vnic->cq = NULL;
+       vnic->rx_ring.size = 0;
+       vnic->tx_ring.size = 0;
+       return -ENODEV;
+}
+
+static void fip_vnic_rings_destroy(struct fip_vnic_data *vnic)
+{
+       fip_flush_rings(vnic->port, vnic->cq, vnic->qp, &vnic->rx_ring,
+                       &vnic->tx_ring, vnic->name);
+       fip_free_rings(vnic->port, &vnic->rx_ring, &vnic->tx_ring, vnic->name);
+       fip_vnic_recv_list_flush(vnic);
+       ib_destroy_qp(vnic->qp);
+       ib_destroy_cq(vnic->cq);
+       vnic->qp = NULL;
+       vnic->cq = NULL;
+}
+
+/*
+ * This function is a callback called upon successful join to a
+ * multicast group. The function checks if we have joined + attached
+ * to all required mcast groups and if so moves the discovery FSM to solicit.
+*/
+void fip_vnic_mcast_cnct_cb(struct vnic_mcast *mcast, void *ctx)
+{
+       struct fip_vnic_data *vnic = mcast->priv_data;
+
+       vnic_dbg_fip(vnic->name, "fip_vnic_mcast_cnct_cb\n");
+       vnic_dbg_parse(vnic->name, "attached mask = 0x%lx, req mask = 0x%lx\n",
+                      *mcast->cur_attached, *mcast->req_attach);
+
+       if ((*mcast->cur_attached & *mcast->req_attach) != *mcast->req_attach)
+               return;
+
+       vnic->keep_alive_jiffs = jiffies;
+       set_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state);
+       /* in case of a new mcast connection switch to VHUB_INIT, for a
+        * reconnection stay in the current state */
+       if (vnic->state < FIP_VNIC_VHUB_INIT) {
+               vnic_dbg_fip(vnic->name,
+                       "fip_vnic_mcast_cnct_cb done joining mcasts\n");
+               vnic->state = FIP_VNIC_VHUB_INIT;
+               cancel_delayed_work(&vnic->vnic_task);
+               REQUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, 0);
+       }
+}
+
+/*
+ * This function is a callback called upon a mcast deattach event.
+ * This event can be triggered due to vnic request or due to an async
+ * event. Currently this code does not participate in the vnic's FSM.
+*/
+void fip_vnic_mcast_deattach_cb(struct vnic_mcast *mcast, void *ctx)
+{
+       struct fip_vnic_data *vnic = mcast->priv_data;
+
+       vnic->detached_ka_jiffs = jiffies;
+       clear_bit(MCAST_ATTACHED, &vnic->vnic_mcaste_state);
+
+       vnic_dbg_fip(vnic->name, "fip_vnic_mcast_deattach_cb\n");
+}
+
+/*
+ * Try to connect to the relevant mcast groups. If one of the mcast failed
+ * The function should be recalled to try and complete the join process
+ * (for the mcast groups that the join process was not performed).
+ * Note: A successful return of vnic_mcast_join means that the mcast join
+ * started, not that the join completed. completion of the connection process
+ * is asyncronous and uses a supplyed callback.
+ */
+int fip_vnic_mcast_cnct(struct fip_vnic_data *vnic)
+{
+       struct vnic_port *port = vnic->port;
+       union vhub_mgid mgid;
+       struct vnic_mcast *mcaste, *mcaste_upd, *mcaste_tbl;
+       struct vnic_mcast *uninitialized_var(mcaste_ka);
+       int rc;
+
+       vnic_dbg_fip(port->name, "fip_vnic_mcast_cnct called\n");
+
+       mcaste_upd = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+       if (IS_ERR(mcaste_upd))
+               return -EINVAL;
+
+       mcaste_tbl = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+       if (IS_ERR(mcaste_tbl)) {
+               rc = -EINVAL;
+               goto free_upd;
+       }
+
+       set_bit(FIP_MCAST_VHUB_UPDATE, &vnic->req_attach);
+       set_bit(FIP_MCAST_TABLE, &vnic->req_attach);
+
+       vnic_dbg_fip(port->name, "gw type is %d\n", vnic->gw->info.gw_type);
+       if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+               mcaste_ka = vnic_mcast_alloc(port, &vnic->req_attach, &vnic->cur_attached);
+               if (IS_ERR(mcaste_ka)) {
+                       rc = -EINVAL;
+                       goto free_tbl;
+               }
+               set_bit(FIP_MCAST_VHUB_KA, &vnic->req_attach);
+       }
+
+       mcaste = mcaste_upd;
+       mcaste->priv_data = vnic;
+       mcaste->attach_bit_nr = FIP_MCAST_VHUB_UPDATE;
+       memset(mcaste->mac, 0, ETH_ALEN);
+       vhub_mgid_create(vnic->login_data.mgid_prefix,
+                        mcaste->mac,
+                        vnic->login_data.n_mac_mcgid,
+                        vnic->login_data.vhub_id, VHUB_MGID_UPDATE,
+                        0, &mgid);
+       mcaste->gid = mgid.ib_gid;
+       mcaste->port_gid = mcaste->gid;
+       mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+       mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+       mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+       mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+       mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+       mcaste->attach_cb_ctx = NULL;
+       mcaste->detach_cb_ctx = NULL;
+       mcaste->blocking = 0;
+       mcaste->qkey = VNIC_FIP_QKEY;
+       mcaste->pkey = vnic->pkey;
+       mcaste->qp = vnic->qp;
+       mcaste->create = vnic_mcast_create;
+       mcaste->blocking = 0;
+       mcaste->join_state = 1;
+       rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+       ASSERT(!rc);
+       rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste);      /* MCAST_RECEIVE_ONLY */
+       ASSERT(!rc);
+
+       mcaste = mcaste_tbl;
+       mcaste->priv_data = vnic;
+       mcaste->attach_bit_nr = FIP_MCAST_TABLE;
+       memset(mcaste->mac, 0, ETH_ALEN);
+       vhub_mgid_create(vnic->login_data.mgid_prefix,
+                        mcaste->mac,
+                        vnic->login_data.n_mac_mcgid,
+                        vnic->login_data.vhub_id, VHUB_MGID_TABLE,
+                        0, &mgid);
+       mcaste->gid = mgid.ib_gid;
+       mcaste->port_gid = mcaste->gid;
+       mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+       mcaste->backoff_factor = VNIC_MCAST_BACKOF_FAC;
+       mcaste->retry = VNIC_MCAST_ULIMIT_RETRY;
+       mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+       mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+       mcaste->attach_cb_ctx = NULL;
+       mcaste->detach_cb_ctx = NULL;
+       mcaste->blocking = 0;
+       mcaste->qkey = VNIC_FIP_QKEY;
+       mcaste->pkey = vnic->pkey;
+       mcaste->qp = vnic->qp;
+       mcaste->create = vnic_mcast_create;
+       mcaste->blocking = 0;
+       mcaste->join_state = 1;
+       rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+       ASSERT(!rc);
+       rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste);      /* MCAST_RECEIVE_ONLY */
+       ASSERT(!rc);
+
+       if (vnic->gw->info.gw_type != GW_TYPE_LAG)
+               return 0;
+
+       mcaste = mcaste_ka;
+       mcaste->priv_data = vnic;
+       mcaste->attach_bit_nr = FIP_MCAST_VHUB_KA;
+       memset(mcaste->mac, 0, ETH_ALEN);
+       vhub_mgid_create(vnic->login_data.mgid_prefix,
+                        mcaste->mac,
+                        vnic->login_data.n_mac_mcgid,
+                        vnic->login_data.vhub_id, VHUB_MGID_KA,
+                        0, &mgid);
+       mcaste->gid = mgid.ib_gid;
+       mcaste->port_gid = mcaste->gid;
+       mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+       mcaste->backoff_factor = 1;
+       mcaste->retry = VNIC_MCAST_MAX_RETRY;
+       mcaste->attach_cb = fip_vnic_mcast_cnct_cb;
+       mcaste->detach_cb = fip_vnic_mcast_deattach_cb;
+       mcaste->attach_cb_ctx = NULL;
+       mcaste->detach_cb_ctx = NULL;
+       mcaste->blocking = 0;
+       mcaste->qkey = VNIC_FIP_QKEY;
+       mcaste->pkey = vnic->pkey;
+       mcaste->qp = vnic->qp;
+       mcaste->create = vnic_mcast_create;
+       mcaste->blocking = 0;
+       mcaste->join_state = 1;
+       mcaste->sender_only = 1;
+       vnic->ka_mcast_gid = mcaste->gid;
+       rc = vnic_mcast_add(&vnic->mcast_tree, mcaste);
+       ASSERT(!rc);
+       rc = vnic_mcast_attach(&vnic->mcast_tree, mcaste);
+       ASSERT(!rc);
+
+        return 0;
+
+free_tbl:
+       vnic_mcast_dealloc(mcaste_tbl);
+
+free_upd:
+       vnic_mcast_dealloc(mcaste_upd);
+
+       return rc;
+}
+
+/*
+ * This function is the driving engine of the vnic logic. It manages the
+ * vnics state machines.
+ * Some of the states in the state machine could have been removed because
+ * they contain "actions" and not states. Still it is easier to maintaine
+ * the code this way and it gives an easy mechanism for exception handling
+ * and retries.
+ * Only call this function from fip_wq context.
+*/
+void fip_vnic_fsm(struct work_struct *work)
+{
+       struct fip_vnic_data *vnic =
+               container_of(work, struct fip_vnic_data, vnic_task.work);
+       struct vnic_port *port = vnic->port;
+       int rc, recall_time = 0;
+       const long int msec_in_sec = 1000;
+       struct fip_vnic_send_info gw_address;
+       ktime_t ktime;
+
+       vnic_dbg_fip(port->name, "fip_vnic_fsm called vnic %d\n",
+                    vnic->vnic_id);
+
+       if (vnic->flush != FIP_NO_FLUSH)
+               return;
+
+       switch (vnic->state) {
+       case FIP_VNIC_CLOSED:
+               break;
+       case FIP_VNIC_HADMIN_IDLE:
+               if (vnic->gw->state < FIP_GW_CONNECTED)
+                       break;
+               fip_vnic_create_gw_param(&gw_address, vnic->gw->info.gw_qpn, VNIC_FIP_QKEY,
+                                         vnic->gw->info.gw_lid, vnic_gw_ctrl_sl(vnic->gw));
+               fip_vnic_set_gw_param(vnic, &gw_address);
+               /* fall through */
+
+       case FIP_VNIC_LOGIN:
+               vnic_dbg_fip(port->name, "FIP_VNIC_LOGIN vnic %d\n",
+                            vnic->vnic_id);
+               /* get data QP numbers needed for login request packet. If we fail
+                * we will close the vnic entirely */
+               rc = fip_vnic_login_init(vnic->port, vnic);
+               if (rc) {
+                       fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+                       vnic_warn(vnic->name, "fip_vnic_login_init failed, "
+                                "closing vnic rc %d\n", rc);
+                       break;
+               }
+               vnic->state = FIP_VNIC_WAIT_4_ACK;
+               /* fall through */
+
+       case FIP_VNIC_WAIT_4_ACK:
+               vnic_dbg_fip(port->name, "FIP_VNIC_WAIT_4_ACK vnic %d\n",
+                            vnic->vnic_id);
+               /* resend login request every timeout */
+               vnic_dbg_fip(port->name, "fip_login_send vnic %d\n",vnic->vnic_id);
+               rc = fip_login_send(vnic);
+               if (!rc)
+                       recall_time = FIP_LOGIN_TIMEOUT * msec_in_sec;
+               else
+                       recall_time = 1 * msec_in_sec;
+
+               goto queue_vnic_work;
+
+       case FIP_VNIC_RINGS_INIT:
+               /* create QP and rings */
+               rc = fip_vnic_rings_create(vnic->port, vnic);
+               if (rc) {
+                       fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+                       vnic_warn(vnic->name, "fip_vnic_rings_create failed, "
+                                 "closing vnic rc=%d\n", rc);
+                       break;
+               }
+
+               vnic->last_send_jiffs = 1; /* use a non zero value to start transmition */
+               {
+                       /* start vnic UCAST KA packets, This will also cause bxm to send us the
+                         * neighbor table */
+                       if (vnic->gw->info.gw_type != GW_TYPE_LAG) {
+                               ktime = ktime_set(0, 0);
+#ifdef _BP_HR_TIMER
+                               hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL );
+#else
+                               hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL );
+#endif
+                       }
+               }
+
+               vnic->state = FIP_VNIC_MCAST_INIT;
+               /* fall through */
+
+       case FIP_VNIC_MCAST_INIT:
+               rc = fip_vnic_mcast_cnct(vnic);
+               if (rc) {
+                       vnic_warn(vnic->name,
+                                    "fip_vnic_mcast_cnct failed, rc %d\n", rc);
+                       /* try again later */
+                       recall_time = 1 * msec_in_sec;
+                       goto queue_vnic_work;
+               }
+               vnic->state = FIP_VNIC_MCAST_INIT_DONE;
+               /* fall through */
+
+       case FIP_VNIC_MCAST_INIT_DONE:
+               /* wait for mcast attach CB before continueing */
+               break;
+
+       case FIP_VNIC_VHUB_INIT:
+
+               /* previous KA if sent did not request a table because MCASTs were not
+                * available. Send extra KA packet that should trigger table request in
+                * order to hasten things up */
+               fip_vnic_keepalive_send(vnic, 1);
+
+               if (vnic->gw->info.gw_type == GW_TYPE_LAG) {
+                       /* start vnic MCAST KA packets, This will also cause bxm to send us the
+                         * neighbor table */
+                       ktime = ktime_set(0, 0);
+#ifdef _BP_HR_TIMER
+                       hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_REL );
+#else
+                       hrtimer_start(&vnic->keepalive_timer, ktime, HRTIMER_MODE_REL );
+#endif
+               }
+
+               /* start tracking GW keep alives, calls  fip_vnic_gw_alive() */
+               QUEUE_VNIC_DWORK(vnic, &vnic->vnic_gw_alive_task,
+                                vnic->gw->info.gw_period);
+
+               vnic->state = FIP_VNIC_VHUB_INIT_DONE;
+               /* fall through */
+
+       case FIP_VNIC_VHUB_INIT_DONE:
+               /* we are waiting to receive a full vhub table. The KA will handle
+                * retries if we do not get the table we are expecting */
+
+               /* queue login create request */
+               if (fip_vnic_test_login(vnic, 1)) {
+                       recall_time = 1 * msec_in_sec;
+                       goto queue_vnic_work;
+               }
+
+               break;
+
+       case FIP_VNIC_VHUB_DONE:
+               if (fip_vnic_test_login(vnic, 1)) {
+                       recall_time = 1 * msec_in_sec;
+                       goto queue_vnic_work;
+               }
+
+                if (vnic_login_complete_ack(vnic, &vnic->login_data, &vnic->shared_vnic)) {
+                       vnic_warn(vnic->name,
+                                    "vnic_login_complete_ack failed\n");
+                       recall_time = 1 * msec_in_sec;
+                       goto queue_vnic_work;
+               }
+
+               /* for LAG write member info */
+               fip_vnic_write_members(vnic);
+
+               vnic->state = FIP_VNIC_VHUB_WRITE;
+               /* fall through */
+
+       case FIP_VNIC_VHUB_WRITE:
+               /* write the vhub table to login */
+               fip_vnic_write_tbl(vnic);
+               vnic->state = FIP_VNIC_CONNECTED;
+               /* fall through */
+
+       case FIP_VNIC_CONNECTED:
+               vnic->retry_count = 0;
+               break;
+       default:
+               ASSERT(0);
+               break;
+       }
+
+       vnic_dbg_fip(port->name, "state %d gw_lid %d gw_qpn %d\n",
+                    vnic->state, vnic->gw_address.gw_lid, vnic->gw_address.gw_qpn);
+       return;
+
+queue_vnic_work:
+       QUEUE_VNIC_DWORK(vnic, &vnic->vnic_task, recall_time * HZ / msec_in_sec);
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_main.c
new file mode 100644 (file)
index 0000000..07a6f2e
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+
+struct workqueue_struct *fip_wq;
+
+void fip_refresh_mcasts(struct fip_discover *discover)
+{
+       struct fip_gw_data *gw;
+       struct fip_vnic_data *vnic;
+
+       fip_discover_mcast_reattach(discover, discover->port);
+
+       down_read(&discover->l_rwsem);
+       list_for_each_entry(gw, &discover->gw_list, list)
+               list_for_each_entry(vnic, &gw->vnic_list, gw_vnics) {
+                       if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT)
+                               vnic_tree_mcast_detach(&vnic->mcast_tree);
+               }
+
+       list_for_each_entry(gw, &discover->gw_list, list) {
+            list_for_each_entry(vnic, &gw->vnic_list, gw_vnics)  {
+                if (vnic->flush != FIP_FULL_FLUSH && vnic->state > FIP_VNIC_MCAST_INIT)
+                                       vnic_tree_mcast_attach(&vnic->mcast_tree);
+                       }
+            /* restart path query */
+            if (vnic_sa_query && gw->state >= FIP_GW_CTRL_PATH_QUERY && gw->flush == FIP_NO_FLUSH)
+                               fip_discover_gw_fsm_move(gw, FIP_GW_CTRL_PATH_QUERY);
+    }
+       up_read(&discover->l_rwsem);
+
+}
+
+void port_fip_discover_restart(struct work_struct *work)
+{
+       struct vnic_port *port =
+           container_of(work, struct vnic_port, discover_restart_task.work);
+       struct fip_discover *discover;
+       struct vnic_login *login;
+
+       vnic_dbg_mark();
+       mutex_lock(&port->start_stop_lock);
+       vnic_dbg_mark();
+       mutex_lock(&port->mlock);
+       if (vnic_port_query(port))
+               vnic_warn(port->name, "vnic_port_query failed\n");
+
+       /* bring vnics links down */
+       list_for_each_entry(login, &port->login_list, list)
+               vnic_mcast_del_all(&login->mcast_tree);
+
+       mutex_unlock(&port->mlock);
+       list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+               if (fip_discover_cleanup(port, discover, 0)) {
+                       vnic_dbg(port->name, "fip_discover_cleanup flushed\n");
+                       goto out;
+               }
+       }
+
+       list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+               if (fip_discover_init(port, discover, discover->pkey, 0)) {
+                       vnic_warn(port->name, "failed to alloc discover resources\n");
+               }
+       }
+out:
+       mutex_unlock(&port->start_stop_lock);
+       return;
+}
+
+void vnic_port_fip_cleanup(struct vnic_port *port, int lock)
+{
+       struct fip_discover *discover, *tmp_discover;
+
+       if (lock)
+               mutex_lock(&port->start_stop_lock);
+
+       list_for_each_entry_safe(discover, tmp_discover, &port->fip.discover_list, discover_list) {
+               vnic_dbg_fip_p0(port->name, "Discovery cleanup of PKEY=0x%x\n", discover->pkey);
+
+               list_del(&discover->discover_list);
+               vnic_info("Removed fip discovery %s port %d pkey 0x%x\n",
+                         port->dev->ca->name, port->num, discover->pkey);
+               fip_discover_cleanup(port, discover, 1);
+               kfree(discover);
+       }
+
+       if (lock)
+               mutex_unlock(&port->start_stop_lock);
+}
+
+
+int vnic_port_fip_init(struct vnic_port *port)
+{
+       int rc;
+       struct fip_discover *discover;
+       int i;
+
+       if (no_bxm)
+               return 0;
+
+       vnic_discovery_pkeys_count = vnic_discovery_pkeys_count > MAX_NUM_PKEYS_DISCOVERY ?
+               MAX_NUM_PKEYS_DISCOVERY : vnic_discovery_pkeys_count;
+
+       if (vnic_discovery_pkeys_count == 0 ||
+           (vnic_discovery_pkeys_count == MAX_NUM_PKEYS_DISCOVERY &&
+            vnic_discovery_pkeys[0] == 0)) {
+               vnic_discovery_pkeys[0] = 0xffff;
+               vnic_discovery_pkeys_count = 1;
+               vnic_dbg_fip_p0(port->name, "Creating default PKEY for Discovery\n");
+       }
+
+       mutex_lock(&port->start_stop_lock);
+
+       for (i = 0; i < vnic_discovery_pkeys_count; i++) {
+               vnic_discovery_pkeys[i] &= 0xffff;
+               vnic_discovery_pkeys[i] |= 0x8000;
+
+               vnic_dbg_fip_p0(port->name, "Init Discovery=%d on PKEY=0x%x\n", i, vnic_discovery_pkeys[i]);
+
+               discover = kzalloc(sizeof(struct fip_discover), GFP_KERNEL);
+               if (!discover) {
+                       vnic_warn(port->name, "discover alloc failed\n");
+                       rc = -ENOMEM;
+                       goto fail;
+               }
+
+               INIT_LIST_HEAD(&discover->discover_list);
+
+               vnic_info("Added fip discovery %s port %d PKEY 0x%x\n",
+                         port->dev->ca->name, port->num,
+                         vnic_discovery_pkeys[i]);
+
+               list_add_tail(&discover->discover_list, &port->fip.discover_list);
+               rc = fip_discover_init(port, discover, vnic_discovery_pkeys[i], 1);
+               if (rc) {
+                       vnic_warn(port->name, "fip_discover_init pkey=0x%x "
+                                 "failed\n", discover->pkey);
+                       list_del(&discover->discover_list);
+                       kfree(discover);
+                       goto fail;
+               }
+       }
+       mutex_unlock(&port->start_stop_lock);
+       return 0;
+
+fail:
+       mutex_unlock(&port->start_stop_lock);
+       vnic_port_fip_cleanup(port, 1);
+       return rc;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.c
new file mode 100644 (file)
index 0000000..078d4aa
--- /dev/null
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/uts.h>
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+const struct eoib_host_update base_update_pkt = {
+       .fip.subcode = FIP_HOST_ALIVE_SUB_OPCODE,
+       .fip.type.type = FIP_FIP_HDR_TYPE,
+       .fip.type.length = FIP_FIP_HDR_LENGTH,
+       .fip.vendor_id = FIP_VENDOR_MELLANOX,
+
+       .type_1.type = FIP_HOST_UPDATE_TYPE,
+       .type_1.length = FIP_HOST_UPDATE_LENGTH,
+       .vendor_id = FIP_VENDOR_MELLANOX,
+};
+
+const struct eoib_host_update base_logout_pkt = {
+       .fip.subcode = FIP_HOST_LOGOUT_SUB_OPCODE,
+       .fip.type.type = FIP_FIP_HDR_TYPE,
+       .fip.type.length = FIP_FIP_HDR_LENGTH,
+       .fip.vendor_id = FIP_VENDOR_MELLANOX,
+
+       .type_1.type = FIP_LOGOUT_TYPE_1,
+       .type_1.length = FIP_LOGOUT_LENGTH_1,
+       .vendor_id = FIP_VENDOR_MELLANOX,
+};
+
+static int extract_adv_extended(struct fip_ext_desc_tlv *fed,
+                               struct fip_gw_data_info *info)
+{
+        struct fip_ext_type_cap *extended_cap;
+        struct fip_ext_type_boot *extended_boot;
+       struct fip_ext_type_power_cycle_id *extended_pc_id;
+       struct fip_ext_type_lag_props *extended_lag = NULL;
+       struct fip_extended_type *ext_hdr;
+       int length_to_go, ext_length;
+       
+       vnic_dbg_parse("", "extracting extended descriptor\n");
+
+       length_to_go = (((int)fed->ft.length) << 2) - sizeof(*fed);
+       ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+       while (length_to_go > 0) {
+               ext_length = ((int)ext_hdr->len) << 2;
+
+               vnic_dbg_parse(NULL, "Advertise parse, sub-tlv "
+                              "type  %d length %d address=%p\n",
+                              ext_hdr->ext_type, ext_length, ext_hdr);
+
+               if (ext_length < sizeof(*ext_hdr) ||
+                   ext_length > length_to_go) {
+                       vnic_dbg_parse(NULL, "Extended length error. "
+                                      "Length=%d\n", ext_length);
+                       return -EINVAL;
+               }
+
+               if (ext_hdr->ext_type == ADV_EXT_TYPE(CAP) &&
+                   ext_length == sizeof(*extended_cap)) {              /* capabilities*/
+                       /* do nothing */
+               } else if (ext_hdr->ext_type == ADV_EXT_TYPE(LAG) &&    /* LAG */
+                          ext_length == sizeof(*extended_lag)) {
+                       extended_lag = (struct fip_ext_type_lag_props *)ext_hdr;
+                       info->gw_type = extended_lag->gw_type;
+                       info->ext_lag.hash =  be16_to_cpu(extended_lag->lag_hash);
+                       info->ext_lag.weights_policy = extended_lag->weight_policy_flags >> 4;
+                       info->ext_lag.member_ka = (extended_lag->weight_policy_flags & 0x8) >> 3;
+                       info->ext_lag.ca = !!(extended_lag->weight_policy_flags &
+                                               FIP_EXT_LAG_W_POLICY_HOST);
+                       info->ext_lag.ca_thresh = extended_lag->ca_threshold;
+                       info->ext_lag.ucast = !!(extended_lag->weight_policy_flags &
+                                                FIP_EXT_LAG_W_POLICY_UCAST);
+                       info->ext_lag.valid = 1;
+               } else if (ext_hdr->ext_type == ADV_EXT_TYPE(BOOT) &&
+                          ext_length == sizeof(*extended_boot)) {      /* boot */
+                       extended_boot = (struct fip_ext_type_boot *)ext_hdr;
+                       info->ext_boot.boot_prio = extended_boot->boot_prio;
+                       info->ext_boot.timeout = extended_boot->discovery_timeout;
+                       info->ext_boot.valid = 1;
+               } else if (ext_hdr->ext_type == ADV_EXT_TYPE(PC_ID) && 
+                          ext_length == sizeof(*extended_pc_id)) { /* Power Cycle ID */
+                       extended_pc_id = (struct fip_ext_type_power_cycle_id *)ext_hdr;
+                       info->ext_pc_id.power_cycle_id =
+                               be64_to_cpu(extended_pc_id->power_cycle_id);
+                       info->ext_pc_id.valid = 1;
+               } else if (ext_hdr->mandatory & 0x01) {
+                       vnic_dbg_parse(NULL, "Advertise parse, unknown"
+                                      " mandatory extended type %d length %d\n",
+                                      ext_hdr->ext_type, ext_length);
+                       return -EINVAL;
+               } else
+                       vnic_dbg_parse(NULL, "Advertise parse, unknown "
+                                      "non-mandatory extended. Skipping, type"
+                                      " %d length %d\n",
+                                      ext_hdr->ext_type, ext_length);
+
+               ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+               length_to_go -= ext_length;
+       }
+
+       return 0;
+}
+
+int fip_advertise_parse_bh(struct fip_discover *discover, struct fip_content *fc,
+                          struct fip_gw_data *data)
+{
+       long ka_time;
+       int err = 0;
+
+       /* make sure we have at least a single address descriptor */
+       if (fc->fa.num < 1 || !fc->fgwi || !fc->fgid || !fc->fka)
+               return -EINVAL;
+
+       data->info.flags = be16_to_cpu(fc->fh->flags) & FIP_FIP_ADVRTS_FLAG ? FIP_GW_AVAILABLE : 0;
+
+       data->info.flags |=
+           (be16_to_cpu(fc->fh->flags) & FIP_FIP_SOLICITED_FLAG) ? 0 :
+           FIP_RCV_MULTICAST;
+
+       data->info.flags |= FIP_IS_FIP;
+       data->info.flags |= (fc->fh->flags & FIP_ADVERTISE_HOST_VLANS) ?
+           FIP_HADMINED_VLAN : 0;
+
+       data->info.gw_qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff;
+       data->info.gw_lid = be16_to_cpu(fc->fa.fa[0]->lid);
+       data->info.gw_port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) &
+               FIP_ADVERTISE_GW_PORT_ID_MASK;
+       data->info.gw_sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT; /*ignore this value.*/
+       memcpy(data->info.gw_guid, fc->fa.fa[0]->guid, sizeof(data->info.gw_guid));
+       data->info.gw_num_vnics = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) &
+               FIP_ADVERTISE_NUM_VNICS_MASK;
+
+       data->info.n_rss_qpn = be16_to_cpu(fc->fgwi->n_rss_qpn_vnics) >>
+               FIP_ADVERTISE_N_RSS_SHIFT;
+       data->info.hadmined_en = (fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_HOST_EN_MASK);
+       data->info.all_vlan_gw = !!(fc->fgwi->h_nmac_mgid & FIP_ADVERTISE_ALL_VLAN_GW_MASK);
+
+       TERMINATED_MEMCPY(data->info.gw_vendor_id, fc->fgwi->vendor_id);
+       memcpy(data->info.vol_info.system_guid, fc->fgid->sys_guid,
+              sizeof(data->info.vol_info.system_guid));
+       TERMINATED_MEMCPY(data->info.vol_info.system_name,
+                         fc->fgid->sys_name);
+       TERMINATED_MEMCPY(data->info.vol_info.gw_port_name, fc->fgid->gw_port_name);
+
+       ka_time = be32_to_cpu(fc->fka->adv_period);
+       ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+       /* do not let KA go under 2 secs */
+       ka_time = (ka_time < 2000) ? 2000 : ka_time;
+       data->info.gw_adv_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time));
+
+       ka_time = be32_to_cpu(fc->fka->ka_period);
+       ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+       data->info.gw_period = FIP_TIMEOUT_FACTOR(msecs_to_jiffies(ka_time));
+
+       ka_time = be32_to_cpu(fc->fka->vnic_ka_period);
+       ka_time = ka_time ? ka_time : FKA_ADV_PERIOD;
+       data->info.vnic_ka_period = msecs_to_jiffies(ka_time);
+
+       data->info.gw_type = GW_TYPE_SINGLE_EPORT;
+       if (fc->fed.num > 0) {
+               if (fc->fed.num == 1) {
+                       /* new version bxm mode */
+                       data->info.gw_prot_new = 1;
+                       err = extract_adv_extended(fc->fed.fed[0], &data->info);
+                       if (err)
+                               vnic_dbg_parse(discover->name, "invalid extended descripotr\n");
+               } else {
+                       vnic_dbg_parse(discover->name, "too many extended descripotrs\n");
+                       return -EINVAL;
+               }
+       }
+
+       return err;
+}
+
+static int send_generic_mcast_pkt(struct vnic_port *port,
+                                 struct fip_ring *tx_ring,
+                                 void *mem, int pkt_size,
+                                 struct ib_qp *qp,
+                                 int pkey_index,
+                                 struct vnic_mcast *mcast)
+{
+       int index, rc;
+       unsigned long flags;
+       unsigned long tail;
+
+       /*
+        * we are only allowed to update the head at task level so no need to
+        * perform any locks here
+        */
+       spin_lock_irqsave(&tx_ring->ring_lock, flags);
+       index = tx_ring->head & (tx_ring->size - 1);
+       vnic_dbg_fip(port->name, "mcast packet\n");
+
+       spin_lock(&tx_ring->head_tail_lock);
+       tail = tx_ring->tail;
+       spin_unlock(&tx_ring->head_tail_lock);
+
+       /* ring full try again */
+       if (tx_ring->head - tail >=  tx_ring->size) {
+               vnic_warn(port->name, "send_generic_mcast_pkt ring full: QPN 0x%x: tail=%ld head=%ld diff=%ld\n",
+                         qp->qp_num, tx_ring->tail, tx_ring->head, tx_ring->head - tx_ring->tail);
+               rc = -EAGAIN;
+               goto err;
+       }
+
+       rc = _map_generic_pkt(port, &tx_ring->ring[index], mem, pkt_size);
+       if (rc)
+               goto err;
+
+       rc = fip_mcast_send(port, qp, index,
+                           tx_ring->ring[index].bus_addr,
+                           pkt_size, pkey_index, mcast);
+
+       if (rc) {
+               vnic_warn(port->name,
+                         "send_generic_mcast_pkt: fip_mcast_send ret %d\n",
+                         rc);
+               rc = -ENODEV;
+               goto error_unmap_dma;
+       }
+
+       tx_ring->head++;
+
+       spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+       return 0;
+
+error_unmap_dma:
+       ib_dma_unmap_single(port->dev->ca,
+                           tx_ring->ring[index].bus_addr,
+                           pkt_size, DMA_TO_DEVICE);
+
+err:
+       spin_unlock_irqrestore(&tx_ring->ring_lock, flags);
+       return rc;
+}
+
+static void *alloc_solicit_pkt(int new_prot, char *node_desc)
+{
+       void *ptr;
+       struct fip_solicit_new *nptr;
+       struct fip_solicit_legacy *optr;
+       int size = new_prot ? sizeof *nptr : sizeof *optr;
+
+       ptr = kzalloc(size, GFP_KERNEL);
+       if (!ptr)
+               return ERR_PTR(-ENOMEM);
+       optr = ptr;
+       optr->version.version = 1;
+       optr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+       optr->fh.subcode = FIP_HOST_SOL_SUB_OPCODE;
+       optr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*optr), fvend)) / 4;
+       optr->fvend.ft.type = FIP_TYPE(VENDOR_ID);
+       optr->fvend.ft.length = sizeof optr->fvend / 4; 
+       strncpy(optr->fvend.vendor_id, "mellanox", sizeof optr->fvend.vendor_id);
+       optr->addr.ft.type = FIP_TYPE(ADDRESS);
+       optr->addr.ft.length = sizeof optr->addr / 4;
+       strncpy(optr->addr.vendor_id, "mellanox", sizeof optr->addr.vendor_id);
+       if (new_prot) {
+               nptr = ptr;
+               nptr->ext.ft.type = 254;
+               nptr->ext.ft.length = sizeof nptr->ext / 4;
+               strncpy(nptr->ext.vendor_id, "mellanox", sizeof nptr->ext.vendor_id);
+               nptr->ext_cap.et.ext_type = 40;
+               nptr->ext_cap.et.len = sizeof nptr->ext_cap / 4;
+               nptr->ext_cap.et.mandatory = 1;
+               nptr->ext_hostname.et.ext_type = 39;
+               nptr->ext_hostname.et.len = sizeof nptr->ext_hostname / 4;
+               strncpy(nptr->ext_hostname.hostname, node_desc, sizeof nptr->ext_hostname.hostname);
+       }
+
+       return ptr;
+}
+
+int fip_solicit_send(struct fip_discover *discover,
+                    enum fip_packet_type multicast,
+                    u32 dqpn, u16 dlid, u8 sl, int new_prot)
+{
+       int rc = 0;
+       unsigned long flags, flags1;
+       struct fip_solicit_legacy *optr;
+       int size = new_prot ? sizeof(struct fip_solicit_new) : sizeof *optr;
+
+       ASSERT(discover);
+
+       /* alloc packet to be sent */
+       optr = alloc_solicit_pkt(new_prot, discover->port->dev->ca->node_desc);
+       if (IS_ERR(optr))
+               return PTR_ERR(optr);
+
+       /* we set bit 24 to signify that we're a new host */
+       optr->addr.gwtype_qpn = cpu_to_be32(discover->qp->qp_num | 0x1000000);
+       optr->addr.lid = cpu_to_be16(discover->port->attr.lid);
+       /* send the SL to the GW*/
+       optr->addr.sl_gwportid = cpu_to_be16(sl << FIP_ADVERTISE_SL_SHIFT);
+
+       memcpy(optr->addr.guid, &discover->port->gid.global.interface_id, sizeof(optr->addr.guid));
+       vnic_dbg_fip(discover->name, "fip_solicit_send creating multicast %d"
+                    " solicit packet\n", multicast);
+
+       fip_dbg_dump_raw_pkt(0, optr, size, 1, "sending solicit packet");
+
+       if (multicast) {
+               struct vnic_mcast *mcaste;
+               union ib_gid gid;
+
+               memcpy(&gid, fip_solicit_mgid, GID_LEN);
+               spin_lock_irqsave(&discover->mcast_tree.mcast_rb_lock, flags);
+               mcaste = vnic_mcast_search(&discover->mcast_tree, &gid);
+               /* it is possible for the MCAST entry or AH to be missing in
+                * transient states (after events). This is a valid condition
+                * but we can't send packet
+                */
+               if (!IS_ERR(mcaste) && mcaste->ah) {
+                       spin_lock_irqsave(&mcaste->lock, flags1);
+                       rc = send_generic_mcast_pkt(discover->port, &discover->tx_ring,
+                                           optr, size, discover->qp,
+                                           discover->pkey_index,
+                                           mcaste);
+                       spin_unlock_irqrestore(&mcaste->lock, flags1);
+               } else
+                       kfree(optr);
+
+               spin_unlock_irqrestore(&discover->mcast_tree.mcast_rb_lock, flags);
+       } else {
+               rc = send_generic_ucast_pkt(discover->port, NULL, &discover->tx_ring,
+                                           optr, size, discover->qp,
+                                           discover->pkey_index,
+                                           dqpn, dlid, VNIC_FIP_QKEY, sl);
+       }
+       if (rc)
+               goto error_free_mem;
+
+       return 0;
+
+error_free_mem:
+       vnic_warn(discover->name, "discover_send error ret %d\n", rc);
+       kfree(optr);
+       return -ENOMEM;
+}
+
+static void *alloc_login_pkt(struct fip_vnic_data *vnic)
+{
+       struct eoib_login *ptr;
+       int size = sizeof *ptr;
+
+       ptr = kzalloc(size, GFP_KERNEL);
+       if (!ptr)
+               return ERR_PTR(-ENOMEM);
+
+       ptr->eoib_ver.version = 1;
+       ptr->fh.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+       ptr->fh.subcode = FIP_HOST_LOGIN_SUB_OPCODE;
+       ptr->fh.list_length = cpu_to_be16(size - offsetof(typeof(*ptr), fvend) / 4);
+       ptr->fvend.ft.type = FIP_TYPE(VENDOR_ID);
+       ptr->fvend.ft.length = sizeof ptr->fvend / 4; 
+       strncpy(ptr->fvend.vendor_id, "mellanox", sizeof ptr->fvend.vendor_id);
+       ptr->fa.ft.type = FIP_TYPE(ADDRESS);
+       ptr->fa.ft.length = sizeof ptr->fa / 4;
+       strncpy(ptr->fa.vendor_id, "mellanox", sizeof ptr->fa.vendor_id);
+       ptr->fa.gwtype_qpn = cpu_to_be32(vnic->qp_base_num);
+       ptr->fa.sl_gwportid = cpu_to_be16(vnic->gw->info.gw_port_id);
+       /* sl will be taken from the data path record query */
+       ptr->fa.sl_gwportid |= cpu_to_be16(vnic->gw->data_prec.sl << FIP_ADVERTISE_SL_SHIFT);
+       ptr->fa.lid = cpu_to_be16(vnic->port->attr.lid);
+       memcpy(ptr->fa.guid, &vnic->port->gid.global.interface_id, sizeof ptr->fa.guid);
+       ptr->fl.ft.type = FIP_TYPE(LOGIN);
+       ptr->fl.ft.length = sizeof ptr->fl / 4;
+       strncpy(ptr->fl.vendor_id, "mellanox", sizeof ptr->fl.vendor_id);
+       ptr->fl.vnic_id = cpu_to_be16(vnic->vnic_id);
+
+       if (vnic->hadmined) {
+               int mac_valid = !IS_ZERO_MAC(vnic->login_data.mac);
+               u16 flags = (mac_valid ? FIP_LOGIN_M_FLAG : 0) |
+                           FIP_LOGIN_H_FLAG |
+                           (vnic->login_data.vp ? FIP_LOGIN_VP_FLAG  | FIP_LOGIN_V_FLAG : 0);
+               ptr->fl.flags_vlan = cpu_to_be16(vnic->login_data.vlan | flags );
+               memcpy(ptr->fl.mac, vnic->login_data.mac, sizeof ptr->fl.mac);
+               memcpy(ptr->fl.vnic_name, vnic->login_data.vnic_name, sizeof ptr->fl.vnic_name);
+
+               // TODO remove this when BXM handles 0 addresses
+               if (!mac_valid)
+                       ptr->fl.mac[ETH_ALEN-1] = 1;
+       }
+
+       /* all_vlan mode must be enforced between the host and GW side.
+          For host admin vnic with VLAN we let the host choose the work mode.
+          If the GW isn't working in that same mode, the login will fail
+          and the host will enter a login-retry loop
+          For net admin vnic or host admin without a vlan, we work in the mode
+          published by the GW */
+       if (vnic->gw->info.all_vlan_gw &&
+           (!vnic->hadmined ||
+            (vnic->hadmined && !vnic->login_data.vp)))
+               ptr->fl.vfields |= cpu_to_be16(FIP_LOGIN_ALL_VLAN_GW_FLAG);
+
+       ptr->fl.syndrom_ctrl_qpn = cpu_to_be32(vnic->gw->discover->qp->qp_num);
+       ptr->fl.vfields |= cpu_to_be16((vnic->qps_num > 1) << 12);
+
+       /* for child vNics, allow implicit logout */
+       if (vnic->parent_used) {
+               ptr->fl.vfields |= cpu_to_be16(1 << 14);
+               ptr->fl.vfields |= cpu_to_be16(1 << 13);
+       }
+
+       return ptr;
+}
+
+/*
+ * Send a unicast login packet. This function supports both host and
+ * network admined logins. function returns 0 on success and
+ * error code on failure
+*/
+int fip_login_send(struct fip_vnic_data *vnic)
+{
+       int ret;
+       struct eoib_login *ptr;
+
+       ASSERT(vnic);
+       ASSERT(vnic->port);
+
+       /* don't send packet because GW does not support this */
+       if (vnic->hadmined && !vnic->gw->hadmin_gw)
+               return 0;
+
+       /* alloc packet to be sent */
+       ptr = alloc_login_pkt(vnic);
+        if (IS_ERR(ptr))
+               return PTR_ERR(ptr);
+
+       fip_dbg_dump_raw_pkt(0, ptr, sizeof *ptr, 1, "sending login packet");
+
+       ret = send_generic_ucast_pkt(vnic->port, NULL/*ah : create a new ah inside*/,
+                                    &vnic->gw->discover->tx_ring,
+                                    ptr, sizeof *ptr, vnic->gw->discover->qp,
+                                    vnic->gw->discover->pkey_index,
+                                    vnic->gw_address.gw_qpn,
+                                    vnic->gw_address.gw_lid,
+                                    vnic->gw_address.qkey,
+                                    vnic_gw_ctrl_sl(vnic->gw));
+       if (ret) {
+               vnic_warn(vnic->port->name,
+                         "fip_login_send: fip_ucast_send ret %d\n", ret);
+               goto error_free_mem;
+       }
+
+       return 0;
+
+error_free_mem:
+       kfree(ptr);
+       return -ENOMEM;
+}
+
+/*
+ * This function creates and sends a few types of packets (all ucast):
+ *   vHub context request - new=1, logout=0
+ *   vHub context update / vnic keep alive - new=0, logout=0
+ *   vnic logout - new=0, logout=1
+*/
+int fip_update_send(struct fip_vnic_data *vnic, int request_new, int logout)
+{
+       struct eoib_host_update *pkt;
+       struct ib_qp *send_qp;
+       struct fip_ring *tx_ring;
+       int pkey_index;
+       int ret = 0;
+
+       ASSERT(vnic);
+       ASSERT(vnic->port);
+
+       /* alloc packet to be sent */
+       pkt = kmalloc(sizeof *pkt, GFP_ATOMIC);
+       if (!pkt) {
+               vnic_warn(vnic->port->name, "fip_update_send malloc failed\n");
+               return -EAGAIN;
+       }
+
+       /* copy keep alive packet template */
+       if (logout)
+               memcpy(pkt, &base_logout_pkt, sizeof(struct eoib_host_update));
+       else
+               memcpy(pkt, &base_update_pkt, sizeof(struct eoib_host_update));
+
+       pkt->fip.opcode = cpu_to_be16(EOIB_FIP_OPCODE);
+       pkt->fip.list_length =
+           cpu_to_be16((sizeof(struct eoib_host_update) >> 2) - 3);
+       pkt->vnic_id = cpu_to_be16(vnic->vnic_id);
+       memcpy(pkt->mac, vnic->login_data.mac, sizeof(pkt->mac));
+       memcpy(pkt->vnic_name, vnic->login_data.vnic_name,
+              sizeof(pkt->vnic_name));
+       memcpy(pkt->port_guid, &vnic->port->gid.global.interface_id,
+              sizeof(pkt->port_guid));
+
+       pkt->vhub_id.vhub_id = cpu_to_be32(vnic->login_data.vhub_id);
+
+       if (!logout) {
+               pkt->tusn = cpu_to_be32(vnic->vhub_table.main_list.tusn);
+               send_qp = vnic->qp;
+               tx_ring = &vnic->tx_ring;
+               pkey_index = vnic->pkey_index;
+
+               if (vnic->login_data.vp)
+                       pkt->vhub_id.flags.flags |= FIP_HOST_VP_FLAG;
+
+               if (request_new)
+                       pkt->vhub_id.flags.flags |= FIP_HOST_R_FLAG;
+               else
+                       pkt->vhub_id.flags.flags |= FIP_HOST_U_FLAG;
+       } else {
+               send_qp = vnic->gw->discover->qp;
+               tx_ring = &vnic->gw->discover->tx_ring;
+               pkey_index = vnic->gw->discover->pkey_index;
+       }
+
+       if (vnic->gw->info.gw_type == GW_TYPE_LAG && 
+           !vnic->gw->info.ext_lag.ucast && !logout) {
+               struct vnic_mcast *mcaste;
+               unsigned long flags;
+
+               spin_lock_irqsave(&vnic->mcast_tree.mcast_rb_lock, flags);
+               mcaste = vnic_mcast_search(&vnic->mcast_tree, &vnic->ka_mcast_gid);
+               if (!IS_ERR(mcaste)) {
+                       if (mcaste->ah) {
+                               ret = send_generic_mcast_pkt(vnic->port, &vnic->tx_ring,
+                                                            pkt, sizeof *pkt, vnic->qp,
+                                                            vnic->pkey_index, mcaste);
+                                vnic_dbg_parse(vnic->name, "sent multicast keep alive\n");
+                       }
+                       else {
+                               vnic_dbg_parse(vnic->name, "mcaste %p: ah is null\n", mcaste);
+                               kfree(pkt);
+                       }
+               } else {
+                       vnic_dbg_parse(vnic->name, "ka mcast not found\n");
+                       ret = -ENOMEM;
+               }
+               spin_unlock_irqrestore(&vnic->mcast_tree.mcast_rb_lock, flags);
+
+       } else
+               /* For LAG gateway the ah is not up to date and therefore
+                  should not be used */
+               ret = send_generic_ucast_pkt(vnic->port, NULL/*ah : create a new ah inside*/,
+                                            tx_ring, pkt, sizeof *pkt,
+                                            send_qp,
+                                            pkey_index,
+                                            vnic->gw_address.gw_qpn,
+                                            vnic->gw_address.gw_lid,
+                                            vnic->gw_address.qkey,
+                                            vnic_gw_ctrl_sl(vnic->gw));
+       if (ret) {
+               vnic_warn(vnic->port->name,
+                         "fip_update_send: ret %d\n", ret);
+               goto error_free_mem;
+       }
+
+       return 0;
+
+error_free_mem:
+       kfree(pkt);
+       return -ENOMEM;
+}
+
+static void dump_lag_member(struct lag_member *m)
+{
+       vnic_dbg_lag("", "QPN 0x%x, SL %d, gw_portid 0x%x, LID 0x%x, guid " GUID_FORMAT
+                      ", eport_state %s, weight %d, link_utilization %d\n",
+                      m->qpn, m->sl, m->gw_port_id, m->lid, GUID_ARG(m->guid),
+                      eport_state_str(m->eport_state), m->weight, m->link_utilization);
+}
+
+static inline int handle_lag_member(struct fip_vnic_data *vnic,
+                            struct fip_ext_type_lag_members *ext_lag_membs,
+                            int ext_length)
+{
+       struct lag_members lag_members;
+
+       extract_memb_extended(ext_lag_membs, ext_length, &lag_members, vnic->name);
+
+       /* propogate change in member state as needed */
+       return handle_member_update(vnic, &lag_members);
+}
+
+int extract_vhub_extended(struct fip_ext_desc_tlv *fed,
+                         struct fip_vnic_data *vnic)
+{
+       struct fip_ext_type_ctrl_iport *ext_ctrl_iport;
+       struct fip_ext_type_lag_members *ext_lag_memb;
+       struct fip_extended_type *ext_hdr;
+       struct fip_vnic_send_info *gw_addr;
+       int length_to_go, ext_length;
+
+       if (fed->ft.type != 254)
+               return -EINVAL;
+
+       length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed);
+       ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+       while (length_to_go > 0) {
+               ext_length = ((int)ext_hdr->len) << 2;
+
+               vnic_dbg_parse(vnic->name, "Table Update parse, sub-tlv "
+                              "type  %d length %d address=%p\n",
+                              ext_hdr->ext_type, ext_length, ext_hdr);
+
+               if (ext_length < sizeof(*ext_hdr) ||
+                   ext_length > length_to_go) {
+                       vnic_dbg_parse(vnic->name, "Extended length error."
+                                      " Length=%d\n", ext_length);
+                       return -EINVAL;
+               }
+
+               switch (ext_hdr->ext_type) {
+               case ADV_EXT_TYPE(MEMBER):
+                       ext_lag_memb = (struct fip_ext_type_lag_members *)ext_hdr;
+
+                       if (handle_lag_member(vnic, ext_lag_memb, ext_length))
+                               vnic_dbg_parse(vnic->name, "handle_lag_member() failed");
+                       break;
+               case ADV_EXT_TYPE(CTRL_IPORT):
+                       if (ext_length != sizeof(*ext_ctrl_iport)) {
+                               vnic_dbg_parse(vnic->name, "Extended length %d is"
+                                              " different than expected\n", 
+                                              ext_length);
+                               return -EINVAL;
+                       }
+
+                       gw_addr = &vnic->gw_address;
+                       ext_ctrl_iport  = (struct fip_ext_type_ctrl_iport *)ext_hdr;
+                       gw_addr->gw_qpn = be32_to_cpu(ext_ctrl_iport->gwtype_qpn);
+                       gw_addr->gw_lid = be16_to_cpu(ext_ctrl_iport->lid);
+                       gw_addr->gw_sl  = be16_to_cpu(ext_ctrl_iport->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT;
+                       break;
+               default:
+                       if (ext_hdr->mandatory & 0x01) {
+                               vnic_dbg_parse(vnic->name, "Unknown mandatory extended type %d length %d\n",
+                                              ext_hdr->ext_type, ext_length);
+                               return -EINVAL;
+                       } else {
+                               vnic_dbg_parse(vnic->name, "Unknown non-mandatory extended. Skipping, type %d length %d\n",
+                                              ext_hdr->ext_type, ext_length);
+                               ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+                               length_to_go -= ext_length;
+                                       continue;
+                               }
+                       }
+       
+               ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+               length_to_go -= ext_length;
+       }
+
+       return 0;
+}
+
+static int extract_login_extended(struct fip_ext_desc_tlv *fed,
+                                 struct lag_members *lagm,
+                                 char *name)
+{
+       struct fip_ext_type_lag_members *ext_lag_membs;
+       struct fip_extended_type *ext_hdr;
+       int length_to_go, ext_length;
+
+       if (fed->ft.type != 254)
+               return -EINVAL;
+
+       length_to_go = ((int)(fed->ft.length) << 2) - sizeof(*fed);
+       ext_hdr = (struct fip_extended_type *)(fed + 1);
+
+       while (length_to_go > 0) {
+               ext_length = ((int)ext_hdr->len) << 2;
+
+               vnic_dbg_parse(name, "Table Update parse, sub-tlv "
+                              "type  %d length %d address=%p\n",
+                              ext_hdr->ext_type, ext_length, ext_hdr);
+
+               if (ext_length < sizeof(*ext_hdr) ||
+                   ext_length > length_to_go) {
+                       vnic_dbg_parse(name, "Extended length error."
+                                      " Length=%d\n", ext_length);
+                       return -EINVAL;
+               }
+
+               switch (ext_hdr->ext_type) {
+               case ADV_EXT_TYPE(MEMBER):
+                       ext_lag_membs = (struct fip_ext_type_lag_members *)ext_hdr;
+
+                       extract_memb_extended(ext_lag_membs, ext_length, lagm, name);
+                       
+                       break;
+               default:
+                       if (ext_hdr->mandatory & 0x01) {
+                               vnic_dbg_parse(name, "Unknown mandatory extended type %d length %d\n",
+                                              ext_hdr->ext_type, ext_length);
+                               return -EINVAL;
+                       } else {
+                               vnic_dbg_parse(name, "Unknown non-mandatory extended. Skipping, type %d length %d\n",
+                                              ext_hdr->ext_type, ext_length);
+                               ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+                               length_to_go -= ext_length;
+                                       continue;
+                       }
+               }
+               ext_hdr = (struct fip_extended_type *)((char *)ext_hdr + ext_length);
+               length_to_go -= ext_length;
+       }
+
+       return 0;
+}
+
+void extract_memb_extended(struct fip_ext_type_lag_members *ext_lag_membs,
+                          int ext_length,                        
+                          struct lag_members *lagm,
+                          char *name)
+{      
+       struct lag_member *m;
+       struct fip_ext_type_lag_member *lm;
+       int nmemb = 0;
+       int i;  
+
+       nmemb = (ext_length - sizeof ext_lag_membs->et) / sizeof *lm;
+       if (nmemb > MAX_LAG_MEMBERS) {
+               vnic_dbg_parse(name, "recieved %d members but max supported is %d. "
+                              "Using only %d\n", nmemb, MAX_LAG_MEMBERS,
+                              MAX_LAG_MEMBERS);
+               nmemb = MAX_LAG_MEMBERS;
+       }
+
+       m = lagm->memb;
+       lm = ext_lag_membs->lagm;
+
+       for (i = 0; i < nmemb; ++i, ++lm, ++m) {
+               m->qpn = be32_to_cpu(lm->qpn) & 0xffffff;
+               m->sl = be16_to_cpu(lm->sl_gw_portid) >> 12;
+               m->gw_port_id = be16_to_cpu(lm->sl_gw_portid) & 0xfff;
+               m->lid = be16_to_cpu(lm->lid);
+               memcpy(m->guid, lm->guid, sizeof m->guid);
+               m->eport_state = lm->eport_state >> 6;
+               m->weight = lm->weight;
+               m->link_utilization = lm->link_utilization;
+               dump_lag_member(m);
+       }
+       lagm->num = nmemb;
+
+       vnic_dbg_parse(name, "Table Update extended parse finished OK. Num members=%d\n",
+                      lagm->num);
+       return;
+}
+
+/*
+ * parse a packet that is suspected of being an login ack packet. The packet
+ * returns 0 for a valid login ack packet and an error code otherwise. The
+ * packets "interesting" details are returned in data.
+ */
+int fip_login_parse(struct fip_discover *discover, struct fip_content *fc,
+                   struct fip_login_data *data)
+{
+       u32 vfields;
+       int err = 0;
+
+       data->syndrome = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) >> 24;
+       data->vnic_id = be16_to_cpu(fc->fl->vnic_id);
+       data->lid = be16_to_cpu(fc->fa.fa[0]->lid);
+       data->port_id = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) & 0xfff;
+       data->sl = be16_to_cpu(fc->fa.fa[0]->sl_gwportid) >> FIP_ADVERTISE_SL_SHIFT;
+       data->qpn = be32_to_cpu(fc->fa.fa[0]->gwtype_qpn) & 0xffffff;
+       memcpy(data->guid, fc->fa.fa[0]->guid, sizeof(data->guid));
+
+       if (be16_to_cpu(fc->fl->flags_vlan) & FIP_LOGIN_VP_FLAG) {
+               data->vp = 1;
+               data->vlan = be16_to_cpu(fc->fl->flags_vlan) & 0xfff;
+       }
+       data->all_vlan_gw = !!(be16_to_cpu(fc->fl->vfields) & FIP_LOGIN_ALL_VLAN_GW_FLAG);
+
+       data->vhub_id = CREATE_VHUB_ID(cpu_to_be16(data->vlan), data->port_id);
+
+       data->ctl_qpn = be32_to_cpu(fc->fl->syndrom_ctrl_qpn) & FIP_LOGIN_CTRL_QPN_MASK;
+       vfields = be16_to_cpu(fc->fl->vfields);
+       data->n_mac_mcgid = vfields & FIP_LOGIN_DMAC_MGID_MASK;
+       data->n_rss_mgid = vfields >> 8 & 0xf;
+       /* data->rss = pkt->rss & FIP_LOGIN_RSS_MASK; it's redundant in login ack */
+       data->pkey = be16_to_cpu(fc->fp->pkey);
+       data->mtu = be16_to_cpu(fc->fl->mtu);
+
+       memcpy(data->mac, fc->fl->mac, sizeof(data->mac));
+       memcpy(data->mgid_prefix, fc->fl->eth_gid_prefix, sizeof(data->mgid_prefix));
+       memcpy(data->vnic_name, fc->fl->vnic_name, sizeof(data->vnic_name));
+       memcpy(data->vendor_id, fc->fl->vendor_id, sizeof(data->vendor_id));
+
+       if (fc->fed.num)
+               err = extract_login_extended(fc->fed.fed[0], &data->lagm, discover->name);
+
+       return err;
+}
+
+/*
+ * Check if a received packet is a FIP packet, And if so return its subtype.
+ * The FIP type is also returned in fip_type and can be either EOIB_FIP_OPCODE
+ * or FCOIB_FIP_OPCODE. If the packet is not a FIP packet -EINVAL is returned.
+*/
+int fip_pkt_parse(char *buffer, int length, int *fip_type)
+{
+       struct fip_fip_header *fip_header;
+       u16 fip_opcode;
+
+       fip_header = (struct fip_fip_header *)
+           (buffer + IB_GRH_BYTES + sizeof(struct fip_eoib_ver));
+
+       fip_opcode = be16_to_cpu(fip_header->opcode);
+
+       if (fip_opcode != EOIB_FIP_OPCODE) {
+               *fip_type = 0;
+               return -EINVAL;
+       }
+
+       *fip_type = fip_opcode;
+
+       return fip_header->subcode;
+}
+
+/*
+ * Already know that this is a FIP packet, return its subtype.
+*/
+int fip_pkt_get_subtype_bh(char *buffer)
+{
+       struct fip_fip_header *fip_header;
+
+       fip_header = (struct fip_fip_header *)
+           (buffer + sizeof(struct fip_eoib_ver));
+
+       return fip_header->subcode;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_pkt.h
new file mode 100644 (file)
index 0000000..32e34fc
--- /dev/null
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _FIP_DISCOVER_PKT_H
+#define _FIP_DISCOVER_PKT_H
+
+#include <linux/kref.h>
+
+
+
+#endif /* _FIP_DISCOVER_H */
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_fip_vhub.c
new file mode 100644 (file)
index 0000000..8bcd6d0
--- /dev/null
@@ -0,0 +1,635 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+#include "vnic_fip_discover.h"
+#include "vnic_fip_pkt.h"
+
+/*
+ * construct an mgid address based on vnic login information and the type
+ * variable (data mcast / vhub update / vhub table). The resulting mgid
+ * is returned in *mgid.
+ */
+void vhub_mgid_create(const char *mgid_prefix,
+                     const char *mmac, /* mcast mac for bcast 0xFF.. */
+                     u64 n_mac,        /* bits to take from mmac */
+                     u32 vhub_id,
+                     enum vhub_mgid_type type,
+                     u8 rss_hash,
+                     union vhub_mgid *mgid)
+{
+       u32 vhub_id_be;
+       u64 mac_mask;
+       u64 *mac_ptr;
+       u64 one = 1; /* must do that for shift bitwise operation */
+
+       memcpy(mgid->mgid.mgid_prefix, mgid_prefix,
+              sizeof(mgid->mgid.mgid_prefix));
+       mgid->mgid.type = (u8)type;
+       memcpy(mgid->mgid.dmac, mmac, sizeof(mgid->mgid.dmac));
+       mac_mask = cpu_to_le64(((one << n_mac) - one) | 0xFFFF000000000000ULL);
+       mac_ptr = (u64*)(mgid->mgid.dmac);
+       *mac_ptr &= mac_mask;
+       mgid->mgid.rss_hash = rss_hash;
+       vhub_id_be = cpu_to_be32(vhub_id);
+       memcpy(mgid->mgid.vhub_id, ((u8 *) &vhub_id_be) + 1,
+              sizeof(mgid->mgid.vhub_id));
+};
+
+/*
+ * Init the vnic's vHub table data structures, before using them
+ */
+void vhub_ctx_init(struct fip_vnic_data *vnic)
+{
+       INIT_LIST_HEAD(&vnic->vhub_table.main_list.vnic_list);
+       vnic->vhub_table.main_list.tusn = 0;
+       vnic->vhub_table.main_list.count = 0;
+       vnic->vhub_table.main_list.total_count = 0;
+
+       INIT_LIST_HEAD(&vnic->vhub_table.update_list.vnic_list);
+       vnic->vhub_table.update_list.tusn = 0;
+       vnic->vhub_table.update_list.count = 0;
+       vnic->vhub_table.update_list.total_count = 0;
+
+       vnic->vhub_table.checksum = 0;
+       vnic->vhub_table.tusn = 0;
+       vnic->vhub_table.state = VHUB_TBL_INIT;
+}
+
+/* print vhub context table */
+static void vhub_ctx_prnt(struct fip_vnic_data *vnic,
+                         struct vhub_elist *vhub_list, int level)
+{
+       struct vnic_table_entry *vnic_entry;
+
+       if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V))
+               return;
+
+       vnic_dbg_vhub_v(vnic->name, "Dumping context table. Count %d tusn %d\n",
+                       vhub_list->count, vhub_list->tusn);
+
+       list_for_each_entry(vnic_entry, &vhub_list->vnic_list, list) {
+               vnic_dbg_vhub_v(vnic->name, "lid 0x%04x qpn 0x%06x, mac "
+                               MAC_6_PRINT_FMT"\n", vnic_entry->lid,
+                               vnic_entry->qpn,
+                               MAC_6_PRINT_ARG(vnic_entry->mac));
+       }
+}
+
+void vhub_table_free(struct vhub_elist *elist)
+{
+       struct vnic_table_entry *del_vnic, *tmp_vnic;
+
+       list_for_each_entry_safe(del_vnic, tmp_vnic, &elist->vnic_list, list) {
+               list_del(&del_vnic->list);
+               kfree(del_vnic);
+       }
+}
+
+/*
+ * Clear and free the vnic's vHub context table data structures.
+ */
+void vhub_ctx_free(struct fip_vnic_data *vnic)
+{
+       vnic_dbg_fip_v(vnic->name, "vhub_ctx_free called\n");
+
+       vhub_table_free(&vnic->vhub_table.main_list);
+       vhub_table_free(&vnic->vhub_table.update_list);
+
+       vhub_ctx_init(vnic);
+}
+
+static struct vnic_table_entry *vhub_find_entry(struct vhub_elist *vnic_list,
+                                              u16 lid, u32 qpn)
+{
+       struct vnic_table_entry *tmp_vnic;
+
+       list_for_each_entry(tmp_vnic, &vnic_list->vnic_list, list) {
+               if (tmp_vnic->lid == lid && tmp_vnic->qpn == qpn)
+                       return tmp_vnic;
+       }
+       return NULL;
+}
+
+/*
+ * Move vHub context entries from the update list to the main list. The update
+ * list is used during the wait for the main table to be updated. Once
+ * the table update is completed the entries need to be moved from the update
+ * table to the main table. This function does this.
+*/
+static int vhub_update_main(struct fip_vnic_data *vnic,
+                           struct vhub_elist *main_list,
+                           struct vhub_elist *update_list)
+{
+       struct vnic_table_entry *new_entry, *tmp_vnic, *del_vnic;
+       int first_tusn = (u32) update_list->tusn - (update_list->count - 1);
+       int extra_tusn;
+
+       /* update list is usually empty */
+       if (likely(update_list->count == 0))
+               return 0;
+
+       if (first_tusn > main_list->tusn + 1) {
+               vnic_warn(vnic->name, "Info, vhub_to_main_tbl sync main to"
+                         " update list failed. update tusn %d update "
+                         "first %d main %d\n",
+                         update_list->tusn, first_tusn, main_list->tusn);
+               return -1;
+       }
+
+       extra_tusn = main_list->tusn + 1 - first_tusn;
+
+       /* go over update list and move / remove entries in it */
+       list_for_each_entry_safe(new_entry, tmp_vnic,
+                                &update_list->vnic_list, list) {
+               if (extra_tusn > 0) {
+                       list_del(&new_entry->list);
+                       kfree(new_entry);
+                       extra_tusn--;
+               } else {
+                       /* remove from update list and apply to main list */
+                       list_del(&new_entry->list);
+                       main_list->tusn++;
+
+                       /* Check valid bit, if set add to main list */
+                       if (new_entry->valid) {
+                               list_add_tail(&new_entry->list,
+                                             &main_list->vnic_list);
+                               main_list->count++;
+                       } else {        /* remove from main list */
+                               del_vnic = vhub_find_entry(main_list,
+                                                          new_entry->lid,
+                                                          new_entry->qpn);
+                               if (del_vnic) {
+                                       list_del(&del_vnic->list);
+                                       kfree(del_vnic);
+
+                                       main_list->count--;
+                               }
+                               vnic_dbg_fip_v(vnic->name,
+                                              "vhub_to_main_tbl removed "
+                                              "vnic lid %d qpn 0x%x found %d\n",
+                                              (int)new_entry->lid,
+                                              (int)new_entry->qpn,
+                                              (del_vnic != 0));
+                               kfree(new_entry);
+                       }
+               }
+               update_list->count--;
+       }
+       return 0;
+}
+
+int fip_vnic_search_mac(struct fip_vnic_data *vnic, struct vhub_elist *elist)
+{
+       struct vnic_table_entry *vlist_entry;
+
+       list_for_each_entry(vlist_entry, &elist->vnic_list, list)
+               /* find matching entry based on mac */
+               if(!memcmp(vnic->login_data.mac, vlist_entry->mac, ETH_ALEN)) {
+                       /* verify lid/qpn match */
+                       if (vnic->port->attr.lid == vlist_entry->lid &&
+                           vnic->qp_base_num == vlist_entry->qpn)
+                               return 1;
+                       else {
+                               vnic_dbg_vhub(vnic->name,
+                                             "vnic LID=0x%x or QPN=0x%x "
+                                             "in vhub tbl is different than "
+                                             "expected LID=0x%x, QPN=0x%x\n",
+                                             vlist_entry->lid,
+                                             vlist_entry->qpn,
+                                             vnic->port->attr.lid, 
+                                             vnic->qp_base_num);
+                               break;
+                       }
+               }
+
+       return 0;
+}
+
+/*
+ * This function handles a vhub context table packet. The table will
+ * be processed only if we do not have an up to date local copy of
+ * our own. The table update supports multi-packet tables so care
+ * must be taken in building the complete table.
+ */
+int vhub_handle_tbl(struct fip_vnic_data *vnic, struct fip_content *fc,
+                   u32 vhub_id, u32 tusn)
+{
+       struct context_table_entry *entry;
+       struct vnic_table_entry *new_entry;
+       struct vhub_elist *table;
+       int i, j, count_in_pkt;
+       int reason = 0;
+       int hdr_type;
+
+       /* we already have a table. disregard this one */
+       if (vnic->vhub_table.state != VHUB_TBL_INIT) {
+               vnic_dbg_vhub_v(vnic->name,
+                              "vhub_handle_tbl context not in init\n");
+               return 0;
+       }
+
+       /* compute the number of vnic entries in the packet.
+        * don't forget the checksum
+        */
+       count_in_pkt = fc->cte.num;
+       table = &vnic->vhub_table.main_list;
+       hdr_type = be16_to_cpu(fc->fvt->hdr) >> 14;
+
+       /* first or only packet in sequence */
+       if (hdr_type == FIP_TABLE_HDR_FIRST || hdr_type == FIP_TABLE_HDR_ONLY) {
+               table->total_count = be16_to_cpu(fc->fvt->table_size);
+               table->tusn = tusn;
+       }
+       if (table->tusn != tusn) {
+               vnic_warn(vnic->name, "Info, vhub_handle_tbl got unexpected "
+                         "tusn. Expect=%d received=%d\n", table->tusn, tusn);
+               if (!table->tusn)
+                       goto drop_silently;
+               reason = 1;
+               goto reset_table;
+       }
+
+       if ((table->count + count_in_pkt > table->total_count) ||
+           ((table->count + count_in_pkt < table->total_count) &&
+            (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY))) {
+               vnic_dbg_vhub(vnic->name,
+                             "vhub_handle_tbl got unexpected entry count. "
+                             "count %d, in packet %d total expected %d\n",
+                             table->count, count_in_pkt, table->total_count);
+               reason = 2;
+               goto reset_table;
+       }
+
+       entry = fc->cte.cte;
+       for (i = 0; i < count_in_pkt; ++i, ++entry) {
+               new_entry = kzalloc(sizeof *new_entry, GFP_KERNEL);
+               if (!new_entry)
+                       goto reset_table;
+
+               for (j = 0; j < (sizeof *entry) >> 2; ++j)
+                       vnic->vhub_table.checksum += ((u32 *) entry)[j];
+
+               new_entry->lid = be16_to_cpu(entry->lid);
+               new_entry->qpn = be32_to_cpu(entry->qpn) & 0xffffff;
+               new_entry->sl = entry->sl & 0xf;
+               new_entry->rss = !!(entry->v_rss_type & FIP_CONTEXT_RSS_FLAG);
+               new_entry->valid = !!(entry->v_rss_type & FIP_CONTEXT_V_FLAG);
+               memcpy(new_entry->mac, entry->mac, sizeof(new_entry->mac));
+
+               list_add_tail(&new_entry->list, &table->vnic_list);
+               table->count++;
+       }
+
+       /* last packet */
+       if (hdr_type == FIP_TABLE_HDR_LAST || hdr_type == FIP_TABLE_HDR_ONLY) {
+               ASSERT(table->count == table->total_count);
+               if (vnic->vhub_table.checksum != be32_to_cpu(*(u32 *) entry)) {
+                       vnic_dbg_fip_v(vnic->name,
+                                      "vhub_handle_tbl checksum mismatch. "
+                                      "expected 0x%x, in packet 0x%x\n",
+                                      vnic->vhub_table.checksum,
+                                      be32_to_cpu(*(u32 *) entry));
+                       /* TODO: request checksum match in final code */
+                       /* goto reset_table; */
+               }
+
+               if (vhub_update_main(vnic, &vnic->vhub_table.main_list,
+                                    &vnic->vhub_table.update_list)) {
+                       vnic_dbg_fip_v(vnic->name,
+                                      "vhub_handle_tbl moving update list to main "
+                                      "list failed\n");
+                       reason = 3;
+                       goto reset_table;
+               }
+
+               /* we are done receiving the context table */
+               vnic_dbg_fip_v(vnic->name,
+                              "vhub_handle_tbl updated with %d entries\n",
+                              vnic->vhub_table.main_list.count);
+               vhub_ctx_prnt(vnic, &vnic->vhub_table.main_list, 0);
+
+               /* we are not in the main vHub list close ourselves */
+               if (!fip_vnic_search_mac(vnic, &vnic->vhub_table.main_list)) {
+                       vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves\n");
+                       fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+                       reason = 4;
+                       goto reset_table;
+               }
+
+               if (fip_vnic_tbl_done(vnic)) {
+                       vnic_warn(vnic->name, "vhub_handle_tbl done failed, reseting table\n");
+                       reason = 5;
+                       goto reset_table;
+               }
+       }
+
+drop_silently:
+       return 0;
+
+reset_table:
+       vnic_dbg_fip_p0(vnic->name, "We are not in the main table close our selves reason=%d\n", reason);
+       vhub_ctx_free(vnic);
+       /* TODO renable tx of update request, fip_update_send() */
+       return -EINVAL;
+}
+
+/*
+ * This function writes the main vhub table to the data (login) vnic.
+ * You should call it when the data vnic is ready for it and after the
+ * table is up to date (and the update list was applied to the main list)
+ */
+int fip_vnic_write_tbl(struct fip_vnic_data *vnic)
+{
+       struct vnic_table_entry *vlist_entry;
+       int rc;
+
+       if (vnic->login)
+               sprintf(vnic->name, "%s", vnic->login->name);
+
+       /* update table in neigh tree */
+       list_for_each_entry(vlist_entry,
+                           &vnic->vhub_table.main_list.vnic_list, list) {
+               rc = vnic_vhube_add(vnic, vlist_entry);
+               if (rc) {
+                       vnic_warn(vnic->name, "vnic_vhube_add failed for mac "
+                                 MAC_6_PRINT_FMT" (rc %d)\n",
+                                 MAC_6_PRINT_ARG(vlist_entry->mac), rc);
+                       vhub_ctx_free(vnic);
+                       vnic_vhube_flush(vnic);
+                       return -1;
+               }
+       }
+
+       vnic_dbg_fip(vnic->name, "fip_vnic_tbl_done: creation of vnic done\n");
+
+       vnic->vhub_table.tusn = vnic->vhub_table.main_list.tusn;
+       vnic->vhub_table.state = VHUB_TBL_UPDATED;
+
+       /* free table memory */
+       vhub_table_free(&vnic->vhub_table.main_list);
+       return 0;
+}
+
+/*
+ * This function handles a vhub context update packets received AFTER
+ * we have a valid vhub table. For update additions the code adds an
+ * entry to the neighbour tree. For update removals we either remove
+ * the entry from the neighbour list or if the removed entry is "this vnic"
+ * we remove the vnic.
+*/
+static int vhub_update_updated(struct fip_vnic_data *vnic,
+                              u32 vhub_id, u32 pkt_tusn,
+                              struct vnic_table_entry *data)
+{
+       int curr_tusn;
+
+       curr_tusn = vnic->vhub_table.tusn;
+
+       /* if vnic is being flushed, return */
+       if (vnic->flush)
+               return 0;
+
+       /* we got a GW keep alive packet */
+       if (pkt_tusn == curr_tusn)
+               return 0;
+
+       /* if we got an out of order update clear list and request new table */
+       if (pkt_tusn != curr_tusn + 1) {
+               vnic_warn(vnic->name, "Info, vhub_update_up2date received out"
+                         " of order update. Recvd=%d Expect=%d\n",
+                         pkt_tusn, curr_tusn);
+               goto error_in_update;
+       }
+
+       /* new entry added */
+       if (data->valid) {
+               if (vnic_vhube_add(vnic, data)) {
+                       vnic_dbg_fip(vnic->name, "vnic_vhube_add "
+                                    "failed to update vnic neigh tree\n");
+                       goto error_in_update;
+               }
+       } else {                /* remove entry */
+               /* the remove request is for this vnic :-o */
+               if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+                       vnic_dbg_fip_p0(vnic->name, "remove this vnic "MAC_6_PRINT_FMT"\n",
+                                    MAC_6_PRINT_ARG(vnic->login_data.mac));
+                       fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+               } else {
+                       vnic_dbg_fip(vnic->name, "remove neigh vnic\n");
+                       vnic_vhube_del(vnic, data->mac);
+               }
+       }
+
+       vnic->vhub_table.tusn = pkt_tusn;
+
+       return 0;
+
+error_in_update:
+       vhub_ctx_free(vnic);
+       vnic_vhube_flush(vnic);
+       fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+       return -1;
+}
+
+/*
+ * This function handles a vhub context update packets received BEFORE
+ * we have a valid vhub table. The function adds the update request
+ * to an update list to be processed after the entire vhub table is received
+ * and processed.
+ */
+static int vhub_update_init(struct fip_vnic_data *vnic,
+                            u32 vhub_id, u32 pkt_tusn,
+                            struct vnic_table_entry *data)
+{
+       struct vnic_table_entry *new_vnic;
+       struct vhub_elist *vnic_list;
+       int curr_tusn;
+
+       vnic_list = &vnic->vhub_table.update_list;
+       curr_tusn = vnic_list->tusn;
+
+       /* if we got an out of order update clear list and request new table */
+       if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1)
+           && curr_tusn != 0) {
+               vnic_warn(vnic->name, "Info, vhub_update_init received out of"
+                         " order update. got %d my %d\n", pkt_tusn, curr_tusn);
+               goto error_in_update;
+       }
+
+       /* we got a GW keep alive packet */
+       if (pkt_tusn == curr_tusn) {
+               vnic_dbg_fip_v(vnic->name, "Received GW keep alive update."
+                              " tusn %d\n", curr_tusn);
+               return 0;
+       }
+
+       /* got remove request for this vnic don't wait */
+       if (!(data->valid) &&
+           !memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+               vhub_ctx_free(vnic);
+               vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_init\n");
+               fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+               goto err;
+       }
+
+       new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL);
+       if (!new_vnic)
+               goto error_in_update;
+
+       memcpy(new_vnic, data, sizeof *data);
+       list_add_tail(&new_vnic->list, &vnic_list->vnic_list);
+       vnic_list->count++;
+       vnic_list->tusn = pkt_tusn;
+       vhub_ctx_prnt(vnic, vnic_list, 0);
+       return 0;
+
+error_in_update:
+       vhub_ctx_free(vnic);
+       fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+err:
+       return -1;
+}
+
+/*
+ * This function handles a vhub context update packets received after
+ * we have a valid vhub table but  before it was passed to the data rbtree.
+ * The function applies the update request to the main vhub table.
+ */
+static int vhub_update_inter(struct fip_vnic_data *vnic,
+                            u32 vhub_id, u32 pkt_tusn,
+                            struct vnic_table_entry *data)
+{
+       struct vnic_table_entry *new_vnic, *del_vnic;
+       struct vhub_elist *vnic_list;
+       int curr_tusn;
+
+       vnic_list = &vnic->vhub_table.main_list;
+       curr_tusn = vnic_list->tusn;
+
+       /* if we got an out of order update clear list and request new table */
+       if ((pkt_tusn < curr_tusn || pkt_tusn > curr_tusn + 1)
+           && curr_tusn != 0) {
+               vnic_warn(vnic->name, "Info, vhub_update_init received out"
+                         " of order update. got %d my %d\n", pkt_tusn, curr_tusn);
+               goto error_in_update;
+       }
+
+       /* we got a GW keep alive packet */
+       if (pkt_tusn == curr_tusn) {
+               vnic_dbg_fip_v(vnic->name, "Received GW keep alive update."
+                              " tusn %d\n", curr_tusn);
+               return 0;
+       }
+
+       /* we got an add request */
+       if (data->valid) {
+               new_vnic = kzalloc(sizeof *new_vnic, GFP_KERNEL);
+               if (!new_vnic)
+                       goto error_in_update;
+
+               memcpy(new_vnic, data, sizeof *data);
+               list_add_tail(&new_vnic->list, &vnic_list->vnic_list);
+               vnic_list->count++;
+               vnic_list->tusn = pkt_tusn;
+       } else { /* we got a remove request */
+               /* remove is for this vnic */
+               if (!memcmp(vnic->login_data.mac, data->mac, ETH_ALEN)) {
+                       vhub_ctx_free(vnic);
+                       vnic_dbg_fip_p0(vnic->name, "got request to close vNic vhub_update_inter\n");
+                       fip_vnic_close(vnic, FIP_PARTIAL_FLUSH);
+                       goto err;
+               }
+
+               /* search and delete the vnic */
+               del_vnic = vhub_find_entry(vnic_list,
+                                          data->lid,
+                                          data->qpn);
+               if (del_vnic) {
+                       list_del(&del_vnic->list);
+                       kfree(del_vnic);
+                       vnic_list->count--;
+               }
+               vnic_dbg_fip_v(vnic->name,
+                              "vhub_update_inter removed "
+                              "vnic lid %d qpn 0x%x found %d\n",
+                              (int)data->lid, (int)data->qpn,
+                              (del_vnic != 0));
+       }
+
+       vhub_ctx_prnt(vnic, vnic_list, 0);
+       return 0;
+
+error_in_update:
+       vhub_ctx_free(vnic);
+       fip_update_send(vnic, 1 /* new */, 0 /* logout */);
+err:
+       return -1;
+}
+
+/*
+ * This function handles a vhub context update packets. There are three flows
+ * in handeling update packets. The first is before the main table is up
+ * to date, the second is after the table is up to date but before it was
+ * passed to the ownership of the data vnic (login struct) and the local
+ * lists are freed, and the last is when the table maintanence is done
+ * by the data vnic. This function handles all cases.
+*/
+int vhub_handle_update(struct fip_vnic_data *vnic,
+                      u32 vhub_id, u32 tusn,
+                      struct vnic_table_entry *data)
+{
+       int ret = 0;
+
+       /*
+        * if we do not have an up to date table to use the update list.
+        * if we have an up to date table apply the updates to the
+        * main table list.
+        */
+       switch (vnic->vhub_table.state) {
+       case VHUB_TBL_INIT:     /* No full table yet, keep updates for later */
+               ret = vhub_update_init(vnic, vhub_id, tusn, data);
+               break;
+       case VHUB_TBL_UP2DATE:  /* full table available, not writen to data half */
+               ret = vhub_update_inter(vnic, vhub_id, tusn, data);
+               break;
+       case VHUB_TBL_UPDATED:  /* full table available and writen to data half */
+               ret = vhub_update_updated(vnic, vhub_id, tusn, data);
+               break;
+       default:
+               break;
+       }
+
+        return ret;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_main.c
new file mode 100644 (file)
index 0000000..f07ee4e
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+#include "vnic_fip.h"
+
+MODULE_AUTHOR(DRV_AUTH);
+MODULE_DESCRIPTION(DRV_DESC);
+MODULE_LICENSE(DRV_LIC);
+MODULE_VERSION(DRV_VER);
+
+static int __init mlx4_ib_init(void)
+{
+       vnic_dbg_func("module_init");
+
+       if (vnic_param_check())
+               goto err;
+       if (vnic_mcast_init())
+               goto err;
+       if (vnic_ports_init())
+               goto free_mcast;
+
+       return 0;
+
+free_mcast:
+       vnic_mcast_cleanup();
+err:
+       return -EINVAL;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+       int ret;
+
+       vnic_dbg_func("module_exit");
+       vnic_ports_cleanup();
+       vnic_dbg_mark();
+       vnic_mcast_cleanup();
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.c
new file mode 100644 (file)
index 0000000..c82190c
--- /dev/null
@@ -0,0 +1,1098 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+
+struct workqueue_struct *mcast_wq;
+struct ib_sa_client vnic_sa_client;
+
+//static void vnic_mcast_detach_task(struct work_struct *work);
+static void vnic_mcast_attach_task(struct work_struct *work);
+static void vnic_port_mcast_leave_task(struct work_struct *work);
+static void vnic_port_mcast_join_task(struct work_struct *work);
+
+static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste);
+static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast
+                                                     *_mcaste);
+
+/*
+ * A helper function to prevent code duplication. Fills vnic_mcast struct with
+ * common values.
+ *
+ * in: mcaste - mcaste to fill
+ * in: gw_id - to be used in creation MGID address
+ * in: mac - to be used in creation MGID address
+ * in: create - value of create field in mcaste
+ */
+void __vnic_mcaste_fill(struct vnic_login *login, struct vnic_mcast *mcaste,
+                       u16 gw_id, const u8 *mac, u8 rss_hash, int create)
+{
+       union vhub_mgid mgid;
+
+       memcpy(mcaste->mac, mac, ETH_ALEN);
+       vhub_mgid_create(login->mgid_prefix, mcaste->mac,
+                        login->n_mac_mcgid,
+                        CREATE_VHUB_ID(login->vid, gw_id),
+                        VHUB_MGID_DATA, rss_hash, &mgid);
+       memcpy(&mcaste->gid, mgid.ib_gid.raw, GID_LEN);
+       memcpy(&mcaste->port_gid, &mcaste->gid, GID_LEN);
+       mcaste->backoff = msecs_to_jiffies(VNIC_MCAST_BACKOFF_MSEC);
+       mcaste->backoff_factor = 1;
+       mcaste->retry = VNIC_MCAST_MAX_RETRY;
+       mcaste->blocking = 0;
+       mcaste->qkey = login->qkey;
+       mcaste->pkey = login->pkey;
+       mcaste->create = create;
+       mcaste->qp = login->qp_res[0].qp; /* mcast/bcast is only on first QP */
+       mcaste->join_state = 1;
+}
+
+/*
+ * A helper function to prevent code duplication. Receives a multicast mac
+ * and a gw_id and attaches it (join + attach). The function also receives
+ * a default_mcaste (used for the MGID over default MLID hack and a user list.
+ * Returns 0 on success and non 0 on failure.
+ *
+ * in: mmac - to be used in creation MGID address
+ * in: default_mcaste - mcaste entry of the default MGID. Can be NULL
+ * in: user_list - A user list to hang the new mcaste on. Can be NULL
+ * in: gw_id - to be used in creation MGID address
+ */
+int _vnic_mcast_attach_mgid(struct vnic_login *login,
+                          char *mmac,
+                          struct vnic_mcast *default_mcaste,
+                          void *private_data,
+                          u16 gw_id)
+{
+       struct vnic_mcast *mcaste;
+       int rc = 0;
+       int rss_hash;
+
+       mcaste = vnic_mcast_alloc(login->port, NULL, NULL);
+       if (IS_ERR(mcaste)) {
+               vnic_warn(login->name, "vnic_mcast_alloc for "MAC_6_PRINT_FMT" failed\n",
+                         MAC_6_PRINT_ARG(mmac));
+               vnic_dbg_mark();
+               return -ENOMEM;
+       }
+       memcpy(mcaste->mac, mmac, ETH_ALEN);
+
+       /* if mcast mac has mcast IP in it:*/
+       rss_hash = 0;
+       if ((mcaste->mac[0] & 0xf0) == 0xe0 &&
+            mcaste->mac[4] == 0x00 &&
+            mcaste->mac[5] == 0x00) {
+               /* calculate mcas rss_hash on IP octets */
+               rss_hash = mcaste->mac[0] ^ mcaste->mac[1] ^
+                          mcaste->mac[2] ^ mcaste->mac[3];
+               /* and build the corresponding mcast MAC using the IEEE
+                * multicast OUI 01:00:5e
+                */
+               mcaste->mac[5] = mcaste->mac[3];
+               mcaste->mac[4] = mcaste->mac[2];
+               mcaste->mac[3] = mcaste->mac[1] & 0x7f;
+               mcaste->mac[2] = 0x5e;
+               mcaste->mac[1] = 0x00;
+               mcaste->mac[0] = 0x01;
+       }
+
+       __vnic_mcaste_fill(login, mcaste, gw_id, mcaste->mac, rss_hash, 0);
+       mcaste->priv_data = private_data;
+
+       if (default_mcaste)
+               memcpy(&mcaste->port_gid, &default_mcaste->gid, GID_LEN);
+
+       rc = vnic_mcast_add(&login->mcast_tree, mcaste); /* add holds mcast_rb_lock */
+       if (!rc) {
+               rc = vnic_mcast_attach(&login->mcast_tree, mcaste);
+               ASSERT(!rc);
+       } else if (rc == -EEXIST){
+               /* MGID may be already in the tree when n_mac_mcgid > 0 (ok)*/
+               vnic_dbg_mcast(login->name, "vnic_mcast_add for "
+                              MAC_6_PRINT_FMT" already exist, rc %d\n",
+                              MAC_6_PRINT_ARG(mcaste->mac), rc);
+               vnic_mcast_dealloc(mcaste);
+               rc = 0;
+       } else {
+               vnic_warn(login->name, "vnic_mcast_add for "
+                         MAC_6_PRINT_FMT" failed, rc %d\n",
+                         MAC_6_PRINT_ARG(mcaste->mac), rc);
+               vnic_mcast_dealloc(mcaste);
+       }
+       return rc;
+}
+
+struct vnic_mcast *vnic_mcast_alloc(struct vnic_port *port,
+                                   unsigned long *req_attach,
+                                   unsigned long *cur_attached)
+{
+       struct vnic_mcast *mcaste;
+
+       mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC);
+       if (!mcaste)
+               return ERR_PTR(-ENOMEM);
+       /* set mcaste fields */
+       init_completion(&mcaste->attach_complete);
+       INIT_DELAYED_WORK(&mcaste->attach_task, vnic_mcast_attach_task);
+       spin_lock_init(&mcaste->lock);
+       mcaste->port = port;
+       mcaste->req_attach = req_attach;
+       mcaste->cur_attached = cur_attached;
+
+       return mcaste;
+}
+
+void vnic_mcast_dealloc(struct vnic_mcast *mcaste)
+{
+       struct vnic_port *port;
+
+       ASSERT(mcaste);
+       port = mcaste->port;
+       vnic_dbg_mcast_vv(port->name, "dealloc vnic_mcast: MAC "MAC_6_PRINT_FMT
+                        " GID "VNIC_GID_FMT"\n",
+                        MAC_6_PRINT_ARG(mcaste->mac),
+                        VNIC_GID_ARG(mcaste->gid));
+       kfree(mcaste);
+}
+
+/*
+ * This function grabs the mcast_tree->mcast_rb_lock
+*/
+int vnic_mcast_add(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+       struct rb_node **n = &mcast_tree->mcast_tree.rb_node, *pn = NULL;
+       struct vnic_mcast *mcaste_t;
+       unsigned long flags;
+       int rc;
+
+       spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+       while (*n) {
+               pn = *n;
+               mcaste_t = rb_entry(pn, struct vnic_mcast, rb_node);
+               rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN);
+               if (rc < 0)
+                       n = &pn->rb_left;
+               else if (rc > 0)
+                       n = &pn->rb_right;
+               else {
+                       rc = -EEXIST;
+                       goto out;
+               }
+       }
+
+       rb_link_node(&mcaste->rb_node, pn, n);
+       rb_insert_color(&mcaste->rb_node, &mcast_tree->mcast_tree);
+
+       rc = 0;
+
+out:
+       vnic_dbg_mcast_v(mcaste->port->name,
+                        "added (rc %d) vnic_mcast: MAC "MAC_6_PRINT_FMT
+                        " GID "VNIC_GID_FMT"\n", rc,
+                        MAC_6_PRINT_ARG(mcaste->mac),
+                        VNIC_GID_ARG(mcaste->gid));
+
+       spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+       return rc;
+}
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+ */
+void vnic_mcast_del(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+       rb_erase(&mcaste->rb_node, &mcast_tree->mcast_tree);
+}
+
+/*
+ * The caller must hold the mcast_tree->mcast_rb_lock lock before calling
+*/
+struct vnic_mcast *vnic_mcast_search(struct mcast_root *mcast_tree,
+                                    union ib_gid *gid)
+{
+       struct rb_node *n = mcast_tree->mcast_tree.rb_node;
+       struct vnic_mcast *mcaste_t;
+       int rc;
+
+       while (n) {
+               mcaste_t = rb_entry(n, struct vnic_mcast, rb_node);
+               rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN);
+               if (rc < 0)
+                       n = n->rb_left;
+               else if (rc > 0)
+                       n = n->rb_right;
+               else {
+                       vnic_dbg_mcast_v(mcaste_t->port->name,
+                                        "found: MAC "MAC_6_PRINT_FMT" GID "
+                                        VNIC_GID_FMT"\n",
+                                        MAC_6_PRINT_ARG(mcaste_t->mac),
+                                        VNIC_GID_ARG(mcaste_t->gid));
+                       goto out;
+               }
+       }
+       mcaste_t = ERR_PTR(-ENODATA);
+
+out:
+       return mcaste_t;
+}
+
+static void vnic_mcast_detach_ll(struct vnic_mcast *mcaste, struct mcast_root *mcast_tree)
+{
+       struct vnic_port *port = mcaste->port;
+       struct ib_ah *tmp_ih;
+       unsigned long flags;
+       int rc;
+
+       vnic_dbg_mcast_v(port->name,
+                        "mcaste->attached %d for mac "MAC_6_PRINT_FMT"\n",
+                        test_bit(MCAST_ATTACHED, &mcaste->state),
+                        MAC_6_PRINT_ARG(mcaste->mac));
+
+       spin_lock_irqsave(&mcaste->lock, flags);
+       if (!test_and_clear_bit(MCAST_ATTACHED, &mcaste->state)) {
+               spin_unlock_irqrestore(&mcaste->lock, flags);
+               return;
+       }
+
+       tmp_ih = mcaste->ah;
+       mcaste->ah = NULL;
+       spin_unlock_irqrestore(&mcaste->lock, flags);
+
+       /* callback */
+       if (mcaste->detach_cb) {
+               vnic_dbg_mcast(port->name, "calling detach_cb\n");
+               mcaste->detach_cb(mcaste, mcaste->detach_cb_ctx);
+       }
+
+       if (!mcaste->sender_only)
+               rc = ib_detach_mcast(mcaste->qp, &mcaste->gid, port->attr.lid);
+       else
+               rc = 0;
+
+       ASSERT(tmp_ih);
+       if (ib_destroy_ah(tmp_ih))
+               vnic_warn(port->name,
+                         "ib_destroy_ah failed (rc %d) for mcaste mac "
+                         MAC_6_PRINT_FMT"\n", rc,
+                         MAC_6_PRINT_ARG(mcaste->mac));
+       vnic_dbg_mcast(port->name, "GID "VNIC_GID_FMT" detached!\n",
+                      VNIC_GID_ARG(mcaste->gid));
+}
+
+int vnic_mcast_detach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+       struct vnic_port *port = mcaste->port;
+       unsigned long flags;
+
+       /* must be a task, to make sure no attach task is pending */
+       vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) "
+                        "vnic_mcast_detach_task\n", mcaste->backoff);
+
+       /* cancel any pending/queued tasks. We can not use sync
+        * under the spinlock because it might hang. we need the
+        * spinlock here to ensure the requeueing is atomic
+        */
+       vnic_dbg_mcast_v(port->name, "cancel attach_task\n");
+       spin_lock_irqsave(&mcaste->lock, flags);
+       clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+       spin_unlock_irqrestore(&mcaste->lock, flags);
+#ifndef _BP_WORK_SYNC
+       cancel_delayed_work_sync(&mcaste->attach_task);
+#else
+       cancel_delayed_work(&mcaste->attach_task);
+       flush_workqueue(mcast_wq);
+#endif
+       vnic_mcast_detach_ll(mcaste, mcast_tree);
+
+       if (mcaste->port_mcaste)
+               vnic_port_mcast_release(mcaste->port_mcaste);
+
+       return 0;
+}
+
+static void vnic_mcast_attach_task(struct work_struct *work)
+{
+       struct ib_ah_attr av;
+       struct vnic_mcast *mcaste =
+           container_of(work, struct vnic_mcast, attach_task.work);
+       struct vnic_port *port = mcaste->port;
+       unsigned long flags;
+       int rc;
+       u16 mlid;
+
+       if ((++mcaste->attach_task_cnt > mcaste->retry && mcaste->retry) ||
+               !test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) {
+               vnic_dbg_mcast_v(port->name,
+                                "attach_task stopped, tried %ld times\n",
+                                mcaste->retry);
+               goto out;
+       }
+
+       /* update backoff time */
+       mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor,
+                             msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC));
+
+       if (!test_bit(MCAST_JOINED, &mcaste->port_mcaste->state)) {
+               vnic_dbg_mcast_v(port->name, "joined %d, retry %ld from %ld\n",
+                                test_bit(MCAST_JOINED, &mcaste->port_mcaste->state),
+                                mcaste->attach_task_cnt, mcaste->retry);
+               goto retry;
+       }
+
+       /* attach QP */
+       ASSERT(mcaste);
+       ASSERT(mcaste->port_mcaste);
+       ASSERT(mcaste->port_mcaste->sa_mcast);
+       mlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+       vnic_dbg_mcast(port->name, "QPN 0x%06x attaching MGID "VNIC_GID_FMT
+                      " LID 0x%04x\n", mcaste->qp->qp_num,
+                      VNIC_GID_ARG(mcaste->gid), mlid);
+       if (!mcaste->sender_only)
+               rc = ib_attach_mcast(mcaste->qp, &mcaste->gid, mlid);
+       else
+               rc = 0;
+
+       if (rc) {
+               int attach_count = atomic_read(&mcaste->port_mcaste->ref_cnt);
+
+               vnic_err(port->name, "failed to attach (rc %d) to multicast "
+                        "group, MGID "VNIC_GID_FMT"\n",
+                        rc, VNIC_GID_ARG(mcaste->gid));
+
+               if (port->dev->attr.max_mcast_qp_attach <= attach_count) {
+                       vnic_err(port->name, "Attach failed. Too many vnics are on the same"
+                                " vhub on this port. vnics count=%d, max=%d\n", 
+                                attach_count,
+                                port->dev->attr.max_mcast_qp_attach);
+               }
+
+               goto retry;
+       } else {
+               /* create mcast ah */
+               memset(&av, 0, sizeof(av));
+               av.dlid = be16_to_cpu(mcaste->port_mcaste->rec.mlid);
+               av.port_num = mcaste->port->num;
+               av.ah_flags = IB_AH_GRH;
+               av.static_rate = mcaste->port_mcaste->rec.rate;
+               av.sl = mcaste->port_mcaste->rec.sl;
+               memcpy(&av.grh.dgid, mcaste->gid.raw, GID_LEN);
+               spin_lock_irqsave(&mcaste->lock, flags);
+               mcaste->ah = ib_create_ah(port->pd, &av);
+               if (IS_ERR(mcaste->ah)) {
+                       mcaste->ah = NULL;
+                       vnic_err(port->name,
+                                "vnic_ib_create_ah failed (rc %d)\n",
+                                (int)PTR_ERR(mcaste->ah));
+                       spin_unlock_irqrestore(&mcaste->lock, flags);
+                       /* for such a failure, no need to retry */
+                       goto out;
+               }
+               vnic_dbg_mcast(mcaste->port->name, "created mcast ah for %p\n", mcaste);
+
+               /* callback */
+               set_bit(MCAST_ATTACHED, &mcaste->state);
+               spin_unlock_irqrestore(&mcaste->lock, flags);
+
+               if (mcaste->cur_attached)
+                       set_bit(mcaste->attach_bit_nr, mcaste->cur_attached);
+               vnic_dbg_mcast(mcaste->port->name,
+                              "attached GID "VNIC_GID_FMT"\n",
+                              VNIC_GID_ARG(mcaste->gid));
+               if (mcaste->attach_cb) {
+                       vnic_dbg_mcast(mcaste->port->name,
+                                      "calling attach_cb\n");
+                       mcaste->attach_cb(mcaste, mcaste->attach_cb_ctx);
+               }
+       }
+
+out:
+       mcaste->attach_task_cnt = 0; /* for next time */
+       mcaste->backoff = mcaste->backoff_init;
+       clear_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+       complete(&mcaste->attach_complete);
+       return;
+
+retry:
+       spin_lock_irqsave(&mcaste->lock, flags);
+       if (test_bit(MCAST_ATTACH_RUNNING, &mcaste->state)) {
+               /* calls vnic_mcast_attach_task() */
+               queue_delayed_work(mcast_wq, &mcaste->attach_task, mcaste->backoff);
+       }
+       spin_unlock_irqrestore(&mcaste->lock, flags);
+}
+
+int vnic_mcast_attach(struct mcast_root *mcast_tree, struct vnic_mcast *mcaste)
+{
+       struct vnic_port_mcast *pmcaste;
+       struct vnic_port *port = mcaste->port;
+       int rc = 0;
+       ASSERT(mcaste);
+
+       mcaste->backoff_init = mcaste->backoff;
+
+       pmcaste = vnic_port_mcast_update(mcaste);
+       if (IS_ERR(pmcaste)) {
+               vnic_err(port->name, "vnic_port_mcast_update failed GID "
+                        VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+               rc = PTR_ERR(pmcaste);
+               goto out;
+       }
+
+       mcaste->port_mcaste = pmcaste;
+
+       set_bit(MCAST_ATTACH_RUNNING, &mcaste->state);
+
+       /* must be a task, to sample the joined flag */
+       vnic_dbg_mcast_v(port->name, "queue delayed task (%lu) "
+                        "vnic_mcast_join_task\n", mcaste->backoff);
+       init_completion(&mcaste->attach_complete);
+       /* calls vnic_mcast_attach_task() */
+       queue_delayed_work(mcast_wq, &mcaste->attach_task, 0);
+       if (mcaste->blocking) {
+               wait_for_completion(&mcaste->attach_complete);
+               if (test_bit(MCAST_ATTACHED, &mcaste->state))
+                       goto out;
+               vnic_mcast_detach(mcast_tree, mcaste);
+               rc = 1;
+       }
+
+out:
+       return rc;
+}
+
+#if 0
+static int vnic_mcast_attach_all(struct mcast_root *mcast_tree)
+{
+       int fails = 0;
+       struct vnic_mcast *mcaste;
+       struct rb_node *n;
+
+       n = rb_first(&mcast_tree->mcast_tree);
+       while (n) {
+               mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+               n = rb_next(n);
+               /* async call */
+               if (vnic_mcast_attach(mcast_tree, mcaste))
+                       fails++;
+       }
+
+       return fails;
+}
+#endif
+
+int vnic_mcast_del_all(struct mcast_root *mcast_tree)
+{
+       struct rb_node *n;
+       struct vnic_mcast *mcaste, *mcaste_t;
+       unsigned long flags;
+       int fails = 0;
+       LIST_HEAD(local_list);
+
+       spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+       n = rb_first(&mcast_tree->mcast_tree);
+       while (n) {
+               mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+               vnic_mcast_del(mcast_tree, mcaste);
+               list_add_tail(&mcaste->list, &local_list);
+               n = rb_first(&mcast_tree->mcast_tree);
+       }
+       spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+       list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+               list_del(&mcaste->list);
+               vnic_mcast_detach(mcast_tree, mcaste);
+               vnic_mcast_dealloc(mcaste);
+       }
+
+       return fails;
+}
+
+int vnic_mcast_del_user(struct mcast_root *mcast_tree, void *owner)
+{
+       struct rb_node *n;
+       struct vnic_mcast *mcaste, *mcaste_t;
+       unsigned long flags;
+       int fails = 0;
+       LIST_HEAD(local_list);
+
+       spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+       n = rb_first(&mcast_tree->mcast_tree);
+       while (n) {
+               mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+               n = rb_next(&mcaste->rb_node);
+               if (mcaste->priv_data == owner) {
+                       list_add_tail(&mcaste->list, &local_list);
+                       vnic_mcast_del(mcast_tree, mcaste);
+               }
+       }
+       spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+       list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+               list_del(&mcaste->list);
+               vnic_mcast_detach(mcast_tree, mcaste);
+               vnic_mcast_dealloc(mcaste);
+       }
+
+       return fails;
+}
+
+/* PORT MCAST FUNCTIONS */
+static struct vnic_port_mcast *vnic_port_mcast_alloc(struct vnic_port *port,
+                                                    union ib_gid *gid)
+{
+       struct vnic_port_mcast *mcaste;
+
+       mcaste = kzalloc(sizeof *mcaste, GFP_ATOMIC);
+       if (!mcaste)
+               return ERR_PTR(-ENOMEM);
+
+       mcaste->gid = *gid;
+       mcaste->port = port;
+       init_completion(&mcaste->leave_complete);
+       atomic_set(&mcaste->ref_cnt, 1);
+       INIT_DELAYED_WORK(&mcaste->join_task, vnic_port_mcast_join_task);
+       INIT_WORK(&mcaste->leave_task, vnic_port_mcast_leave_task);
+       mcaste->sa_mcast = ERR_PTR(-EINVAL);
+       memset(&mcaste->rec,0,sizeof(mcaste->rec));
+       vnic_dbg_mcast_v(mcaste->port->name, "allocated port_mcast GID "
+                        VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+       spin_lock_init(&mcaste->lock);
+       set_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+
+       return mcaste;
+}
+
+static void vnic_port_mcast_dealloc(struct vnic_port_mcast *mcaste)
+{
+       ASSERT(mcaste);
+       vnic_dbg_mcast_v(NULL, "dealloc port_mcast GID "
+                        VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+       kfree(mcaste);
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+static int vnic_port_mcast_add(struct vnic_port_mcast *mcaste)
+{
+       struct rb_node **n = &mcaste->port->mcast_tree.mcast_tree.rb_node;
+       struct rb_node *pn = NULL;
+       struct vnic_port_mcast *mcaste_t;
+       int rc;
+
+       while (*n) {
+               pn = *n;
+               mcaste_t = rb_entry(pn, struct vnic_port_mcast, rb_node);
+               rc = memcmp(mcaste->gid.raw, mcaste_t->gid.raw, GID_LEN);
+               if (rc < 0)
+                       n = &pn->rb_left;
+               else if (rc > 0)
+                       n = &pn->rb_right;
+               else {
+                       rc = -EEXIST;
+                       goto out;
+               }
+       }
+
+       rb_link_node(&mcaste->rb_node, pn, n);
+       rb_insert_color(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree);
+       rc = 0;
+
+out:
+       vnic_dbg_mcast_v(mcaste->port->name, "added (rc %d) port_mcast GID "
+                        VNIC_GID_FMT"\n", rc, VNIC_GID_ARG(mcaste->gid));
+       return rc;
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+static void vnic_port_mcast_del(struct vnic_port_mcast *mcaste)
+{
+       ASSERT(mcaste);
+       vnic_dbg_mcast_v(mcaste->port->name, "del port_mcast GID "
+                        VNIC_GID_FMT"\n", VNIC_GID_ARG(mcaste->gid));
+       rb_erase(&mcaste->rb_node, &mcaste->port->mcast_tree.mcast_tree);
+}
+
+/*
+ * This function accesses the port mcast tree. Please make sure
+ * to call it only while holding the port mcast_rb_lock
+*/
+struct vnic_port_mcast *vnic_port_mcast_search(struct vnic_port *port,
+                                              union ib_gid *gid)
+{
+       struct rb_node *n = port->mcast_tree.mcast_tree.rb_node;
+       struct vnic_port_mcast *mcaste_t;
+       int rc;
+
+       while (n) {
+               mcaste_t = rb_entry(n, struct vnic_port_mcast, rb_node);
+               rc = memcmp(gid->raw, mcaste_t->gid.raw, GID_LEN);
+               if (rc < 0)
+                       n = n->rb_left;
+               else if (rc > 0)
+                       n = n->rb_right;
+               else {
+                       vnic_dbg_mcast_v(mcaste_t->port->name,
+                                        "found: GID "VNIC_GID_FMT"\n",
+                                        VNIC_GID_ARG(mcaste_t->gid));
+                       goto out;
+               }
+       }
+       mcaste_t = ERR_PTR(-ENODATA);
+
+out:
+       return mcaste_t;
+}
+/*
+static void vnic_port_mcast_leave_task(struct work_struct *work)
+{
+       struct vnic_port_mcast *mcaste =
+               container_of(work, struct vnic_port_mcast, leave_task.work);
+
+       vnic_dbg_mcast_v(mcaste->port->name, "leave GID "VNIC_GID_FMT"\n",
+                        VNIC_GID_ARG(mcaste->gid));
+
+       if (!IS_ERR(mcaste->sa_mcast) && test_bit(MCAST_JOINED, &mcaste->port_mcaste->state))
+               vnic_dbg_mcast(mcaste->port->name,
+                              "mcast left: GID "VNIC_GID_FMT"\n",
+                              VNIC_GID_ARG(mcaste->gid));
+       if (!IS_ERR(mcaste->sa_mcast))
+               ib_sa_free_multicast(mcaste->sa_mcast);
+       mcaste->sa_mcast = ERR_PTR(-EINVAL);
+       clear_bit(MCAST_JOINED, &mcaste->port_mcaste->state);
+}
+*/
+
+static int vnic_port_mcast_leave(struct vnic_port_mcast *mcaste,
+                                unsigned long backoff)
+{
+       unsigned long flags;
+
+       ASSERT(mcaste);
+       vnic_dbg_mcast(NULL, "queue delayed task (%lu) "
+                      "vnic_mcast_leave_task\n", backoff);
+
+       /* cancel any pending/queued tasks. We can not use sync
+        * under the spinlock because it might hang. we need the
+        * spinlock here to ensure the requeueing is atomic
+        */
+       spin_lock_irqsave(&mcaste->lock, flags);
+       clear_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+       spin_unlock_irqrestore(&mcaste->lock, flags);
+#ifndef _BP_WORK_SYNC
+       cancel_delayed_work_sync(&mcaste->join_task);
+#else
+       cancel_delayed_work(&mcaste->join_task);
+       if (delayed_work_pending(&mcaste->join_task)) {
+               return -EBUSY;
+       }
+#endif
+
+       if (test_and_clear_bit(MCAST_JOIN_STARTED, &mcaste->state)
+           && !IS_ERR(mcaste->sa_mcast)) {
+               ib_sa_free_multicast(mcaste->sa_mcast);
+               mcaste->sa_mcast = ERR_PTR(-EINVAL);
+       }
+
+       return 0;
+}
+
+static int vnic_port_mcast_join_comp(int status, struct ib_sa_multicast *sa_mcast)
+{
+       struct vnic_port_mcast *mcaste = sa_mcast->context;
+       unsigned long flags;
+
+       vnic_dbg_mcast(mcaste->port->name, "join completion for GID "
+                      VNIC_GID_FMT" (status %d)\n",
+                      VNIC_GID_ARG(mcaste->gid), status);
+
+       if (status == -ENETRESET)
+               return 0;
+
+       if (status)
+               goto retry;
+
+       /* same as mcaste->rec = mcaste->sa_mcast->rec; */
+       mcaste->rec = sa_mcast->rec;
+
+       set_bit(MCAST_JOINED, &mcaste->state);
+       vnic_dbg_mcast(mcaste->port->name, "joined GID "VNIC_GID_FMT"\n",
+                      VNIC_GID_ARG(mcaste->gid));
+#if 0
+       vnic_dbg_mcast_v(mcaste->port->name, "mcast record dump:\n");
+       vnic_dbg_mcast_v(mcaste->port->name, "mgid      "VNIC_GID_FMT"\n",
+                        VNIC_GID_ARG(rec->mgid));
+       vnic_dbg_mcast_v(mcaste->port->name, "port_gid  "VNIC_GID_FMT"\n",
+                        VNIC_GID_ARG(rec->port_gid));
+       vnic_dbg_mcast_v(mcaste->port->name, "pkey       0x%x\n", rec->pkey);
+       vnic_dbg_mcast_v(mcaste->port->name, "qkey       0x%x\n", rec->qkey);
+       vnic_dbg_mcast_v(mcaste->port->name, "mtu_slct   0x%x\n",
+                        rec->mtu_selector);
+       vnic_dbg_mcast_v(mcaste->port->name, "mtu        0x%x\n", rec->mtu);
+       vnic_dbg_mcast_v(mcaste->port->name, "rate_slct  0x%x\n",
+                        rec->rate_selector);
+       vnic_dbg_mcast_v(mcaste->port->name, "rate       0x%x\n", rec->rate);
+       vnic_dbg_mcast_v(mcaste->port->name, "sl         0x%x\n", rec->sl);
+       vnic_dbg_mcast_v(mcaste->port->name, "flow_label 0x%x\n",
+                        rec->flow_label);
+       vnic_dbg_mcast_v(mcaste->port->name, "hop_limit  0x%x\n",
+                        rec->hop_limit);
+#endif
+
+       goto out;
+retry:
+       /* calls vnic_port_mcast_join_task() */
+       spin_lock_irqsave(&mcaste->lock, flags);
+       if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+               queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff);
+       spin_unlock_irqrestore(&mcaste->lock, flags);
+
+out:
+       /* rc is always zero so we handle ib_sa_free_multicast ourselves */
+       return 0;
+}
+
+static void vnic_port_mcast_join_task(struct work_struct *work)
+{
+       struct vnic_port_mcast *mcaste =
+           container_of(work, struct vnic_port_mcast, join_task.work);
+       struct ib_sa_mcmember_rec rec = {
+               .join_state = mcaste->join_state
+       };
+       int rc;
+       ib_sa_comp_mask comp_mask;
+       unsigned long flags;
+
+       if (++mcaste->join_task_cnt > mcaste->retry && mcaste->retry) {
+               vnic_dbg_mcast(mcaste->port->name,
+                              "join_task stopped, tried %ld times\n",
+                              mcaste->retry);
+               goto out;
+       }
+
+       /* update backoff time */
+       mcaste->backoff = min(mcaste->backoff * mcaste->backoff_factor,
+                             msecs_to_jiffies(VNIC_MCAST_BACKOFF_MAX_MSEC));
+
+       rec.mgid.global = mcaste->gid.global;
+       rec.port_gid.global = mcaste->port->gid.global;
+       rec.pkey = cpu_to_be16(mcaste->pkey);
+
+       comp_mask =
+           IB_SA_MCMEMBER_REC_MGID |
+           IB_SA_MCMEMBER_REC_PORT_GID |
+           /*IB_SA_MCMEMBER_REC_PKEY | */
+           IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+       if (mcaste->create) {
+               comp_mask |=
+                   IB_SA_MCMEMBER_REC_QKEY |
+                   IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+                   IB_SA_MCMEMBER_REC_MTU |
+                   IB_SA_MCMEMBER_REC_TRAFFIC_CLASS |
+                   IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+                   IB_SA_MCMEMBER_REC_RATE |
+                   IB_SA_MCMEMBER_REC_SL |
+                   IB_SA_MCMEMBER_REC_FLOW_LABEL |
+                   IB_SA_MCMEMBER_REC_HOP_LIMIT |
+                   IB_SA_MCMEMBER_REC_PKEY;
+
+               rec.qkey = cpu_to_be32(mcaste->qkey);
+               rec.mtu_selector = IB_SA_EQ;
+               rec.rate_selector = IB_SA_EQ;
+               /* when no_bxm is set, use min values to let everybody in */
+               rec.mtu = no_bxm ? IB_MTU_2048 : mcaste->port->attr.max_mtu;
+               rec.rate = no_bxm ? IB_RATE_10_GBPS : mcaste->port->rate_enum;
+               rec.sl = 0;
+               rec.flow_label = 0;
+               rec.hop_limit = 0;
+       }
+
+       vnic_dbg_mcast(mcaste->port->name, "joining MGID "VNIC_GID_FMT
+                      " create %d, comp_mask %lu\n",
+                      VNIC_GID_ARG(mcaste->gid), mcaste->create, (unsigned long)comp_mask);
+
+       if (!IS_ERR(mcaste->sa_mcast))
+               ib_sa_free_multicast(mcaste->sa_mcast);
+
+       mcaste->sa_mcast =
+           ib_sa_join_multicast(&vnic_sa_client, mcaste->port->dev->ca,
+                                mcaste->port->num, &rec, comp_mask,
+                                GFP_KERNEL, vnic_port_mcast_join_comp, mcaste);
+       set_bit(MCAST_JOIN_STARTED, &mcaste->state);
+
+       if (IS_ERR(mcaste->sa_mcast)) {
+               rc = PTR_ERR(mcaste->sa_mcast);
+               vnic_warn(mcaste->port->name,
+                         "ib_sa_join_multicast failed, status %d\n", rc);
+               /* calls vnic_port_mcast_join_task() */
+               spin_lock_irqsave(&mcaste->lock, flags);
+               if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+                       queue_delayed_work(mcast_wq, &mcaste->join_task, mcaste->backoff);
+               spin_unlock_irqrestore(&mcaste->lock, flags);
+       }
+
+       return;
+
+out:
+       mcaste->join_task_cnt = 0; /* for next time */
+       mcaste->backoff = mcaste->backoff_init;
+       return;
+}
+
+static int vnic_port_mcast_join(struct vnic_port_mcast *mcaste)
+{
+       unsigned long flags;
+
+       ASSERT(mcaste);
+       vnic_dbg_mcast_v(mcaste->port->name, "queue delayed task (%lu) "
+                        "vnic_port_mcast_join_task\n", mcaste->backoff);
+
+       /* calls vnic_port_mcast_join_task() */
+       spin_lock_irqsave(&mcaste->lock, flags);
+       if (test_bit(MCAST_JOIN_RUNNING, &mcaste->state))
+               queue_delayed_work(mcast_wq, &mcaste->join_task, 0);
+       spin_unlock_irqrestore(&mcaste->lock, flags);
+
+       return 0;
+}
+
+#if 0
+static int vnic_port_mcast_join_all(struct vnic_port *port)
+{
+       int fails = 0;
+       struct vnic_port_mcast *mcaste;
+       struct rb_node *n;
+
+       n = rb_first(&port->mcast_tree.mcast_tree);
+       while (n) {
+               mcaste = rb_entry(n, struct vnic_port_mcast, rb_node);
+               n = rb_next(n);
+               if (vnic_port_mcast_join(mcaste))
+                       fails++;
+       }
+
+       return fails;
+}
+#endif
+
+static void vnic_port_mcast_leave_task(struct work_struct *work)
+{
+       struct vnic_port_mcast *mcaste =
+           container_of(work, struct vnic_port_mcast, leave_task);
+
+#ifndef _BP_WORK_SYNC
+       vnic_port_mcast_leave(mcaste, 0);
+#else
+       if (vnic_port_mcast_leave(mcaste, 0)) {
+               queue_work(mcast_wq, &mcaste->leave_task);
+               return;
+       }
+#endif
+       vnic_port_mcast_dealloc(mcaste);
+}
+
+static void vnic_port_mcast_release(struct vnic_port_mcast *mcaste)
+{
+       unsigned long flags;
+
+       struct vnic_port *port = mcaste->port;
+
+       vnic_dbg_mcast(port->name, "update mcaste->ref_cnt %d -> %d\n",
+                      atomic_read(&mcaste->ref_cnt),
+                      atomic_read(&mcaste->ref_cnt) - 1);
+
+       spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags);
+       if (atomic_dec_and_test(&mcaste->ref_cnt)) {
+               vnic_port_mcast_del(mcaste);
+               spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+
+               /* we are not going to wait for the leave to terminate.
+                *  We will just go on.
+                *  calls vnic_port_mcast_leave_task()
+                */
+               queue_work(mcast_wq, &mcaste->leave_task);
+       } else
+               spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+}
+
+static struct vnic_port_mcast *vnic_port_mcast_update(struct vnic_mcast *_mcaste)
+{
+       union ib_gid *gid = &_mcaste->port_gid;
+       u32 qkey = _mcaste->qkey;
+       u16 pkey = _mcaste->pkey;
+       struct vnic_port *port = _mcaste->port;
+       struct vnic_port_mcast *mcaste;
+       unsigned long flags;
+
+       spin_lock_irqsave(&port->mcast_tree.mcast_rb_lock, flags);
+       mcaste = vnic_port_mcast_search(port, gid);
+       /* entry found */
+       if (PTR_ERR(mcaste) != -ENODATA) {
+               ASSERT(!IS_ERR(mcaste));
+               atomic_inc(&mcaste->ref_cnt);
+               spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+               vnic_dbg_mcast(mcaste->port->name,
+                              "found, add GID "VNIC_GID_FMT" \n",
+                              VNIC_GID_ARG(*gid));
+               vnic_dbg_mcast(mcaste->port->name,
+                              "update mcaste->ref_cnt %d -> %d\n",
+                              atomic_read(&mcaste->ref_cnt),
+                              atomic_read(&mcaste->ref_cnt) + 1);
+       } else { /* not found, add it */
+               mcaste = vnic_port_mcast_alloc(port, gid);
+               if (IS_ERR(mcaste)) {
+                       spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+                       return mcaste;
+               }
+               vnic_dbg_mcast(mcaste->port->name,
+                              "not found, add GID "VNIC_GID_FMT" \n",
+                              VNIC_GID_ARG(*gid));
+               vnic_dbg_mcast(mcaste->port->name,
+                              "update mcaste->ref_cnt %d -> %d\n",
+                              atomic_read(&mcaste->ref_cnt),
+                              atomic_read(&mcaste->ref_cnt) + 1);
+               mcaste->qkey = qkey;
+               mcaste->pkey = pkey;
+               mcaste->backoff_init = _mcaste->backoff_init;
+               mcaste->backoff = _mcaste->backoff;
+               mcaste->backoff_factor = _mcaste->backoff_factor;
+               mcaste->retry = _mcaste->retry;
+               mcaste->create = _mcaste->create;
+               mcaste->join_state = _mcaste->join_state;
+               vnic_port_mcast_add(mcaste);
+               spin_unlock_irqrestore(&port->mcast_tree.mcast_rb_lock, flags);
+
+               vnic_port_mcast_join(mcaste);
+               vnic_dbg_mcast(mcaste->port->name, "added\n");
+       }
+
+       return mcaste;
+}
+
+#if 0
+void vnic_port_mcast_del_all(struct vnic_port *port)
+{
+
+       struct rb_node *n;
+       struct vnic_port_mcast *mcaste, *mcaste_t;
+       LIST_HEAD(local_list);
+
+       ASSERT(port);
+
+       n = rb_first(&port->mcast_tree.mcast_tree);
+       while (n) {
+               mcaste = rb_entry(n, struct vnic_port_mcast, rb_node);
+               list_add_tail(&mcaste->list, &local_list);
+               n = rb_next(&mcaste->rb_node);
+       }
+
+       list_for_each_entry_safe(mcaste, mcaste_t, &local_list, list) {
+               list_del(&mcaste->list);
+               vnic_warn(port->name, "shouldn't find gid "VNIC_GID_FMT"\n",
+                         VNIC_GID_ARG(mcaste->gid));
+               vnic_port_mcast_release(mcaste);
+       }
+
+       return;
+}
+#endif
+
+void vnic_tree_mcast_detach(struct mcast_root *mcast_tree)
+{
+       struct vnic_mcast *mcaste, *mcaste_t;
+       struct rb_node *n;
+       unsigned long flags;
+       INIT_LIST_HEAD(&mcast_tree->reattach_list);
+
+       spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+       n = rb_first(&mcast_tree->mcast_tree);
+       while (n) {
+               mcaste = rb_entry(n, struct vnic_mcast, rb_node);
+               list_add_tail(&mcaste->list, &mcast_tree->reattach_list);
+               n = rb_next(&mcaste->rb_node);
+               vnic_mcast_del(mcast_tree, mcaste);
+               mcaste->attach_task_cnt = 0;
+       }
+       spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+       list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) {
+               vnic_mcast_detach(mcast_tree, mcaste);
+       }
+
+       return;
+}
+
+void vnic_tree_mcast_attach(struct mcast_root *mcast_tree)
+{
+       struct vnic_mcast *mcaste, *mcaste_t;
+       int rc;
+
+       /* The add function grabs the mcast_rb_lock no need to take it */
+       list_for_each_entry_safe(mcaste, mcaste_t, &mcast_tree->reattach_list, list) {
+               rc = vnic_mcast_add(mcast_tree, mcaste);
+               ASSERT(!rc);
+               rc = vnic_mcast_attach(mcast_tree, mcaste);
+               ASSERT(!rc);
+               list_del(&mcaste->list);
+       }
+
+       return;
+}
+
+int vnic_mcast_init()
+{
+       ib_sa_register_client(&vnic_sa_client);
+
+       mcast_wq = create_singlethread_workqueue("mcast_wq");
+       if (!mcast_wq)
+               return -ENOMEM;
+
+       return 0;
+}
+
+void vnic_mcast_cleanup()
+{
+       ASSERT(mcast_wq);
+       vnic_dbg_mark();
+       flush_workqueue(mcast_wq);
+       vnic_dbg_mark();
+       destroy_workqueue(mcast_wq);
+       vnic_dbg_mark();
+       ib_sa_unregister_client(&vnic_sa_client);
+
+       return;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_mcast.h
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_param.c
new file mode 100644 (file)
index 0000000..56751aa
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_fip.h"
+
+u32 vnic_lro_num = VNIC_MAX_LRO_DESCS;
+u32 vnic_net_admin = 1;
+u32 vnic_child_max = VNIC_CHILD_MAX;
+u32 vnic_tx_rings_num = 0;
+u32 vnic_rx_rings_num = 0;
+u32 vnic_tx_rings_len = VNIC_TX_QUEUE_LEN;
+u32 vnic_rx_rings_len = VNIC_RX_QUEUE_LEN;
+u32 vnic_mgid_data_type = 0;
+u32 vnic_encap_headroom = 1;
+u32 vnic_tx_polling = 1;
+u32 vnic_rx_linear = 0;
+u32 vnic_change_mac = 0;
+u32 vnic_learn_mac_enabled = 1;
+u32 vnic_synd_backlog = 4;
+u32 vnic_eport_state_enforce = 0;
+u32 vnic_src_mac_enforce = 0;
+u32 vnic_inline_tshold = 0;
+u32 vnic_discovery_pkeys[MAX_NUM_PKEYS_DISCOVERY];
+u32 vnic_discovery_pkeys_count = MAX_NUM_PKEYS_DISCOVERY;
+u32 vnic_sa_query = 0;
+
+/* these params are enbaled in debug mode */
+u32 no_bxm = 0;
+u32 vnic_msglvl = 0x80000000;
+u32 vnic_max_tx_outs = VNIC_MAX_TX_OUTS;
+u32 vnic_linear_small_pkt = 1;
+u32 vnic_mcast_create = 0;
+u32 vnic_napi_weight = VNIC_MAX_RX_CQE;
+
+module_param_named(tx_rings_num, vnic_tx_rings_num, int, 0444);
+MODULE_PARM_DESC(tx_rings_num, "Number of TX rings, use 0 for #cpus [default 0, max 32]");
+
+module_param_named(tx_rings_len, vnic_tx_rings_len, int, 0444);
+MODULE_PARM_DESC(tx_rings_len, "Length of TX rings, must be power of two [default 1024, max 8K]");
+
+module_param_named(rx_rings_num, vnic_rx_rings_num, int, 0444);
+MODULE_PARM_DESC(rx_rings_num, "Number of RX rings, use 0 for #cpus [default 0, max 32]");
+
+module_param_named(rx_rings_len, vnic_rx_rings_len, int, 0444);
+MODULE_PARM_DESC(rx_rings_len, "Length of RX rings, must be power of two [default 2048, max 8K]");
+
+module_param_named(eport_state_enforce, vnic_eport_state_enforce, int, 0644);
+MODULE_PARM_DESC(eport_state_enforce, "Bring interface up only when corresponding EPort is up [default 0]");
+
+module_param_named(src_mac_enforce, vnic_src_mac_enforce, int, 0644);
+MODULE_PARM_DESC(src_mac_enforce, "Enforce source MAC address [default 0]");
+
+module_param_named(vnic_net_admin, vnic_net_admin, int, 0644);
+MODULE_PARM_DESC(vnic_net_admin, "Enable Network Administration mode [default 1]");
+
+module_param_named(vnic_child_max, vnic_child_max, int, 0644);
+MODULE_PARM_DESC(vnic_child_max, "Max child vNics (per interface), use 0 to disable [default 128]");
+
+module_param_named(mgid_data_type, vnic_mgid_data_type, int, 0444);
+MODULE_PARM_DESC(mgid_data_type, "Set MGID data type for multicast traffic [default 0, max 1]");
+
+module_param_named(encap_headroom, vnic_encap_headroom, int, 0444);
+MODULE_PARM_DESC(encap_headroom, "Use SKB headroom for protocol encapsulation [default 1]");
+
+module_param_named(inline_tshold, vnic_inline_tshold, int, 0444);
+MODULE_PARM_DESC(inline_tshold, "Packets smaller than this threshold (in bytes) use inline & blue flame [default 0, max 512]");
+
+module_param_named(tx_polling, vnic_tx_polling, int, 0444);
+MODULE_PARM_DESC(tx_polling, "Enable TX polling mode [default 1]");
+
+module_param_named(rx_linear, vnic_rx_linear, int, 0444);
+MODULE_PARM_DESC(rx_linear, "Enable linear RX buffers [default 0]");
+
+module_param_named(change_mac, vnic_change_mac, int, 0444);
+MODULE_PARM_DESC(change_mac, "Enable MAC change using child vNics [default 0]");
+
+module_param_named(learn_tx_mac, vnic_learn_mac_enabled, int, 0644);
+MODULE_PARM_DESC(learn_tx_mac, "Enable TX MAC learning in promisc mode [default 1]");
+
+module_param_named(synd_backlog, vnic_synd_backlog, int, 0644);
+MODULE_PARM_DESC(synd_backlog, "Syndrome error reporting backlog limit [default 4]");
+
+module_param_array_named(discovery_pkeys, vnic_discovery_pkeys, int, &vnic_discovery_pkeys_count, 0444);
+MODULE_PARM_DESC(discovery_pkeys, "Vector of PKeys to be used for discovery [default 0xffff, max vector length 24]");
+
+module_param_named(sa_query, vnic_sa_query, int, 0644);
+MODULE_PARM_DESC(sa_query, "Query SA for each IB address and ignore gateway assigned SLs [default 0]");
+
+
+#if !(defined(NETIF_F_GRO) && !defined(_BP_NO_GRO))
+module_param_named(lro_num, vnic_lro_num, int, 0444);
+MODULE_PARM_DESC(lro_num, "Number of LRO sessions per ring, use 0 to disable [default 32, max 32]");
+#endif
+
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+module_param_named(no_bxm, no_bxm, int, 0444);
+MODULE_PARM_DESC(no_bxm, "Enable NO BXM mode [default 0]");
+
+module_param_named(msglvl, vnic_msglvl, uint, 0644);
+MODULE_PARM_DESC(msglvl, "Debug message level [default 0]");
+
+module_param_named(max_tx_outs, vnic_max_tx_outs, int, 0644);
+MODULE_PARM_DESC(max_tx_outs, "Max outstanding TX packets [default 16]");
+
+module_param_named(linear_small_pkt, vnic_linear_small_pkt, int, 0644);
+MODULE_PARM_DESC(linear_small_pkt, "Use linear buffer for small packets [default 1]");
+
+module_param_named(mcast_create, vnic_mcast_create, int, 0444);
+MODULE_PARM_DESC(mcast_create, "Create multicast group during join request [default 0]");
+
+module_param_named(napi_weight, vnic_napi_weight, int, 0444);
+MODULE_PARM_DESC(napi_weight, "NAPI weight [default 32]");
+#endif /* CONFIG_MLX4_VNIC_DEBUG */
+
+int vnic_param_check(void) {
+#ifdef CONFIG_MLX4_VNIC_DEBUG
+       vnic_info("VNIC_DEBUG flag is set\n");
+#endif
+
+       vnic_mcast_create = vnic_mcast_create ? 1 : 0;
+       vnic_mcast_create = no_bxm ? 1 : vnic_mcast_create;
+       no_bxm            = no_bxm ? 1 : 0;
+       vnic_sa_query     = vnic_sa_query ? 1 : 0;
+
+       vnic_mgid_data_type = max_t(u32, vnic_mgid_data_type, 0);
+       vnic_mgid_data_type = min_t(u32, vnic_mgid_data_type, 1);
+
+       vnic_rx_rings_num = max_t(u32, vnic_rx_rings_num, 0);
+       vnic_rx_rings_num = min_t(u32, vnic_rx_rings_num, VNIC_MAX_NUM_CPUS);
+
+       vnic_tx_rings_num = max_t(u32, vnic_tx_rings_num, 0);
+       vnic_tx_rings_num = min_t(u32, vnic_tx_rings_num, VNIC_MAX_NUM_CPUS);
+
+       vnic_tx_rings_len = rounddown_pow_of_two(vnic_tx_rings_len);
+       vnic_tx_rings_len = max_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MIN);
+       vnic_tx_rings_len = min_t(u32, vnic_tx_rings_len, VNIC_TX_QUEUE_LEN_MAX);
+
+       vnic_rx_rings_len = rounddown_pow_of_two(vnic_rx_rings_len);
+       vnic_rx_rings_len = max_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MIN);
+       vnic_rx_rings_len = min_t(u32, vnic_rx_rings_len, VNIC_RX_QUEUE_LEN_MAX);
+
+       vnic_max_tx_outs  = min_t(u32, vnic_tx_rings_len, vnic_max_tx_outs);
+
+       vnic_napi_weight  = min_t(u32, vnic_napi_weight, VNIC_MAX_NUM_CPUS);
+
+       vnic_lro_num      = max_t(u32, vnic_lro_num, 0);
+       vnic_lro_num      = min_t(u32, vnic_lro_num, VNIC_MAX_LRO_DESCS);
+
+       vnic_inline_tshold = max_t(u32, vnic_inline_tshold, 0);
+       vnic_inline_tshold = min_t(u32, vnic_inline_tshold, VNIC_MAX_INLINE_TSHOLD);
+
+       return 0;
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_port.c
new file mode 100644 (file)
index 0000000..a973deb
--- /dev/null
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "vnic.h"
+#include "vnic_data.h"
+
+/* globals */
+struct workqueue_struct *port_wq;
+struct workqueue_struct *login_wq;
+
+/* functions */
+static void vnic_port_event(struct ib_event_handler *handler,
+                           struct ib_event *record)
+{
+       struct vnic_port *port =
+               container_of(handler, struct vnic_port, event_handler);
+
+       if (record->element.port_num != port->num)
+               return;
+
+       vnic_info("Received event 0x%x (device %s port %d)\n",
+                 record->event, record->device->name,
+                 record->element.port_num);
+
+       switch (record->event) {
+       case IB_EVENT_SM_CHANGE:
+       case IB_EVENT_CLIENT_REREGISTER:
+               /* calls vnic_port_event_task_light() */
+               queue_delayed_work(fip_wq, &port->event_task_light, msecs_to_jiffies(VNIC_SM_HEADSTART));
+               break;
+       case IB_EVENT_PORT_ERR:
+       case IB_EVENT_PORT_ACTIVE:
+               /* calls vnic_port_event_task() */
+               queue_delayed_work(fip_wq, &port->event_task, msecs_to_jiffies(VNIC_SM_HEADSTART));
+               break;
+       case IB_EVENT_PKEY_CHANGE:
+       case IB_EVENT_LID_CHANGE:
+               /* calls port_fip_discover_restart() */
+               if (no_bxm)
+                       queue_delayed_work(fip_wq, &port->event_task, 0);
+               else
+                       queue_delayed_work(port_wq, &port->discover_restart_task, msecs_to_jiffies(VNIC_SM_HEADSTART));
+               break;
+       case IB_EVENT_SRQ_ERR:
+       case IB_EVENT_SRQ_LIMIT_REACHED:
+       case IB_EVENT_QP_LAST_WQE_REACHED:
+       case IB_EVENT_DEVICE_FATAL:
+       default:
+               vnic_warn(port->name, "event 0x%x unhandled\n", record->event);
+               break;
+       }
+
+}
+
+static inline u8 vnic_mcast_rate_enum(struct vnic_port *port, int rate)
+{
+       u8 ret;
+
+       switch (rate) {
+       case 10:
+               ret = IB_RATE_10_GBPS;
+               break;
+       case 20:
+               ret = IB_RATE_20_GBPS;
+               break;
+       case 40:
+               ret = IB_RATE_40_GBPS;
+               break;
+       case 80:
+               ret = IB_RATE_80_GBPS;
+               break;
+       default:
+               ret = IB_RATE_10_GBPS;
+       }
+       return ret;
+}
+
+int vnic_port_query(struct vnic_port *port)
+{
+       if (ib_query_gid(port->dev->ca, port->num, 0, &port->gid)) {
+               vnic_err(port->name, "ib_query_gid failed\n");
+               return -EINVAL;
+       }
+
+       if (ib_query_port(port->dev->ca, port->num, &port->attr)) {
+               vnic_err(port->name, "ib_query_port failed\n");
+               return -EINVAL;
+       }
+
+       port->max_mtu_enum = ib_mtu_enum_to_int(port->attr.max_mtu);
+       port->rate = ((int)port->attr.active_speed *
+                     ib_width_enum_to_int(port->attr.active_width) * 25) / 10;
+       port->rate_enum = vnic_mcast_rate_enum(port, port->rate);
+
+       if (ib_query_pkey(port->dev->ca, port->num, port->pkey_index,
+                         &port->pkey)) {
+               vnic_err(port->name, "ib_query_pkey failed for index %d\n",
+                        port->pkey_index);
+               return -EINVAL;
+       }
+       port->pkey |= 0x8000;
+
+       return 0;
+}
+
+void vnic_port_event_task(struct work_struct *work)
+{
+       struct vnic_port *port =
+               container_of(work, struct vnic_port, event_task.work);
+       struct fip_discover *discover;
+
+       /* refresh port attr, TODO: check what else need to be refreshed */
+       vnic_dbg_mark();
+       mutex_lock(&port->mlock);
+       if (vnic_port_query(port))
+               vnic_warn(port->name, "vnic_port_query failed\n");
+       mutex_unlock(&port->mlock);
+
+       /* refresh login mcasts */
+       vnic_login_refresh_mcasts(port);
+
+       list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+               /* refresh FIP mcasts */
+               if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF))
+                       fip_refresh_mcasts(discover);
+       }
+}
+
+void vnic_port_event_task_light(struct work_struct *work)
+{
+       struct vnic_port *port =
+               container_of(work, struct vnic_port, event_task_light.work);
+       unsigned long flags,mc_flags;
+       struct fip_discover *discover;
+       struct rb_node *node;
+       struct vnic_port_mcast *mcaste;
+       struct mcast_root *mcast_tree = &port->mcast_tree;
+       struct vnic_login *login;
+       vnic_dbg_mark();
+       mutex_lock(&port->mlock);
+
+       if (vnic_port_query(port))
+               vnic_warn(port->name, "vnic_port_query failed\n");
+
+       spin_lock_irqsave(&mcast_tree->mcast_rb_lock, flags);
+       for (node = rb_first(&mcast_tree->mcast_tree); node; node = rb_next(node)){
+                       mcaste = rb_entry(node, struct vnic_port_mcast , rb_node);
+                       clear_bit(MCAST_JOINED, &mcaste->state);
+                       set_bit(MCAST_JOIN_RUNNING, &mcaste->state);
+                       vnic_dbg_mcast(mcaste->port->name,"Rejoin GID="VNIC_GID_FMT"\n",VNIC_GID_ARG(mcaste->gid));
+                       spin_lock_irqsave(&mcaste->lock, mc_flags);
+                       queue_delayed_work(mcast_wq, &mcaste->join_task, 0);
+                       spin_unlock_irqrestore(&mcaste->lock, mc_flags);
+       }
+
+       spin_unlock_irqrestore(&mcast_tree->mcast_rb_lock, flags);
+
+       vnic_dbg_mark();
+       if (vnic_sa_query)
+               list_for_each_entry(login, &port->login_list, list)
+               {
+                               /* take the tx lock to make sure no delete function is called at the time */
+                               netif_tx_lock_bh(login->dev);
+                               vnic_neigh_invalidate(login);
+                               netif_tx_unlock_bh(login->dev);
+               }
+
+       mutex_unlock(&port->mlock);
+
+       list_for_each_entry(discover, &port->fip.discover_list, discover_list) {
+               if ((!no_bxm) && (discover->state != FIP_DISCOVER_OFF))
+                       fip_refresh_mcasts(discover);
+       }
+}
+
+struct vnic_port *vnic_port_alloc(struct vnic_ib_dev *vnic_dev, u8 num)
+{
+       struct vnic_port *port;
+       int def_rings_num;
+       int max_num_cpus;
+
+       port = kzalloc(sizeof *port, GFP_KERNEL);
+       if (!port)
+               return ERR_PTR(-ENOMEM);
+
+       /* pre-init fields */
+       port->num = num;
+       port->dev = vnic_dev;
+
+       max_num_cpus = min((int)num_online_cpus(), VNIC_MAX_NUM_CPUS);
+       def_rings_num = min(vnic_dev->ca->num_comp_vectors, max_num_cpus);
+       port->rx_rings_num = vnic_rx_rings_num ? vnic_rx_rings_num : def_rings_num;
+       port->tx_rings_num = vnic_tx_rings_num ? vnic_tx_rings_num : def_rings_num;
+
+       sprintf(port->name, "%s:%d", port->dev->ca->name, port->num);
+       INIT_LIST_HEAD(&port->login_list);
+       INIT_LIST_HEAD(&port->fip.discover_list);
+       INIT_DELAYED_WORK(&port->event_task, vnic_port_event_task);
+       INIT_DELAYED_WORK(&port->event_task_light, vnic_port_event_task_light);
+       INIT_DELAYED_WORK(&port->discover_restart_task, port_fip_discover_restart);
+       INIT_IB_EVENT_HANDLER(&port->event_handler, vnic_dev->ca,
+                             vnic_port_event);
+       mutex_init(&port->mlock);
+       mutex_init(&port->start_stop_lock);
+       vnic_mcast_root_init(&port->mcast_tree);
+       atomic_set(&port->vnic_child_ids, 0);
+
+       port->pkey_index = 0;   /* used by fip qps, TBD */
+
+       if (ib_register_event_handler(&port->event_handler)) {
+               vnic_err(port->name, "ib_register_event_handler failed\n");
+               goto err;
+       }
+
+       vnic_dbg_mark();
+       mutex_lock(&port->mlock);
+       if (vnic_port_query(port)) {
+               vnic_err(port->name, "vnic_port_query failed\n");
+               mutex_unlock(&port->mlock);
+               if (ib_unregister_event_handler(&port->event_handler))
+                       vnic_err(port->name, "ib_unregister_event_handler failed!\n");
+               goto err;
+       }
+       mutex_unlock(&port->mlock);
+
+       return port;
+err:
+       kfree(port);
+       return ERR_PTR(-EINVAL);
+}
+
+int vnic_port_init(struct vnic_port *port)
+{
+       return vnic_port_ib_init(port);
+}
+
+void vnic_port_cleanup(struct vnic_port *port)
+{
+       /* should be empty list */
+       vnic_port_ib_cleanup(port);
+       return;
+}
+
+static void vnic_ib_dev_add_one(struct ib_device *device);
+static void vnic_ib_dev_remove_one(struct ib_device *device);
+static struct ib_client vnic_init_client = {
+       .name = DRV_NAME,
+       .add = vnic_ib_dev_add_one,
+       .remove = vnic_ib_dev_remove_one,
+};
+
+static void vnic_ib_dev_add_one(struct ib_device *device)
+{
+       struct vnic_port *ib_port;
+       struct vnic_ib_dev *ib_dev;
+       int s, e, p, rc;
+
+       vnic_dbg(NULL, "ib_dev %s\n", device->name);
+
+       if (memcmp(device->name, "mlx4", 4))
+               return;
+
+       if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+               return;
+
+       s = 1;
+       e = device->phys_port_cnt;
+
+       /* alloc ib device */
+       ib_dev = kzalloc(sizeof *ib_dev, GFP_KERNEL);
+       if (!ib_dev)
+               return;
+
+       /* init ib dev */
+       mutex_init(&ib_dev->mlock);
+       ib_dev->ca = device;
+       mutex_lock(&ib_dev->mlock);
+       /* TODO: remove mdev once all mlx4 caps are standard */
+       ib_dev->mdev = to_mdev(device);
+       ASSERT(ib_dev->ca);
+       sprintf(ib_dev->name, "%s", device->name);
+       if (ib_query_device(device, &ib_dev->attr)) {
+               vnic_err(ib_dev->name, "ib_query_device failed on %s\n",
+                        device->name);
+               goto abort;
+       }
+
+       VNIC_FW_STR(ib_dev->attr.fw_ver, ib_dev->fw_ver_str);
+       INIT_LIST_HEAD(&ib_dev->port_list);
+       vnic_dbg_mark();
+       for (p = s; p <= e; ++p) {
+               /* skip non IB link layers */
+                if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+                        continue;
+
+               /* alloc IB port */
+               ib_port = vnic_port_alloc(ib_dev, p);
+               if (IS_ERR(ib_port)) {
+                       vnic_err(ib_dev->name,
+                                "vnic_port_alloc failed %d from %d\n", p, e);
+                       continue;
+               }
+               /* init IB port */
+               rc = vnic_port_init(ib_port);
+               if (rc) {
+                       vnic_err(ib_port->name,
+                                "vnic_port_init failed, rc %d\n", rc);
+                       if (ib_unregister_event_handler(&ib_port->event_handler))
+                               vnic_err(ib_port->name,
+                                        "ib_unregister_event_handler failed!\n");
+                       kfree(ib_port);
+                       continue;
+               }
+               if (no_bxm) {
+                       rc = vnic_port_data_init(ib_port);
+                       if (rc)
+                                vnic_err(ib_port->name,
+                                         "vnic_port_data_init failed, rc %d\n", rc);
+               } else {
+                       rc = vnic_port_fip_init(ib_port);
+                       if (rc)
+                               vnic_err(ib_port->name,
+                                        "vnic_port_fip_init failed, rc %d\n", rc);
+                       else {
+                               rc = port_fs_init(ib_port);
+                               if (rc)
+                                       vnic_warn(ib_port->name, "port_fs_init sysfs:"
+                                                 "entry creation failed, %d\n", rc);
+                       }
+               }
+               if (rc) {
+                       if (ib_unregister_event_handler(&ib_port->event_handler))
+                               vnic_err(ib_port->name,
+                                        "ib_unregister_event_handler failed!\n");
+                       vnic_port_cleanup(ib_port);
+                       kfree(ib_port);
+                       continue;
+
+               }
+               vnic_dbg_mark();
+               mutex_lock(&ib_port->start_stop_lock);
+               list_add_tail(&ib_port->list, &ib_dev->port_list);
+               mutex_unlock(&ib_port->start_stop_lock);
+       }
+
+       /* set device ctx */
+       ib_set_client_data(device, &vnic_init_client, ib_dev);
+       mutex_unlock(&ib_dev->mlock);
+       return;
+
+abort:
+       mutex_unlock(&ib_dev->mlock);
+       kfree(ib_dev);
+}
+
+static void vnic_ib_dev_remove_one(struct ib_device *device)
+{
+       struct vnic_port *port, *port_t;
+       struct vnic_ib_dev *ib_dev =
+               ib_get_client_data(device, &vnic_init_client);
+
+       vnic_dbg(NULL, "ib_dev %s\n", device->name);
+
+       if (!ib_dev)
+               return;
+
+       vnic_dbg_mark();
+       mutex_lock(&ib_dev->mlock);
+       list_for_each_entry_safe(port, port_t, &ib_dev->port_list, list) {
+               vnic_dbg(port->name, "port %d\n", port->num);
+               if (ib_unregister_event_handler(&port->event_handler))
+                       vnic_err(port->name, "ib_unregister_event_handler failed!\n");
+               /* make sure we don't have any more pending events */
+#ifndef _BP_WORK_SYNC
+               cancel_delayed_work_sync(&port->event_task_light);
+               cancel_delayed_work_sync(&port->event_task);
+               cancel_delayed_work_sync(&port->discover_restart_task);
+#else
+               cancel_delayed_work(&port->event_task_light);
+               cancel_delayed_work(&port->event_task);
+               cancel_delayed_work(&port->discover_restart_task);
+               flush_workqueue(port_wq);
+               flush_workqueue(fip_wq);
+#endif
+               /* remove sysfs entries related to FIP
+                *  we want to do this outside the lock
+                */
+               port_fs_exit(port);
+
+               /* cleanup any pending vnics */
+               vnic_dbg_mark();
+               mutex_lock(&port->start_stop_lock);
+               list_del(&port->list);
+               if (no_bxm)
+                       vnic_port_data_cleanup(port);
+               else {
+                       vnic_port_fip_cleanup(port, 0);
+               }
+               mutex_unlock(&port->start_stop_lock);
+               vnic_port_cleanup(port);
+               kfree(port);
+       }
+       mutex_unlock(&ib_dev->mlock);
+
+       kfree(ib_dev);
+}
+
+int vnic_ports_init(void)
+{
+       int rc;
+
+       /* create global wq */
+       port_wq = create_singlethread_workqueue("port_wq");
+       if (!port_wq) {
+               vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+                        "port_wq");
+               return -EINVAL;
+       }
+
+       login_wq = create_singlethread_workqueue("login_wq");
+       if (!login_wq) {
+               vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+                        "login_wq");
+               goto free_wq0;
+       }
+
+       fip_wq = create_singlethread_workqueue("fip");
+       if (!fip_wq) {
+               vnic_err(NULL, "create_singlethread_workqueue failed for %s\n",
+                        "fip");
+               goto free_wq1;
+       }
+
+       /* calls vnic_ib_dev_add_one() */
+       rc = ib_register_client(&vnic_init_client);
+       if (rc) {
+               vnic_err(NULL, "ib_register_client failed %d\n", rc);
+               goto free_wq2;
+       }
+
+       return 0;
+
+free_wq2:
+       destroy_workqueue(fip_wq);
+free_wq1:
+       destroy_workqueue(login_wq);
+free_wq0:
+       destroy_workqueue(port_wq);
+
+       return -EINVAL;
+}
+
+void vnic_ports_cleanup(void)
+{
+       vnic_dbg(NULL, "calling ib_unregister_client\n");
+       /* calls vnic_ib_dev_remove_one() */
+       ib_unregister_client(&vnic_init_client);
+       vnic_dbg(NULL, "calling destroy_workqueue\n");
+       destroy_workqueue(fip_wq);
+       destroy_workqueue(login_wq);
+       destroy_workqueue(port_wq);
+       vnic_dbg(NULL, "vnic_data_cleanup done\n");
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_qp.c
new file mode 100644 (file)
index 0000000..c8fb317
--- /dev/null
@@ -0,0 +1,1636 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/qp.h>
+#include <linux/io.h>
+
+#include "vnic.h"
+
+/* compare with drivers/infiniband/hw/mlx4/qp.c */
+#define mlx4_ib_dbg(format, arg...) vnic_dbg(NULL, format, ## arg)
+
+enum {
+       MLX4_IB_ACK_REQ_FREQ    = 8,
+};
+
+enum {
+       MLX4_IB_DEFAULT_SCHED_QUEUE     = 0x83,
+       MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
+       MLX4_IB_LINK_TYPE_IB            = 0,
+       MLX4_IB_LINK_TYPE_ETH           = 1,
+};
+
+enum {
+       /*
+        * Largest possible UD header: send with GRH and immediate data.
+        * 4 bytes added to accommodate for eth header instead of lrh
+        */
+       MLX4_IB_UD_HEADER_SIZE          = 76,
+       MLX4_IB_MAX_RAW_ETY_HDR_SIZE    = 12
+};
+
+enum {
+       MLX4_IBOE_ETHERTYPE = 0x8915
+};
+
+struct mlx4_ib_sqp {
+       struct mlx4_ib_qp       qp;
+       int                     pkey_index;
+       u32                     qkey;
+       u32                     send_psn;
+       struct ib_ud_header     ud_header;
+       u8                      header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+enum {
+       MLX4_IB_MIN_SQ_STRIDE = 6
+};
+
+static const __be32 mlx4_ib_opcode[] = {
+       [IB_WR_SEND]                    = cpu_to_be32(MLX4_OPCODE_SEND),
+       [IB_WR_LSO]                     = cpu_to_be32(MLX4_OPCODE_LSO),
+       [IB_WR_SEND_WITH_IMM]           = cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+       [IB_WR_RDMA_WRITE]              = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+       [IB_WR_RDMA_WRITE_WITH_IMM]     = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+       [IB_WR_RDMA_READ]               = cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+       [IB_WR_ATOMIC_CMP_AND_SWP]      = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+       [IB_WR_ATOMIC_FETCH_AND_ADD]    = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+       [IB_WR_SEND_WITH_INV]           = cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+       [IB_WR_LOCAL_INV]               = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+       [IB_WR_FAST_REG_MR]             = cpu_to_be32(MLX4_OPCODE_FMR),
+       [IB_WR_MASKED_ATOMIC_CMP_AND_SWP]       = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+       [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]     = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+};
+
+#ifndef wc_wmb
+       #if defined(__i386__)
+               #define wc_wmb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
+       #elif defined(__x86_64__)
+               #define wc_wmb() asm volatile("sfence" ::: "memory")
+       #elif defined(__ia64__)
+               #define wc_wmb() asm volatile("fwb" ::: "memory")
+       #else
+               #define wc_wmb() wmb()
+       #endif
+#endif
+
+#if 0
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+       return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+#endif
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+       return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+       return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+       return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with
+ *     0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+       __be32 *wqe;
+       int i;
+       int s;
+       int ind;
+       void *buf;
+       __be32 stamp;
+       struct mlx4_wqe_ctrl_seg *ctrl;
+
+       if (qp->sq_max_wqes_per_wr > 1) {
+               s = roundup(size, 1U << qp->sq.wqe_shift);
+               for (i = 0; i < s; i += 64) {
+                       ind = (i >> qp->sq.wqe_shift) + n;
+                       stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+                                                      cpu_to_be32(0xffffffff);
+                       buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+                       wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+                       *wqe = stamp;
+               }
+       } else {
+               ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+               s = (ctrl->fence_size & 0x3f) << 4;
+               for (i = 64; i < s; i += 64) {
+                       wqe = buf + i;
+                       *wqe = cpu_to_be32(0xffffffff);
+               }
+       }
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+       struct mlx4_wqe_ctrl_seg *ctrl;
+       struct mlx4_wqe_inline_seg *inl;
+       void *wqe;
+       int s;
+
+       ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+       s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+       if (qp->ibqp.qp_type == IB_QPT_UD) {
+               struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+               struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+               memset(dgram, 0, sizeof *dgram);
+               av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+               s += sizeof(struct mlx4_wqe_datagram_seg);
+       }
+
+       /* Pad the remainder of the WQE with an inline data segment. */
+       if (size > s) {
+               inl = wqe + s;
+               inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+       }
+       ctrl->srcrb_flags = 0;
+       ctrl->fence_size = size / 16;
+       /*
+        * Make sure descriptor is fully written before setting ownership bit
+        * (because HW can start executing as soon as we do).
+        */
+       wmb();
+
+       ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+               (n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+       stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+       unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+       if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+               post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+               ind += s;
+       }
+       return ind;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+       struct ib_event event;
+       struct mlx4_ib_qp *mqp = to_mibqp(qp);
+       struct ib_qp *ibqp = &mqp->ibqp;
+
+       if (type == MLX4_EVENT_TYPE_PATH_MIG)
+               to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+       if (ibqp->event_handler) {
+               event.device     = ibqp->device;
+               switch (type) {
+               case MLX4_EVENT_TYPE_PATH_MIG:
+                       event.event = IB_EVENT_PATH_MIG;
+                       break;
+               case MLX4_EVENT_TYPE_COMM_EST:
+                       event.event = IB_EVENT_COMM_EST;
+                       break;
+               case MLX4_EVENT_TYPE_SQ_DRAINED:
+                       event.event = IB_EVENT_SQ_DRAINED;
+                       break;
+               case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+                       event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+                       break;
+               case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+                       event.event = IB_EVENT_QP_FATAL;
+                       break;
+               case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+                       event.event = IB_EVENT_PATH_MIG_ERR;
+                       break;
+               case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+                       event.event = IB_EVENT_QP_REQ_ERR;
+                       break;
+               case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+                       event.event = IB_EVENT_QP_ACCESS_ERR;
+                       break;
+               default:
+                       printk(KERN_WARNING "mlx4_ib: Unexpected event type %d "
+                              "on QP %06x\n", type, qp->qpn);
+                       return;
+               }
+
+               event.element.qp = ibqp;
+               ibqp->event_handler(&event, ibqp->qp_context);
+       }
+}
+
+static int send_wqe_overhead(enum ib_qp_type type, u32 flags)
+{
+       /*
+        * UD WQEs must have a datagram segment.
+        * RC and UC WQEs might have a remote address segment.
+        * MLX WQEs need two extra inline data segments (for the UD
+        * header and space for the ICRC).
+        */
+       switch (type) {
+       case IB_QPT_UD:
+               return sizeof (struct mlx4_wqe_ctrl_seg) +
+                       sizeof (struct mlx4_wqe_datagram_seg) +
+                       ((flags & MLX4_IB_QP_LSO) ? 128 : 0);
+       case IB_QPT_UC:
+               return sizeof (struct mlx4_wqe_ctrl_seg) +
+                       sizeof (struct mlx4_wqe_raddr_seg);
+       case IB_QPT_XRC_TGT:
+       case IB_QPT_RC:
+               return sizeof (struct mlx4_wqe_ctrl_seg) +
+                       sizeof (struct mlx4_wqe_atomic_seg) +
+                       sizeof (struct mlx4_wqe_raddr_seg);
+       case IB_QPT_SMI:
+       case IB_QPT_GSI:
+               return sizeof (struct mlx4_wqe_ctrl_seg) +
+                       ALIGN(MLX4_IB_UD_HEADER_SIZE +
+                             DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+                                          MLX4_INLINE_ALIGN) *
+                             sizeof (struct mlx4_wqe_inline_seg),
+                             sizeof (struct mlx4_wqe_data_seg)) +
+                       ALIGN(4 +
+                             sizeof (struct mlx4_wqe_inline_seg),
+                             sizeof (struct mlx4_wqe_data_seg));
+       case IB_QPT_RAW_ETHERTYPE:
+               return sizeof(struct mlx4_wqe_ctrl_seg) +
+                       ALIGN(MLX4_IB_MAX_RAW_ETY_HDR_SIZE +
+                             sizeof(struct mlx4_wqe_inline_seg),
+                             sizeof(struct mlx4_wqe_data_seg));
+
+       default:
+               return sizeof (struct mlx4_wqe_ctrl_seg);
+       }
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+                      int is_user, int has_rq, struct mlx4_ib_qp *qp)
+{
+       /* Sanity check RQ size before proceeding */
+       if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+               cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
+               return -EINVAL;
+
+       if (!has_rq) {
+               if (cap->max_recv_wr)
+                       return -EINVAL;
+
+               qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+       } else {
+               /* HW requires >= 1 RQ entry with >= 1 gather entry */
+               if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
+                       return -EINVAL;
+
+               qp->rq.wqe_cnt   = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+               qp->rq.max_gs    = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+               qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+       }
+
+       /* leave userspace return values as they were, so as not to break ABI */
+       if (is_user) {
+               cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
+               cap->max_recv_sge = qp->rq.max_gs;
+       } else {
+               cap->max_recv_wr  = qp->rq.max_post =
+                       min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+               cap->max_recv_sge = min(qp->rq.max_gs,
+                                       min(dev->dev->caps.max_sq_sg,
+                                           dev->dev->caps.max_rq_sg));
+       }
+
+       return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+                             enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+       int s;
+
+       /* Sanity check SQ size before proceeding */
+       if (cap->max_send_wr  > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+           cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+           cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+           sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+               return -EINVAL;
+
+       /*
+        * For MLX transport we need 2 extra S/G entries:
+        * one for the header and one for the checksum at the end
+        */
+       if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+            type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
+           cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+               return -EINVAL;
+
+       s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+               cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+               send_wqe_overhead(type, qp->flags);
+
+       if (s > dev->dev->caps.max_sq_desc_sz)
+               return -EINVAL;
+
+       /*
+        * Hermon supports shrinking WQEs, such that a single work
+        * request can include multiple units of 1 << wqe_shift.  This
+        * way, work requests can differ in size, and do not have to
+        * be a power of 2 in size, saving memory and speeding up send
+        * WR posting.  Unfortunately, if we do this then the
+        * wqe_index field in CQEs can't be used to look up the WR ID
+        * anymore, so we do this only if selective signaling is off.
+        *
+        * Further, on 32-bit platforms, we can't use vmap() to make
+        * the QP buffer virtually contiguous.  Thus we have to use
+        * constant-sized WRs to make sure a WR is always fully within
+        * a single page-sized chunk.
+        *
+        * Finally, we use NOP work requests to pad the end of the
+        * work queue, to avoid wrap-around in the middle of WR.  We
+        * set NEC bit to avoid getting completions with error for
+        * these NOP WRs, but since NEC is only supported starting
+        * with firmware 2.2.232, we use constant-sized WRs for older
+        * firmware.
+        *
+        * And, since MLX QPs only support SEND, we use constant-sized
+        * WRs in this case.
+        *
+        * We look for the smallest value of wqe_shift such that the
+        * resulting number of wqes does not exceed device
+        * capabilities.
+        *
+        * We set WQE size to at least 64 bytes, this way stamping
+        * invalidates each WQE.
+        */
+       if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+           qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+           type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
+           !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
+                     MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
+               qp->sq.wqe_shift = ilog2(64);
+       else
+               qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+       for (;;) {
+               qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+
+               /*
+                * We need to leave 2 KB + 1 WR of headroom in the SQ to
+                * allow HW to prefetch.
+                */
+               qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+               qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+                                                   qp->sq_max_wqes_per_wr +
+                                                   qp->sq_spare_wqes);
+
+               if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+                       break;
+
+               if (qp->sq_max_wqes_per_wr <= 1)
+                       return -EINVAL;
+
+               ++qp->sq.wqe_shift;
+       }
+
+       qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+                            (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
+                        send_wqe_overhead(type, qp->flags)) /
+               sizeof (struct mlx4_wqe_data_seg);
+
+       qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+               (qp->sq.wqe_cnt << qp->sq.wqe_shift);
+       if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+               qp->rq.offset = 0;
+               qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+       } else {
+               qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+               qp->sq.offset = 0;
+       }
+
+       cap->max_send_wr  = qp->sq.max_post =
+               (qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+       cap->max_send_sge = min(qp->sq.max_gs,
+                               min(dev->dev->caps.max_sq_sg,
+                                   dev->dev->caps.max_rq_sg));
+       qp->max_inline_data = cap->max_inline_data;
+
+       return 0;
+}
+
+
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+       switch (state) {
+       case IB_QPS_RESET:      return MLX4_QP_STATE_RST;
+       case IB_QPS_INIT:       return MLX4_QP_STATE_INIT;
+       case IB_QPS_RTR:        return MLX4_QP_STATE_RTR;
+       case IB_QPS_RTS:        return MLX4_QP_STATE_RTS;
+       case IB_QPS_SQD:        return MLX4_QP_STATE_SQD;
+       case IB_QPS_SQE:        return MLX4_QP_STATE_SQER;
+       case IB_QPS_ERR:        return MLX4_QP_STATE_ERR;
+       default:                return -1;
+       }
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+       struct mlx4_ib_gid_entry *ge, *tmp;
+
+       list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+               list_del(&ge->list);
+               kfree(ge);
+       }
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+                             struct ib_qp_init_attr *init_attr)
+{
+       if (qp->state != IB_QPS_RESET)
+               if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+                                  MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+                       printk(KERN_WARNING "mlx4_ib: modify QP %06x to RESET failed.\n",
+                              qp->mqp.qpn);
+
+       mlx4_qp_remove(dev->dev, &qp->mqp);
+       mlx4_qp_free(dev->dev, &qp->mqp);
+       mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+       mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+       kfree(qp->sq.wrid);
+       kfree(qp->rq.wrid);
+       mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+       if (qp->max_inline_data)
+               mlx4_bf_free(dev->dev, &qp->bf);
+       if (!init_attr->srq)
+               mlx4_db_free(dev->dev, &qp->db);
+
+       del_gid_entries(qp);
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+       if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+               return 0;
+
+       return !attr->srq;
+}
+
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+                           struct ib_qp_init_attr *init_attr,
+                           struct ib_udata *udata, int sqpn, struct mlx4_ib_qp *qp)
+{
+       int qpn;
+       int err;
+       enum mlx4_ib_qp_type qp_type =
+                       (enum mlx4_ib_qp_type) init_attr->qp_type;
+       qp->mlx4_ib_qp_type = qp_type;
+       qp->pri.vid = qp->alt.vid = 0xFFFF;
+       mutex_init(&qp->mutex);
+       spin_lock_init(&qp->sq.lock);
+       spin_lock_init(&qp->rq.lock);
+       INIT_LIST_HEAD(&qp->gid_list);
+       INIT_LIST_HEAD(&qp->steering_rules);
+
+       qp->state        = IB_QPS_RESET;
+       if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+               qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+       err = set_rq_size(dev, &init_attr->cap, !!pd->uobject,
+                                         qp_has_rq(init_attr), qp);
+       if (err)
+               goto err;
+
+       if (pd->uobject) {
+       } else {
+               qp->sq_no_prefetch = 0;
+
+               if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+                       qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+               if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+                       qp->flags |= MLX4_IB_QP_LSO;
+
+               if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP &&
+                   dev->dev->caps.steering_mode ==
+                   MLX4_STEERING_MODE_DEVICE_MANAGED &&
+                   !mlx4_is_mfunc(dev->dev))
+                       qp->flags |= MLX4_IB_QP_NETIF;
+
+               err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+               if (err)
+                       goto err;
+
+               if (qp_has_rq(init_attr)) {
+                       err = mlx4_db_alloc(dev->dev, &qp->db, 0, GFP_KERNEL);
+                       if (err)
+                               goto err;
+
+                       *qp->db.db = 0;
+               }
+
+               if (qp->max_inline_data) {
+                       err = mlx4_bf_alloc(dev->dev, &qp->bf, 0);
+                       if (err) {
+                               mlx4_ib_dbg("failed to allocate blue flame register (%d)", err);
+                               qp->bf.uar = &dev->priv_uar;
+                       }
+               } else
+                       qp->bf.uar = &dev->priv_uar;
+
+               if (mlx4_buf_alloc(dev->dev, qp->buf_size,
+                                          PAGE_SIZE * 2, &qp->buf, GFP_KERNEL)) {
+                       err = -ENOMEM;
+                       goto err_db;
+               }
+
+               err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+                                   &qp->mtt);
+               if (err) {
+                       mlx4_ib_dbg("kernel qp mlx4_mtt_init error (%d)", err);
+                       goto err_buf;
+               }
+
+               err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, GFP_KERNEL);
+               if (err) {
+                       mlx4_ib_dbg("mlx4_buf_write_mtt error (%d)", err);
+                       goto err_mtt;
+               }
+
+               /* these are big chunks that may fail, added __GFP_NOWARN */
+               qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64),
+                                      GFP_KERNEL | __GFP_NOWARN);
+               qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64),
+                                      GFP_KERNEL | __GFP_NOWARN);
+
+               if (!qp->sq.wrid || !qp->rq.wrid) {
+                       printk(KERN_WARNING "%s:%d: not enough memory\n",
+                              __func__, __LINE__);
+                       err = -ENOMEM;
+                       goto err_wrid;
+               }
+       }
+
+       qpn = sqpn;
+
+       err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, GFP_KERNEL);
+       if (err)
+               goto err_qpn;
+
+       if (init_attr->qp_type == IB_QPT_XRC_TGT)
+               qp->mqp.qpn |= (1 << 23);
+
+       /*
+        * Hardware wants QPN written in big-endian order (after
+        * shifting) for send doorbell.  Precompute this value to save
+        * a little bit when posting sends.
+        */
+       qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+       qp->mqp.event = mlx4_ib_qp_event;
+
+       return 0;
+
+err_qpn:
+err_wrid:
+       if (pd->uobject) {
+       } else {
+               kfree(qp->sq.wrid);
+               kfree(qp->rq.wrid);
+       }
+
+err_mtt:
+       mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+       if (pd->uobject)
+               ib_umem_release(qp->umem);
+       else
+               mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+       if (!pd->uobject && !init_attr->srq
+               && init_attr->qp_type != IB_QPT_XRC_TGT)
+               mlx4_db_free(dev->dev, &qp->db);
+
+       if (qp->max_inline_data)
+               mlx4_bf_free(dev->dev, &qp->bf);
+
+err:
+       return err;
+}
+
+#if 0
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+                           void *wqe, unsigned *mlx_seg_len)
+{
+       struct ib_device *ib_dev = &to_mdev(sqp->qp.ibqp.device)->ib_dev;
+       struct mlx4_wqe_mlx_seg *mlx = wqe;
+       struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+       struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+       u16 pkey;
+       int send_size;
+       int header_size;
+       int spc;
+       int i;
+       union ib_gid sgid;
+       int is_eth;
+       int is_grh;
+       int is_vlan = 0;
+       int err;
+       u16 vlan;
+
+       send_size = 0;
+       for (i = 0; i < wr->num_sge; ++i)
+               send_size += wr->sg_list[i].length;
+
+       is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+       is_grh = mlx4_ib_ah_grh_present(ah);
+       err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+                               ah->av.ib.gid_index, &sgid);
+       if (err)
+               return err;
+
+       if (is_eth) {
+               is_vlan = rdma_get_vlan_id(&sgid) < 0x1000;
+               vlan = rdma_get_vlan_id(&sgid);
+       }
+
+       ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+       if (!is_eth) {
+               sqp->ud_header.lrh.service_level =
+                       be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+               sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+               sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+       }
+
+       if (is_grh) {
+               sqp->ud_header.grh.traffic_class =
+                       (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+               sqp->ud_header.grh.flow_label    =
+                       ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+               sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
+               ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
+                                 ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
+               memcpy(sqp->ud_header.grh.destination_gid.raw,
+                      ah->av.ib.dgid, 16);
+       }
+
+       mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+       if (!is_eth) {
+               mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+                                         (sqp->ud_header.lrh.destination_lid ==
+                                          IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+                                         (sqp->ud_header.lrh.service_level << 8));
+               mlx->rlid = sqp->ud_header.lrh.destination_lid;
+       }
+
+       switch (wr->opcode) {
+       case IB_WR_SEND:
+               sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+               sqp->ud_header.immediate_present = 0;
+               break;
+       case IB_WR_SEND_WITH_IMM:
+               sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+               sqp->ud_header.immediate_present = 1;
+               sqp->ud_header.immediate_data    = wr->ex.imm_data;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (is_eth) {
+               u8 *smac;
+
+               memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
+               smac = to_mdev(sqp->qp.ibqp.device)->iboe.netdevs[sqp->qp.port - 1]->dev_addr; /* fixme: cache this value */
+               memcpy(sqp->ud_header.eth.smac_h, smac, 6);
+               if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+                       mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+               if (!is_vlan)
+                       sqp->ud_header.eth.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+               else {
+                       u16 pcp;
+
+                       sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IBOE_ETHERTYPE);
+                       pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 27 & 3) << 13;
+                       sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+               }
+       } else {
+               sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+               if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+                       sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+       }
+       sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+       if (!sqp->qp.ibqp.qp_num)
+               ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+       else
+               ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+       sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+       sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+       sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+       sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+                                              sqp->qkey : wr->wr.ud.remote_qkey);
+       sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+       header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+       if (0) {
+               printk(KERN_ERR "built UD header of size %d:\n", header_size);
+               for (i = 0; i < header_size / 4; ++i) {
+                       if (i % 8 == 0)
+                               printk("  [%02x] ", i * 4);
+                       printk(" %08x",
+                              be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+                       if ((i + 1) % 8 == 0)
+                               printk("\n");
+               }
+               printk("\n");
+       }
+
+       /*
+        * Inline data segments may not cross a 64 byte boundary.  If
+        * our UD header is bigger than the space available up to the
+        * next 64 byte boundary in the WQE, use two inline data
+        * segments to hold the UD header.
+        */
+       spc = MLX4_INLINE_ALIGN -
+             ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+       if (header_size <= spc) {
+               inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+               memcpy(inl + 1, sqp->header_buf, header_size);
+               i = 1;
+       } else {
+               inl->byte_count = cpu_to_be32(1 << 31 | spc);
+               memcpy(inl + 1, sqp->header_buf, spc);
+
+               inl = (void *) (inl + 1) + spc;
+               memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+               /*
+                * Need a barrier here to make sure all the data is
+                * visible before the byte_count field is set.
+                * Otherwise the HCA prefetcher could grab the 64-byte
+                * chunk with this inline segment and get a valid (!=
+                * 0xffffffff) byte count but stale data, and end up
+                * generating a packet with bad headers.
+                *
+                * The first inline segment's byte_count field doesn't
+                * need a barrier, because it comes after a
+                * control/MLX segment and therefore is at an offset
+                * of 16 mod 64.
+                */
+               wmb();
+               inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+               i = 2;
+       }
+
+       *mlx_seg_len =
+       ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+       return 0;
+}
+#endif
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+       unsigned cur;
+       struct mlx4_ib_cq *cq;
+
+       cur = wq->head - wq->tail;
+       if (likely(cur + nreq < wq->max_post))
+               return 0;
+
+       cq = to_mcq(ib_cq);
+       spin_lock(&cq->lock);
+       cur = wq->head - wq->tail;
+       spin_unlock(&cq->lock);
+
+       return cur + nreq >= wq->max_post;
+}
+
+#if 0
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+       iseg->flags     = 0;
+       iseg->mem_key   = cpu_to_be32(rkey);
+       iseg->guest_id  = 0;
+       iseg->pa        = 0;
+}
+#endif
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+                                         u64 remote_addr, u32 rkey)
+{
+       rseg->raddr    = cpu_to_be64(remote_addr);
+       rseg->rkey     = cpu_to_be32(rkey);
+       rseg->reserved = 0;
+}
+
+#if 0
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
+{
+       if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+               aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+               aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+       } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+               aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+               aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+       } else {
+               aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+               aseg->compare  = 0;
+       }
+
+}
+#endif
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+                            struct ib_send_wr *wr, __be16 *vlan)
+{
+       memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+       dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+       dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+       dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
+       memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
+       *vlan = dseg->vlan;
+}
+
+#if 0
+static void set_mlx_icrc_seg(void *dseg)
+{
+       u32 *t = dseg;
+       struct mlx4_wqe_inline_seg *iseg = dseg;
+
+       t[1] = 0;
+
+       /*
+        * Need a barrier here before writing the byte_count field to
+        * make sure that all the data is visible before the
+        * byte_count field is set.  Otherwise, if the segment begins
+        * a new cacheline, the HCA prefetcher could grab the 64-byte
+        * chunk and get a valid (!= * 0xffffffff) byte count but
+        * stale data, and end up sending the wrong data.
+        */
+       wmb();
+
+       iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+#endif
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+       dseg->lkey       = cpu_to_be32(sg->lkey);
+       dseg->addr       = cpu_to_be64(sg->addr);
+
+       /*
+        * Need a barrier here before writing the byte_count field to
+        * make sure that all the data is visible before the
+        * byte_count field is set.  Otherwise, if the segment begins
+        * a new cacheline, the HCA prefetcher could grab the 64-byte
+        * chunk and get a valid (!= * 0xffffffff) byte count but
+        * stale data, and end up sending the wrong data.
+        */
+       wmb();
+
+       dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+       dseg->byte_count = cpu_to_be32(sg->length);
+       dseg->lkey       = cpu_to_be32(sg->lkey);
+       dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+                        struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+                        __be32 *lso_hdr_sz, int *blh)
+{
+       unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
+
+       *blh = unlikely(halign > 64) ? 1 : 0;
+
+       if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+                    wr->num_sge > qp->sq.max_gs - (halign >> 4)))
+               return -EINVAL;
+
+       memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
+
+       *lso_hdr_sz  = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+                                  wr->wr.ud.hlen);
+       *lso_seg_len = halign;
+       return 0;
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+       switch (wr->opcode) {
+       case IB_WR_SEND_WITH_IMM:
+       case IB_WR_RDMA_WRITE_WITH_IMM:
+               return wr->ex.imm_data;
+
+       case IB_WR_SEND_WITH_INV:
+               return cpu_to_be32(wr->ex.invalidate_rkey);
+
+       default:
+               return 0;
+       }
+}
+
+static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
+                          void *wqe, int *sz)
+{
+       struct mlx4_wqe_inline_seg *seg;
+       void *addr;
+       int len, seg_len;
+       int num_seg;
+       int off, to_copy;
+       int i;
+       int inl = 0;
+
+       seg = wqe; // current segment
+       wqe += sizeof *seg; // wqe pointer
+       off = ((unsigned long)wqe) & (unsigned long)(MLX4_INLINE_ALIGN - 1);
+       num_seg = 0;
+       seg_len = 0;
+
+       for (i = 0; i < wr->num_sge; ++i) {
+               addr = (void *) (unsigned long)(wr->sg_list[i].addr);
+               len  = wr->sg_list[i].length;
+               inl += len;
+
+               if (inl > qp->max_inline_data) {
+                       inl = 0;
+                       return -1;
+               }
+
+               while (len >= MLX4_INLINE_ALIGN - off) {
+                       to_copy = MLX4_INLINE_ALIGN - off;
+                       memcpy(wqe, addr, to_copy);
+                       len -= to_copy;
+                       wqe += to_copy;
+                       addr += to_copy;
+                       seg_len += to_copy;
+                       wmb(); /* see comment below */
+                       seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+                       seg_len = 0;
+                       seg = wqe;
+                       wqe += sizeof *seg;
+                       off = sizeof *seg;
+                       ++num_seg;
+               }
+
+               memcpy(wqe, addr, len);
+               wqe += len;
+               seg_len += len;
+               off += len;
+       }
+
+       if (seg_len) {
+               ++num_seg;
+               /*
+                * Need a barrier here to make sure
+                * all the data is visible before the
+                * byte_count field is set.  Otherwise
+                * the HCA prefetcher could grab the
+                * 64-byte chunk with this inline
+                * segment and get a valid (!=
+                * 0xffffffff) byte count but stale
+                * data, and end up sending the wrong
+                * data.
+                */
+               wmb();
+               seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
+       }
+
+       *sz = (inl + num_seg * sizeof * seg + 15) / 16;
+
+       return 0;
+}
+
+/*
+ * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
+ * implementations may use move-string-buffer assembler instructions,
+ * which do not guarantee order of copying.
+ */
+static void mlx4_bf_copy(unsigned long *dst, unsigned long *src, unsigned bytecnt)
+{
+       __iowrite64_copy(dst, src, bytecnt / 8);
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+                     struct ib_recv_wr **bad_wr)
+{
+       struct mlx4_ib_qp *qp = to_mqp(ibqp);
+       struct mlx4_wqe_data_seg *scat;
+       unsigned long flags;
+       int err = 0;
+       int nreq;
+       int ind;
+       int i;
+
+       spin_lock_irqsave(&qp->rq.lock, flags);
+
+       ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+       for (nreq = 0; wr; ++nreq, wr = wr->next) {
+               if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+                       mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+                       err = -ENOMEM;
+                       *bad_wr = wr;
+                       goto out;
+               }
+
+               if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+                       mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+                                   ibqp->qp_num, wr->num_sge);
+                       err = -EINVAL;
+                       *bad_wr = wr;
+                       goto out;
+               }
+
+               scat = get_recv_wqe(qp, ind);
+
+               for (i = 0; i < wr->num_sge; ++i)
+                       __set_data_seg(scat + i, wr->sg_list + i);
+
+               if (i < qp->rq.max_gs) {
+                       scat[i].byte_count = 0;
+                       scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+                       scat[i].addr       = 0;
+               }
+
+               qp->rq.wrid[ind] = wr->wr_id;
+
+               ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+       }
+
+out:
+       if (likely(nreq)) {
+               qp->rq.head += nreq;
+
+               /*
+                * Make sure that descriptors are written before
+                * doorbell record.
+                */
+               wmb();
+
+               *qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+       }
+
+       spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+       return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+       switch (mlx4_state) {
+       case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
+       case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
+       case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
+       case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
+       case MLX4_QP_STATE_SQ_DRAINING:
+       case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
+       case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
+       case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
+       default:                     return -1;
+       }
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+       switch (mlx4_mig_state) {
+       case MLX4_QP_PM_ARMED:          return IB_MIG_ARMED;
+       case MLX4_QP_PM_REARM:          return IB_MIG_REARM;
+       case MLX4_QP_PM_MIGRATED:       return IB_MIG_MIGRATED;
+       default: return -1;
+       }
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+       int ib_flags = 0;
+
+       if (mlx4_flags & MLX4_QP_BIT_RRE)
+               ib_flags |= IB_ACCESS_REMOTE_READ;
+       if (mlx4_flags & MLX4_QP_BIT_RWE)
+               ib_flags |= IB_ACCESS_REMOTE_WRITE;
+       if (mlx4_flags & MLX4_QP_BIT_RAE)
+               ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+       return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx4_ib_dev *ib_dev, struct ib_ah_attr *ib_ah_attr,
+                         struct mlx4_qp_path *path)
+{
+       struct mlx4_dev *dev = ib_dev->dev;
+       int is_eth;
+
+       memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+       ib_ah_attr->port_num      = path->sched_queue & 0x40 ? 2 : 1;
+
+       if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+               return;
+
+       is_eth = rdma_port_get_link_layer(&ib_dev->ib_dev, ib_ah_attr->port_num) ==
+               IB_LINK_LAYER_ETHERNET;
+       if (is_eth)
+               ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
+               ((path->sched_queue & 4) << 1);
+       else
+               ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
+
+       ib_ah_attr->dlid          = be16_to_cpu(path->rlid);
+
+       ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
+       ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+       ib_ah_attr->ah_flags      = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+       if (ib_ah_attr->ah_flags) {
+               ib_ah_attr->grh.sgid_index = path->mgid_index;
+               ib_ah_attr->grh.hop_limit  = path->hop_limit;
+               ib_ah_attr->grh.traffic_class =
+                       (be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+               ib_ah_attr->grh.flow_label =
+                       be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+               memcpy(ib_ah_attr->grh.dgid.raw,
+                       path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+       }
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+                    struct ib_qp_init_attr *qp_init_attr)
+{
+       struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+       struct mlx4_ib_qp *qp = to_mqp(ibqp);
+       struct mlx4_qp_context context;
+       int mlx4_state;
+       int err = 0;
+
+       mutex_lock(&qp->mutex);
+
+       if (qp->state == IB_QPS_RESET) {
+               qp_attr->qp_state = IB_QPS_RESET;
+               goto done;
+       }
+
+       err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+       if (err) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+       qp->state                    = to_ib_qp_state(mlx4_state);
+       qp_attr->qp_state            = qp->state;
+       qp_attr->path_mtu            = context.mtu_msgmax >> 5;
+       qp_attr->path_mig_state      =
+               to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+       qp_attr->qkey                = be32_to_cpu(context.qkey);
+       qp_attr->rq_psn              = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+       qp_attr->sq_psn              = be32_to_cpu(context.next_send_psn) & 0xffffff;
+       qp_attr->dest_qp_num         = be32_to_cpu(context.remote_qpn) & 0xffffff;
+       qp_attr->qp_access_flags     =
+               to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+       if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC ||
+           qp->ibqp.qp_type == IB_QPT_XRC_TGT) {
+               to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+               to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+               qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+               qp_attr->alt_port_num   = qp_attr->alt_ah_attr.port_num;
+       }
+
+       qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+       if (qp_attr->qp_state == IB_QPS_INIT)
+               qp_attr->port_num = qp->port;
+       else
+               qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+       /* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+       qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+       qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+       qp_attr->max_dest_rd_atomic =
+               1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+       qp_attr->min_rnr_timer      =
+               (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+       qp_attr->timeout            = context.pri_path.ackto >> 3;
+       qp_attr->retry_cnt          = (be32_to_cpu(context.params1) >> 16) & 0x7;
+       qp_attr->rnr_retry          = (be32_to_cpu(context.params1) >> 13) & 0x7;
+       qp_attr->alt_timeout        = context.alt_path.ackto >> 3;
+
+done:
+       qp_attr->cur_qp_state        = qp_attr->qp_state;
+       qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+       qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+       if (!ibqp->uobject) {
+               qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+               qp_attr->cap.max_send_sge = qp->sq.max_gs;
+       } else {
+               qp_attr->cap.max_send_wr  = 0;
+               qp_attr->cap.max_send_sge = 0;
+       }
+
+       /*
+        * We don't support inline sends for kernel QPs (yet), and we
+        * don't know what userspace's value should be.
+        */
+       qp_attr->cap.max_inline_data = 0;
+
+       qp_init_attr->cap            = qp_attr->cap;
+
+       qp_init_attr->create_flags = 0;
+       if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+               qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+       if (qp->flags & MLX4_IB_QP_LSO)
+               qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+out:
+       mutex_unlock(&qp->mutex);
+       return err;
+}
+
+
+int mlx4_ib_create_xrc_rcv_qp(struct ib_qp_init_attr *init_attr,
+                             u32 *qp_num)
+{
+       return -ENOSYS;
+}
+
+int mlx4_ib_modify_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+                             struct ib_qp_attr *attr, int attr_mask)
+{
+       return -ENOSYS;
+}
+
+int mlx4_ib_query_xrc_rcv_qp(struct ib_xrcd *ibxrcd, u32 qp_num,
+                            struct ib_qp_attr *qp_attr, int qp_attr_mask,
+                            struct ib_qp_init_attr *qp_init_attr)
+{
+       return -ENOSYS;
+}
+
+int mlx4_ib_reg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+       return -ENOSYS;
+}
+
+int mlx4_ib_unreg_xrc_rcv_qp(struct ib_xrcd *xrcd, void *context, u32 qp_num)
+{
+       return -ENOSYS;
+}
+
+/**** VNIC IB VERBS ****/
+int vnic_ib_post_send(struct ib_qp *ibqp,
+                     struct ib_send_wr *wr,
+                     struct ib_send_wr **bad_wr,
+                     u8 ip_off, u8 ip6_off,
+                     u8 tcp_off, u8 udp_off)
+{
+       struct mlx4_ib_qp *qp = to_mqp(ibqp);
+       void *wqe;
+       struct mlx4_wqe_ctrl_seg *ctrl;
+       struct mlx4_wqe_data_seg *dseg;
+       __be32 owner_opcode = 0;
+       int nreq;
+       int err = 0;
+       unsigned ind;
+       int uninitialized_var(stamp);
+       int uninitialized_var(size);
+       unsigned uninitialized_var(seglen);
+       __be32 dummy;
+       __be32 *lso_wqe;
+       __be32 uninitialized_var(lso_hdr_sz);
+       int i;
+       int blh = 0;
+       __be16 vlan = 0;
+       int inl = 0;
+
+       ind = qp->sq_next_wqe;
+
+       nreq = 0;
+       lso_wqe = &dummy;
+
+       if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+               mlx4_ib_dbg("QP 0x%x: WQE overflow", ibqp->qp_num);
+               err = -ENOMEM;
+               *bad_wr = wr;
+               goto out;
+       }
+
+       if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+               mlx4_ib_dbg("QP 0x%x: too many sg entries (%d)",
+                           ibqp->qp_num, wr->num_sge);
+               err = -EINVAL;
+               *bad_wr = wr;
+               goto out;
+       }
+
+       ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+       *((u32 *) (&ctrl->vlan_tag)) = 0;
+       qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+       ctrl->srcrb_flags =
+               (wr->send_flags & IB_SEND_SIGNALED ?
+                cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+               (wr->send_flags & IB_SEND_SOLICITED ?
+                cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+               qp->sq_signal_bits;
+
+       ctrl->imm = send_ieth(wr);
+
+       wqe += sizeof *ctrl;
+       size = sizeof *ctrl / 16;
+
+       set_datagram_seg(wqe, wr, &vlan);
+       wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+       size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+       if (wr->opcode == IB_WR_LSO) {
+               err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
+               if (unlikely(err)) {
+                       *bad_wr = wr;
+                       goto out;
+               }
+               lso_wqe = (__be32 *) wqe;
+               wqe  += seglen;
+               size += seglen / 16;
+       }
+       dseg = wqe;
+       dseg += wr->num_sge - 1;
+
+       if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
+               int sz;
+
+               err = lay_inline_data(qp, wr, wqe, &sz);
+               if (!err) {
+                       inl = 1;
+                       size += sz;
+               }
+       } else {
+               size += wr->num_sge * (sizeof(struct mlx4_wqe_data_seg) / 16);
+               for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+                       set_data_seg(dseg, wr->sg_list + i);
+       }
+
+       wmb();
+       *lso_wqe = lso_hdr_sz;
+
+       ctrl->fence_size = size;
+
+       /* set SWP bits based on ip/ip6/tcp/udp offests */
+       if (wr->send_flags & IB_SEND_IP_CSUM) {
+                /* SWP bit */
+               owner_opcode |= cpu_to_be32(1 << 24);
+
+               /* IP offset starts from the begining of IB packet
+                * (and not ETH packet) in 2 bytes.
+                * In control segment, we use c & d:
+                * (a) tcp=0, ip=0 => calc TCP/UDP csum over IPv4
+                * (b) tcp=0, ip=1 => calc IP csum only over IPv4
+                * (c) tcp=1, ip=0 => calc TCP/UDP csum over IPv6
+                * (d) tcp=1, ip=1 => calc TCP/UDP and IP csum over IPv4
+                */
+               if (ip_off) {
+                       ip_off += (IB_LRH_BYTES + IB_BTH_BYTES +
+                                  IB_DETH_BYTES) >> 1;
+                       ip_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid
+                                  & 0x80) ? (IB_GRH_BYTES >> 1) : 0;
+                       owner_opcode |= cpu_to_be32((ip_off) << 8);
+                       ctrl->srcrb_flags |=
+                               cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM);
+               } else if (ip6_off) {
+                       ip6_off += (IB_LRH_BYTES + IB_BTH_BYTES +
+                                  IB_DETH_BYTES) >> 1;
+                       ip6_off += (to_mah(wr->wr.ud.ah)->av.ib.g_slid
+                                  & 0x80) ? (IB_GRH_BYTES >> 1) : 0;
+                       owner_opcode |= cpu_to_be32((ip6_off) << 8);
+               }
+
+               if (udp_off) { /* UDP offset and bit */
+                       owner_opcode |= cpu_to_be32(udp_off << 16);
+                       owner_opcode |= cpu_to_be32(1 << 25);
+                       ctrl->srcrb_flags |=
+                               cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
+               } else if (tcp_off) { /* TCP offset */
+                       owner_opcode |= cpu_to_be32(tcp_off << 16);
+                       ctrl->srcrb_flags |=
+                               cpu_to_be32(MLX4_WQE_CTRL_TCP_UDP_CSUM);
+               }
+       }
+
+       /* set opcode, use 0x4e for BIG_LSO */
+       if (!blh)
+               owner_opcode |= mlx4_ib_opcode[wr->opcode];
+       else
+               owner_opcode |= cpu_to_be32(0x4e);
+
+       /* set owenership bit */
+       owner_opcode |= (ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+       /* Make sure descriptor is fully written */
+       wmb();
+       ctrl->owner_opcode = owner_opcode;
+
+       stamp = ind + qp->sq_spare_wqes;
+       ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
+
+       /* simulate the for loop */
+       nreq++;
+
+out:
+       if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) {
+               ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8);
+               *(u32 *) (&ctrl->vlan_tag) |= qp->doorbell_qpn;
+               /*
+                * Make sure that descriptor is written to memory
+                * before writing to BlueFlame page.
+                */
+               wmb();
+
+               ++qp->sq.head;
+
+               mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl,
+                            ALIGN(size * 16, 64));
+               wc_wmb();
+
+               qp->bf.offset ^= qp->bf.buf_size;
+
+       } else if (nreq) {
+               qp->sq.head += nreq;
+
+               /*
+                * Make sure that descriptors are written before
+                * doorbell record.
+                */
+               wmb();
+
+               writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL);
+
+               /*
+                * Make sure doorbells don't leak out of SQ spinlock
+                * and reach the HCA out of order.
+                */
+               mmiowb();
+
+       }
+
+       stamp_send_wqe(qp, stamp, size * 16);
+
+       ind = pad_wraparound(qp, ind);
+       qp->sq_next_wqe = ind;
+       return err;
+}
+
+int __vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+                             struct ib_udata *udata, int nqps,
+                             int align, struct ib_qp *list[])
+{
+       struct mlx4_ib_dev *dev = to_mdev(pd->device);
+       struct mlx4_ib_qp *qp;
+       int err;
+       int base_qpn, qpn;
+       int i;
+
+       for (i = 0; i < nqps; ++i) {
+               if (init_attr[i].create_flags & ~(IB_QP_CREATE_IPOIB_UD_LSO |
+                                                 IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+                       return -EINVAL;
+               if (init_attr[i].create_flags & (IB_QP_CREATE_IPOIB_UD_LSO |
+                                                IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) &&
+                   (pd->uobject || init_attr[i].qp_type != IB_QPT_UD))
+                       return -EINVAL;
+
+               /* Userspace is not allowed to create special QPs: */
+               if (pd->uobject && (init_attr[i].qp_type == IB_QPT_SMI ||
+                                   init_attr[i].qp_type == IB_QPT_GSI))
+                       return -EINVAL;
+               if (nqps > 1 && (init_attr[i].qp_type == IB_QPT_SMI ||
+                                   init_attr[i].qp_type == IB_QPT_GSI))
+                       return -EINVAL;
+       }
+       err = mlx4_qp_reserve_range(dev->dev, nqps, align, &base_qpn, 0);
+       if (err)
+               return err;
+
+       for (i = 0, qpn = base_qpn; i < nqps; ++i, ++qpn) {
+               qp = kzalloc(sizeof *qp, GFP_KERNEL);
+               if (!qp) {
+                       err = -ENOMEM;
+                       goto exit_fail;
+               }
+
+               err = create_qp_common(dev, pd, init_attr + i, udata, qpn, qp);
+               if (err) {
+                       kfree(qp);
+                       err = err;
+                       goto exit_fail;
+               }
+               qp->xrcdn = 0;
+               qp->ibqp.qp_num = qp->mqp.qpn;
+               list[i] = &qp->ibqp;
+       }
+       return 0;
+
+exit_fail:
+       for (--i; i >= 0; --i) {
+               destroy_qp_common(dev, to_mqp(list[i]), init_attr + i);
+               kfree(to_mqp(list[i]));
+       }
+       mlx4_qp_release_range(dev->dev, base_qpn, nqps);
+       return err;
+}
+
+/* compare with ib_create_qp() in infiniband/core/verbs.c */
+int vnic_ib_create_qp_range(struct ib_pd *pd, struct ib_qp_init_attr *init_attr,
+                           struct ib_udata *udata, int nqps,
+                           int align, struct ib_qp *list[])
+{
+       struct ib_qp *qp;
+       struct ib_qp_init_attr *qp_init_attr;
+       int rc, i;
+
+       rc = __vnic_ib_create_qp_range(pd, init_attr, udata ,nqps, align, list);
+
+       if (rc)
+               return rc;
+
+       for (i = 0; i < nqps; ++ i) {
+               qp = list[i];
+               qp_init_attr      = &init_attr[i];
+               qp->device        = pd->device;
+               qp->real_qp       = qp;
+               qp->pd            = pd;
+               qp->send_cq       = qp_init_attr->send_cq;
+               qp->recv_cq       = qp_init_attr->recv_cq;
+               qp->srq           = qp_init_attr->srq;
+               qp->uobject       = NULL;
+               qp->event_handler = qp_init_attr->event_handler;
+               qp->qp_context    = qp_init_attr->qp_context;
+               qp->qp_type       = qp_init_attr->qp_type;
+               qp->xrcd          = qp->qp_type == IB_QPT_XRC_TGT ?
+                       qp_init_attr->xrcd : NULL;
+               atomic_inc(&pd->usecnt);
+               atomic_inc(&qp_init_attr->send_cq->usecnt);
+               atomic_inc(&qp_init_attr->recv_cq->usecnt);
+               if (qp_init_attr->srq)
+                       atomic_inc(&qp_init_attr->srq->usecnt);
+               if (qp->qp_type == IB_QPT_XRC_TGT)
+                       atomic_inc(&qp->xrcd->usecnt);
+       }
+       return 0;
+}
+
diff --git a/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h b/drivers/net/ethernet/mellanox/mlx4_vnic/vnic_utils.h
new file mode 100644 (file)
index 0000000..56ee8cf
--- /dev/null
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2009 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _VNIC_UTILS_H
+#define _VNIC_UTILS_H
+
+/*#define CONFIG_MLX4_VNIC_DEBUG  */     /* comment out in RELEASE and PERFORMANCE modes */
+/* #define VNIC_PROFILLNG */           /* comment out in RELEASE and PERFORMANCE modes */
+#define VNIC_EXTRA_STATS               /* comment out in PERFORMANCE mode */
+
+enum {
+       VNIC_DEBUG_GENERAL      = 1 << 0,  /* 0x1    */
+       VNIC_DEBUG_MCAST        = 1 << 1,  /* 0x2    */
+       VNIC_DEBUG_MCAST_V      = 1 << 2,  /* 0x4    */
+       VNIC_DEBUG_DATA         = 1 << 3,  /* 0x8    */
+       VNIC_DEBUG_DATA_V       = 1 << 4,  /* 0x10   */
+       VNIC_DEBUG_FIP          = 1 << 5,  /* 0x20   */
+       VNIC_DEBUG_FIP_V        = 1 << 6,  /* 0x40   */
+       VNIC_DEBUG_SKB          = 1 << 7,  /* 0x80   */
+       VNIC_DEBUG_SKB_V        = 1 << 8,  /* 0x100  */
+       VNIC_DEBUG_VHUB         = 1 << 9,  /* 0x200  */
+       VNIC_DEBUG_VHUB_V       = 1 << 10, /* 0x400  */
+       VNIC_DEBUG_ETHTOOL      = 1 << 11, /* 0x800  */
+       VNIC_DEBUG_ETHTOOL_V    = 1 << 12, /* 0x1000 */
+       VNIC_DEBUG_FUNC         = 1 << 13, /* 0x2000 */
+       VNIC_DEBUG_MARK         = 1 << 14, /* 0x4000 */
+       VNIC_DEBUG_MODER        = 1 << 15, /* 0x8000 */
+       VNIC_DEBUG_MODER_v      = 1 << 16, /* 0x10000 */
+       VNIC_DEBUG_PKT_DUMP     = 1 << 17, /* 0x20000 */
+       VNIC_DEBUG_FIP_P0       = 1 << 18, /* 0x40000 */
+       VNIC_DEBUG_SYSFS        = 1 << 19, /* 0x80000 */
+       VNIC_DEBUG_MAC          = 1 << 20, /* 0x100000 */
+       VNIC_DEBUG_TSTAMP       = 1 << 21, /* 0x200000 */
+       VNIC_DEBUG_PARSER       = 1 << 19, /* 0x400000 */
+       VNIC_DEBUG_LAG          = 1 << 20, /* 0x800000 */
+       VNIC_DEBUG_LAG_V        = 1 << 21, /* 0x1000000 */
+       VNIC_DEBUG_MCAST_VV     = 1 << 22, /* 0x2000000 */
+       VNIC_DEBUG_DEBUG        = 1 << 31, /* 0x80000000 */
+};
+
+/* always defined */
+#define vnic_printk(level, prefix, format, arg...)                     \
+       do {   printk(level "T%.4ld [%s] %s:%s:%d: " format,            \
+               jiffies * 1000 / HZ,                                    \
+              DRV_NAME, prefix ? prefix : "", __func__, __LINE__ ,     \
+              ## arg);                                                 \
+} while(0)
+
+#define vnic_info(format, arg...)                                      \
+do {   printk(KERN_INFO "[%s] " format, DRV_NAME, ## arg); }           \
+while (0)
+
+#define vnic_warn(prefix, format, arg...)                              \
+do { vnic_printk(KERN_WARNING, prefix, format, ## arg); }              \
+while (0)
+
+#define vnic_err(prefix, format, arg...)                               \
+do { vnic_printk(KERN_ERR, prefix, format, ## arg); }                  \
+while (0)
+
+#define _sprintf(p, buf, format, arg...)                               \
+       (PAGE_SIZE - (int)(p - buf)) <= 0 ? 0 :                         \
+       scnprintf(p, PAGE_SIZE - (int)(p - buf), format, ## arg)
+
+/* debug functions */
+#ifndef CONFIG_MLX4_VNIC_DEBUG
+#define ASSERT(x)                               do { (void)(x);      } while (0)
+#define vnic_dbg_mark(void)                     do {                 } while (0)
+#define vnic_dbg_func(prefix)                   do {                 } while (0)
+#define vnic_dbg(prefix, format, arg...)         do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast(prefix, format, arg...)   do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_mcast_vv(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_debug(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_ethtool(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_dbg_ethtool_v(prefix, format, arg...) \
+                                                do { (void)(prefix); } while (0)
+#define vnic_dbg_data(prefix, format, arg...)    do { (void)(prefix); } while (0)
+#define vnic_dbg_data_v(prefix, format, arg...)  do { (void)(prefix); } while (0)
+#define vnic_dbg_fip(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_parse(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_lag(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_lag_v(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_fip_p0(prefix, format, arg...)  do { (void)(prefix); } while (0)
+#define vnic_dbg_sysfs(prefix, format, arg...)   do { (void)(prefix); } while (0)
+#define vnic_dbg_mac(prefix, format, arg...)     do { (void)(prefix); } while (0)
+#define vnic_dbg_fip_v(prefix, format, arg...)   do { (void)(prefix); } while (0)
+#define vnic_dbg_vhub(prefix, format, arg...)    do { (void)(prefix); } while (0)
+#define vnic_dbg_vhub_v(prefix, format, arg...)  do { (void)(prefix); } while (0)
+#define vnic_dbg_moder(prefix, format, arg...)   do { (void)(prefix); } while (0)
+#define vnic_dbg_moder_v(prefix, format, arg...) do { (void)(prefix); } while (0)
+#define vnic_printk_skb(prefix, skb, o1, o2)     do { (void)(prefix); } while (0)
+#define vnic_dbg_skb(prefix, skb, o1, o2)        do { (void)(prefix); } while (0)
+#else
+#define ASSERT(x)                                                      \
+do {   if (x) break;                                                   \
+       printk(KERN_EMERG "### ASSERTION FAILED %s: %s: %d: %s\n",      \
+              __FILE__, __func__, __LINE__, #x); dump_stack(); BUG();  \
+} while (0)
+
+#define vnic_dbg(prefix, format, arg...)                               \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_GENERAL)) break;                 \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_mcast(prefix, format, arg...)                         \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_MCAST)) break;                   \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_mcast_v(prefix, format, arg...)                       \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_MCAST_V)) break;                 \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_mcast_vv(prefix, format, arg...)                      \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_MCAST_VV)) break;                \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_debug(prefix, format, arg...)                 \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_DEBUG)) break;                   \
+       vnic_printk(KERN_WARNING, prefix, format, ## arg);              \
+} while (0)
+
+
+#define vnic_dbg_data(prefix, format, arg...)                          \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_DATA)) break;                    \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_data_v(prefix, format, arg...)                                \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_DATA_V)) break;                  \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_fip_p0(prefix, format, arg...)                                \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_FIP_P0)) break;                  \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_sysfs(prefix, format, arg...)                         \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_SYSFS)) break;                   \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_mac(prefix, format, arg...)                           \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_MAC)) break;                     \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_fip(prefix, format, arg...)                           \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_FIP)) break;                     \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_parse(prefix, format, arg...)                         \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_PARSER)) break;                  \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_lag(prefix, format, arg...)                           \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_LAG)) break;                     \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_lag_v(prefix, format, arg...)                         \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_LAG_V)) break;                   \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_fip_v(prefix, format, arg...)                         \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_FIP_V)) break;                   \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_vhub(prefix, format, arg...)                          \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_VHUB)) break;                    \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_vhub_v(prefix, format, arg...)                                \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_VHUB_V)) break;                  \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_moder(prefix, format, arg...)                         \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_MODER)) break;                   \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_moder_v(prefix, format, arg...)                       \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_MODER_V)) break;                 \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_ethtool(prefix, format, arg...)                       \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL)) break;                 \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_ethtool_v(prefix, format, arg...)                     \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_ETHTOOL_V)) break;               \
+       vnic_printk(KERN_DEBUG, prefix, format, ## arg);                \
+} while (0)
+
+#define vnic_dbg_mark(void)                                            \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_MARK)) break;                    \
+       vnic_printk(KERN_DEBUG, NULL, "###\n");                         \
+} while (0)
+
+#define vnic_dbg_func(prefix)                                          \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_FUNC)) break;                    \
+       vnic_printk(KERN_DEBUG, prefix, "function called\n");           \
+} while (0)
+
+#define ethp2str(p, str)                                               \
+do {                                                                   \
+       switch (ntohs(p)) {                                             \
+       case ETH_P_RARP: sprintf(str, "%s", "ETH_P_RARP"); break;       \
+       case ETH_P_ARP:  sprintf(str, "%s", "ETH_P_ARP");  break;       \
+       case ETH_P_IP:   sprintf(str, "%s", "ETH_P_IP");   break;       \
+       case ETH_P_IPV6: sprintf(str, "%s", "ETH_P_IPV6"); break;       \
+       case ETH_P_8021Q:sprintf(str, "%s", "ETH_P_8021Q");break;       \
+       default:         sprintf(str, "0x%x", p);          break;       \
+       }                                                               \
+} while (0)
+
+#define skb_printk(prefix, format, arg...)                             \
+       printk(KERN_DEBUG "[%s] " format, prefix, ## arg)
+
+#define vnic_dbg_skb(_prefix, skb, eoib_off, eth_off)                  \
+do {   if (!(vnic_msglvl & VNIC_DEBUG_SKB)) break;                     \
+       vnic_printk_skb(_prefix, skb, eoib_off, eth_off);               \
+} while (0)
+
+#define VNIC_SYSLOG_LLEN 64
+#define vnic_printk_skb(_prefix, skb, eoib_off, eth_off)               \
+do {                                                                   \
+       char pr[VNIC_SYSLOG_LLEN];                                      \
+       char h_proto_str[VNIC_SYSLOG_LLEN];                             \
+       struct eoibhdr *eoib_hdr = (struct eoibhdr *)                   \
+                       (skb->data + eoib_off);                         \
+       struct ethhdr *ethh = (struct ethhdr *)                         \
+                       (skb->data + eth_off);                          \
+       struct net_device *dev = skb->dev;                              \
+       ASSERT(dev);                                                    \
+       snprintf(pr, VNIC_SYSLOG_LLEN, "%s:skb-%s", dev->name, _prefix);\
+       skb_printk(pr, "\n");                                           \
+       skb_printk(pr, "--- skb dump ---\n");                           \
+       skb_printk(pr, "len          : %d\n", skb->len);                \
+       skb_printk(pr, "data_len     : %d\n", skb->data_len);           \
+       skb_printk(pr, "frags        : %d\n",                           \
+               skb_shinfo(skb)->nr_frags);                             \
+       skb_printk(pr, "gso          : %d\n", skb_is_gso(skb));         \
+       skb_printk(pr, "head_len     : %d\n", (int)skb_headlen(skb));   \
+       skb_printk(pr, "data         : %p\n", skb->data);               \
+       skb_printk(pr, "head         : %p\n", skb->head);               \
+       skb_printk(pr, "tail         : %lu\n",                          \
+                  (unsigned long)(skb->tail));                         \
+       skb_printk(pr, "end          : %lu\n",                          \
+                  (unsigned long)(skb->end));                          \
+       skb_printk(pr, "eoib_off     : %lu\n", eoib_off);               \
+       skb_printk(pr, "eth_off      : %lu\n", eth_off);                \
+       if (eth_off < 0 || !skb_headlen(skb))                           \
+               break;                                                  \
+       ethp2str(ethh->h_proto, h_proto_str);                           \
+       skb_printk(pr, "eth_proto    : %s\n", h_proto_str);             \
+       skb_printk(pr, "eth_dest     : "MAC_6_PRINT_FMT"\n",            \
+                  MAC_6_PRINT_ARG(ethh->h_dest));                      \
+       skb_printk(pr, "eth_source   : "MAC_6_PRINT_FMT"\n",            \
+                  MAC_6_PRINT_ARG(ethh->h_source));                    \
+       if (eoib_off < 0)                                               \
+               break;                                                  \
+       skb_printk(pr, "eoib_seg_id  : 0x%04x\n", eoib_hdr->seg_id);    \
+       skb_printk(pr, "eoib_seg_off : 0x%02x\n", eoib_hdr->seg_off);   \
+       skb_printk(pr, "eoib_ip_chk  : 0x%02x\n",                       \
+                  VNIC_EOIB_HDR_GET_IP_CHK(eoib_hdr));                 \
+       skb_printk(pr, "eoib_tcp_chk : 0x%02x\n",                       \
+                  VNIC_EOIB_HDR_GET_TCP_UDP_CHK(eoib_hdr));            \
+       skb_printk(pr, "eoib_ver     : 0x%02x\n",                       \
+                  VNIC_EOIB_HDR_GET_VER(eoib_hdr));                    \
+       skb_printk(pr, "eoib_sig     : 0x%02x\n",                       \
+                  VNIC_EOIB_HDR_GET_SIG(eoib_hdr));                    \
+} while (0)
+
+#endif /* CONFIG_MLX4_VNIC_DEBUG */
+#endif /* _VNIC_UTILS_H */