ice-y := ice_main.o    \
         ice_controlq.o \
         ice_common.o   \
-        ice_nvm.o
+        ice_nvm.o      \
+        ice_switch.o   \
+        ice_sched.o
 
 #include <linux/bitmap.h>
 #include "ice_devids.h"
 #include "ice_type.h"
+#include "ice_switch.h"
 #include "ice_common.h"
+#include "ice_sched.h"
 
 #define ICE_BAR0               0
 #define ICE_AQ_LEN             64
 
  * descriptor format.  It is shared between Firmware and Software.
  */
 
+#define ICE_AQC_TOPO_MAX_LEVEL_NUM     0x9
+
 struct ice_aqc_generic {
        __le32 param0;
        __le32 param1;
        u8 reserved[2];
 };
 
+/* Get function capabilities (indirect 0x000A)
+ * Get device capabilities (indirect 0x000B)
+ */
+struct ice_aqc_list_caps {
+       u8 cmd_flags;
+       u8 pf_index;
+       u8 reserved[2];
+       __le32 count;
+       __le32 addr_high;
+       __le32 addr_low;
+};
+
+/* Device/Function buffer entry, repeated per reported capability */
+struct ice_aqc_list_caps_elem {
+       __le16 cap;
+#define ICE_AQC_CAPS_VSI                               0x0017
+#define ICE_AQC_CAPS_RSS                               0x0040
+#define ICE_AQC_CAPS_RXQS                              0x0041
+#define ICE_AQC_CAPS_TXQS                              0x0042
+#define ICE_AQC_CAPS_MSIX                              0x0043
+#define ICE_AQC_CAPS_MAX_MTU                           0x0047
+
+       u8 major_ver;
+       u8 minor_ver;
+       /* Number of resources described by this capability */
+       __le32 number;
+       /* Only meaningful for some types of resources */
+       __le32 logical_id;
+       /* Only meaningful for some types of resources */
+       __le32 phys_id;
+       __le64 rsvd1;
+       __le64 rsvd2;
+};
+
 /* Clear PXE Command and response (direct 0x0110) */
 struct ice_aqc_clear_pxe {
        u8 rx_cnt;
        u8 reserved[15];
 };
 
+/* Get switch configuration (0x0200) */
+struct ice_aqc_get_sw_cfg {
+       /* Reserved for command and copy of request flags for response */
+       __le16 flags;
+       /* First desc in case of command and next_elem in case of response
+        * In case of response, if it is not zero, means all the configuration
+        * was not returned and new command shall be sent with this value in
+        * the 'first desc' field
+        */
+       __le16 element;
+       /* Reserved for command, only used for response */
+       __le16 num_elems;
+       __le16 rsvd;
+       __le32 addr_high;
+       __le32 addr_low;
+};
+
+/* Each entry in the response buffer is of the following type: */
+struct ice_aqc_get_sw_cfg_resp_elem {
+       /* VSI/Port Number */
+       __le16 vsi_port_num;
+#define ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_S        0
+#define ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_M        \
+                       (0x3FF << ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_S)
+#define ICE_AQC_GET_SW_CONF_RESP_TYPE_S        14
+#define ICE_AQC_GET_SW_CONF_RESP_TYPE_M        (0x3 << ICE_AQC_GET_SW_CONF_RESP_TYPE_S)
+#define ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT     0
+#define ICE_AQC_GET_SW_CONF_RESP_VIRT_PORT     1
+#define ICE_AQC_GET_SW_CONF_RESP_VSI           2
+
+       /* SWID VSI/Port belongs to */
+       __le16 swid;
+
+       /* Bit 14..0 : PF/VF number VSI belongs to
+        * Bit 15 : VF indication bit
+        */
+       __le16 pf_vf_num;
+#define ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_S    0
+#define ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_M    \
+                               (0x7FFF << ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_S)
+#define ICE_AQC_GET_SW_CONF_RESP_IS_VF         BIT(15)
+};
+
+/* The response buffer is as follows. Note that the length of the
+ * elements array varies with the length of the command response.
+ */
+struct ice_aqc_get_sw_cfg_resp {
+       struct ice_aqc_get_sw_cfg_resp_elem elements[1];
+};
+
+/* Add TSE (indirect 0x0401)
+ * Delete TSE (indirect 0x040F)
+ * Move TSE (indirect 0x0408)
+ */
+struct ice_aqc_add_move_delete_elem {
+       __le16 num_grps_req;
+       __le16 num_grps_updated;
+       __le32 reserved;
+       __le32 addr_high;
+       __le32 addr_low;
+};
+
+struct ice_aqc_elem_info_bw {
+       __le16 bw_profile_idx;
+       __le16 bw_alloc;
+};
+
+struct ice_aqc_txsched_elem {
+       u8 elem_type; /* Special field, reserved for some aq calls */
+#define ICE_AQC_ELEM_TYPE_UNDEFINED            0x0
+#define ICE_AQC_ELEM_TYPE_ROOT_PORT            0x1
+#define ICE_AQC_ELEM_TYPE_TC                   0x2
+#define ICE_AQC_ELEM_TYPE_SE_GENERIC           0x3
+#define ICE_AQC_ELEM_TYPE_ENTRY_POINT          0x4
+#define ICE_AQC_ELEM_TYPE_LEAF                 0x5
+#define ICE_AQC_ELEM_TYPE_SE_PADDED            0x6
+       u8 valid_sections;
+#define ICE_AQC_ELEM_VALID_GENERIC             BIT(0)
+#define ICE_AQC_ELEM_VALID_CIR                 BIT(1)
+#define ICE_AQC_ELEM_VALID_EIR                 BIT(2)
+#define ICE_AQC_ELEM_VALID_SHARED              BIT(3)
+       u8 generic;
+#define ICE_AQC_ELEM_GENERIC_MODE_M            0x1
+#define ICE_AQC_ELEM_GENERIC_PRIO_S            0x1
+#define ICE_AQC_ELEM_GENERIC_PRIO_M    (0x7 << ICE_AQC_ELEM_GENERIC_PRIO_S)
+#define ICE_AQC_ELEM_GENERIC_SP_S              0x4
+#define ICE_AQC_ELEM_GENERIC_SP_M      (0x1 << ICE_AQC_ELEM_GENERIC_SP_S)
+#define ICE_AQC_ELEM_GENERIC_ADJUST_VAL_S      0x5
+#define ICE_AQC_ELEM_GENERIC_ADJUST_VAL_M      \
+       (0x3 << ICE_AQC_ELEM_GENERIC_ADJUST_VAL_S)
+       u8 flags; /* Special field, reserved for some aq calls */
+#define ICE_AQC_ELEM_FLAG_SUSPEND_M            0x1
+       struct ice_aqc_elem_info_bw cir_bw;
+       struct ice_aqc_elem_info_bw eir_bw;
+       __le16 srl_id;
+       __le16 reserved2;
+};
+
+struct ice_aqc_txsched_elem_data {
+       __le32 parent_teid;
+       __le32 node_teid;
+       struct ice_aqc_txsched_elem data;
+};
+
+struct ice_aqc_txsched_topo_grp_info_hdr {
+       __le32 parent_teid;
+       __le16 num_elems;
+       __le16 reserved2;
+};
+
+struct ice_aqc_delete_elem {
+       struct ice_aqc_txsched_topo_grp_info_hdr hdr;
+       __le32 teid[1];
+};
+
+/* Query Scheduler Resource Allocation (indirect 0x0412)
+ * This indirect command retrieves the scheduler resources allocated by
+ * EMP Firmware to the given PF.
+ */
+struct ice_aqc_query_txsched_res {
+       u8 reserved[8];
+       __le32 addr_high;
+       __le32 addr_low;
+};
+
+struct ice_aqc_generic_sched_props {
+       __le16 phys_levels;
+       __le16 logical_levels;
+       u8 flattening_bitmap;
+       u8 max_device_cgds;
+       u8 max_pf_cgds;
+       u8 rsvd0;
+       __le16 rdma_qsets;
+       u8 rsvd1[22];
+};
+
+struct ice_aqc_layer_props {
+       u8 logical_layer;
+       u8 chunk_size;
+       __le16 max_device_nodes;
+       __le16 max_pf_nodes;
+       u8 rsvd0[2];
+       __le16 max_shared_rate_lmtr;
+       __le16 max_children;
+       __le16 max_cir_rl_profiles;
+       __le16 max_eir_rl_profiles;
+       __le16 max_srl_profiles;
+       u8 rsvd1[14];
+};
+
+struct ice_aqc_query_txsched_res_resp {
+       struct ice_aqc_generic_sched_props sched_props;
+       struct ice_aqc_layer_props layer_props[ICE_AQC_TOPO_MAX_LEVEL_NUM];
+};
+
 /* NVM Read command (indirect 0x0701)
  * NVM Erase commands (direct 0x0702)
  * NVM Update commands (indirect 0x0703)
                struct ice_aqc_q_shutdown q_shutdown;
                struct ice_aqc_req_res res_owner;
                struct ice_aqc_clear_pxe clear_pxe;
+               struct ice_aqc_list_caps get_cap;
+               struct ice_aqc_get_sw_cfg get_sw_conf;
+               struct ice_aqc_query_txsched_res query_sched_res;
+               struct ice_aqc_add_move_delete_elem add_move_delete_elem;
                struct ice_aqc_nvm nvm;
        } params;
 };
 #define ICE_AQ_LG_BUF  512
 
 #define ICE_AQ_FLAG_LB_S       9
+#define ICE_AQ_FLAG_RD_S       10
 #define ICE_AQ_FLAG_BUF_S      12
 #define ICE_AQ_FLAG_SI_S       13
 
 #define ICE_AQ_FLAG_LB         BIT(ICE_AQ_FLAG_LB_S)  /* 0x200  */
+#define ICE_AQ_FLAG_RD         BIT(ICE_AQ_FLAG_RD_S)  /* 0x400  */
 #define ICE_AQ_FLAG_BUF                BIT(ICE_AQ_FLAG_BUF_S) /* 0x1000 */
 #define ICE_AQ_FLAG_SI         BIT(ICE_AQ_FLAG_SI_S)  /* 0x2000 */
 
 /* error codes */
 enum ice_aq_err {
        ICE_AQ_RC_OK            = 0,  /* success */
+       ICE_AQ_RC_ENOMEM        = 9,  /* Out of memory */
        ICE_AQ_RC_EBUSY         = 12, /* Device or resource busy */
        ICE_AQ_RC_EEXIST        = 13, /* object already exists */
 };
        ice_aqc_opc_req_res                             = 0x0008,
        ice_aqc_opc_release_res                         = 0x0009,
 
+       /* device/function capabilities */
+       ice_aqc_opc_list_func_caps                      = 0x000A,
+       ice_aqc_opc_list_dev_caps                       = 0x000B,
+
        /* PXE */
        ice_aqc_opc_clear_pxe_mode                      = 0x0110,
 
+       /* internal switch commands */
+       ice_aqc_opc_get_sw_cfg                          = 0x0200,
+
        ice_aqc_opc_clear_pf_cfg                        = 0x02A4,
 
+       /* transmit scheduler commands */
+       ice_aqc_opc_delete_sched_elems                  = 0x040F,
+       ice_aqc_opc_query_sched_res                     = 0x0412,
+
        /* NVM commands */
        ice_aqc_opc_nvm_read                            = 0x0701,
 
 
 /* Copyright (c) 2018, Intel Corporation. */
 
 #include "ice_common.h"
+#include "ice_sched.h"
 #include "ice_adminq_cmd.h"
 
 #define ICE_PF_RESET_WAIT_COUNT        200
        if (status)
                goto err_unroll_cqinit;
 
+       status = ice_get_caps(hw);
+       if (status)
+               goto err_unroll_cqinit;
+
+       hw->port_info = devm_kzalloc(ice_hw_to_dev(hw),
+                                    sizeof(*hw->port_info), GFP_KERNEL);
+       if (!hw->port_info) {
+               status = ICE_ERR_NO_MEMORY;
+               goto err_unroll_cqinit;
+       }
+
+       /* set the back pointer to hw */
+       hw->port_info->hw = hw;
+
+       /* Initialize port_info struct with switch configuration data */
+       status = ice_get_initial_sw_cfg(hw);
+       if (status)
+               goto err_unroll_alloc;
+
+       /* Query the allocated resources for tx scheduler */
+       status = ice_sched_query_res_alloc(hw);
+       if (status) {
+               ice_debug(hw, ICE_DBG_SCHED,
+                         "Failed to get scheduler allocated resources\n");
+               goto err_unroll_alloc;
+       }
+
        return 0;
 
+err_unroll_alloc:
+       devm_kfree(ice_hw_to_dev(hw), hw->port_info);
 err_unroll_cqinit:
        ice_shutdown_all_ctrlq(hw);
        return status;
  */
 void ice_deinit_hw(struct ice_hw *hw)
 {
+       ice_sched_cleanup_all(hw);
        ice_shutdown_all_ctrlq(hw);
+       if (hw->port_info) {
+               devm_kfree(ice_hw_to_dev(hw), hw->port_info);
+               hw->port_info = NULL;
+       }
 }
 
 /**
        }
 }
 
+/**
+ * ice_parse_caps - parse function/device capabilities
+ * @hw: pointer to the hw struct
+ * @buf: pointer to a buffer containing function/device capability records
+ * @cap_count: number of capability records in the list
+ * @opc: type of capabilities list to parse
+ *
+ * Helper function to parse function(0x000a)/device(0x000b) capabilities list.
+ */
+static void
+ice_parse_caps(struct ice_hw *hw, void *buf, u32 cap_count,
+              enum ice_adminq_opc opc)
+{
+       struct ice_aqc_list_caps_elem *cap_resp;
+       struct ice_hw_func_caps *func_p = NULL;
+       struct ice_hw_dev_caps *dev_p = NULL;
+       struct ice_hw_common_caps *caps;
+       u32 i;
+
+       if (!buf)
+               return;
+
+       cap_resp = (struct ice_aqc_list_caps_elem *)buf;
+
+       if (opc == ice_aqc_opc_list_dev_caps) {
+               dev_p = &hw->dev_caps;
+               caps = &dev_p->common_cap;
+       } else if (opc == ice_aqc_opc_list_func_caps) {
+               func_p = &hw->func_caps;
+               caps = &func_p->common_cap;
+       } else {
+               ice_debug(hw, ICE_DBG_INIT, "wrong opcode\n");
+               return;
+       }
+
+       for (i = 0; caps && i < cap_count; i++, cap_resp++) {
+               u32 logical_id = le32_to_cpu(cap_resp->logical_id);
+               u32 phys_id = le32_to_cpu(cap_resp->phys_id);
+               u32 number = le32_to_cpu(cap_resp->number);
+               u16 cap = le16_to_cpu(cap_resp->cap);
+
+               switch (cap) {
+               case ICE_AQC_CAPS_VSI:
+                       if (dev_p) {
+                               dev_p->num_vsi_allocd_to_host = number;
+                               ice_debug(hw, ICE_DBG_INIT,
+                                         "HW caps: Dev.VSI cnt = %d\n",
+                                         dev_p->num_vsi_allocd_to_host);
+                       } else if (func_p) {
+                               func_p->guaranteed_num_vsi = number;
+                               ice_debug(hw, ICE_DBG_INIT,
+                                         "HW caps: Func.VSI cnt = %d\n",
+                                         func_p->guaranteed_num_vsi);
+                       }
+                       break;
+               case ICE_AQC_CAPS_RSS:
+                       caps->rss_table_size = number;
+                       caps->rss_table_entry_width = logical_id;
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: RSS table size = %d\n",
+                                 caps->rss_table_size);
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: RSS table width = %d\n",
+                                 caps->rss_table_entry_width);
+                       break;
+               case ICE_AQC_CAPS_RXQS:
+                       caps->num_rxq = number;
+                       caps->rxq_first_id = phys_id;
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: Num Rx Qs = %d\n", caps->num_rxq);
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: Rx first queue ID = %d\n",
+                                 caps->rxq_first_id);
+                       break;
+               case ICE_AQC_CAPS_TXQS:
+                       caps->num_txq = number;
+                       caps->txq_first_id = phys_id;
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: Num Tx Qs = %d\n", caps->num_txq);
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: Tx first queue ID = %d\n",
+                                 caps->txq_first_id);
+                       break;
+               case ICE_AQC_CAPS_MSIX:
+                       caps->num_msix_vectors = number;
+                       caps->msix_vector_first_id = phys_id;
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: MSIX vector count = %d\n",
+                                 caps->num_msix_vectors);
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: MSIX first vector index = %d\n",
+                                 caps->msix_vector_first_id);
+                       break;
+               case ICE_AQC_CAPS_MAX_MTU:
+                       caps->max_mtu = number;
+                       if (dev_p)
+                               ice_debug(hw, ICE_DBG_INIT,
+                                         "HW caps: Dev.MaxMTU = %d\n",
+                                         caps->max_mtu);
+                       else if (func_p)
+                               ice_debug(hw, ICE_DBG_INIT,
+                                         "HW caps: func.MaxMTU = %d\n",
+                                         caps->max_mtu);
+                       break;
+               default:
+                       ice_debug(hw, ICE_DBG_INIT,
+                                 "HW caps: Unknown capability[%d]: 0x%x\n", i,
+                                 cap);
+                       break;
+               }
+       }
+}
+
+/**
+ * ice_aq_discover_caps - query function/device capabilities
+ * @hw: pointer to the hw struct
+ * @buf: a virtual buffer to hold the capabilities
+ * @buf_size: Size of the virtual buffer
+ * @data_size: Size of the returned data, or buf size needed if AQ err==ENOMEM
+ * @opc: capabilities type to discover - pass in the command opcode
+ * @cd: pointer to command details structure or NULL
+ *
+ * Get the function(0x000a)/device(0x000b) capabilities description from
+ * the firmware.
+ */
+static enum ice_status
+ice_aq_discover_caps(struct ice_hw *hw, void *buf, u16 buf_size, u16 *data_size,
+                    enum ice_adminq_opc opc, struct ice_sq_cd *cd)
+{
+       struct ice_aqc_list_caps *cmd;
+       struct ice_aq_desc desc;
+       enum ice_status status;
+
+       cmd = &desc.params.get_cap;
+
+       if (opc != ice_aqc_opc_list_func_caps &&
+           opc != ice_aqc_opc_list_dev_caps)
+               return ICE_ERR_PARAM;
+
+       ice_fill_dflt_direct_cmd_desc(&desc, opc);
+
+       status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+       if (!status)
+               ice_parse_caps(hw, buf, le32_to_cpu(cmd->count), opc);
+       *data_size = le16_to_cpu(desc.datalen);
+
+       return status;
+}
+
+/**
+ * ice_get_caps - get info about the HW
+ * @hw: pointer to the hardware structure
+ */
+enum ice_status ice_get_caps(struct ice_hw *hw)
+{
+       enum ice_status status;
+       u16 data_size = 0;
+       u16 cbuf_len;
+       u8 retries;
+
+       /* The driver doesn't know how many capabilities the device will return
+        * so the buffer size required isn't known ahead of time. The driver
+        * starts with cbuf_len and if this turns out to be insufficient, the
+        * device returns ICE_AQ_RC_ENOMEM and also the buffer size it needs.
+        * The driver then allocates the buffer of this size and retries the
+        * operation. So it follows that the retry count is 2.
+        */
+#define ICE_GET_CAP_BUF_COUNT  40
+#define ICE_GET_CAP_RETRY_COUNT        2
+
+       cbuf_len = ICE_GET_CAP_BUF_COUNT *
+               sizeof(struct ice_aqc_list_caps_elem);
+
+       retries = ICE_GET_CAP_RETRY_COUNT;
+
+       do {
+               void *cbuf;
+
+               cbuf = devm_kzalloc(ice_hw_to_dev(hw), cbuf_len, GFP_KERNEL);
+               if (!cbuf)
+                       return ICE_ERR_NO_MEMORY;
+
+               status = ice_aq_discover_caps(hw, cbuf, cbuf_len, &data_size,
+                                             ice_aqc_opc_list_func_caps, NULL);
+               devm_kfree(ice_hw_to_dev(hw), cbuf);
+
+               if (!status || hw->adminq.sq_last_status != ICE_AQ_RC_ENOMEM)
+                       break;
+
+               /* If ENOMEM is returned, try again with bigger buffer */
+               cbuf_len = data_size;
+       } while (--retries);
+
+       return status;
+}
+
 /**
  * ice_aq_clear_pxe_mode
  * @hw: pointer to the hw struct
 
 
 #include "ice.h"
 #include "ice_type.h"
+#include "ice_switch.h"
 
 void ice_debug_cq(struct ice_hw *hw, u32 mask, void *desc, void *buf,
                  u16 buf_len);
                struct ice_aq_desc *desc, void *buf, u16 buf_size,
                struct ice_sq_cd *cd);
 void ice_clear_pxe_mode(struct ice_hw *hw);
+enum ice_status ice_get_caps(struct ice_hw *hw);
 bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq);
 enum ice_status ice_aq_q_shutdown(struct ice_hw *hw, bool unloading);
 void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode);
 
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+#include "ice_sched.h"
+
+/**
+ * ice_aq_delete_sched_elems - delete scheduler elements
+ * @hw: pointer to the hw struct
+ * @grps_req: number of groups to delete
+ * @buf: pointer to buffer
+ * @buf_size: buffer size in bytes
+ * @grps_del: returns total number of elements deleted
+ * @cd: pointer to command details structure or NULL
+ *
+ * Delete scheduling elements (0x040F)
+ */
+static enum ice_status
+ice_aq_delete_sched_elems(struct ice_hw *hw, u16 grps_req,
+                         struct ice_aqc_delete_elem *buf, u16 buf_size,
+                         u16 *grps_del, struct ice_sq_cd *cd)
+{
+       struct ice_aqc_add_move_delete_elem *cmd;
+       struct ice_aq_desc desc;
+       enum ice_status status;
+
+       cmd = &desc.params.add_move_delete_elem;
+       ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_delete_sched_elems);
+       desc.flags |= cpu_to_le16(ICE_AQ_FLAG_RD);
+       cmd->num_grps_req = cpu_to_le16(grps_req);
+
+       status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+       if (!status && grps_del)
+               *grps_del = le16_to_cpu(cmd->num_grps_updated);
+
+       return status;
+}
+
+/**
+ * ice_sched_remove_elems - remove nodes from hw
+ * @hw: pointer to the hw struct
+ * @parent: pointer to the parent node
+ * @num_nodes: number of nodes
+ * @node_teids: array of node teids to be deleted
+ *
+ * This function remove nodes from hw
+ */
+static enum ice_status
+ice_sched_remove_elems(struct ice_hw *hw, struct ice_sched_node *parent,
+                      u16 num_nodes, u32 *node_teids)
+{
+       struct ice_aqc_delete_elem *buf;
+       u16 i, num_groups_removed = 0;
+       enum ice_status status;
+       u16 buf_size;
+
+       buf_size = sizeof(*buf) + sizeof(u32) * (num_nodes - 1);
+       buf = devm_kzalloc(ice_hw_to_dev(hw), buf_size, GFP_KERNEL);
+       if (!buf)
+               return ICE_ERR_NO_MEMORY;
+       buf->hdr.parent_teid = parent->info.node_teid;
+       buf->hdr.num_elems = cpu_to_le16(num_nodes);
+       for (i = 0; i < num_nodes; i++)
+               buf->teid[i] = cpu_to_le32(node_teids[i]);
+       status = ice_aq_delete_sched_elems(hw, 1, buf, buf_size,
+                                          &num_groups_removed, NULL);
+       if (status || num_groups_removed != 1)
+               ice_debug(hw, ICE_DBG_SCHED, "remove elements failed\n");
+       devm_kfree(ice_hw_to_dev(hw), buf);
+       return status;
+}
+
+/**
+ * ice_sched_get_first_node - get the first node of the given layer
+ * @hw: pointer to the hw struct
+ * @parent: pointer the base node of the subtree
+ * @layer: layer number
+ *
+ * This function retrieves the first node of the given layer from the subtree
+ */
+static struct ice_sched_node *
+ice_sched_get_first_node(struct ice_hw *hw, struct ice_sched_node *parent,
+                        u8 layer)
+{
+       u8 i;
+
+       if (layer < hw->sw_entry_point_layer)
+               return NULL;
+       for (i = 0; i < parent->num_children; i++) {
+               struct ice_sched_node *node = parent->children[i];
+
+               if (node) {
+                       if (node->tx_sched_layer == layer)
+                               return node;
+                       /* this recursion is intentional, and wouldn't
+                        * go more than 9 calls
+                        */
+                       return ice_sched_get_first_node(hw, node, layer);
+               }
+       }
+       return NULL;
+}
+
+/**
+ * ice_sched_get_tc_node - get pointer to TC node
+ * @pi: port information structure
+ * @tc: TC number
+ *
+ * This function returns the TC node pointer
+ */
+struct ice_sched_node *ice_sched_get_tc_node(struct ice_port_info *pi, u8 tc)
+{
+       u8 i;
+
+       if (!pi)
+               return NULL;
+       for (i = 0; i < pi->root->num_children; i++)
+               if (pi->root->children[i]->tc_num == tc)
+                       return pi->root->children[i];
+       return NULL;
+}
+
+/**
+ * ice_free_sched_node - Free a Tx scheduler node from SW DB
+ * @pi: port information structure
+ * @node: pointer to the ice_sched_node struct
+ *
+ * This function frees up a node from SW DB as well as from HW
+ *
+ * This function needs to be called with the port_info->sched_lock held
+ */
+void ice_free_sched_node(struct ice_port_info *pi, struct ice_sched_node *node)
+{
+       struct ice_sched_node *parent;
+       struct ice_hw *hw = pi->hw;
+       u8 i, j;
+
+       /* Free the children before freeing up the parent node
+        * The parent array is updated below and that shifts the nodes
+        * in the array. So always pick the first child if num children > 0
+        */
+       while (node->num_children)
+               ice_free_sched_node(pi, node->children[0]);
+
+       /* Leaf, TC and root nodes can't be deleted by SW */
+       if (node->tx_sched_layer >= hw->sw_entry_point_layer &&
+           node->info.data.elem_type != ICE_AQC_ELEM_TYPE_TC &&
+           node->info.data.elem_type != ICE_AQC_ELEM_TYPE_ROOT_PORT &&
+           node->info.data.elem_type != ICE_AQC_ELEM_TYPE_LEAF) {
+               u32 teid = le32_to_cpu(node->info.node_teid);
+               enum ice_status status;
+
+               status = ice_sched_remove_elems(hw, node->parent, 1, &teid);
+               if (status)
+                       ice_debug(hw, ICE_DBG_SCHED,
+                                 "remove element failed %d\n", status);
+       }
+       parent = node->parent;
+       /* root has no parent */
+       if (parent) {
+               struct ice_sched_node *p, *tc_node;
+
+               /* update the parent */
+               for (i = 0; i < parent->num_children; i++)
+                       if (parent->children[i] == node) {
+                               for (j = i + 1; j < parent->num_children; j++)
+                                       parent->children[j - 1] =
+                                               parent->children[j];
+                               parent->num_children--;
+                               break;
+                       }
+
+               /* search for previous sibling that points to this node and
+                * remove the reference
+                */
+               tc_node = ice_sched_get_tc_node(pi, node->tc_num);
+               if (!tc_node) {
+                       ice_debug(hw, ICE_DBG_SCHED,
+                                 "Invalid TC number %d\n", node->tc_num);
+                       goto err_exit;
+               }
+               p = ice_sched_get_first_node(hw, tc_node, node->tx_sched_layer);
+               while (p) {
+                       if (p->sibling == node) {
+                               p->sibling = node->sibling;
+                               break;
+                       }
+                       p = p->sibling;
+               }
+       }
+err_exit:
+       /* leaf nodes have no children */
+       if (node->children)
+               devm_kfree(ice_hw_to_dev(hw), node->children);
+       devm_kfree(ice_hw_to_dev(hw), node);
+}
+
+/**
+ * ice_aq_query_sched_res - query scheduler resource
+ * @hw: pointer to the hw struct
+ * @buf_size: buffer size in bytes
+ * @buf: pointer to buffer
+ * @cd: pointer to command details structure or NULL
+ *
+ * Query scheduler resource allocation (0x0412)
+ */
+static enum ice_status
+ice_aq_query_sched_res(struct ice_hw *hw, u16 buf_size,
+                      struct ice_aqc_query_txsched_res_resp *buf,
+                      struct ice_sq_cd *cd)
+{
+       struct ice_aq_desc desc;
+
+       ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_query_sched_res);
+       return ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+}
+
+/**
+ * ice_sched_clear_tx_topo - clears the schduler tree nodes
+ * @pi: port information structure
+ *
+ * This function removes all the nodes from HW as well as from SW DB.
+ */
+static void ice_sched_clear_tx_topo(struct ice_port_info *pi)
+{
+       struct ice_sched_agg_info *agg_info;
+       struct ice_sched_vsi_info *vsi_elem;
+       struct ice_sched_agg_info *atmp;
+       struct ice_sched_vsi_info *tmp;
+       struct ice_hw *hw;
+
+       if (!pi)
+               return;
+
+       hw = pi->hw;
+
+       list_for_each_entry_safe(agg_info, atmp, &pi->agg_list, list_entry) {
+               struct ice_sched_agg_vsi_info *agg_vsi_info;
+               struct ice_sched_agg_vsi_info *vtmp;
+
+               list_for_each_entry_safe(agg_vsi_info, vtmp,
+                                        &agg_info->agg_vsi_list, list_entry) {
+                       list_del(&agg_vsi_info->list_entry);
+                       devm_kfree(ice_hw_to_dev(hw), agg_vsi_info);
+               }
+       }
+
+       /* remove the vsi list */
+       list_for_each_entry_safe(vsi_elem, tmp, &pi->vsi_info_list,
+                                list_entry) {
+               list_del(&vsi_elem->list_entry);
+               devm_kfree(ice_hw_to_dev(hw), vsi_elem);
+       }
+
+       if (pi->root) {
+               ice_free_sched_node(pi, pi->root);
+               pi->root = NULL;
+       }
+}
+
+/**
+ * ice_sched_clear_port - clear the scheduler elements from SW DB for a port
+ * @pi: port information structure
+ *
+ * Cleanup scheduling elements from SW DB
+ */
+static void ice_sched_clear_port(struct ice_port_info *pi)
+{
+       if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+               return;
+
+       pi->port_state = ICE_SCHED_PORT_STATE_INIT;
+       mutex_lock(&pi->sched_lock);
+       ice_sched_clear_tx_topo(pi);
+       mutex_unlock(&pi->sched_lock);
+       mutex_destroy(&pi->sched_lock);
+}
+
+/**
+ * ice_sched_cleanup_all - cleanup scheduler elements from SW DB for all ports
+ * @hw: pointer to the hw struct
+ *
+ * Cleanup scheduling elements from SW DB for all the ports
+ */
+void ice_sched_cleanup_all(struct ice_hw *hw)
+{
+       if (!hw || !hw->port_info)
+               return;
+
+       if (hw->layer_info)
+               devm_kfree(ice_hw_to_dev(hw), hw->layer_info);
+
+       ice_sched_clear_port(hw->port_info);
+
+       hw->num_tx_sched_layers = 0;
+       hw->num_tx_sched_phys_layers = 0;
+       hw->flattened_layers = 0;
+       hw->max_cgds = 0;
+}
+
+/**
+ * ice_sched_query_res_alloc - query the FW for num of logical sched layers
+ * @hw: pointer to the HW struct
+ *
+ * query FW for allocated scheduler resources and store in HW struct
+ */
+enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw)
+{
+       struct ice_aqc_query_txsched_res_resp *buf;
+       enum ice_status status = 0;
+
+       if (hw->layer_info)
+               return status;
+
+       buf = devm_kzalloc(ice_hw_to_dev(hw), sizeof(*buf), GFP_KERNEL);
+       if (!buf)
+               return ICE_ERR_NO_MEMORY;
+
+       status = ice_aq_query_sched_res(hw, sizeof(*buf), buf, NULL);
+       if (status)
+               goto sched_query_out;
+
+       hw->num_tx_sched_layers = le16_to_cpu(buf->sched_props.logical_levels);
+       hw->num_tx_sched_phys_layers =
+               le16_to_cpu(buf->sched_props.phys_levels);
+       hw->flattened_layers = buf->sched_props.flattening_bitmap;
+       hw->max_cgds = buf->sched_props.max_pf_cgds;
+
+        hw->layer_info = devm_kmemdup(ice_hw_to_dev(hw), buf->layer_props,
+                                      (hw->num_tx_sched_layers *
+                                       sizeof(*hw->layer_info)),
+                                      GFP_KERNEL);
+       if (!hw->layer_info) {
+               status = ICE_ERR_NO_MEMORY;
+               goto sched_query_out;
+       }
+
+sched_query_out:
+       devm_kfree(ice_hw_to_dev(hw), buf);
+       return status;
+}
 
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _ICE_SCHED_H_
+#define _ICE_SCHED_H_
+
+#include "ice_common.h"
+
+struct ice_sched_agg_vsi_info {
+       struct list_head list_entry;
+       DECLARE_BITMAP(tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
+       u16 vsi_id;
+};
+
+struct ice_sched_agg_info {
+       struct list_head agg_vsi_list;
+       struct list_head list_entry;
+       DECLARE_BITMAP(tc_bitmap, ICE_MAX_TRAFFIC_CLASS);
+       u32 agg_id;
+       enum ice_agg_type agg_type;
+};
+
+/* FW AQ command calls */
+enum ice_status ice_sched_query_res_alloc(struct ice_hw *hw);
+void ice_sched_cleanup_all(struct ice_hw *hw);
+void ice_free_sched_node(struct ice_port_info *pi, struct ice_sched_node *node);
+struct ice_sched_node *ice_sched_get_tc_node(struct ice_port_info *pi, u8 tc);
+#endif /* _ICE_SCHED_H_ */
 
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2018, Intel Corporation. */
+
+#include "ice_switch.h"
+
+/**
+ * ice_aq_get_sw_cfg - get switch configuration
+ * @hw: pointer to the hardware structure
+ * @buf: pointer to the result buffer
+ * @buf_size: length of the buffer available for response
+ * @req_desc: pointer to requested descriptor
+ * @num_elems: pointer to number of elements
+ * @cd: pointer to command details structure or NULL
+ *
+ * Get switch configuration (0x0200) to be placed in 'buff'.
+ * This admin command returns information such as initial VSI/port number
+ * and switch ID it belongs to.
+ *
+ * NOTE: *req_desc is both an input/output parameter.
+ * The caller of this function first calls this function with *request_desc set
+ * to 0.  If the response from f/w has *req_desc set to 0, all the switch
+ * configuration information has been returned; if non-zero (meaning not all
+ * the information was returned), the caller should call this function again
+ * with *req_desc set to the previous value returned by f/w to get the
+ * next block of switch configuration information.
+ *
+ * *num_elems is output only parameter. This reflects the number of elements
+ * in response buffer. The caller of this function to use *num_elems while
+ * parsing the response buffer.
+ */
+static enum ice_status
+ice_aq_get_sw_cfg(struct ice_hw *hw, struct ice_aqc_get_sw_cfg_resp *buf,
+                 u16 buf_size, u16 *req_desc, u16 *num_elems,
+                 struct ice_sq_cd *cd)
+{
+       struct ice_aqc_get_sw_cfg *cmd;
+       enum ice_status status;
+       struct ice_aq_desc desc;
+
+       ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_get_sw_cfg);
+       cmd = &desc.params.get_sw_conf;
+       cmd->element = cpu_to_le16(*req_desc);
+
+       status = ice_aq_send_cmd(hw, &desc, buf, buf_size, cd);
+       if (!status) {
+               *req_desc = le16_to_cpu(cmd->element);
+               *num_elems = le16_to_cpu(cmd->num_elems);
+       }
+
+       return status;
+}
+
+/* ice_init_port_info - Initialize port_info with switch configuration data
+ * @pi: pointer to port_info
+ * @vsi_port_num: VSI number or port number
+ * @type: Type of switch element (port or VSI)
+ * @swid: switch ID of the switch the element is attached to
+ * @pf_vf_num: PF or VF number
+ * @is_vf: true if the element is a VF, false otherwise
+ */
+static void
+ice_init_port_info(struct ice_port_info *pi, u16 vsi_port_num, u8 type,
+                  u16 swid, u16 pf_vf_num, bool is_vf)
+{
+       switch (type) {
+       case ICE_AQC_GET_SW_CONF_RESP_PHYS_PORT:
+               pi->lport = (u8)(vsi_port_num & ICE_LPORT_MASK);
+               pi->sw_id = swid;
+               pi->pf_vf_num = pf_vf_num;
+               pi->is_vf = is_vf;
+               pi->dflt_tx_vsi_num = ICE_DFLT_VSI_INVAL;
+               pi->dflt_rx_vsi_num = ICE_DFLT_VSI_INVAL;
+               break;
+       default:
+               ice_debug(pi->hw, ICE_DBG_SW,
+                         "incorrect VSI/port type received\n");
+               break;
+       }
+}
+
+/* ice_get_initial_sw_cfg - Get initial port and default VSI data
+ * @hw: pointer to the hardware structure
+ */
+enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw)
+{
+       struct ice_aqc_get_sw_cfg_resp *rbuf;
+       enum ice_status status;
+       u16 req_desc = 0;
+       u16 num_elems;
+       u16 i;
+
+       rbuf = devm_kzalloc(ice_hw_to_dev(hw), ICE_SW_CFG_MAX_BUF_LEN,
+                           GFP_KERNEL);
+
+       if (!rbuf)
+               return ICE_ERR_NO_MEMORY;
+
+       /* Multiple calls to ice_aq_get_sw_cfg may be required
+        * to get all the switch configuration information. The need
+        * for additional calls is indicated by ice_aq_get_sw_cfg
+        * writing a non-zero value in req_desc
+        */
+       do {
+               status = ice_aq_get_sw_cfg(hw, rbuf, ICE_SW_CFG_MAX_BUF_LEN,
+                                          &req_desc, &num_elems, NULL);
+
+               if (status)
+                       break;
+
+               for (i = 0; i < num_elems; i++) {
+                       struct ice_aqc_get_sw_cfg_resp_elem *ele;
+                       u16 pf_vf_num, swid, vsi_port_num;
+                       bool is_vf = false;
+                       u8 type;
+
+                       ele = rbuf[i].elements;
+                       vsi_port_num = le16_to_cpu(ele->vsi_port_num) &
+                               ICE_AQC_GET_SW_CONF_RESP_VSI_PORT_NUM_M;
+
+                       pf_vf_num = le16_to_cpu(ele->pf_vf_num) &
+                               ICE_AQC_GET_SW_CONF_RESP_FUNC_NUM_M;
+
+                       swid = le16_to_cpu(ele->swid);
+
+                       if (le16_to_cpu(ele->pf_vf_num) &
+                           ICE_AQC_GET_SW_CONF_RESP_IS_VF)
+                               is_vf = true;
+
+                       type = le16_to_cpu(ele->vsi_port_num) >>
+                               ICE_AQC_GET_SW_CONF_RESP_TYPE_S;
+
+                       if (type == ICE_AQC_GET_SW_CONF_RESP_VSI) {
+                               /* FW VSI is not needed. Just continue. */
+                               continue;
+                       }
+
+                       ice_init_port_info(hw->port_info, vsi_port_num,
+                                          type, swid, pf_vf_num, is_vf);
+               }
+       } while (req_desc && !status);
+
+       devm_kfree(ice_hw_to_dev(hw), (void *)rbuf);
+       return status;
+}
 
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2018, Intel Corporation. */
+
+#ifndef _ICE_SWITCH_H_
+#define _ICE_SWITCH_H_
+
+#include "ice_common.h"
+
+#define ICE_SW_CFG_MAX_BUF_LEN 2048
+#define ICE_DFLT_VSI_INVAL 0xff
+
+enum ice_status ice_get_initial_sw_cfg(struct ice_hw *hw);
+
+#endif /* _ICE_SWITCH_H_ */
 
 /* debug masks - set these bits in hw->debug_mask to control output */
 #define ICE_DBG_INIT           BIT_ULL(1)
 #define ICE_DBG_NVM            BIT_ULL(7)
+#define ICE_DBG_SW             BIT_ULL(13)
+#define ICE_DBG_SCHED          BIT_ULL(14)
 #define ICE_DBG_RES            BIT_ULL(17)
 #define ICE_DBG_AQ_MSG         BIT_ULL(24)
 #define ICE_DBG_AQ_CMD         BIT_ULL(27)
        ICE_MAC_GENERIC,
 };
 
+/* Common HW capabilities for SW use */
+struct ice_hw_common_caps {
+       /* TX/RX queues */
+       u16 num_rxq;            /* Number/Total RX queues */
+       u16 rxq_first_id;       /* First queue ID for RX queues */
+       u16 num_txq;            /* Number/Total TX queues */
+       u16 txq_first_id;       /* First queue ID for TX queues */
+
+       /* MSI-X vectors */
+       u16 num_msix_vectors;
+       u16 msix_vector_first_id;
+
+       /* Max MTU for function or device */
+       u16 max_mtu;
+
+       /* RSS related capabilities */
+       u16 rss_table_size;             /* 512 for PFs and 64 for VFs */
+       u8 rss_table_entry_width;       /* RSS Entry width in bits */
+};
+
+/* Function specific capabilities */
+struct ice_hw_func_caps {
+       struct ice_hw_common_caps common_cap;
+       u32 guaranteed_num_vsi;
+};
+
+/* Device wide capabilities */
+struct ice_hw_dev_caps {
+       struct ice_hw_common_caps common_cap;
+       u32 num_vsi_allocd_to_host;     /* Excluding EMP VSI */
+};
+
 /* Various RESET request, These are not tied with HW reset types */
 enum ice_reset_req {
        ICE_RESET_PFR   = 0,
        bool blank_nvm_mode;      /* is NVM empty (no FW present) */
 };
 
+/* Max number of port to queue branches w.r.t topology */
+#define ICE_MAX_TRAFFIC_CLASS 8
+
+struct ice_sched_node {
+       struct ice_sched_node *parent;
+       struct ice_sched_node *sibling; /* next sibling in the same layer */
+       struct ice_sched_node **children;
+       struct ice_aqc_txsched_elem_data info;
+       u32 agg_id;                     /* aggregator group id */
+       u16 vsi_id;
+       bool in_use;                    /* suspended or in use */
+       u8 tx_sched_layer;              /* Logical Layer (1-9) */
+       u8 num_children;
+       u8 tc_num;
+       u8 owner;
+#define ICE_SCHED_NODE_OWNER_LAN       0
+};
+
+/* The aggregator type determines if identifier is for a VSI group,
+ * aggregator group, aggregator of queues, or queue group.
+ */
+enum ice_agg_type {
+       ICE_AGG_TYPE_UNKNOWN = 0,
+       ICE_AGG_TYPE_VSI,
+       ICE_AGG_TYPE_AGG, /* aggregator */
+       ICE_AGG_TYPE_Q,
+       ICE_AGG_TYPE_QG
+};
+
+/* vsi type list entry to locate corresponding vsi/ag nodes */
+struct ice_sched_vsi_info {
+       struct ice_sched_node *vsi_node[ICE_MAX_TRAFFIC_CLASS];
+       struct ice_sched_node *ag_node[ICE_MAX_TRAFFIC_CLASS];
+       struct list_head list_entry;
+       u16 max_lanq[ICE_MAX_TRAFFIC_CLASS];
+       u16 vsi_id;
+};
+
+/* driver defines the policy */
+struct ice_sched_tx_policy {
+       u16 max_num_vsis;
+       u8 max_num_lan_qs_per_tc[ICE_MAX_TRAFFIC_CLASS];
+       bool rdma_ena;
+};
+
+struct ice_port_info {
+       struct ice_sched_node *root;    /* Root Node per Port */
+       struct ice_hw *hw;              /* back pointer to hw instance */
+       u16 sw_id;                      /* Initial switch ID belongs to port */
+       u16 pf_vf_num;
+       u8 port_state;
+#define ICE_SCHED_PORT_STATE_INIT      0x0
+#define ICE_SCHED_PORT_STATE_READY     0x1
+       u16 dflt_tx_vsi_num;
+       u16 dflt_rx_vsi_num;
+       struct mutex sched_lock;        /* protect access to TXSched tree */
+       struct ice_sched_tx_policy sched_policy;
+       struct list_head vsi_info_list;
+       struct list_head agg_list;      /* lists all aggregator */
+       u8 lport;
+#define ICE_LPORT_MASK         0xff
+       bool is_vf;
+};
+
 /* Port hardware description */
 struct ice_hw {
        u8 __iomem *hw_addr;
        void *back;
+       struct ice_aqc_layer_props *layer_info;
+       struct ice_port_info *port_info;
        u64 debug_mask;         /* bitmap for debug mask */
        enum ice_mac_type mac_type;
 
 
        u8 pf_id;               /* device profile info */
 
+       /* TX Scheduler values */
+       u16 num_tx_sched_layers;
+       u16 num_tx_sched_phys_layers;
+       u8 flattened_layers;
+       u8 max_cgds;
+       u8 sw_entry_point_layer;
+
        struct ice_bus_info bus;
        struct ice_nvm_info nvm;
+       struct ice_hw_dev_caps dev_caps;        /* device capabilities */
+       struct ice_hw_func_caps func_caps;      /* function capabilities */
 
        /* Control Queue info */
        struct ice_ctl_q_info adminq;