From f30490e9695ef7da3d0899c6a0293cc7cd373567 Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Wed, 16 Oct 2024 11:30:11 +0200 Subject: [PATCH 01/16] i40e: fix race condition by adding filter's intermediate sync state Fix a race condition in the i40e driver that leads to MAC/VLAN filters becoming corrupted and leaking. Address the issue that occurs under heavy load when multiple threads are concurrently modifying MAC/VLAN filters by setting mac and port VLAN. 1. Thread T0 allocates a filter in i40e_add_filter() within i40e_ndo_set_vf_port_vlan(). 2. Thread T1 concurrently frees the filter in __i40e_del_filter() within i40e_ndo_set_vf_mac(). 3. Subsequently, i40e_service_task() calls i40e_sync_vsi_filters(), which refers to the already freed filter memory, causing corruption. Reproduction steps: 1. Spawn multiple VFs. 2. Apply a concurrent heavy load by running parallel operations to change MAC addresses on the VFs and change port VLANs on the host. 3. Observe errors in dmesg: "Error I40E_AQ_RC_ENOSPC adding RX filters on VF XX, please set promiscuous on manually for VF XX". Exact code for stable reproduction Intel can't open-source now. The fix involves implementing a new intermediate filter state, I40E_FILTER_NEW_SYNC, for the time when a filter is on a tmp_add_list. These filters cannot be deleted from the hash list directly but must be removed using the full process. Fixes: 278e7d0b9d68 ("i40e: store MAC/VLAN filters in a hash with the MAC Address as key") Signed-off-by: Aleksandr Loktionov Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Reviewed-by: Michal Schmidt Tested-by: Michal Schmidt Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e.h | 1 + drivers/net/ethernet/intel/i40e/i40e_debugfs.c | 1 + drivers/net/ethernet/intel/i40e/i40e_main.c | 12 ++++++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e.h b/drivers/net/ethernet/intel/i40e/i40e.h index 2089a0e172bf..d4255c2706fa 100644 --- a/drivers/net/ethernet/intel/i40e/i40e.h +++ b/drivers/net/ethernet/intel/i40e/i40e.h @@ -755,6 +755,7 @@ enum i40e_filter_state { I40E_FILTER_ACTIVE, /* Added to switch by FW */ I40E_FILTER_FAILED, /* Rejected by FW */ I40E_FILTER_REMOVE, /* To be removed */ + I40E_FILTER_NEW_SYNC, /* New, not sent yet, is in i40e_sync_vsi_filters() */ /* There is no 'removed' state; the filter struct is freed */ }; struct i40e_mac_filter { diff --git a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c index abf624d770e6..208c2f0857b6 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_debugfs.c +++ b/drivers/net/ethernet/intel/i40e/i40e_debugfs.c @@ -89,6 +89,7 @@ static char *i40e_filter_state_string[] = { "ACTIVE", "FAILED", "REMOVE", + "NEW_SYNC", }; /** diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 25295ae370b2..55fb362eb508 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1255,6 +1255,7 @@ int i40e_count_filters(struct i40e_vsi *vsi) hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { if (f->state == I40E_FILTER_NEW || + f->state == I40E_FILTER_NEW_SYNC || f->state == I40E_FILTER_ACTIVE) ++cnt; } @@ -1441,6 +1442,8 @@ static int i40e_correct_mac_vlan_filters(struct i40e_vsi *vsi, new->f = add_head; new->state = add_head->state; + if (add_head->state == I40E_FILTER_NEW) + add_head->state = I40E_FILTER_NEW_SYNC; /* Add the new filter to the tmp list */ hlist_add_head(&new->hlist, tmp_add_list); @@ -1550,6 +1553,8 @@ static int i40e_correct_vf_mac_vlan_filters(struct i40e_vsi *vsi, return -ENOMEM; new_mac->f = add_head; new_mac->state = add_head->state; + if (add_head->state == I40E_FILTER_NEW) + add_head->state = I40E_FILTER_NEW_SYNC; /* Add the new filter to the tmp list */ hlist_add_head(&new_mac->hlist, tmp_add_list); @@ -2437,7 +2442,8 @@ static int i40e_aqc_broadcast_filter(struct i40e_vsi *vsi, const char *vsi_name, struct i40e_mac_filter *f) { - bool enable = f->state == I40E_FILTER_NEW; + bool enable = f->state == I40E_FILTER_NEW || + f->state == I40E_FILTER_NEW_SYNC; struct i40e_hw *hw = &vsi->back->hw; int aq_ret; @@ -2611,6 +2617,7 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi) /* Add it to the hash list */ hlist_add_head(&new->hlist, &tmp_add_list); + f->state = I40E_FILTER_NEW_SYNC; } /* Count the number of active (current and new) VLAN @@ -2762,7 +2769,8 @@ int i40e_sync_vsi_filters(struct i40e_vsi *vsi) spin_lock_bh(&vsi->mac_filter_hash_lock); hlist_for_each_entry_safe(new, h, &tmp_add_list, hlist) { /* Only update the state if we're still NEW */ - if (new->f->state == I40E_FILTER_NEW) + if (new->f->state == I40E_FILTER_NEW || + new->f->state == I40E_FILTER_NEW_SYNC) new->f->state = new->state; hlist_del(&new->hlist); netdev_hw_addr_refcnt(new->f, vsi->netdev, -1); -- 2.51.0 From b8473723272e346e22aa487b9046fd324b73a0a5 Mon Sep 17 00:00:00 2001 From: Vitaly Lifshits Date: Tue, 1 Oct 2024 20:08:48 +0300 Subject: [PATCH 02/16] e1000e: Remove Meteor Lake SMBUS workarounds This is a partial revert to commit 76a0a3f9cc2f ("e1000e: fix force smbus during suspend flow"). That commit fixed a sporadic PHY access issue but introduced a regression in runtime suspend flows. The original issue on Meteor Lake systems was rare in terms of the reproduction rate and the number of the systems affected. After the integration of commit 0a6ad4d9e169 ("e1000e: avoid failing the system during pm_suspend"), PHY access loss can no longer cause a system-level suspend failure. As it only occurs when the LAN cable is disconnected, and is recovered during system resume flow. Therefore, its functional impact is low, and the priority is given to stabilizing runtime suspend. Fixes: 76a0a3f9cc2f ("e1000e: fix force smbus during suspend flow") Signed-off-by: Vitaly Lifshits Tested-by: Avigail Dahan Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/e1000e/ich8lan.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/intel/e1000e/ich8lan.c b/drivers/net/ethernet/intel/e1000e/ich8lan.c index ce227b56cf72..2f9655cf5dd9 100644 --- a/drivers/net/ethernet/intel/e1000e/ich8lan.c +++ b/drivers/net/ethernet/intel/e1000e/ich8lan.c @@ -1205,12 +1205,10 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) if (ret_val) goto out; - if (hw->mac.type != e1000_pch_mtp) { - ret_val = e1000e_force_smbus(hw); - if (ret_val) { - e_dbg("Failed to force SMBUS: %d\n", ret_val); - goto release; - } + ret_val = e1000e_force_smbus(hw); + if (ret_val) { + e_dbg("Failed to force SMBUS: %d\n", ret_val); + goto release; } /* Si workaround for ULP entry flow on i127/rev6 h/w. Enable @@ -1273,13 +1271,6 @@ s32 e1000_enable_ulp_lpt_lp(struct e1000_hw *hw, bool to_sx) } release: - if (hw->mac.type == e1000_pch_mtp) { - ret_val = e1000e_force_smbus(hw); - if (ret_val) - e_dbg("Failed to force SMBUS over MTL system: %d\n", - ret_val); - } - hw->phy.ops.release(hw); out: if (ret_val) -- 2.51.0 From 249cfa318fb1b77eb726c2ff4f74c9685f04e568 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 4 Nov 2024 18:03:52 -0800 Subject: [PATCH 03/16] Revert "Merge branch 'there-are-some-bugfix-for-the-hns3-ethernet-driver'" This reverts commit d80a3091308491455b6501b1c4b68698c4a7cd24, reversing changes made to 637f41476384c76d3cd7dcf5947caf2c8b8d7a9b: 2cf246143519 ("net: hns3: fix kernel crash when 1588 is sent on HIP08 devices") 3e22b7de34cb ("net: hns3: fixed hclge_fetch_pf_reg accesses bar space out of bounds issue") d1c2e2961ab4 ("net: hns3: initialize reset_timer before hclgevf_misc_irq_init()") 5f62009ff108 ("net: hns3: don't auto enable misc vector") 2758f18a83ef ("net: hns3: Resolved the issue that the debugfs query result is inconsistent.") 662ecfc46690 ("net: hns3: fix missing features due to dev->features configuration too early") 3e0f7cc887b7 ("net: hns3: fixed reset failure issues caused by the incorrect reset type") f2c14899caba ("net: hns3: add sync command to sync io-pgtable") e6ab19443b36 ("net: hns3: default enable tx bounce buffer when smmu enabled") The series is making the driver poke into IOMMU internals instead of implementing appropriate IOMMU workarounds. Link: https://lore.kernel.org/069c9838-b781-4012-934a-d2626fa78212@arm.com Signed-off-by: Jakub Kicinski --- .../ethernet/hisilicon/hns3/hns3_debugfs.c | 4 +- .../net/ethernet/hisilicon/hns3/hns3_enet.c | 59 +------------------ .../net/ethernet/hisilicon/hns3/hns3_enet.h | 2 - .../ethernet/hisilicon/hns3/hns3_ethtool.c | 33 ----------- .../hisilicon/hns3/hns3pf/hclge_main.c | 45 +++----------- .../hisilicon/hns3/hns3pf/hclge_ptp.c | 3 - .../hisilicon/hns3/hns3pf/hclge_regs.c | 9 ++- .../hisilicon/hns3/hns3vf/hclgevf_main.c | 40 +++---------- .../hisilicon/hns3/hns3vf/hclgevf_regs.c | 9 ++- 9 files changed, 26 insertions(+), 178 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c index 841e5af7b2be..807eb3bbb11c 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_debugfs.c @@ -1293,10 +1293,8 @@ static ssize_t hns3_dbg_read(struct file *filp, char __user *buffer, /* save the buffer addr until the last read operation */ *save_buf = read_buf; - } - /* get data ready for the first time to read */ - if (!*ppos) { + /* get data ready for the first time to read */ ret = hns3_dbg_read_cmd(dbg_data, hns3_dbg_cmd[index].cmd, read_buf, hns3_dbg_cmd[index].buf_len); if (ret) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index b09f0cca34dc..4cbc4d069a1f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -381,24 +380,6 @@ static const struct hns3_rx_ptype hns3_rx_ptype_tbl[] = { #define HNS3_INVALID_PTYPE \ ARRAY_SIZE(hns3_rx_ptype_tbl) -static void hns3_dma_map_sync(struct device *dev, unsigned long iova) -{ - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - struct iommu_iotlb_gather iotlb_gather; - size_t granule; - - if (!domain || !iommu_is_dma_domain(domain)) - return; - - granule = 1 << __ffs(domain->pgsize_bitmap); - iova = ALIGN_DOWN(iova, granule); - iotlb_gather.start = iova; - iotlb_gather.end = iova + granule - 1; - iotlb_gather.pgsize = granule; - - iommu_iotlb_sync(domain, &iotlb_gather); -} - static irqreturn_t hns3_irq_handle(int irq, void *vector) { struct hns3_enet_tqp_vector *tqp_vector = vector; @@ -1051,8 +1032,6 @@ static bool hns3_can_use_tx_sgl(struct hns3_enet_ring *ring, static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) { u32 alloc_size = ring->tqp->handle->kinfo.tx_spare_buf_size; - struct net_device *netdev = ring_to_netdev(ring); - struct hns3_nic_priv *priv = netdev_priv(netdev); struct hns3_tx_spare *tx_spare; struct page *page; dma_addr_t dma; @@ -1094,7 +1073,6 @@ static void hns3_init_tx_spare_buffer(struct hns3_enet_ring *ring) tx_spare->buf = page_address(page); tx_spare->len = PAGE_SIZE << order; ring->tx_spare = tx_spare; - ring->tx_copybreak = priv->tx_copybreak; return; dma_mapping_error: @@ -1746,9 +1724,7 @@ static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv, unsigned int type) { struct hns3_desc_cb *desc_cb = &ring->desc_cb[ring->next_to_use]; - struct hnae3_handle *handle = ring->tqp->handle; struct device *dev = ring_to_dev(ring); - struct hnae3_ae_dev *ae_dev; unsigned int size; dma_addr_t dma; @@ -1780,13 +1756,6 @@ static int hns3_map_and_fill_desc(struct hns3_enet_ring *ring, void *priv, return -ENOMEM; } - /* Add a SYNC command to sync io-pgtale to avoid errors in pgtable - * prefetch - */ - ae_dev = hns3_get_ae_dev(handle); - if (ae_dev->dev_version >= HNAE3_DEVICE_VERSION_V3) - hns3_dma_map_sync(dev, dma); - desc_cb->priv = priv; desc_cb->length = size; desc_cb->dma = dma; @@ -2483,6 +2452,7 @@ static int hns3_nic_set_features(struct net_device *netdev, return ret; } + netdev->features = features; return 0; } @@ -4898,30 +4868,6 @@ static void hns3_nic_dealloc_vector_data(struct hns3_nic_priv *priv) devm_kfree(&pdev->dev, priv->tqp_vector); } -static void hns3_update_tx_spare_buf_config(struct hns3_nic_priv *priv) -{ -#define HNS3_MIN_SPARE_BUF_SIZE (2 * 1024 * 1024) -#define HNS3_MAX_PACKET_SIZE (64 * 1024) - - struct iommu_domain *domain = iommu_get_domain_for_dev(priv->dev); - struct hnae3_ae_dev *ae_dev = hns3_get_ae_dev(priv->ae_handle); - struct hnae3_handle *handle = priv->ae_handle; - - if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V3) - return; - - if (!(domain && iommu_is_dma_domain(domain))) - return; - - priv->min_tx_copybreak = HNS3_MAX_PACKET_SIZE; - priv->min_tx_spare_buf_size = HNS3_MIN_SPARE_BUF_SIZE; - - if (priv->tx_copybreak < priv->min_tx_copybreak) - priv->tx_copybreak = priv->min_tx_copybreak; - if (handle->kinfo.tx_spare_buf_size < priv->min_tx_spare_buf_size) - handle->kinfo.tx_spare_buf_size = priv->min_tx_spare_buf_size; -} - static void hns3_ring_get_cfg(struct hnae3_queue *q, struct hns3_nic_priv *priv, unsigned int ring_type) { @@ -5155,7 +5101,6 @@ int hns3_init_all_ring(struct hns3_nic_priv *priv) int i, j; int ret; - hns3_update_tx_spare_buf_config(priv); for (i = 0; i < ring_num; i++) { ret = hns3_alloc_ring_memory(&priv->ring[i]); if (ret) { @@ -5360,8 +5305,6 @@ static int hns3_client_init(struct hnae3_handle *handle) priv->ae_handle = handle; priv->tx_timeout_count = 0; priv->max_non_tso_bd_num = ae_dev->dev_specs.max_non_tso_bd_num; - priv->min_tx_copybreak = 0; - priv->min_tx_spare_buf_size = 0; set_bit(HNS3_NIC_STATE_DOWN, &priv->state); handle->msg_enable = netif_msg_init(debug, DEFAULT_MSG_LEVEL); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index caf7a4df8585..d36c4ed16d8d 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -596,8 +596,6 @@ struct hns3_nic_priv { struct hns3_enet_coalesce rx_coal; u32 tx_copybreak; u32 rx_copybreak; - u32 min_tx_copybreak; - u32 min_tx_spare_buf_size; }; union l3_hdr_info { diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 97eaeec1952b..b1e988347347 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -1933,31 +1933,6 @@ static int hns3_set_tx_spare_buf_size(struct net_device *netdev, return ret; } -static int hns3_check_tx_copybreak(struct net_device *netdev, u32 copybreak) -{ - struct hns3_nic_priv *priv = netdev_priv(netdev); - - if (copybreak < priv->min_tx_copybreak) { - netdev_err(netdev, "tx copybreak %u should be no less than %u!\n", - copybreak, priv->min_tx_copybreak); - return -EINVAL; - } - return 0; -} - -static int hns3_check_tx_spare_buf_size(struct net_device *netdev, u32 buf_size) -{ - struct hns3_nic_priv *priv = netdev_priv(netdev); - - if (buf_size < priv->min_tx_spare_buf_size) { - netdev_err(netdev, - "tx spare buf size %u should be no less than %u!\n", - buf_size, priv->min_tx_spare_buf_size); - return -EINVAL; - } - return 0; -} - static int hns3_set_tunable(struct net_device *netdev, const struct ethtool_tunable *tuna, const void *data) @@ -1974,10 +1949,6 @@ static int hns3_set_tunable(struct net_device *netdev, switch (tuna->id) { case ETHTOOL_TX_COPYBREAK: - ret = hns3_check_tx_copybreak(netdev, *(u32 *)data); - if (ret) - return ret; - priv->tx_copybreak = *(u32 *)data; for (i = 0; i < h->kinfo.num_tqps; i++) @@ -1992,10 +1963,6 @@ static int hns3_set_tunable(struct net_device *netdev, break; case ETHTOOL_TX_COPYBREAK_BUF_SIZE: - ret = hns3_check_tx_spare_buf_size(netdev, *(u32 *)data); - if (ret) - return ret; - old_tx_spare_buf_size = h->kinfo.tx_spare_buf_size; new_tx_spare_buf_size = *(u32 *)data; netdev_info(netdev, "request to set tx spare buf size from %u to %u\n", diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 728f4777e51f..bd86efd92a5a 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -3585,17 +3584,6 @@ static int hclge_set_vf_link_state(struct hnae3_handle *handle, int vf, return ret; } -static void hclge_set_reset_pending(struct hclge_dev *hdev, - enum hnae3_reset_type reset_type) -{ - /* When an incorrect reset type is executed, the get_reset_level - * function generates the HNAE3_NONE_RESET flag. As a result, this - * type do not need to pending. - */ - if (reset_type != HNAE3_NONE_RESET) - set_bit(reset_type, &hdev->reset_pending); -} - static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) { u32 cmdq_src_reg, msix_src_reg, hw_err_src_reg; @@ -3616,7 +3604,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) */ if (BIT(HCLGE_VECTOR0_IMPRESET_INT_B) & msix_src_reg) { dev_info(&hdev->pdev->dev, "IMP reset interrupt\n"); - hclge_set_reset_pending(hdev, HNAE3_IMP_RESET); + set_bit(HNAE3_IMP_RESET, &hdev->reset_pending); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); *clearval = BIT(HCLGE_VECTOR0_IMPRESET_INT_B); hdev->rst_stats.imp_rst_cnt++; @@ -3626,7 +3614,7 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) if (BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B) & msix_src_reg) { dev_info(&hdev->pdev->dev, "global reset interrupt\n"); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); - hclge_set_reset_pending(hdev, HNAE3_GLOBAL_RESET); + set_bit(HNAE3_GLOBAL_RESET, &hdev->reset_pending); *clearval = BIT(HCLGE_VECTOR0_GLOBALRESET_INT_B); hdev->rst_stats.global_rst_cnt++; return HCLGE_VECTOR0_EVENT_RST; @@ -3781,7 +3769,7 @@ static int hclge_misc_irq_init(struct hclge_dev *hdev) snprintf(hdev->misc_vector.name, HNAE3_INT_NAME_LEN, "%s-misc-%s", HCLGE_NAME, pci_name(hdev->pdev)); ret = request_irq(hdev->misc_vector.vector_irq, hclge_misc_irq_handle, - IRQ_NOAUTOEN, hdev->misc_vector.name, hdev); + 0, hdev->misc_vector.name, hdev); if (ret) { hclge_free_vector(hdev, 0); dev_err(&hdev->pdev->dev, "request misc irq(%d) fail\n", @@ -4074,7 +4062,7 @@ static void hclge_do_reset(struct hclge_dev *hdev) case HNAE3_FUNC_RESET: dev_info(&pdev->dev, "PF reset requested\n"); /* schedule again to check later */ - hclge_set_reset_pending(hdev, HNAE3_FUNC_RESET); + set_bit(HNAE3_FUNC_RESET, &hdev->reset_pending); hclge_reset_task_schedule(hdev); break; default: @@ -4108,8 +4096,6 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hnae3_ae_dev *ae_dev, clear_bit(HNAE3_FLR_RESET, addr); } - clear_bit(HNAE3_NONE_RESET, addr); - if (hdev->reset_type != HNAE3_NONE_RESET && rst_level < hdev->reset_type) return HNAE3_NONE_RESET; @@ -4251,7 +4237,7 @@ static bool hclge_reset_err_handle(struct hclge_dev *hdev) return false; } else if (hdev->rst_stats.reset_fail_cnt < MAX_RESET_FAIL_CNT) { hdev->rst_stats.reset_fail_cnt++; - hclge_set_reset_pending(hdev, hdev->reset_type); + set_bit(hdev->reset_type, &hdev->reset_pending); dev_info(&hdev->pdev->dev, "re-schedule reset task(%u)\n", hdev->rst_stats.reset_fail_cnt); @@ -4494,20 +4480,8 @@ static void hclge_reset_event(struct pci_dev *pdev, struct hnae3_handle *handle) static void hclge_set_def_reset_request(struct hnae3_ae_dev *ae_dev, enum hnae3_reset_type rst_type) { -#define HCLGE_SUPPORT_RESET_TYPE \ - (BIT(HNAE3_FLR_RESET) | BIT(HNAE3_FUNC_RESET) | \ - BIT(HNAE3_GLOBAL_RESET) | BIT(HNAE3_IMP_RESET)) - struct hclge_dev *hdev = ae_dev->priv; - if (!(BIT(rst_type) & HCLGE_SUPPORT_RESET_TYPE)) { - /* To prevent reset triggered by hclge_reset_event */ - set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request); - dev_warn(&hdev->pdev->dev, "unsupported reset type %d\n", - rst_type); - return; - } - set_bit(rst_type, &hdev->default_reset_request); } @@ -11917,6 +11891,9 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_init_rxd_adv_layout(hdev); + /* Enable MISC vector(vector0) */ + hclge_enable_vector(&hdev->misc_vector, true); + ret = hclge_init_wol(hdev); if (ret) dev_warn(&pdev->dev, @@ -11929,10 +11906,6 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_state_init(hdev); hdev->last_reset_time = jiffies; - /* Enable MISC vector(vector0) */ - enable_irq(hdev->misc_vector.vector_irq); - hclge_enable_vector(&hdev->misc_vector, true); - dev_info(&hdev->pdev->dev, "%s driver initialization finished.\n", HCLGE_DRIVER_NAME); @@ -12338,7 +12311,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) /* Disable MISC vector(vector0) */ hclge_enable_vector(&hdev->misc_vector, false); - disable_irq(hdev->misc_vector.vector_irq); + synchronize_irq(hdev->misc_vector.vector_irq); /* Disable all hw interrupts */ hclge_config_mac_tnl_int(hdev, false); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c index bab16c2191b2..5505caea88e9 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_ptp.c @@ -58,9 +58,6 @@ bool hclge_ptp_set_tx_info(struct hnae3_handle *handle, struct sk_buff *skb) struct hclge_dev *hdev = vport->back; struct hclge_ptp *ptp = hdev->ptp; - if (!ptp) - return false; - if (!test_bit(HCLGE_PTP_FLAG_TX_EN, &ptp->flags) || test_and_set_bit(HCLGE_STATE_PTP_TX_HANDLING, &hdev->state)) { ptp->tx_skipped++; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c index 8c057192aae6..43c1c18fa81f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_regs.c @@ -510,9 +510,9 @@ out: static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data, struct hnae3_knic_private_info *kinfo) { +#define HCLGE_RING_REG_OFFSET 0x200 #define HCLGE_RING_INT_REG_OFFSET 0x4 - struct hnae3_queue *tqp; int i, j, reg_num; int data_num_sum; u32 *reg = data; @@ -533,11 +533,10 @@ static int hclge_fetch_pf_reg(struct hclge_dev *hdev, void *data, reg_num = ARRAY_SIZE(ring_reg_addr_list); for (j = 0; j < kinfo->num_tqps; j++) { reg += hclge_reg_get_tlv(HCLGE_REG_TAG_RING, reg_num, reg); - tqp = kinfo->tqp[j]; for (i = 0; i < reg_num; i++) - *reg++ = readl_relaxed(tqp->io_base - - HCLGE_TQP_REG_OFFSET + - ring_reg_addr_list[i]); + *reg++ = hclge_read_dev(&hdev->hw, + ring_reg_addr_list[i] + + HCLGE_RING_REG_OFFSET * j); } data_num_sum += (reg_num + HCLGE_REG_TLV_SPACE) * kinfo->num_tqps; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index 896f1eb172d3..094a7c7b5592 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -1395,17 +1395,6 @@ static int hclgevf_notify_roce_client(struct hclgevf_dev *hdev, return ret; } -static void hclgevf_set_reset_pending(struct hclgevf_dev *hdev, - enum hnae3_reset_type reset_type) -{ - /* When an incorrect reset type is executed, the get_reset_level - * function generates the HNAE3_NONE_RESET flag. As a result, this - * type do not need to pending. - */ - if (reset_type != HNAE3_NONE_RESET) - set_bit(reset_type, &hdev->reset_pending); -} - static int hclgevf_reset_wait(struct hclgevf_dev *hdev) { #define HCLGEVF_RESET_WAIT_US 20000 @@ -1555,7 +1544,7 @@ static void hclgevf_reset_err_handle(struct hclgevf_dev *hdev) hdev->rst_stats.rst_fail_cnt); if (hdev->rst_stats.rst_fail_cnt < HCLGEVF_RESET_MAX_FAIL_CNT) - hclgevf_set_reset_pending(hdev, hdev->reset_type); + set_bit(hdev->reset_type, &hdev->reset_pending); if (hclgevf_is_reset_pending(hdev)) { set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); @@ -1675,8 +1664,6 @@ static enum hnae3_reset_type hclgevf_get_reset_level(unsigned long *addr) clear_bit(HNAE3_FLR_RESET, addr); } - clear_bit(HNAE3_NONE_RESET, addr); - return rst_level; } @@ -1686,15 +1673,14 @@ static void hclgevf_reset_event(struct pci_dev *pdev, struct hnae3_ae_dev *ae_dev = pci_get_drvdata(pdev); struct hclgevf_dev *hdev = ae_dev->priv; + dev_info(&hdev->pdev->dev, "received reset request from VF enet\n"); + if (hdev->default_reset_request) hdev->reset_level = hclgevf_get_reset_level(&hdev->default_reset_request); else hdev->reset_level = HNAE3_VF_FUNC_RESET; - dev_info(&hdev->pdev->dev, "received reset request from VF enet, reset level is %d\n", - hdev->reset_level); - /* reset of this VF requested */ set_bit(HCLGEVF_RESET_REQUESTED, &hdev->reset_state); hclgevf_reset_task_schedule(hdev); @@ -1705,20 +1691,8 @@ static void hclgevf_reset_event(struct pci_dev *pdev, static void hclgevf_set_def_reset_request(struct hnae3_ae_dev *ae_dev, enum hnae3_reset_type rst_type) { -#define HCLGEVF_SUPPORT_RESET_TYPE \ - (BIT(HNAE3_VF_RESET) | BIT(HNAE3_VF_FUNC_RESET) | \ - BIT(HNAE3_VF_PF_FUNC_RESET) | BIT(HNAE3_VF_FULL_RESET) | \ - BIT(HNAE3_FLR_RESET) | BIT(HNAE3_VF_EXP_RESET)) - struct hclgevf_dev *hdev = ae_dev->priv; - if (!(BIT(rst_type) & HCLGEVF_SUPPORT_RESET_TYPE)) { - /* To prevent reset triggered by hclge_reset_event */ - set_bit(HNAE3_NONE_RESET, &hdev->default_reset_request); - dev_info(&hdev->pdev->dev, "unsupported reset type %d\n", - rst_type); - return; - } set_bit(rst_type, &hdev->default_reset_request); } @@ -1875,14 +1849,14 @@ static void hclgevf_reset_service_task(struct hclgevf_dev *hdev) */ if (hdev->reset_attempts > HCLGEVF_MAX_RESET_ATTEMPTS_CNT) { /* prepare for full reset of stack + pcie interface */ - hclgevf_set_reset_pending(hdev, HNAE3_VF_FULL_RESET); + set_bit(HNAE3_VF_FULL_RESET, &hdev->reset_pending); /* "defer" schedule the reset task again */ set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); } else { hdev->reset_attempts++; - hclgevf_set_reset_pending(hdev, hdev->reset_level); + set_bit(hdev->reset_level, &hdev->reset_pending); set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); } hclgevf_reset_task_schedule(hdev); @@ -2005,7 +1979,7 @@ static enum hclgevf_evt_cause hclgevf_check_evt_cause(struct hclgevf_dev *hdev, rst_ing_reg = hclgevf_read_dev(&hdev->hw, HCLGEVF_RST_ING); dev_info(&hdev->pdev->dev, "receive reset interrupt 0x%x!\n", rst_ing_reg); - hclgevf_set_reset_pending(hdev, HNAE3_VF_RESET); + set_bit(HNAE3_VF_RESET, &hdev->reset_pending); set_bit(HCLGEVF_RESET_PENDING, &hdev->reset_state); set_bit(HCLGE_COMM_STATE_CMD_DISABLE, &hdev->hw.hw.comm_state); *clearval = ~(1U << HCLGEVF_VECTOR0_RST_INT_B); @@ -2315,7 +2289,6 @@ static void hclgevf_state_init(struct hclgevf_dev *hdev) clear_bit(HCLGEVF_STATE_RST_FAIL, &hdev->state); INIT_DELAYED_WORK(&hdev->service_task, hclgevf_service_task); - timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); mutex_init(&hdev->mbx_resp.mbx_mutex); sema_init(&hdev->reset_sem, 1); @@ -3015,6 +2988,7 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) HCLGEVF_DRIVER_NAME); hclgevf_task_schedule(hdev, round_jiffies_relative(HZ)); + timer_setup(&hdev->reset_timer, hclgevf_reset_timer, 0); return 0; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c index 7d9d9dbc7560..6db415d8b917 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_regs.c @@ -123,10 +123,10 @@ int hclgevf_get_regs_len(struct hnae3_handle *handle) void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version, void *data) { +#define HCLGEVF_RING_REG_OFFSET 0x200 #define HCLGEVF_RING_INT_REG_OFFSET 0x4 struct hclgevf_dev *hdev = hclgevf_ae_get_hdev(handle); - struct hnae3_queue *tqp; int i, j, reg_um; u32 *reg = data; @@ -147,11 +147,10 @@ void hclgevf_get_regs(struct hnae3_handle *handle, u32 *version, reg_um = ARRAY_SIZE(ring_reg_addr_list); for (j = 0; j < hdev->num_tqps; j++) { reg += hclgevf_reg_get_tlv(HCLGEVF_REG_TAG_RING, reg_um, reg); - tqp = &hdev->htqp[j].q; for (i = 0; i < reg_um; i++) - *reg++ = readl_relaxed(tqp->io_base - - HCLGEVF_TQP_REG_OFFSET + - ring_reg_addr_list[i]); + *reg++ = hclgevf_read_dev(&hdev->hw, + ring_reg_addr_list[i] + + HCLGEVF_RING_REG_OFFSET * j); } reg_um = ARRAY_SIZE(tqp_intr_reg_addr_list); -- 2.51.0 From df3dff8ab6d79edc942464999d06fbaedf8cdd18 Mon Sep 17 00:00:00 2001 From: Peiyang Wang Date: Fri, 1 Nov 2024 17:15:07 +0800 Subject: [PATCH 04/16] net: hns3: fix kernel crash when uninstalling driver When the driver is uninstalled and the VF is disabled concurrently, a kernel crash occurs. The reason is that the two actions call function pci_disable_sriov(). The num_VFs is checked to determine whether to release the corresponding resources. During the second calling, num_VFs is not 0 and the resource release function is called. However, the corresponding resource has been released during the first invoking. Therefore, the problem occurs: [15277.839633][T50670] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000020 ... [15278.131557][T50670] Call trace: [15278.134686][T50670] klist_put+0x28/0x12c [15278.138682][T50670] klist_del+0x14/0x20 [15278.142592][T50670] device_del+0xbc/0x3c0 [15278.146676][T50670] pci_remove_bus_device+0x84/0x120 [15278.151714][T50670] pci_stop_and_remove_bus_device+0x6c/0x80 [15278.157447][T50670] pci_iov_remove_virtfn+0xb4/0x12c [15278.162485][T50670] sriov_disable+0x50/0x11c [15278.166829][T50670] pci_disable_sriov+0x24/0x30 [15278.171433][T50670] hnae3_unregister_ae_algo_prepare+0x60/0x90 [hnae3] [15278.178039][T50670] hclge_exit+0x28/0xd0 [hclge] [15278.182730][T50670] __se_sys_delete_module.isra.0+0x164/0x230 [15278.188550][T50670] __arm64_sys_delete_module+0x1c/0x30 [15278.193848][T50670] invoke_syscall+0x50/0x11c [15278.198278][T50670] el0_svc_common.constprop.0+0x158/0x164 [15278.203837][T50670] do_el0_svc+0x34/0xcc [15278.207834][T50670] el0_svc+0x20/0x30 For details, see the following figure. rmmod hclge disable VFs ---------------------------------------------------- hclge_exit() sriov_numvfs_store() ... device_lock() pci_disable_sriov() hns3_pci_sriov_configure() pci_disable_sriov() sriov_disable() sriov_disable() if !num_VFs : if !num_VFs : return; return; sriov_del_vfs() sriov_del_vfs() ... ... klist_put() klist_put() ... ... num_VFs = 0; num_VFs = 0; device_unlock(); In this patch, when driver is removing, we get the device_lock() to protect num_VFs, just like sriov_numvfs_store(). Fixes: 0dd8a25f355b ("net: hns3: disable sriov before unload hclge layer") Signed-off-by: Peiyang Wang Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241101091507.3644584-1-shaojijie@huawei.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/hisilicon/hns3/hnae3.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.c b/drivers/net/ethernet/hisilicon/hns3/hnae3.c index 67b0bf310daa..9a63fbc69408 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.c +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.c @@ -25,8 +25,11 @@ void hnae3_unregister_ae_algo_prepare(struct hnae3_ae_algo *ae_algo) pci_id = pci_match_id(ae_algo->pdev_id_table, ae_dev->pdev); if (!pci_id) continue; - if (IS_ENABLED(CONFIG_PCI_IOV)) + if (IS_ENABLED(CONFIG_PCI_IOV)) { + device_lock(&ae_dev->pdev->dev); pci_disable_sriov(ae_dev->pdev); + device_unlock(&ae_dev->pdev->dev); + } } } EXPORT_SYMBOL(hnae3_unregister_ae_algo_prepare); -- 2.51.0 From de794169cf1711a98e1e4856c76388e6dadd73a1 Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Fri, 1 Nov 2024 12:18:50 +0200 Subject: [PATCH 05/16] net: ethernet: ti: am65-cpsw: Fix multi queue Rx on J7 On J7 platforms, setting up multiple RX flows was failing as the RX free descriptor ring 0 is shared among all flows and we did not allocate enough elements in the RX free descriptor ring 0 to accommodate for all RX flows. This issue is not present on AM62 as separate pair of rings are used for free and completion rings for each flow. Fix this by allocating enough elements for RX free descriptor ring 0. However, we can no longer rely on desc_idx (descriptor based offsets) to identify the pages in the respective flows as free descriptor ring includes elements for all flows. To solve this, introduce a new swdata data structure to store flow_id and page. This can be used to identify which flow (page_pool) and page the descriptor belonged to when popped out of the RX rings. Fixes: da70d184a8c3 ("net: ethernet: ti: am65-cpsw: Introduce multi queue Rx") Signed-off-by: Roger Quadros Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 73 ++++++++++-------------- drivers/net/ethernet/ti/am65-cpsw-nuss.h | 6 +- 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 0520e9f4bea7..70aea654c79f 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -337,9 +337,9 @@ static int am65_cpsw_nuss_rx_push(struct am65_cpsw_common *common, struct am65_cpsw_rx_chn *rx_chn = &common->rx_chns; struct cppi5_host_desc_t *desc_rx; struct device *dev = common->dev; + struct am65_cpsw_swdata *swdata; dma_addr_t desc_dma; dma_addr_t buf_dma; - void *swdata; desc_rx = k3_cppi_desc_pool_alloc(rx_chn->desc_pool); if (!desc_rx) { @@ -363,7 +363,8 @@ static int am65_cpsw_nuss_rx_push(struct am65_cpsw_common *common, cppi5_hdesc_attach_buf(desc_rx, buf_dma, AM65_CPSW_MAX_PACKET_SIZE, buf_dma, AM65_CPSW_MAX_PACKET_SIZE); swdata = cppi5_hdesc_get_swdata(desc_rx); - *((void **)swdata) = page_address(page); + swdata->page = page; + swdata->flow_id = flow_idx; return k3_udma_glue_push_rx_chn(rx_chn->rx_chn, flow_idx, desc_rx, desc_dma); @@ -519,36 +520,31 @@ static enum am65_cpsw_tx_buf_type am65_cpsw_nuss_buf_type(struct am65_cpsw_tx_ch static inline void am65_cpsw_put_page(struct am65_cpsw_rx_flow *flow, struct page *page, - bool allow_direct, - int desc_idx) + bool allow_direct) { page_pool_put_full_page(flow->page_pool, page, allow_direct); - flow->pages[desc_idx] = NULL; } static void am65_cpsw_nuss_rx_cleanup(void *data, dma_addr_t desc_dma) { - struct am65_cpsw_rx_flow *flow = data; + struct am65_cpsw_rx_chn *rx_chn = data; struct cppi5_host_desc_t *desc_rx; - struct am65_cpsw_rx_chn *rx_chn; + struct am65_cpsw_swdata *swdata; dma_addr_t buf_dma; + struct page *page; u32 buf_dma_len; - void *page_addr; - void **swdata; - int desc_idx; + u32 flow_id; - rx_chn = &flow->common->rx_chns; desc_rx = k3_cppi_desc_pool_dma2virt(rx_chn->desc_pool, desc_dma); swdata = cppi5_hdesc_get_swdata(desc_rx); - page_addr = *swdata; + page = swdata->page; + flow_id = swdata->flow_id; cppi5_hdesc_get_obuf(desc_rx, &buf_dma, &buf_dma_len); k3_udma_glue_rx_cppi5_to_dma_addr(rx_chn->rx_chn, &buf_dma); dma_unmap_single(rx_chn->dma_dev, buf_dma, buf_dma_len, DMA_FROM_DEVICE); k3_cppi_desc_pool_free(rx_chn->desc_pool, desc_rx); - desc_idx = am65_cpsw_nuss_desc_idx(rx_chn->desc_pool, desc_rx, - rx_chn->dsize_log2); - am65_cpsw_put_page(flow, virt_to_page(page_addr), false, desc_idx); + am65_cpsw_put_page(&rx_chn->flows[flow_id], page, false); } static void am65_cpsw_nuss_xmit_free(struct am65_cpsw_tx_chn *tx_chn, @@ -703,14 +699,13 @@ static int am65_cpsw_nuss_common_open(struct am65_cpsw_common *common) ret = -ENOMEM; goto fail_rx; } - flow->pages[i] = page; ret = am65_cpsw_nuss_rx_push(common, page, flow_idx); if (ret < 0) { dev_err(common->dev, "cannot submit page to rx channel flow %d, error %d\n", flow_idx, ret); - am65_cpsw_put_page(flow, page, false, i); + am65_cpsw_put_page(flow, page, false); goto fail_rx; } } @@ -764,8 +759,8 @@ fail_tx: fail_rx: for (i = 0; i < common->rx_ch_num_flows; i++) - k3_udma_glue_reset_rx_chn(rx_chn->rx_chn, i, &rx_chn->flows[i], - am65_cpsw_nuss_rx_cleanup, 0); + k3_udma_glue_reset_rx_chn(rx_chn->rx_chn, i, rx_chn, + am65_cpsw_nuss_rx_cleanup, !!i); am65_cpsw_destroy_xdp_rxqs(common); @@ -817,11 +812,11 @@ static int am65_cpsw_nuss_common_stop(struct am65_cpsw_common *common) dev_err(common->dev, "rx teardown timeout\n"); } - for (i = 0; i < common->rx_ch_num_flows; i++) { + for (i = common->rx_ch_num_flows - 1; i >= 0; i--) { napi_disable(&rx_chn->flows[i].napi_rx); hrtimer_cancel(&rx_chn->flows[i].rx_hrtimer); - k3_udma_glue_reset_rx_chn(rx_chn->rx_chn, i, &rx_chn->flows[i], - am65_cpsw_nuss_rx_cleanup, 0); + k3_udma_glue_reset_rx_chn(rx_chn->rx_chn, i, rx_chn, + am65_cpsw_nuss_rx_cleanup, !!i); } k3_udma_glue_disable_rx_chn(rx_chn->rx_chn); @@ -1028,7 +1023,7 @@ pool_free: static int am65_cpsw_run_xdp(struct am65_cpsw_rx_flow *flow, struct am65_cpsw_port *port, struct xdp_buff *xdp, - int desc_idx, int cpu, int *len) + int cpu, int *len) { struct am65_cpsw_common *common = flow->common; struct am65_cpsw_ndev_priv *ndev_priv; @@ -1101,7 +1096,7 @@ drop: } page = virt_to_head_page(xdp->data); - am65_cpsw_put_page(flow, page, true, desc_idx); + am65_cpsw_put_page(flow, page, true); out: return ret; @@ -1150,16 +1145,16 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, struct am65_cpsw_ndev_stats *stats; struct cppi5_host_desc_t *desc_rx; struct device *dev = common->dev; + struct am65_cpsw_swdata *swdata; struct page *page, *new_page; dma_addr_t desc_dma, buf_dma; struct am65_cpsw_port *port; - int headroom, desc_idx, ret; struct net_device *ndev; u32 flow_idx = flow->id; struct sk_buff *skb; struct xdp_buff xdp; + int headroom, ret; void *page_addr; - void **swdata; u32 *psdata; *xdp_state = AM65_CPSW_XDP_PASS; @@ -1182,8 +1177,8 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, __func__, flow_idx, &desc_dma); swdata = cppi5_hdesc_get_swdata(desc_rx); - page_addr = *swdata; - page = virt_to_page(page_addr); + page = swdata->page; + page_addr = page_address(page); cppi5_hdesc_get_obuf(desc_rx, &buf_dma, &buf_dma_len); k3_udma_glue_rx_cppi5_to_dma_addr(rx_chn->rx_chn, &buf_dma); pkt_len = cppi5_hdesc_get_pktlen(desc_rx); @@ -1199,9 +1194,6 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, k3_cppi_desc_pool_free(rx_chn->desc_pool, desc_rx); - desc_idx = am65_cpsw_nuss_desc_idx(rx_chn->desc_pool, desc_rx, - rx_chn->dsize_log2); - skb = am65_cpsw_build_skb(page_addr, ndev, AM65_CPSW_MAX_PACKET_SIZE); if (unlikely(!skb)) { @@ -1213,7 +1205,7 @@ static int am65_cpsw_nuss_rx_packets(struct am65_cpsw_rx_flow *flow, xdp_init_buff(&xdp, PAGE_SIZE, &port->xdp_rxq[flow->id]); xdp_prepare_buff(&xdp, page_addr, AM65_CPSW_HEADROOM, pkt_len, false); - *xdp_state = am65_cpsw_run_xdp(flow, port, &xdp, desc_idx, + *xdp_state = am65_cpsw_run_xdp(flow, port, &xdp, cpu, &pkt_len); if (*xdp_state != AM65_CPSW_XDP_PASS) goto allocate; @@ -1247,10 +1239,8 @@ allocate: return -ENOMEM; } - flow->pages[desc_idx] = new_page; - if (netif_dormant(ndev)) { - am65_cpsw_put_page(flow, new_page, true, desc_idx); + am65_cpsw_put_page(flow, new_page, true); ndev->stats.rx_dropped++; return 0; } @@ -1258,7 +1248,7 @@ allocate: requeue: ret = am65_cpsw_nuss_rx_push(common, new_page, flow_idx); if (WARN_ON(ret < 0)) { - am65_cpsw_put_page(flow, new_page, true, desc_idx); + am65_cpsw_put_page(flow, new_page, true); ndev->stats.rx_errors++; ndev->stats.rx_dropped++; } @@ -2402,10 +2392,6 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) for (i = 0; i < common->rx_ch_num_flows; i++) { flow = &rx_chn->flows[i]; flow->page_pool = NULL; - flow->pages = devm_kcalloc(dev, AM65_CPSW_MAX_RX_DESC, - sizeof(*flow->pages), GFP_KERNEL); - if (!flow->pages) - return -ENOMEM; } rx_chn->rx_chn = k3_udma_glue_request_rx_chn(dev, "rx", &rx_cfg); @@ -2458,7 +2444,8 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) rx_flow_cfg.ring_rxfdq0_id = fdqring_id; rx_flow_cfg.rx_cfg.size = max_desc_num; - rx_flow_cfg.rxfdq_cfg.size = max_desc_num; + /* share same FDQ for all flows */ + rx_flow_cfg.rxfdq_cfg.size = max_desc_num * rx_cfg.flow_id_num; rx_flow_cfg.rxfdq_cfg.mode = common->pdata.fdqring_mode; ret = k3_udma_glue_rx_flow_init(rx_chn->rx_chn, @@ -3349,8 +3336,8 @@ static int am65_cpsw_nuss_register_ndevs(struct am65_cpsw_common *common) for (i = 0; i < common->rx_ch_num_flows; i++) k3_udma_glue_reset_rx_chn(rx_chan->rx_chn, i, - &rx_chan->flows[i], - am65_cpsw_nuss_rx_cleanup, 0); + rx_chan, + am65_cpsw_nuss_rx_cleanup, !!i); k3_udma_glue_disable_rx_chn(rx_chan->rx_chn); diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h index dc8d544230dc..92a27ba4c601 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h @@ -101,10 +101,14 @@ struct am65_cpsw_rx_flow { struct hrtimer rx_hrtimer; unsigned long rx_pace_timeout; struct page_pool *page_pool; - struct page **pages; char name[32]; }; +struct am65_cpsw_swdata { + u32 flow_id; + struct page *page; +}; + struct am65_cpsw_rx_chn { struct device *dev; struct device *dma_dev; -- 2.51.0 From ba3b7ac4f7143568ed6480180a847dc752780ece Mon Sep 17 00:00:00 2001 From: Roger Quadros Date: Fri, 1 Nov 2024 12:18:51 +0200 Subject: [PATCH 06/16] net: ethernet: ti: am65-cpsw: fix warning in am65_cpsw_nuss_remove_rx_chns() flow->irq is initialized to 0 which is a valid IRQ. Set it to -EINVAL in error path of am65_cpsw_nuss_init_rx_chns() so we do not try to free an unallocated IRQ in am65_cpsw_nuss_remove_rx_chns(). If user tried to change number of RX queues and am65_cpsw_nuss_init_rx_chns() failed due to any reason, the warning will happen if user tries to change the number of RX queues after the error condition. root@am62xx-evm:~# ethtool -L eth0 rx 3 [ 40.385293] am65-cpsw-nuss 8000000.ethernet: set new flow-id-base 19 [ 40.393211] am65-cpsw-nuss 8000000.ethernet: Failed to init rx flow2 netlink error: Invalid argument root@am62xx-evm:~# ethtool -L eth0 rx 2 [ 82.306427] ------------[ cut here ]------------ [ 82.311075] WARNING: CPU: 0 PID: 378 at kernel/irq/devres.c:144 devm_free_irq+0x84/0x90 [ 82.469770] Call trace: [ 82.472208] devm_free_irq+0x84/0x90 [ 82.475777] am65_cpsw_nuss_remove_rx_chns+0x6c/0xac [ti_am65_cpsw_nuss] [ 82.482487] am65_cpsw_nuss_update_tx_rx_chns+0x2c/0x9c [ti_am65_cpsw_nuss] [ 82.489442] am65_cpsw_set_channels+0x30/0x4c [ti_am65_cpsw_nuss] [ 82.495531] ethnl_set_channels+0x224/0x2dc [ 82.499713] ethnl_default_set_doit+0xb8/0x1b8 [ 82.504149] genl_family_rcv_msg_doit+0xc0/0x124 [ 82.508757] genl_rcv_msg+0x1f0/0x284 [ 82.512409] netlink_rcv_skb+0x58/0x130 [ 82.516239] genl_rcv+0x38/0x50 [ 82.519374] netlink_unicast+0x1d0/0x2b0 [ 82.523289] netlink_sendmsg+0x180/0x3c4 [ 82.527205] __sys_sendto+0xe4/0x158 [ 82.530779] __arm64_sys_sendto+0x28/0x38 [ 82.534782] invoke_syscall+0x44/0x100 [ 82.538526] el0_svc_common.constprop.0+0xc0/0xe0 [ 82.543221] do_el0_svc+0x1c/0x28 [ 82.546528] el0_svc+0x28/0x98 [ 82.549578] el0t_64_sync_handler+0xc0/0xc4 [ 82.553752] el0t_64_sync+0x190/0x194 [ 82.557407] ---[ end trace 0000000000000000 ]--- Fixes: da70d184a8c3 ("net: ethernet: ti: am65-cpsw: Introduce multi queue Rx") Signed-off-by: Roger Quadros Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 70aea654c79f..ba6db61dd227 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2441,6 +2441,7 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) flow = &rx_chn->flows[i]; flow->id = i; flow->common = common; + flow->irq = -EINVAL; rx_flow_cfg.ring_rxfdq0_id = fdqring_id; rx_flow_cfg.rx_cfg.size = max_desc_num; @@ -2483,6 +2484,7 @@ static int am65_cpsw_nuss_init_rx_chns(struct am65_cpsw_common *common) if (ret) { dev_err(dev, "failure requesting rx %d irq %u, %d\n", i, flow->irq, ret); + flow->irq = -EINVAL; goto err; } } -- 2.51.0 From 256748d5480bb3c4b731236c6d6fc86a8e2815d8 Mon Sep 17 00:00:00 2001 From: Diogo Silva Date: Sat, 2 Nov 2024 16:15:05 +0100 Subject: [PATCH 07/16] net: phy: ti: add PHY_RST_AFTER_CLK_EN flag DP83848 datasheet (section 4.7.2) indicates that the reset pin should be toggled after the clocks are running. Add the PHY_RST_AFTER_CLK_EN to make sure that this indication is respected. In my experience not having this flag enabled would lead to, on some boots, the wrong MII mode being selected if the PHY was initialized on the bootloader and was receiving data during Linux boot. Signed-off-by: Diogo Silva Reviewed-by: Andrew Lunn Fixes: 34e45ad9378c ("net: phy: dp83848: Add TI DP83848 Ethernet PHY") Link: https://patch.msgid.link/20241102151504.811306-1-paissilva@ld-100007.ds1.internal Signed-off-by: Jakub Kicinski --- drivers/net/phy/dp83848.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/dp83848.c b/drivers/net/phy/dp83848.c index 937061acfc61..351411f0aa6f 100644 --- a/drivers/net/phy/dp83848.c +++ b/drivers/net/phy/dp83848.c @@ -147,6 +147,8 @@ MODULE_DEVICE_TABLE(mdio, dp83848_tbl); /* IRQ related */ \ .config_intr = dp83848_config_intr, \ .handle_interrupt = dp83848_handle_interrupt, \ + \ + .flags = PHY_RST_AFTER_CLK_EN, \ } static struct phy_driver dp83848_driver[] = { -- 2.51.0 From cfbbd4859882a5469f6f4945937a074ee78c4b46 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Mon, 4 Nov 2024 13:31:41 +0100 Subject: [PATCH 08/16] mptcp: no admin perm to list endpoints During the switch to YNL, the command to list all endpoints has been accidentally restricted to users with admin permissions. It looks like there are no reasons to have this restriction which makes it harder for a user to quickly check if the endpoint list has been correctly populated by an automated tool. Best to go back to the previous behaviour then. mptcp_pm_gen.c has been modified using ynl-gen-c.py: $ ./tools/net/ynl/ynl-gen-c.py --mode kernel \ --spec Documentation/netlink/specs/mptcp_pm.yaml --source \ -o net/mptcp/mptcp_pm_gen.c The header file doesn't need to be regenerated. Fixes: 1d0507f46843 ("net: mptcp: convert netlink from small_ops to ops") Cc: stable@vger.kernel.org Reviewed-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241104-net-mptcp-misc-6-12-v1-1-c13f2ff1656f@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/mptcp_pm.yaml | 1 - net/mptcp/mptcp_pm_gen.c | 1 - 2 files changed, 2 deletions(-) diff --git a/Documentation/netlink/specs/mptcp_pm.yaml b/Documentation/netlink/specs/mptcp_pm.yaml index 30d8342cacc8..dc190bf838fe 100644 --- a/Documentation/netlink/specs/mptcp_pm.yaml +++ b/Documentation/netlink/specs/mptcp_pm.yaml @@ -293,7 +293,6 @@ operations: doc: Get endpoint information attribute-set: attr dont-validate: [ strict ] - flags: [ uns-admin-perm ] do: &get-addr-attrs request: attributes: diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c index c30a2a90a192..bfb37c5a88c4 100644 --- a/net/mptcp/mptcp_pm_gen.c +++ b/net/mptcp/mptcp_pm_gen.c @@ -112,7 +112,6 @@ const struct genl_ops mptcp_pm_nl_ops[11] = { .dumpit = mptcp_pm_nl_get_addr_dumpit, .policy = mptcp_pm_get_addr_nl_policy, .maxattr = MPTCP_PM_ATTR_TOKEN, - .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, -- 2.51.0 From 99635c91fb8b860a6404b9bc8b769df7bdaa2ae3 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 4 Nov 2024 13:31:42 +0100 Subject: [PATCH 09/16] mptcp: use sock_kfree_s instead of kfree The local address entries on userspace_pm_local_addr_list are allocated by sock_kmalloc(). It's then required to use sock_kfree_s() instead of kfree() to free these entries in order to adjust the allocated size on the sk side. Fixes: 24430f8bf516 ("mptcp: add address into userspace pm list") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241104-net-mptcp-misc-6-12-v1-2-c13f2ff1656f@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 2cceded3a83a..56dfea9862b7 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -91,6 +91,7 @@ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, struct mptcp_pm_addr_entry *addr) { struct mptcp_pm_addr_entry *entry, *tmp; + struct sock *sk = (struct sock *)msk; list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) { if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) { @@ -98,7 +99,7 @@ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, * be used multiple times (e.g. fullmesh mode). */ list_del_rcu(&entry->list); - kfree(entry); + sock_kfree_s(sk, entry, sizeof(*entry)); msk->pm.local_addr_used--; return 0; } -- 2.51.0 From 1f26339b2ed63d1e8e18a18674fb73a392f3660e Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Tue, 5 Nov 2024 17:31:01 +0100 Subject: [PATCH 10/16] net: vertexcom: mse102x: Fix possible double free of TX skb The scope of the TX skb is wider than just mse102x_tx_frame_spi(), so in case the TX skb room needs to be expanded, we should free the the temporary skb instead of the original skb. Otherwise the original TX skb pointer would be freed again in mse102x_tx_work(), which leads to crashes: Internal error: Oops: 0000000096000004 [#2] PREEMPT SMP CPU: 0 PID: 712 Comm: kworker/0:1 Tainted: G D 6.6.23 Hardware name: chargebyte Charge SOM DC-ONE (DT) Workqueue: events mse102x_tx_work [mse102x] pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : skb_release_data+0xb8/0x1d8 lr : skb_release_data+0x1ac/0x1d8 sp : ffff8000819a3cc0 x29: ffff8000819a3cc0 x28: ffff0000046daa60 x27: ffff0000057f2dc0 x26: ffff000005386c00 x25: 0000000000000002 x24: 00000000ffffffff x23: 0000000000000000 x22: 0000000000000001 x21: ffff0000057f2e50 x20: 0000000000000006 x19: 0000000000000000 x18: ffff00003fdacfcc x17: e69ad452d0c49def x16: 84a005feff870102 x15: 0000000000000000 x14: 000000000000024a x13: 0000000000000002 x12: 0000000000000000 x11: 0000000000000400 x10: 0000000000000930 x9 : ffff00003fd913e8 x8 : fffffc00001bc008 x7 : 0000000000000000 x6 : 0000000000000008 x5 : ffff00003fd91340 x4 : 0000000000000000 x3 : 0000000000000009 x2 : 00000000fffffffe x1 : 0000000000000000 x0 : 0000000000000000 Call trace: skb_release_data+0xb8/0x1d8 kfree_skb_reason+0x48/0xb0 mse102x_tx_work+0x164/0x35c [mse102x] process_one_work+0x138/0x260 worker_thread+0x32c/0x438 kthread+0x118/0x11c ret_from_fork+0x10/0x20 Code: aa1303e0 97fffab6 72001c1f 54000141 (f9400660) Cc: stable@vger.kernel.org Fixes: 2f207cbf0dd4 ("net: vertexcom: Add MSE102x SPI support") Signed-off-by: Stefan Wahren Link: https://patch.msgid.link/20241105163101.33216-1-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/vertexcom/mse102x.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c index a04d4073def9..2c37957478fb 100644 --- a/drivers/net/ethernet/vertexcom/mse102x.c +++ b/drivers/net/ethernet/vertexcom/mse102x.c @@ -222,7 +222,7 @@ static int mse102x_tx_frame_spi(struct mse102x_net *mse, struct sk_buff *txp, struct mse102x_net_spi *mses = to_mse102x_spi(mse); struct spi_transfer *xfer = &mses->spi_xfer; struct spi_message *msg = &mses->spi_msg; - struct sk_buff *tskb; + struct sk_buff *tskb = NULL; int ret; netif_dbg(mse, tx_queued, mse->ndev, "%s: skb %p, %d@%p\n", @@ -235,7 +235,6 @@ static int mse102x_tx_frame_spi(struct mse102x_net *mse, struct sk_buff *txp, if (!tskb) return -ENOMEM; - dev_kfree_skb(txp); txp = tskb; } @@ -257,6 +256,8 @@ static int mse102x_tx_frame_spi(struct mse102x_net *mse, struct sk_buff *txp, mse->stats.xfer_err++; } + dev_kfree_skb(tskb); + return ret; } -- 2.51.0 From 25d70702142ac2115e75e01a0a985c6ea1d78033 Mon Sep 17 00:00:00 2001 From: =?utf8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Fri, 1 Nov 2024 17:17:29 -0400 Subject: [PATCH 11/16] net: stmmac: Fix unbalanced IRQ wake disable warning on single irq case MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Commit a23aa0404218 ("net: stmmac: ethtool: Fixed calltrace caused by unbalanced disable_irq_wake calls") introduced checks to prevent unbalanced enable and disable IRQ wake calls. However it only initialized the auxiliary variable on one of the paths, stmmac_request_irq_multi_msi(), missing the other, stmmac_request_irq_single(). Add the same initialization on stmmac_request_irq_single() to prevent "Unbalanced IRQ wake disable" warnings from being printed the first time disable_irq_wake() is called on platforms that run on that code path. Fixes: a23aa0404218 ("net: stmmac: ethtool: Fixed calltrace caused by unbalanced disable_irq_wake calls") Signed-off-by: Nícolas F. R. A. Prado Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241101-stmmac-unbalanced-wake-single-fix-v1-1-5952524c97f0@collabora.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 208dbc68aaf9..7bf275f127c9 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -3780,6 +3780,7 @@ static int stmmac_request_irq_single(struct net_device *dev) /* Request the Wake IRQ in case of another line * is used for WoL */ + priv->wol_irq_disabled = true; if (priv->wol_irq > 0 && priv->wol_irq != dev->irq) { ret = request_irq(priv->wol_irq, stmmac_interrupt, IRQF_SHARED, dev->name, dev); -- 2.51.0 From 86a48a00efdf61197b6658e52c6140463eb313dc Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Mon, 4 Nov 2024 16:57:03 +0800 Subject: [PATCH 12/16] virtio_net: Support dynamic rss indirection table size When reading/writing virtio_net_ctrl_rss, we get the indirection table size from vi->rss_indir_table_size, which is initialized in virtnet_probe(). However, the actual size of indirection_table was set as VIRTIO_NET_RSS_MAX_TABLE_LEN=128. This collision may cause issues if the vi->rss_indir_table_size exceeds 128. This patch instead uses dynamic indirection table, allocated with vi->rss after vi->rss_indir_table_size initialized. And free it in virtnet_remove(). In virtnet_commit_rss_command(), sgs for rss is initialized differently with hash_report. So indirection_table is not used if !vi->has_rss, and then we don't need to alloc indirection_table for hash_report only uses. Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Signed-off-by: Philo Lu Signed-off-by: Xuan Zhuo Acked-by: Joe Damato Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 792e9eadbfc3..4b507007d242 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -368,15 +368,16 @@ struct receive_queue { * because table sizes may be differ according to the device configuration. */ #define VIRTIO_NET_RSS_MAX_KEY_SIZE 40 -#define VIRTIO_NET_RSS_MAX_TABLE_LEN 128 struct virtio_net_ctrl_rss { u32 hash_types; u16 indirection_table_mask; u16 unclassified_queue; - u16 indirection_table[VIRTIO_NET_RSS_MAX_TABLE_LEN]; + u16 hash_cfg_reserved; /* for HASH_CONFIG (see virtio_net_hash_config for details) */ u16 max_tx_vq; u8 hash_key_length; u8 key[VIRTIO_NET_RSS_MAX_KEY_SIZE]; + + u16 *indirection_table; }; /* Control VQ buffers: protected by the rtnl lock */ @@ -512,6 +513,25 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb, struct page *page, void *buf, int len, int truesize); +static int rss_indirection_table_alloc(struct virtio_net_ctrl_rss *rss, u16 indir_table_size) +{ + if (!indir_table_size) { + rss->indirection_table = NULL; + return 0; + } + + rss->indirection_table = kmalloc_array(indir_table_size, sizeof(u16), GFP_KERNEL); + if (!rss->indirection_table) + return -ENOMEM; + + return 0; +} + +static void rss_indirection_table_free(struct virtio_net_ctrl_rss *rss) +{ + kfree(rss->indirection_table); +} + static bool is_xdp_frame(void *ptr) { return (unsigned long)ptr & VIRTIO_XDP_FLAG; @@ -3828,11 +3848,15 @@ static bool virtnet_commit_rss_command(struct virtnet_info *vi) /* prepare sgs */ sg_init_table(sgs, 4); - sg_buf_size = offsetof(struct virtio_net_ctrl_rss, indirection_table); + sg_buf_size = offsetof(struct virtio_net_ctrl_rss, hash_cfg_reserved); sg_set_buf(&sgs[0], &vi->rss, sg_buf_size); - sg_buf_size = sizeof(uint16_t) * (vi->rss.indirection_table_mask + 1); - sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); + if (vi->has_rss) { + sg_buf_size = sizeof(uint16_t) * vi->rss_indir_table_size; + sg_set_buf(&sgs[1], vi->rss.indirection_table, sg_buf_size); + } else { + sg_set_buf(&sgs[1], &vi->rss.hash_cfg_reserved, sizeof(uint16_t)); + } sg_buf_size = offsetof(struct virtio_net_ctrl_rss, key) - offsetof(struct virtio_net_ctrl_rss, max_tx_vq); @@ -6420,6 +6444,9 @@ static int virtnet_probe(struct virtio_device *vdev) virtio_cread16(vdev, offsetof(struct virtio_net_config, rss_max_indirection_table_length)); } + err = rss_indirection_table_alloc(&vi->rss, vi->rss_indir_table_size); + if (err) + goto free; if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = @@ -6674,6 +6701,8 @@ static void virtnet_remove(struct virtio_device *vdev) remove_vq_common(vi); + rss_indirection_table_free(&vi->rss); + free_netdev(vi->dev); } -- 2.51.0 From 3f7d9c1964fcd16d02a8a9d4fd6f6cb60c4cc530 Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Mon, 4 Nov 2024 16:57:04 +0800 Subject: [PATCH 13/16] virtio_net: Add hash_key_length check Add hash_key_length check in virtnet_probe() to avoid possible out of bound errors when setting/reading the hash key. Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Signed-off-by: Philo Lu Signed-off-by: Xuan Zhuo Acked-by: Joe Damato Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 4b507007d242..545dda8ec077 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -6451,6 +6451,12 @@ static int virtnet_probe(struct virtio_device *vdev) if (vi->has_rss || vi->has_rss_hash_report) { vi->rss_key_size = virtio_cread8(vdev, offsetof(struct virtio_net_config, rss_max_key_size)); + if (vi->rss_key_size > VIRTIO_NET_RSS_MAX_KEY_SIZE) { + dev_err(&vdev->dev, "rss_max_key_size=%u exceeds the limit %u.\n", + vi->rss_key_size, VIRTIO_NET_RSS_MAX_KEY_SIZE); + err = -EINVAL; + goto free; + } vi->rss_hash_types_supported = virtio_cread32(vdev, offsetof(struct virtio_net_config, supported_hash_types)); -- 2.51.0 From dc749b7b06082ccaacc602e724445da19cd03e9f Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Mon, 4 Nov 2024 16:57:05 +0800 Subject: [PATCH 14/16] virtio_net: Sync rss config to device when virtnet_probe During virtnet_probe, default rss configuration is initialized, but was not committed to the device. This patch fix this by sending rss command after device ready in virtnet_probe. Otherwise, the actual rss configuration used by device can be different with that read by user from driver, which may confuse the user. If the command committing fails, driver rss will be disabled. Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Signed-off-by: Philo Lu Signed-off-by: Xuan Zhuo Acked-by: Joe Damato Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 545dda8ec077..b3232b8baa25 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -6584,6 +6584,15 @@ static int virtnet_probe(struct virtio_device *vdev) virtio_device_ready(vdev); + if (vi->has_rss || vi->has_rss_hash_report) { + if (!virtnet_commit_rss_command(vi)) { + dev_warn(&vdev->dev, "RSS disabled because committing failed.\n"); + dev->hw_features &= ~NETIF_F_RXHASH; + vi->has_rss_hash_report = false; + vi->has_rss = false; + } + } + virtnet_set_queues(vi, vi->curr_queue_pairs); /* a random MAC address has been assigned, notify the device. -- 2.51.0 From 50bfcaedd78e53135ec0504302269b3b65bf1eff Mon Sep 17 00:00:00 2001 From: Philo Lu Date: Mon, 4 Nov 2024 16:57:06 +0800 Subject: [PATCH 15/16] virtio_net: Update rss when set queue RSS configuration should be updated with queue number. In particular, it should be updated when (1) rss enabled and (2) default rss configuration is used without user modification. During rss command processing, device updates queue_pairs using rss.max_tx_vq. That is, the device updates queue_pairs together with rss, so we can skip the sperate queue_pairs update (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. Also remove the `vi->has_rss ?` check when setting vi->rss.max_tx_vq, because this is not used in the other hash_report case. Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.") Signed-off-by: Philo Lu Signed-off-by: Xuan Zhuo Acked-by: Michael S. Tsirkin Signed-off-by: Paolo Abeni --- drivers/net/virtio_net.c | 65 +++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 14 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index b3232b8baa25..53a038fcbe99 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3394,15 +3394,59 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi) dev_warn(&vi->dev->dev, "Failed to ack link announce.\n"); } +static bool virtnet_commit_rss_command(struct virtnet_info *vi); + +static void virtnet_rss_update_by_qpairs(struct virtnet_info *vi, u16 queue_pairs) +{ + u32 indir_val = 0; + int i = 0; + + for (; i < vi->rss_indir_table_size; ++i) { + indir_val = ethtool_rxfh_indir_default(i, queue_pairs); + vi->rss.indirection_table[i] = indir_val; + } + vi->rss.max_tx_vq = queue_pairs; +} + static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) { struct virtio_net_ctrl_mq *mq __free(kfree) = NULL; - struct scatterlist sg; + struct virtio_net_ctrl_rss old_rss; struct net_device *dev = vi->dev; + struct scatterlist sg; if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ)) return 0; + /* Firstly check if we need update rss. Do updating if both (1) rss enabled and + * (2) no user configuration. + * + * During rss command processing, device updates queue_pairs using rss.max_tx_vq. That is, + * the device updates queue_pairs together with rss, so we can skip the sperate queue_pairs + * update (VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET below) and return directly. + */ + if (vi->has_rss && !netif_is_rxfh_configured(dev)) { + memcpy(&old_rss, &vi->rss, sizeof(old_rss)); + if (rss_indirection_table_alloc(&vi->rss, vi->rss_indir_table_size)) { + vi->rss.indirection_table = old_rss.indirection_table; + return -ENOMEM; + } + + virtnet_rss_update_by_qpairs(vi, queue_pairs); + + if (!virtnet_commit_rss_command(vi)) { + /* restore ctrl_rss if commit_rss_command failed */ + rss_indirection_table_free(&vi->rss); + memcpy(&vi->rss, &old_rss, sizeof(old_rss)); + + dev_warn(&dev->dev, "Fail to set num of queue pairs to %d, because committing RSS failed\n", + queue_pairs); + return -EINVAL; + } + rss_indirection_table_free(&old_rss); + goto succ; + } + mq = kzalloc(sizeof(*mq), GFP_KERNEL); if (!mq) return -ENOMEM; @@ -3415,12 +3459,12 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs) dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n", queue_pairs); return -EINVAL; - } else { - vi->curr_queue_pairs = queue_pairs; - /* virtnet_open() will refill when device is going to up. */ - if (dev->flags & IFF_UP) - schedule_delayed_work(&vi->refill, 0); } +succ: + vi->curr_queue_pairs = queue_pairs; + /* virtnet_open() will refill when device is going to up. */ + if (dev->flags & IFF_UP) + schedule_delayed_work(&vi->refill, 0); return 0; } @@ -3880,21 +3924,14 @@ err: static void virtnet_init_default_rss(struct virtnet_info *vi) { - u32 indir_val = 0; - int i = 0; - vi->rss.hash_types = vi->rss_hash_types_supported; vi->rss_hash_types_saved = vi->rss_hash_types_supported; vi->rss.indirection_table_mask = vi->rss_indir_table_size ? vi->rss_indir_table_size - 1 : 0; vi->rss.unclassified_queue = 0; - for (; i < vi->rss_indir_table_size; ++i) { - indir_val = ethtool_rxfh_indir_default(i, vi->curr_queue_pairs); - vi->rss.indirection_table[i] = indir_val; - } + virtnet_rss_update_by_qpairs(vi, vi->curr_queue_pairs); - vi->rss.max_tx_vq = vi->has_rss ? vi->curr_queue_pairs : 0; vi->rss.hash_key_length = vi->rss_key_size; netdev_rss_key_fill(vi->rss.key, vi->rss_key_size); -- 2.51.0 From 71803c1dfa29e0d13b99e48fda11107cc8caebc7 Mon Sep 17 00:00:00 2001 From: Johan Jonker Date: Mon, 4 Nov 2024 21:01:38 +0800 Subject: [PATCH 16/16] net: arc: fix the device for dma_map_single/dma_unmap_single The ndev->dev and pdev->dev aren't the same device, use ndev->dev.parent which has dma_mask, ndev->dev.parent is just pdev->dev. Or it would cause the following issue: [ 39.933526] ------------[ cut here ]------------ [ 39.938414] WARNING: CPU: 1 PID: 501 at kernel/dma/mapping.c:149 dma_map_page_attrs+0x90/0x1f8 Fixes: f959dcd6ddfd ("dma-direct: Fix potential NULL pointer dereference") Signed-off-by: David Wu Signed-off-by: Johan Jonker Signed-off-by: Andy Yan Signed-off-by: Paolo Abeni --- drivers/net/ethernet/arc/emac_main.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/arc/emac_main.c b/drivers/net/ethernet/arc/emac_main.c index 31ee477dd131..8283aeee35fb 100644 --- a/drivers/net/ethernet/arc/emac_main.c +++ b/drivers/net/ethernet/arc/emac_main.c @@ -111,6 +111,7 @@ static void arc_emac_tx_clean(struct net_device *ndev) { struct arc_emac_priv *priv = netdev_priv(ndev); struct net_device_stats *stats = &ndev->stats; + struct device *dev = ndev->dev.parent; unsigned int i; for (i = 0; i < TX_BD_NUM; i++) { @@ -140,7 +141,7 @@ static void arc_emac_tx_clean(struct net_device *ndev) stats->tx_bytes += skb->len; } - dma_unmap_single(&ndev->dev, dma_unmap_addr(tx_buff, addr), + dma_unmap_single(dev, dma_unmap_addr(tx_buff, addr), dma_unmap_len(tx_buff, len), DMA_TO_DEVICE); /* return the sk_buff to system */ @@ -174,6 +175,7 @@ static void arc_emac_tx_clean(struct net_device *ndev) static int arc_emac_rx(struct net_device *ndev, int budget) { struct arc_emac_priv *priv = netdev_priv(ndev); + struct device *dev = ndev->dev.parent; unsigned int work_done; for (work_done = 0; work_done < budget; work_done++) { @@ -223,9 +225,9 @@ static int arc_emac_rx(struct net_device *ndev, int budget) continue; } - addr = dma_map_single(&ndev->dev, (void *)skb->data, + addr = dma_map_single(dev, (void *)skb->data, EMAC_BUFFER_SIZE, DMA_FROM_DEVICE); - if (dma_mapping_error(&ndev->dev, addr)) { + if (dma_mapping_error(dev, addr)) { if (net_ratelimit()) netdev_err(ndev, "cannot map dma buffer\n"); dev_kfree_skb(skb); @@ -237,7 +239,7 @@ static int arc_emac_rx(struct net_device *ndev, int budget) } /* unmap previosly mapped skb */ - dma_unmap_single(&ndev->dev, dma_unmap_addr(rx_buff, addr), + dma_unmap_single(dev, dma_unmap_addr(rx_buff, addr), dma_unmap_len(rx_buff, len), DMA_FROM_DEVICE); pktlen = info & LEN_MASK; @@ -423,6 +425,7 @@ static int arc_emac_open(struct net_device *ndev) { struct arc_emac_priv *priv = netdev_priv(ndev); struct phy_device *phy_dev = ndev->phydev; + struct device *dev = ndev->dev.parent; int i; phy_dev->autoneg = AUTONEG_ENABLE; @@ -445,9 +448,9 @@ static int arc_emac_open(struct net_device *ndev) if (unlikely(!rx_buff->skb)) return -ENOMEM; - addr = dma_map_single(&ndev->dev, (void *)rx_buff->skb->data, + addr = dma_map_single(dev, (void *)rx_buff->skb->data, EMAC_BUFFER_SIZE, DMA_FROM_DEVICE); - if (dma_mapping_error(&ndev->dev, addr)) { + if (dma_mapping_error(dev, addr)) { netdev_err(ndev, "cannot dma map\n"); dev_kfree_skb(rx_buff->skb); return -ENOMEM; @@ -548,6 +551,7 @@ static void arc_emac_set_rx_mode(struct net_device *ndev) static void arc_free_tx_queue(struct net_device *ndev) { struct arc_emac_priv *priv = netdev_priv(ndev); + struct device *dev = ndev->dev.parent; unsigned int i; for (i = 0; i < TX_BD_NUM; i++) { @@ -555,7 +559,7 @@ static void arc_free_tx_queue(struct net_device *ndev) struct buffer_state *tx_buff = &priv->tx_buff[i]; if (tx_buff->skb) { - dma_unmap_single(&ndev->dev, + dma_unmap_single(dev, dma_unmap_addr(tx_buff, addr), dma_unmap_len(tx_buff, len), DMA_TO_DEVICE); @@ -579,6 +583,7 @@ static void arc_free_tx_queue(struct net_device *ndev) static void arc_free_rx_queue(struct net_device *ndev) { struct arc_emac_priv *priv = netdev_priv(ndev); + struct device *dev = ndev->dev.parent; unsigned int i; for (i = 0; i < RX_BD_NUM; i++) { @@ -586,7 +591,7 @@ static void arc_free_rx_queue(struct net_device *ndev) struct buffer_state *rx_buff = &priv->rx_buff[i]; if (rx_buff->skb) { - dma_unmap_single(&ndev->dev, + dma_unmap_single(dev, dma_unmap_addr(rx_buff, addr), dma_unmap_len(rx_buff, len), DMA_FROM_DEVICE); @@ -679,6 +684,7 @@ static netdev_tx_t arc_emac_tx(struct sk_buff *skb, struct net_device *ndev) unsigned int len, *txbd_curr = &priv->txbd_curr; struct net_device_stats *stats = &ndev->stats; __le32 *info = &priv->txbd[*txbd_curr].info; + struct device *dev = ndev->dev.parent; dma_addr_t addr; if (skb_padto(skb, ETH_ZLEN)) @@ -692,10 +698,9 @@ static netdev_tx_t arc_emac_tx(struct sk_buff *skb, struct net_device *ndev) return NETDEV_TX_BUSY; } - addr = dma_map_single(&ndev->dev, (void *)skb->data, len, - DMA_TO_DEVICE); + addr = dma_map_single(dev, (void *)skb->data, len, DMA_TO_DEVICE); - if (unlikely(dma_mapping_error(&ndev->dev, addr))) { + if (unlikely(dma_mapping_error(dev, addr))) { stats->tx_dropped++; stats->tx_errors++; dev_kfree_skb_any(skb); -- 2.51.0