From c69046c3f2dcef3fe65eb771544547286934a865 Mon Sep 17 00:00:00 2001 From: Jianbo Liu Date: Thu, 20 Feb 2025 23:39:58 +0200 Subject: [PATCH 01/16] net/mlx5e: Support RX xfrm state selector's UPSPEC for packet offload Previously, the upper layer matches are added for the decryption rule when xfrm selector's UPSPEC is specified in the command. However, it's impossible as packets are not decrypted, and there is no way to do match on the upper protocol (TCP/UDP) with specific source/destination port. The result is that packets are not decrypted by hardware because of this mismatch. Instead, they are forwarded to kernel, and decryption is done by software. To resolve this issue, this patch adds new table (sa_sel) after status table and before policy table. When UPSPEC's proto is specified in xfrm state's selector, a rule is added in status table to forward the decrypted packets to sa_sel table, where the corresponding rule for selector's UPSPEC is added, and packet's upper headers are checked there. If matched, they will be forward to policy table to do policy check. Otherwise, they are dropped immediately. Besides, add a global count for this kind of packet drop. Signed-off-by: Jianbo Liu Reviewed-by: Leon Romanovsky Reviewed-by: Patrisious Haddad Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250220213959.504304-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../mellanox/mlx5/core/en_accel/ipsec.h | 5 + .../mellanox/mlx5/core/en_accel/ipsec_fs.c | 238 +++++++++++++++++- .../mellanox/mlx5/core/en_accel/ipsec_stats.c | 1 + 3 files changed, 242 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h index 7d943e93cf6d..ad8db9e1fd1d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.h @@ -128,6 +128,7 @@ struct mlx5e_ipsec_hw_stats { u64 ipsec_rx_bytes; u64 ipsec_rx_drop_pkts; u64 ipsec_rx_drop_bytes; + u64 ipsec_rx_drop_mismatch_sa_sel; u64 ipsec_tx_pkts; u64 ipsec_tx_bytes; u64 ipsec_tx_drop_pkts; @@ -184,6 +185,7 @@ struct mlx5e_ipsec_ft { struct mutex mutex; /* Protect changes to this struct */ struct mlx5_flow_table *pol; struct mlx5_flow_table *sa; + struct mlx5_flow_table *sa_sel; struct mlx5_flow_table *status; u32 refcnt; }; @@ -195,6 +197,8 @@ struct mlx5e_ipsec_drop { struct mlx5e_ipsec_rule { struct mlx5_flow_handle *rule; + struct mlx5_flow_handle *status_pass; + struct mlx5_flow_handle *sa_sel; struct mlx5_modify_hdr *modify_hdr; struct mlx5_pkt_reformat *pkt_reformat; struct mlx5_fc *fc; @@ -206,6 +210,7 @@ struct mlx5e_ipsec_rule { struct mlx5e_ipsec_miss { struct mlx5_flow_group *group; struct mlx5_flow_handle *rule; + struct mlx5_fc *fc; }; struct mlx5e_ipsec_tx_create_attr { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c index 840d9e0514d3..d51ace739637 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_fs.c @@ -16,6 +16,8 @@ #define MLX5_REFORMAT_TYPE_ADD_ESP_TRANSPORT_SIZE 16 #define IPSEC_TUNNEL_DEFAULT_TTL 0x40 +#define MLX5_IPSEC_FS_SA_SELECTOR_MAX_NUM_GROUPS 16 + enum { MLX5_IPSEC_ASO_OK, MLX5_IPSEC_ASO_BAD_REPLY, @@ -52,6 +54,7 @@ struct mlx5e_ipsec_rx { struct mlx5e_ipsec_ft ft; struct mlx5e_ipsec_miss pol; struct mlx5e_ipsec_miss sa; + struct mlx5e_ipsec_miss sa_sel; struct mlx5e_ipsec_status_checks status_checks; struct mlx5e_ipsec_fc *fc; struct mlx5_fs_chains *chains; @@ -689,6 +692,16 @@ static void ipsec_rx_policy_destroy(struct mlx5e_ipsec_rx *rx) } } +static void ipsec_rx_sa_selector_destroy(struct mlx5_core_dev *mdev, + struct mlx5e_ipsec_rx *rx) +{ + mlx5_del_flow_rules(rx->sa_sel.rule); + mlx5_fc_destroy(mdev, rx->sa_sel.fc); + rx->sa_sel.fc = NULL; + mlx5_destroy_flow_group(rx->sa_sel.group); + mlx5_destroy_flow_table(rx->ft.sa_sel); +} + static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, struct mlx5e_ipsec_rx *rx, u32 family) { @@ -704,6 +717,8 @@ static void rx_destroy(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, mlx5_ipsec_rx_status_destroy(ipsec, rx); mlx5_destroy_flow_table(rx->ft.status); + ipsec_rx_sa_selector_destroy(mdev, rx); + ipsec_rx_policy_destroy(rx); mlx5_ipsec_fs_roce_rx_destroy(ipsec->roce, family, mdev); @@ -892,6 +907,115 @@ err_out: return err; } +static int ipsec_rx_sa_selector_create(struct mlx5e_ipsec *ipsec, + struct mlx5e_ipsec_rx *rx, + struct mlx5e_ipsec_rx_create_attr *attr) +{ + int inlen = MLX5_ST_SZ_BYTES(create_flow_group_in); + struct mlx5_core_dev *mdev = ipsec->mdev; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_destination dest; + struct mlx5_flow_handle *rule; + struct mlx5_flow_table *ft; + struct mlx5_flow_group *fg; + u32 *flow_group_in; + struct mlx5_fc *fc; + int err; + + flow_group_in = kvzalloc(inlen, GFP_KERNEL); + if (!flow_group_in) + return -ENOMEM; + + ft = ipsec_ft_create(attr->ns, attr->status_level, attr->prio, 1, + MLX5_IPSEC_FS_SA_SELECTOR_MAX_NUM_GROUPS, 0); + if (IS_ERR(ft)) { + err = PTR_ERR(ft); + mlx5_core_err(mdev, "Failed to create RX SA selector flow table, err=%d\n", + err); + goto err_ft; + } + + MLX5_SET(create_flow_group_in, flow_group_in, start_flow_index, + ft->max_fte - 1); + MLX5_SET(create_flow_group_in, flow_group_in, end_flow_index, + ft->max_fte - 1); + fg = mlx5_create_flow_group(ft, flow_group_in); + if (IS_ERR(fg)) { + err = PTR_ERR(fg); + mlx5_core_err(mdev, "Failed to create RX SA selector miss group, err=%d\n", + err); + goto err_fg; + } + + fc = mlx5_fc_create(mdev, false); + if (IS_ERR(fc)) { + err = PTR_ERR(fc); + mlx5_core_err(mdev, + "Failed to create ipsec RX SA selector miss rule counter, err=%d\n", + err); + goto err_cnt; + } + + dest.type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest.counter = fc; + flow_act.action = + MLX5_FLOW_CONTEXT_ACTION_COUNT | MLX5_FLOW_CONTEXT_ACTION_DROP; + + rule = mlx5_add_flow_rules(ft, NULL, &flow_act, &dest, 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, "Failed to create RX SA selector miss drop rule, err=%d\n", + err); + goto err_rule; + } + + rx->ft.sa_sel = ft; + rx->sa_sel.group = fg; + rx->sa_sel.fc = fc; + rx->sa_sel.rule = rule; + + kvfree(flow_group_in); + + return 0; + +err_rule: + mlx5_fc_destroy(mdev, fc); +err_cnt: + mlx5_destroy_flow_group(fg); +err_fg: + mlx5_destroy_flow_table(ft); +err_ft: + kvfree(flow_group_in); + return err; +} + +/* The decryption processing is as follows: + * + * +----------+ +-------------+ + * | | | | + * | Kernel <--------------+----------+ policy miss <------------+ + * | | ^ | | ^ + * +----^-----+ | +-------------+ | + * | crypto | + * miss offload ok allow/default + * ^ ^ ^ + * | | packet | + * +----+---------+ +----+-------------+ offload ok +------+---+ + * | | | | (no UPSPEC) | | + * | SA (decrypt) +-----> status +--->------->----+ policy | + * | | | | | | + * +--------------+ ++---------+-------+ +-^----+---+ + * | | | | + * v packet +-->->---+ v + * | offload ok match | + * fails (with UPSPEC) | block + * | | +-------------+-+ | + * v v | | miss v + * drop +---> SA sel +--------->drop + * | | + * +---------------+ + */ + static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, struct mlx5e_ipsec_rx *rx, u32 family) { @@ -907,13 +1031,17 @@ static int rx_create(struct mlx5_core_dev *mdev, struct mlx5e_ipsec *ipsec, if (err) return err; - ft = ipsec_ft_create(attr.ns, attr.status_level, attr.prio, 3, 3, 0); + ft = ipsec_ft_create(attr.ns, attr.status_level, attr.prio, 3, 4, 0); if (IS_ERR(ft)) { err = PTR_ERR(ft); goto err_fs_ft_status; } rx->ft.status = ft; + err = ipsec_rx_sa_selector_create(ipsec, rx, &attr); + if (err) + goto err_fs_ft_sa_sel; + /* Create FT */ if (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_TUNNEL) rx->allow_tunnel_mode = mlx5_eswitch_block_encap(mdev); @@ -956,6 +1084,8 @@ err_fs: if (rx->allow_tunnel_mode) mlx5_eswitch_unblock_encap(mdev); err_fs_ft: + ipsec_rx_sa_selector_destroy(mdev, rx); +err_fs_ft_sa_sel: mlx5_destroy_flow_table(rx->ft.status); err_fs_ft_status: mlx5_ipsec_fs_roce_rx_destroy(ipsec->roce, family, mdev); @@ -1781,6 +1911,85 @@ static int setup_pkt_reformat(struct mlx5e_ipsec *ipsec, return 0; } +static int rx_add_rule_sa_selector(struct mlx5e_ipsec_sa_entry *sa_entry, + struct mlx5e_ipsec_rx *rx, + struct upspec *upspec) +{ + struct mlx5e_ipsec *ipsec = sa_entry->ipsec; + struct mlx5_core_dev *mdev = ipsec->mdev; + struct mlx5_flow_destination dest[2]; + struct mlx5_flow_act flow_act = {}; + struct mlx5_flow_handle *rule; + struct mlx5_flow_spec *spec; + int err = 0; + + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); + if (!spec) + return -ENOMEM; + + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters_2.ipsec_syndrome); + MLX5_SET_TO_ONES(fte_match_param, spec->match_criteria, + misc_parameters_2.metadata_reg_c_4); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters_2.ipsec_syndrome, 0); + MLX5_SET(fte_match_param, spec->match_value, + misc_parameters_2.metadata_reg_c_4, 0); + spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS_2; + + ipsec_rx_rule_add_match_obj(sa_entry, rx, spec); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | + MLX5_FLOW_CONTEXT_ACTION_COUNT; + flow_act.flags = FLOW_ACT_IGNORE_FLOW_LEVEL; + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[0].ft = rx->ft.sa_sel; + dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER; + dest[1].counter = rx->fc->cnt; + + rule = mlx5_add_flow_rules(rx->ft.status, spec, &flow_act, dest, 2); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, + "Failed to add ipsec rx pass rule, err=%d\n", + err); + goto err_add_status_pass_rule; + } + + sa_entry->ipsec_rule.status_pass = rule; + + MLX5_SET(fte_match_param, spec->match_criteria, + misc_parameters_2.ipsec_syndrome, 0); + MLX5_SET(fte_match_param, spec->match_criteria, + misc_parameters_2.metadata_reg_c_4, 0); + + setup_fte_upper_proto_match(spec, upspec); + + flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest[0].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dest[0].ft = rx->ft.pol; + + rule = mlx5_add_flow_rules(rx->ft.sa_sel, spec, &flow_act, &dest[0], 1); + if (IS_ERR(rule)) { + err = PTR_ERR(rule); + mlx5_core_err(mdev, + "Failed to add ipsec rx sa selector rule, err=%d\n", + err); + goto err_add_sa_sel_rule; + } + + sa_entry->ipsec_rule.sa_sel = rule; + + kvfree(spec); + return 0; + +err_add_sa_sel_rule: + mlx5_del_flow_rules(sa_entry->ipsec_rule.status_pass); +err_add_status_pass_rule: + kvfree(spec); + return err; +} + static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) { struct mlx5_accel_esp_xfrm_attrs *attrs = &sa_entry->attrs; @@ -1813,7 +2022,6 @@ static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) if (!attrs->encap) setup_fte_esp(spec); setup_fte_no_frags(spec); - setup_fte_upper_proto_match(spec, &attrs->upspec); if (!attrs->drop) { if (rx != ipsec->rx_esw) @@ -1861,6 +2069,13 @@ static int rx_add_rule(struct mlx5e_ipsec_sa_entry *sa_entry) mlx5_core_err(mdev, "fail to add RX ipsec rule err=%d\n", err); goto err_add_flow; } + + if (attrs->upspec.proto && attrs->type == XFRM_DEV_OFFLOAD_PACKET) { + err = rx_add_rule_sa_selector(sa_entry, rx, &attrs->upspec); + if (err) + goto err_add_sa_sel; + } + if (attrs->type == XFRM_DEV_OFFLOAD_PACKET) err = rx_add_rule_drop_replay(sa_entry, rx); if (err) @@ -1884,6 +2099,11 @@ err_drop_reason: mlx5_fc_destroy(mdev, sa_entry->ipsec_rule.replay.fc); } err_add_replay: + if (sa_entry->ipsec_rule.sa_sel) { + mlx5_del_flow_rules(sa_entry->ipsec_rule.sa_sel); + mlx5_del_flow_rules(sa_entry->ipsec_rule.status_pass); + } +err_add_sa_sel: mlx5_del_flow_rules(rule); err_add_flow: mlx5_fc_destroy(mdev, counter); @@ -2265,6 +2485,7 @@ void mlx5e_accel_ipsec_fs_read_stats(struct mlx5e_priv *priv, void *ipsec_stats) stats->ipsec_rx_bytes = 0; stats->ipsec_rx_drop_pkts = 0; stats->ipsec_rx_drop_bytes = 0; + stats->ipsec_rx_drop_mismatch_sa_sel = 0; stats->ipsec_tx_pkts = 0; stats->ipsec_tx_bytes = 0; stats->ipsec_tx_drop_pkts = 0; @@ -2274,6 +2495,9 @@ void mlx5e_accel_ipsec_fs_read_stats(struct mlx5e_priv *priv, void *ipsec_stats) mlx5_fc_query(mdev, fc->cnt, &stats->ipsec_rx_pkts, &stats->ipsec_rx_bytes); mlx5_fc_query(mdev, fc->drop, &stats->ipsec_rx_drop_pkts, &stats->ipsec_rx_drop_bytes); + if (ipsec->rx_ipv4->sa_sel.fc) + mlx5_fc_query(mdev, ipsec->rx_ipv4->sa_sel.fc, + &stats->ipsec_rx_drop_mismatch_sa_sel, &bytes); fc = ipsec->tx->fc; mlx5_fc_query(mdev, fc->cnt, &stats->ipsec_tx_pkts, &stats->ipsec_tx_bytes); @@ -2302,6 +2526,11 @@ void mlx5e_accel_ipsec_fs_read_stats(struct mlx5e_priv *priv, void *ipsec_stats) stats->ipsec_tx_drop_pkts += packets; stats->ipsec_tx_drop_bytes += bytes; } + + if (ipsec->rx_esw->sa_sel.fc && + !mlx5_fc_query(mdev, ipsec->rx_esw->sa_sel.fc, + &packets, &bytes)) + stats->ipsec_rx_drop_mismatch_sa_sel += packets; } } @@ -2399,6 +2628,11 @@ void mlx5e_accel_ipsec_fs_del_rule(struct mlx5e_ipsec_sa_entry *sa_entry) mlx5_del_flow_rules(ipsec_rule->auth.rule); mlx5_fc_destroy(mdev, ipsec_rule->auth.fc); + if (ipsec_rule->sa_sel) { + mlx5_del_flow_rules(ipsec_rule->sa_sel); + mlx5_del_flow_rules(ipsec_rule->status_pass); + } + if (ipsec_rule->replay.rule) { mlx5_del_flow_rules(ipsec_rule->replay.rule); mlx5_fc_destroy(mdev, ipsec_rule->replay.fc); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c index 92bf3fa44a3b..93be388068f8 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec_stats.c @@ -42,6 +42,7 @@ static const struct counter_desc mlx5e_ipsec_hw_stats_desc[] = { { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_rx_bytes) }, { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_rx_drop_pkts) }, { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_rx_drop_bytes) }, + { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_rx_drop_mismatch_sa_sel) }, { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_tx_pkts) }, { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_tx_bytes) }, { MLX5E_DECLARE_STAT(struct mlx5e_ipsec_hw_stats, ipsec_tx_drop_pkts) }, -- 2.51.0 From 8f3f4464ff08f70e959c026fad2f3790abe84be6 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 21 Feb 2025 09:53:22 +0100 Subject: [PATCH 02/16] net/mlx5: Use secs_to_jiffies() instead of msecs_to_jiffies() Use secs_to_jiffies() and simplify the code. Reviewed-by: Jacob Keller Reviewed-by: Saeed Mahameed Signed-off-by: Thorsten Blum Reviewed-by: Tariq Toukan Reviewed-by: Somnath Kotur Link: https://patch.msgid.link/20250221085350.198024-3-thorsten.blum@linux.dev Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c index 3dbd4efa21a2..19dce1ba512d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/hws/bwc.c @@ -220,7 +220,7 @@ static int hws_bwc_queue_poll(struct mlx5hws_context *ctx, bool drain) { unsigned long timeout = jiffies + - msecs_to_jiffies(MLX5HWS_BWC_POLLING_TIMEOUT * MSEC_PER_SEC); + secs_to_jiffies(MLX5HWS_BWC_POLLING_TIMEOUT); struct mlx5hws_flow_op_result comp[MLX5HWS_BWC_MATCHER_REHASH_BURST_TH]; u16 burst_th = hws_bwc_get_burst_th(ctx, queue_id); bool got_comp = *pending_rules >= burst_th; -- 2.51.0 From bc337e8c0e762b0c1eaca00aa6955cd0e7013ba1 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 21 Feb 2025 16:43:54 +0100 Subject: [PATCH 03/16] mptcp: pm: remove unused ret value to set flags The returned value is not used, it can then be dropped. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-1-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 99705a9c2238..ff1e5695dc1d 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1922,13 +1922,11 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); } -static int mptcp_nl_set_flags(struct net *net, - struct mptcp_addr_info *addr, - u8 bkup, u8 changed) +static void mptcp_nl_set_flags(struct net *net, struct mptcp_addr_info *addr, + u8 bkup, u8 changed) { long s_slot = 0, s_num = 0; struct mptcp_sock *msk; - int ret = -EINVAL; while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; @@ -1938,7 +1936,7 @@ static int mptcp_nl_set_flags(struct net *net, lock_sock(sk); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) - ret = mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); + mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) mptcp_pm_nl_fullmesh(msk, addr); release_sock(sk); @@ -1948,7 +1946,7 @@ next: cond_resched(); } - return ret; + return; } int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, -- 2.51.0 From 145dc6cc4abdb3b76eb01a0943a540db2a01ebe6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 21 Feb 2025 16:43:55 +0100 Subject: [PATCH 04/16] mptcp: pm: change to fullmesh only for 'subflow' If an endpoint doesn't have the 'subflow' flag -- in fact, has no type, so not 'subflow', 'signal', nor 'implicit' -- there are then no subflows created from this local endpoint to at least the initial destination address. In this case, no need to call mptcp_pm_nl_fullmesh() which is there to recreate the subflows to reflect the new value of the fullmesh attribute. Similarly, there is then no need to iterate over all connections to do nothing, if only the 'fullmesh' flag has been changed, and the endpoint doesn't have the 'subflow' one. So stop early when dealing with this specific case. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-2-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ff1e5695dc1d..1a0695e087af 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1923,11 +1923,16 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, } static void mptcp_nl_set_flags(struct net *net, struct mptcp_addr_info *addr, - u8 bkup, u8 changed) + u8 flags, u8 changed) { + u8 is_subflow = !!(flags & MPTCP_PM_ADDR_FLAG_SUBFLOW); + u8 bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP); long s_slot = 0, s_num = 0; struct mptcp_sock *msk; + if (changed == MPTCP_PM_ADDR_FLAG_FULLMESH && !is_subflow) + return; + while ((msk = mptcp_token_iter_next(net, &s_slot, &s_num)) != NULL) { struct sock *sk = (struct sock *)msk; @@ -1937,7 +1942,8 @@ static void mptcp_nl_set_flags(struct net *net, struct mptcp_addr_info *addr, lock_sock(sk); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) mptcp_pm_nl_mp_prio_send_ack(msk, addr, NULL, bkup); - if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) + /* Subflows will only be recreated if the SUBFLOW flag is set */ + if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)) mptcp_pm_nl_fullmesh(msk, addr); release_sock(sk); @@ -1959,7 +1965,6 @@ int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; u8 lookup_by_id = 0; - u8 bkup = 0; pernet = pm_nl_get_pernet(net); @@ -1972,9 +1977,6 @@ int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, } } - if (local->flags & MPTCP_PM_ADDR_FLAG_BACKUP) - bkup = 1; - spin_lock_bh(&pernet->lock); entry = lookup_by_id ? __lookup_addr_by_id(pernet, local->addr.id) : __lookup_addr(pernet, &local->addr); @@ -1996,7 +1998,7 @@ int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, *local = *entry; spin_unlock_bh(&pernet->lock); - mptcp_nl_set_flags(net, &local->addr, bkup, changed); + mptcp_nl_set_flags(net, &local->addr, entry->flags, changed); return 0; } -- 2.51.0 From 63132fb054744e58de61d45a6d4f2a707cdfcfb3 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 21 Feb 2025 16:43:56 +0100 Subject: [PATCH 05/16] mptcp: pm: add a build check for userspace_pm_dump_addr This patch adds a build check for mptcp_userspace_pm_dump_addr() to make sure there is enough space in 'cb->ctx' to store an address id bitmap. Just in case info stored in 'cb->ctx' are increased later. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-3-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 277cf092a870..b69fb5b18130 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -643,6 +643,8 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct sock *sk; void *hdr; + BUILD_BUG_ON(sizeof(struct id_bitmap) > sizeof(cb->ctx)); + bitmap = (struct id_bitmap *)cb->ctx; msk = mptcp_userspace_pm_get_sock(info); -- 2.51.0 From f8fe8174657329609a80f66da1d3dd80a80de76b Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 21 Feb 2025 16:43:57 +0100 Subject: [PATCH 06/16] mptcp: pm: add mptcp_pm_genl_fill_addr helper To save some redundant code in dump_addr() interfaces of both the netlink PM and userspace PM, the code that calls netlink message helpers (genlmsg_put/cancel/end) and mptcp_nl_fill_addr() is wrapped into a new helper mptcp_pm_genl_fill_addr(). Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-4-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 21 +++++++++++++++++++++ net/mptcp/pm_netlink.c | 12 +----------- net/mptcp/pm_userspace.c | 12 +----------- net/mptcp/protocol.h | 3 +++ 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index b1f36dc1a091..16cacce6c10f 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -489,6 +489,27 @@ fail: return ret; } +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry) +{ + void *hdr; + + hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &mptcp_genl_family, + NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); + if (!hdr) + return -EINVAL; + + if (mptcp_nl_fill_addr(msg, entry) < 0) { + genlmsg_cancel(msg, hdr); + return -EINVAL; + } + + genlmsg_end(msg, hdr); + return 0; +} + static int mptcp_pm_dump_addr(struct sk_buff *msg, struct netlink_callback *cb) { const struct genl_info *info = genl_info_dump(cb); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 1a0695e087af..98fcbf8b1465 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1798,7 +1798,6 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; int id = cb->args[0]; - void *hdr; int i; pernet = pm_nl_get_pernet(net); @@ -1813,19 +1812,10 @@ int mptcp_pm_nl_dump_addr(struct sk_buff *msg, if (entry->addr.id <= id) continue; - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) break; - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); - break; - } - id = entry->addr.id; - genlmsg_end(msg, hdr); } } rcu_read_unlock(); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index b69fb5b18130..bedd6f9ebc8b 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -641,7 +641,6 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; - void *hdr; BUILD_BUG_ON(sizeof(struct id_bitmap) > sizeof(cb->ctx)); @@ -659,19 +658,10 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg, if (test_bit(entry->addr.id, bitmap->map)) continue; - hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, &mptcp_genl_family, - NLM_F_MULTI, MPTCP_PM_CMD_GET_ADDR); - if (!hdr) + if (mptcp_pm_genl_fill_addr(msg, cb, entry) < 0) break; - if (mptcp_nl_fill_addr(msg, entry) < 0) { - genlmsg_cancel(msg, hdr); - break; - } - __set_bit(entry->addr.id, bitmap->map); - genlmsg_end(msg, hdr); } spin_unlock_bh(&msk->pm.lock); release_sock(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ca65f8bff632..256677c43ca6 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1057,6 +1057,9 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf struct request_sock *req); int mptcp_nl_fill_addr(struct sk_buff *skb, struct mptcp_pm_addr_entry *entry); +int mptcp_pm_genl_fill_addr(struct sk_buff *msg, + struct netlink_callback *cb, + struct mptcp_pm_addr_entry *entry); static inline bool mptcp_pm_should_add_signal(struct mptcp_sock *msk) { -- 2.51.0 From 640e3d69d0bc70d7d3de34800a1640793262bd08 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 21 Feb 2025 16:43:58 +0100 Subject: [PATCH 07/16] mptcp: pm: drop match in userspace_pm_append_new_local_addr The variable 'match' in mptcp_userspace_pm_append_new_local_addr() is a redundant one, and this patch drops it. No need to define 'match' as 'struct mptcp_pm_addr_entry *' type. In this function, it's only used to check whether it's NULL. It can be defined as a Boolean one. Also other variables 'addr_match' and 'id_match' make 'match' a redundant one, which can be replaced by directly checking 'addr_match && id_match'. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-5-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index bedd6f9ebc8b..a16e2fb45a6c 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -48,7 +48,6 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, bool needs_id) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); - struct mptcp_pm_addr_entry *match = NULL; struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *e; bool addr_match = false; @@ -63,16 +62,12 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, if (addr_match && entry->addr.id == 0 && needs_id) entry->addr.id = e->addr.id; id_match = (e->addr.id == entry->addr.id); - if (addr_match && id_match) { - match = e; + if (addr_match || id_match) break; - } else if (addr_match || id_match) { - break; - } __set_bit(e->addr.id, id_bitmap); } - if (!match && !addr_match && !id_match) { + if (!addr_match && !id_match) { /* Memory for the entry is allocated from the * sock option buffer. */ @@ -90,7 +85,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, list_add_tail_rcu(&e->list, &msk->pm.userspace_pm_local_addr_list); msk->pm.local_addr_used++; ret = e->addr.id; - } else if (match) { + } else if (addr_match && id_match) { ret = entry->addr.id; } -- 2.51.0 From dc41695200a1f5db720f7988952a74833542831d Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 21 Feb 2025 16:43:59 +0100 Subject: [PATCH 08/16] mptcp: pm: drop inet6_sk after inet_sk In mptcp_event_add_subflow(), mptcp_event_pm_listener() and mptcp_nl_find_ssk(), 'issk' has already been got through inet_sk(). No need to use inet6_sk() to get 'ipv6_pinfo' again, just use issk->pinet6 instead. This patch also drops these 'ipv6_pinfo' variables. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-6-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 8 ++------ net/mptcp/pm_userspace.c | 4 +--- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 98fcbf8b1465..f67b637c1fcf 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -2022,9 +2022,7 @@ static int mptcp_event_add_subflow(struct sk_buff *skb, const struct sock *ssk) break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *np = inet6_sk(ssk); - - if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) return -EMSGSIZE; if (nla_put_in6_addr(skb, MPTCP_ATTR_DADDR6, &ssk->sk_v6_daddr)) return -EMSGSIZE; @@ -2251,9 +2249,7 @@ void mptcp_event_pm_listener(const struct sock *ssk, break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *np = inet6_sk(ssk); - - if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &np->saddr)) + if (nla_put_in6_addr(skb, MPTCP_ATTR_SADDR6, &issk->pinet6->saddr)) goto nla_put_failure; break; } diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a16e2fb45a6c..6bf6a20ef7f3 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -460,9 +460,7 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk, break; #if IS_ENABLED(CONFIG_MPTCP_IPV6) case AF_INET6: { - const struct ipv6_pinfo *pinfo = inet6_sk(ssk); - - if (!ipv6_addr_equal(&local->addr6, &pinfo->saddr) || + if (!ipv6_addr_equal(&local->addr6, &issk->pinet6->saddr) || !ipv6_addr_equal(&remote->addr6, &ssk->sk_v6_daddr)) continue; break; -- 2.51.0 From 7720790fd56b91259efe500a702ad4c0fd29b260 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 21 Feb 2025 16:44:00 +0100 Subject: [PATCH 09/16] mptcp: pm: use ipv6_addr_equal in addresses_equal Use ipv6_addr_equal() to check whether two IPv6 addresses are equal in mptcp_addresses_equal(). This is more appropriate than using !ipv6_addr_cmp(). Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-7-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f67b637c1fcf..ef85a60151ad 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -64,7 +64,7 @@ bool mptcp_addresses_equal(const struct mptcp_addr_info *a, addr_equals = a->addr.s_addr == b->addr.s_addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) else - addr_equals = !ipv6_addr_cmp(&a->addr6, &b->addr6); + addr_equals = ipv6_addr_equal(&a->addr6, &b->addr6); } else if (a->family == AF_INET) { if (ipv6_addr_v4mapped(&b->addr6)) addr_equals = a->addr.s_addr == b->addr6.s6_addr32[3]; -- 2.51.0 From 9771a96a7a35daa2220cc4f170b840b34af28b2c Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 21 Feb 2025 16:44:01 +0100 Subject: [PATCH 10/16] mptcp: sched: split get_subflow interface into two get_retrans() interface of the burst packet scheduler invokes a sleeping function mptcp_pm_subflow_chk_stale(), which calls __lock_sock_fast(). So get_retrans() interface should be set with BPF_F_SLEEPABLE flag in BPF. But get_send() interface of this scheduler can't be set with BPF_F_SLEEPABLE flag since it's invoked in ack_update_msk() under mptcp data lock. So this patch has to split get_subflow() interface of packet scheduer into two interfaces: get_send() and get_retrans(). Then we can set get_retrans() interface alone with BPF_F_SLEEPABLE flag. Signed-off-by: Geliang Tang Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-8-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- include/net/mptcp.h | 5 +++-- net/mptcp/sched.c | 35 ++++++++++++++++++++++++----------- 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/include/net/mptcp.h b/include/net/mptcp.h index 814b5f2e3ed5..2c85ca92bb1c 100644 --- a/include/net/mptcp.h +++ b/include/net/mptcp.h @@ -103,13 +103,14 @@ struct mptcp_out_options { #define MPTCP_SUBFLOWS_MAX 8 struct mptcp_sched_data { - bool reinject; u8 subflows; struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX]; }; struct mptcp_sched_ops { - int (*get_subflow)(struct mptcp_sock *msk, + int (*get_send)(struct mptcp_sock *msk, + struct mptcp_sched_data *data); + int (*get_retrans)(struct mptcp_sock *msk, struct mptcp_sched_data *data); char name[MPTCP_SCHED_NAME_MAX]; diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index df7dbcfa3b71..94dc4b3ad82f 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -16,13 +16,25 @@ static DEFINE_SPINLOCK(mptcp_sched_list_lock); static LIST_HEAD(mptcp_sched_list); -static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, +static int mptcp_sched_default_get_send(struct mptcp_sock *msk, + struct mptcp_sched_data *data) +{ + struct sock *ssk; + + ssk = mptcp_subflow_get_send(msk); + if (!ssk) + return -EINVAL; + + mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true); + return 0; +} + +static int mptcp_sched_default_get_retrans(struct mptcp_sock *msk, struct mptcp_sched_data *data) { struct sock *ssk; - ssk = data->reinject ? mptcp_subflow_get_retrans(msk) : - mptcp_subflow_get_send(msk); + ssk = mptcp_subflow_get_retrans(msk); if (!ssk) return -EINVAL; @@ -31,7 +43,8 @@ static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk, } static struct mptcp_sched_ops mptcp_sched_default = { - .get_subflow = mptcp_sched_default_get_subflow, + .get_send = mptcp_sched_default_get_send, + .get_retrans = mptcp_sched_default_get_retrans, .name = "default", .owner = THIS_MODULE, }; @@ -73,7 +86,7 @@ void mptcp_get_available_schedulers(char *buf, size_t maxlen) int mptcp_register_scheduler(struct mptcp_sched_ops *sched) { - if (!sched->get_subflow) + if (!sched->get_send) return -EINVAL; spin_lock(&mptcp_sched_list_lock); @@ -164,10 +177,9 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) return 0; } - data.reinject = false; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_send(msk, &data); + return msk->sched->get_send(msk, &data); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) @@ -186,8 +198,9 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) return 0; } - data.reinject = true; if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_subflow(msk, &data); - return msk->sched->get_subflow(msk, &data); + return mptcp_sched_default_get_retrans(msk, &data); + if (msk->sched->get_retrans) + return msk->sched->get_retrans(msk, &data); + return msk->sched->get_send(msk, &data); } -- 2.51.0 From b68b106b0f15424db6c78d8c1a0616f698080b9d Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 21 Feb 2025 16:44:02 +0100 Subject: [PATCH 11/16] mptcp: sched: reduce size for unused data Thanks for the previous commit ("mptcp: sched: split get_subflow interface into two"), the mptcp_sched_data structure is now currently unused. This structure has been added to allow future extensions that are not ready yet. At the end, this structure will not even be used at all when mptcp_subflow bpf_iter will be supported [1]. Here is a first step to save 64 bytes on the stack for each scheduling operation. The structure is not removed yet not to break the WIP work on these extensions, but will be done when [1] will be ready and applied. Link: https://lore.kernel.org/6645ad6e-8874-44c5-8730-854c30673218@linux.dev [1] Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-9-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/sched.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/mptcp/sched.c b/net/mptcp/sched.c index 94dc4b3ad82f..c16c6fbd4ba2 100644 --- a/net/mptcp/sched.c +++ b/net/mptcp/sched.c @@ -157,7 +157,7 @@ void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow, int mptcp_sched_get_send(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data data; + struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -178,14 +178,14 @@ int mptcp_sched_get_send(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_send(msk, &data); - return msk->sched->get_send(msk, &data); + return mptcp_sched_default_get_send(msk, data); + return msk->sched->get_send(msk, data); } int mptcp_sched_get_retrans(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - struct mptcp_sched_data data; + struct mptcp_sched_data *data = NULL; msk_owned_by_me(msk); @@ -199,8 +199,8 @@ int mptcp_sched_get_retrans(struct mptcp_sock *msk) } if (msk->sched == &mptcp_sched_default || !msk->sched) - return mptcp_sched_default_get_retrans(msk, &data); + return mptcp_sched_default_get_retrans(msk, data); if (msk->sched->get_retrans) - return msk->sched->get_retrans(msk, &data); - return msk->sched->get_send(msk, &data); + return msk->sched->get_retrans(msk, data); + return msk->sched->get_send(msk, data); } -- 2.51.0 From 8275ac799ee15e972841eb77b694d63f5e888519 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 21 Feb 2025 16:44:03 +0100 Subject: [PATCH 12/16] mptcp: blackhole: avoid checking the state twice A small cleanup, reordering the conditions to avoid checking things twice. The code here is called in case of timeout on a TCP connection, before triggering a retransmission. But it only acts on SYN + MPC packets. So the conditions can be re-order to exit early in case of non-MPTCP SYN + MPC. This also reduce the indentation levels. Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250221-net-next-mptcp-pm-misc-cleanup-3-v1-10-2b70ab1cee79@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 2dd81e6c26bd..be6c0237e10b 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -401,26 +401,30 @@ void mptcp_active_enable(struct sock *sk) void mptcp_active_detect_blackhole(struct sock *ssk, bool expired) { struct mptcp_subflow_context *subflow; + u8 timeouts, to_max; + struct net *net; - if (!sk_is_mptcp(ssk)) + /* Only check MPTCP SYN ... */ + if (likely(!sk_is_mptcp(ssk) || ssk->sk_state != TCP_SYN_SENT)) return; subflow = mptcp_subflow_ctx(ssk); - if (subflow->request_mptcp && ssk->sk_state == TCP_SYN_SENT) { - struct net *net = sock_net(ssk); - u8 timeouts, to_max; + /* ... + MP_CAPABLE */ + if (!subflow->request_mptcp) { + /* Mark as blackhole iif the 1st non-MPTCP SYN is accepted */ + subflow->mpc_drop = 0; + return; + } - timeouts = inet_csk(ssk)->icsk_retransmits; - to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; + net = sock_net(ssk); + timeouts = inet_csk(ssk)->icsk_retransmits; + to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback; - if (timeouts == to_max || (timeouts < to_max && expired)) { - MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); - subflow->mpc_drop = 1; - mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); - } - } else if (ssk->sk_state == TCP_SYN_SENT) { - subflow->mpc_drop = 0; + if (timeouts == to_max || (timeouts < to_max && expired)) { + MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP); + subflow->mpc_drop = 1; + mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow); } } -- 2.51.0 From a3e51d4711793be001220784bd7d8ce81517003e Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sat, 22 Feb 2025 09:36:10 +0100 Subject: [PATCH 13/16] net: phy: add phylib-internal.h This patch is a starting point for moving phylib-internal declarations to a private header file. Signed-off-by: Heiner Kallweit Link: https://patch.msgid.link/082eacd2-a888-4716-8797-b3491ce02820@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy-c45.c | 1 + drivers/net/phy/phy-core.c | 3 ++- drivers/net/phy/phy.c | 2 ++ drivers/net/phy/phy_device.c | 2 ++ drivers/net/phy/phy_led_triggers.c | 2 ++ drivers/net/phy/phylib-internal.h | 25 +++++++++++++++++++++++++ include/linux/phy.h | 13 ------------- 7 files changed, 34 insertions(+), 14 deletions(-) create mode 100644 drivers/net/phy/phylib-internal.h diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 37c9a344bf4a..0bcbdce38107 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -9,6 +9,7 @@ #include #include "mdio-open-alliance.h" +#include "phylib-internal.h" /** * genphy_c45_baset1_able - checks if the PMA has BASE-T1 extended abilities diff --git a/drivers/net/phy/phy-core.c b/drivers/net/phy/phy-core.c index 2fd1d153abc9..b1c1670de23b 100644 --- a/drivers/net/phy/phy-core.c +++ b/drivers/net/phy/phy-core.c @@ -6,6 +6,8 @@ #include #include +#include "phylib-internal.h" + /** * phy_speed_to_str - Return a string representing the PHY link speed * @@ -544,7 +546,6 @@ void phy_check_downshift(struct phy_device *phydev) phydev->downshifted_rate = 1; } -EXPORT_SYMBOL_GPL(phy_check_downshift); static int phy_resolve_min_speed(struct phy_device *phydev, bool fdx_only) { diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 831b36839627..16ffc00b419c 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -36,6 +36,8 @@ #include #include +#include "phylib-internal.h" + #define PHY_STATE_TIME HZ #define PHY_STATE_STR(_state) \ diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 7c4e1ad1864c..a38d399f244b 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -41,6 +41,8 @@ #include #include +#include "phylib-internal.h" + MODULE_DESCRIPTION("PHY library"); MODULE_AUTHOR("Andy Fleming"); MODULE_LICENSE("GPL"); diff --git a/drivers/net/phy/phy_led_triggers.c b/drivers/net/phy/phy_led_triggers.c index f550576eb9da..bd3c9554f6ac 100644 --- a/drivers/net/phy/phy_led_triggers.c +++ b/drivers/net/phy/phy_led_triggers.c @@ -5,6 +5,8 @@ #include #include +#include "phylib-internal.h" + static struct phy_led_trigger *phy_speed_to_led_trigger(struct phy_device *phy, unsigned int speed) { diff --git a/drivers/net/phy/phylib-internal.h b/drivers/net/phy/phylib-internal.h new file mode 100644 index 000000000000..dc9592c6bb8e --- /dev/null +++ b/drivers/net/phy/phylib-internal.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * phylib-internal header + */ + +#ifndef __PHYLIB_INTERNAL_H +#define __PHYLIB_INTERNAL_H + +struct phy_device; + +/* + * phy_supported_speeds - return all speeds currently supported by a PHY device + */ +unsigned int phy_supported_speeds(struct phy_device *phy, + unsigned int *speeds, + unsigned int size); +void of_set_phy_supported(struct phy_device *phydev); +void of_set_phy_eee_broken(struct phy_device *phydev); +void of_set_phy_timing_role(struct phy_device *phydev); +int phy_speed_down_core(struct phy_device *phydev); +void phy_check_downshift(struct phy_device *phydev); + +int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); + +#endif /* __PHYLIB_INTERNAL_H */ diff --git a/include/linux/phy.h b/include/linux/phy.h index 13be48d3b8b3..7bfbae51070a 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -181,13 +181,6 @@ static inline void phy_interface_set_rgmii(unsigned long *intf) __set_bit(PHY_INTERFACE_MODE_RGMII_TXID, intf); } -/* - * phy_supported_speeds - return all speeds currently supported by a PHY device - */ -unsigned int phy_supported_speeds(struct phy_device *phy, - unsigned int *speeds, - unsigned int size); - /** * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value @@ -1331,10 +1324,6 @@ phy_lookup_setting(int speed, int duplex, const unsigned long *mask, bool exact); size_t phy_speeds(unsigned int *speeds, size_t size, unsigned long *mask); -void of_set_phy_supported(struct phy_device *phydev); -void of_set_phy_eee_broken(struct phy_device *phydev); -void of_set_phy_timing_role(struct phy_device *phydev); -int phy_speed_down_core(struct phy_device *phydev); /** * phy_is_started - Convenience function to check whether PHY is started @@ -1360,7 +1349,6 @@ static inline void phy_disable_eee_mode(struct phy_device *phydev, u32 link_mode void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); -void phy_check_downshift(struct phy_device *phydev); /** * phy_read - Convenience function for reading a given PHY register @@ -2035,7 +2023,6 @@ int genphy_c45_ethtool_get_eee(struct phy_device *phydev, int genphy_c45_ethtool_set_eee(struct phy_device *phydev, struct ethtool_keee *data); int genphy_c45_an_config_eee_aneg(struct phy_device *phydev); -int genphy_c45_read_eee_adv(struct phy_device *phydev, unsigned long *adv); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; -- 2.51.0 From c6aa4e2cdff6351ec32404b63b83e5a4126a019b Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Fri, 21 Feb 2025 12:18:11 -0800 Subject: [PATCH 14/16] eth: fbnic: Add PCIe registers dump Provide coverage to PCIe registers in ethtool register dump Signed-off-by: Mohsin Bashir Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_csr.c | 1 + drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c index aeb9f333f4c7..d9c0dc1c2af9 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.c @@ -30,6 +30,7 @@ static const struct fbnic_csr_bounds fbnic_csr_sects[] = { FBNIC_BOUNDS(RSFEC), FBNIC_BOUNDS(MAC_MAC), FBNIC_BOUNDS(SIG), + FBNIC_BOUNDS(PCIE_SS_COMPHY), FBNIC_BOUNDS(PUL_USER), FBNIC_BOUNDS(QUEUE), FBNIC_BOUNDS(RPC_RAM), diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index 6f24c5f2e175..af6d33931c35 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -788,6 +788,11 @@ enum { #define FBNIC_MAC_STAT_TX_MULTICAST_H 0x11a4b /* 0x4692c */ #define FBNIC_MAC_STAT_TX_BROADCAST_L 0x11a4c /* 0x46930 */ #define FBNIC_MAC_STAT_TX_BROADCAST_H 0x11a4d /* 0x46934 */ + +/* PCIE Comphy Registers */ +#define FBNIC_CSR_START_PCIE_SS_COMPHY 0x2442e /* CSR section delimiter */ +#define FBNIC_CSR_END_PCIE_SS_COMPHY 0x279d7 /* CSR section delimiter */ + /* PUL User Registers */ #define FBNIC_CSR_START_PUL_USER 0x31000 /* CSR section delimiter */ #define FBNIC_PUL_OB_TLP_HDR_AW_CFG 0x3103d /* 0xc40f4 */ -- 2.51.0 From e4e7c9be21170bf60820d8db2ba9db29c7a7d9ac Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Fri, 21 Feb 2025 12:18:12 -0800 Subject: [PATCH 15/16] eth: fbnic: Consolidate PUL_USER CSR section Move PUL_USER CSRs in the relevant section, update the end boundary address, and remove the redundant definition of end boundary. Signed-off-by: Mohsin Bashir Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_csr.h | 73 ++++++++++----------- 1 file changed, 35 insertions(+), 38 deletions(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h index af6d33931c35..3b12a0ab5906 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_csr.h +++ b/drivers/net/ethernet/meta/fbnic/fbnic_csr.h @@ -799,7 +799,41 @@ enum { #define FBNIC_PUL_OB_TLP_HDR_AW_CFG_BME CSR_BIT(18) #define FBNIC_PUL_OB_TLP_HDR_AR_CFG 0x3103e /* 0xc40f8 */ #define FBNIC_PUL_OB_TLP_HDR_AR_CFG_BME CSR_BIT(18) -#define FBNIC_CSR_END_PUL_USER 0x31080 /* CSR section delimiter */ +#define FBNIC_PUL_USER_OB_RD_TLP_CNT_31_0 \ + 0x3106e /* 0xc41b8 */ +#define FBNIC_PUL_USER_OB_RD_DWORD_CNT_31_0 \ + 0x31070 /* 0xc41c0 */ +#define FBNIC_PUL_USER_OB_RD_DWORD_CNT_63_32 \ + 0x31071 /* 0xc41c4 */ +#define FBNIC_PUL_USER_OB_WR_TLP_CNT_31_0 \ + 0x31072 /* 0xc41c8 */ +#define FBNIC_PUL_USER_OB_WR_TLP_CNT_63_32 \ + 0x31073 /* 0xc41cc */ +#define FBNIC_PUL_USER_OB_WR_DWORD_CNT_31_0 \ + 0x31074 /* 0xc41d0 */ +#define FBNIC_PUL_USER_OB_WR_DWORD_CNT_63_32 \ + 0x31075 /* 0xc41d4 */ +#define FBNIC_PUL_USER_OB_CPL_TLP_CNT_31_0 \ + 0x31076 /* 0xc41d8 */ +#define FBNIC_PUL_USER_OB_CPL_TLP_CNT_63_32 \ + 0x31077 /* 0xc41dc */ +#define FBNIC_PUL_USER_OB_CPL_DWORD_CNT_31_0 \ + 0x31078 /* 0xc41e0 */ +#define FBNIC_PUL_USER_OB_CPL_DWORD_CNT_63_32 \ + 0x31079 /* 0xc41e4 */ +#define FBNIC_PUL_USER_OB_RD_DBG_CNT_CPL_CRED_31_0 \ + 0x3107a /* 0xc41e8 */ +#define FBNIC_PUL_USER_OB_RD_DBG_CNT_CPL_CRED_63_32 \ + 0x3107b /* 0xc41ec */ +#define FBNIC_PUL_USER_OB_RD_DBG_CNT_TAG_31_0 \ + 0x3107c /* 0xc41f0 */ +#define FBNIC_PUL_USER_OB_RD_DBG_CNT_TAG_63_32 \ + 0x3107d /* 0xc41f4 */ +#define FBNIC_PUL_USER_OB_RD_DBG_CNT_NP_CRED_31_0 \ + 0x3107e /* 0xc41f8 */ +#define FBNIC_PUL_USER_OB_RD_DBG_CNT_NP_CRED_63_32 \ + 0x3107f /* 0xc41fc */ +#define FBNIC_CSR_END_PUL_USER 0x310ea /* CSR section delimiter */ /* Queue Registers * @@ -939,43 +973,6 @@ enum { #define FBNIC_MAX_QUEUES 128 #define FBNIC_CSR_END_QUEUE (0x40000 + 0x400 * FBNIC_MAX_QUEUES - 1) -/* PUL User Registers*/ -#define FBNIC_PUL_USER_OB_RD_TLP_CNT_31_0 \ - 0x3106e /* 0xc41b8 */ -#define FBNIC_PUL_USER_OB_RD_DWORD_CNT_31_0 \ - 0x31070 /* 0xc41c0 */ -#define FBNIC_PUL_USER_OB_RD_DWORD_CNT_63_32 \ - 0x31071 /* 0xc41c4 */ -#define FBNIC_PUL_USER_OB_WR_TLP_CNT_31_0 \ - 0x31072 /* 0xc41c8 */ -#define FBNIC_PUL_USER_OB_WR_TLP_CNT_63_32 \ - 0x31073 /* 0xc41cc */ -#define FBNIC_PUL_USER_OB_WR_DWORD_CNT_31_0 \ - 0x31074 /* 0xc41d0 */ -#define FBNIC_PUL_USER_OB_WR_DWORD_CNT_63_32 \ - 0x31075 /* 0xc41d4 */ -#define FBNIC_PUL_USER_OB_CPL_TLP_CNT_31_0 \ - 0x31076 /* 0xc41d8 */ -#define FBNIC_PUL_USER_OB_CPL_TLP_CNT_63_32 \ - 0x31077 /* 0xc41dc */ -#define FBNIC_PUL_USER_OB_CPL_DWORD_CNT_31_0 \ - 0x31078 /* 0xc41e0 */ -#define FBNIC_PUL_USER_OB_CPL_DWORD_CNT_63_32 \ - 0x31079 /* 0xc41e4 */ -#define FBNIC_PUL_USER_OB_RD_DBG_CNT_CPL_CRED_31_0 \ - 0x3107a /* 0xc41e8 */ -#define FBNIC_PUL_USER_OB_RD_DBG_CNT_CPL_CRED_63_32 \ - 0x3107b /* 0xc41ec */ -#define FBNIC_PUL_USER_OB_RD_DBG_CNT_TAG_31_0 \ - 0x3107c /* 0xc41f0 */ -#define FBNIC_PUL_USER_OB_RD_DBG_CNT_TAG_63_32 \ - 0x3107d /* 0xc41f4 */ -#define FBNIC_PUL_USER_OB_RD_DBG_CNT_NP_CRED_31_0 \ - 0x3107e /* 0xc41f8 */ -#define FBNIC_PUL_USER_OB_RD_DBG_CNT_NP_CRED_63_32 \ - 0x3107f /* 0xc41fc */ -#define FBNIC_CSR_END_PUL_USER 0x31080 /* CSR section delimiter */ - /* BAR 4 CSRs */ /* The IPC mailbox consists of 32 mailboxes, with each mailbox consisting -- 2.51.0 From 26aa7992b456629882b74b2f3916dd2b94a87e7b Mon Sep 17 00:00:00 2001 From: Mohsin Bashir Date: Fri, 21 Feb 2025 12:18:13 -0800 Subject: [PATCH 16/16] eth: fbnic: Update return value in kdoc Fix return value in kdoc for fbnic_netdev_alloc() Signed-off-by: Mohsin Bashir Signed-off-by: Paolo Abeni --- drivers/net/ethernet/meta/fbnic/fbnic_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c index cf8feb90b617..79a01fdd1dd1 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_netdev.c @@ -611,7 +611,7 @@ void fbnic_netdev_free(struct fbnic_dev *fbd) * Allocate and initialize the netdev and netdev private structure. Bind * together the hardware, netdev, and pci data structures. * - * Return: 0 on success, negative on failure + * Return: Pointer to net_device on success, NULL on failure **/ struct net_device *fbnic_netdev_alloc(struct fbnic_dev *fbd) { -- 2.51.0