From a12c76a03386e32413ae8eaaefa337e491880632 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Wed, 15 Jan 2025 09:27:54 -0500 Subject: [PATCH 01/16] net: sched: refine software bypass handling in tc_run MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch addresses issues with filter counting in block (tcf_block), particularly for software bypass scenarios, by introducing a more accurate mechanism using useswcnt. Previously, filtercnt and skipswcnt were introduced by: Commit 2081fd3445fe ("net: sched: cls_api: add filter counter") and Commit f631ef39d819 ("net: sched: cls_api: add skip_sw counter") filtercnt tracked all tp (tcf_proto) objects added to a block, and skipswcnt counted tp objects with the skipsw attribute set. The problem is: a single tp can contain multiple filters, some with skipsw and others without. The current implementation fails in the case: When the first filter in a tp has skipsw, both skipswcnt and filtercnt are incremented, then adding a second filter without skipsw to the same tp does not modify these counters because tp->counted is already set. This results in bypass software behavior based solely on skipswcnt equaling filtercnt, even when the block includes filters without skipsw. Consequently, filters without skipsw are inadvertently bypassed. To address this, the patch introduces useswcnt in block to explicitly count tp objects containing at least one filter without skipsw. Key changes include: Whenever a filter without skipsw is added, its tp is marked with usesw and counted in useswcnt. tc_run() now uses useswcnt to determine software bypass, eliminating reliance on filtercnt and skipswcnt. This refined approach prevents software bypass for blocks containing mixed filters, ensuring correct behavior in tc_run(). Additionally, as atomic operations on useswcnt ensure thread safety and tp->lock guards access to tp->usesw and tp->counted, the broader lock down_write(&block->cb_lock) is no longer required in tc_new_tfilter(), and this resolves a performance regression caused by the filter counting mechanism during parallel filter insertions. The improvement can be demonstrated using the following script: # cat insert_tc_rules.sh tc qdisc add dev ens1f0np0 ingress for i in $(seq 16); do taskset -c $i tc -b rules_$i.txt & done wait Each of rules_$i.txt files above includes 100000 tc filter rules to a mlx5 driver NIC ens1f0np0. Without this patch: # time sh insert_tc_rules.sh real 0m50.780s user 0m23.556s sys 4m13.032s With this patch: # time sh insert_tc_rules.sh real 0m17.718s user 0m7.807s sys 3m45.050s Fixes: 047f340b36fc ("net: sched: make skip_sw actually skip software") Reported-by: Shuang Li Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Reviewed-by: Asbjørn Sloth Tønnesen Tested-by: Asbjørn Sloth Tønnesen Signed-off-by: David S. Miller --- include/net/pkt_cls.h | 13 +++++++-- include/net/sch_generic.h | 5 ++-- net/core/dev.c | 15 ++++++----- net/sched/cls_api.c | 57 ++++++++++++++++----------------------- net/sched/cls_bpf.c | 2 ++ net/sched/cls_flower.c | 2 ++ net/sched/cls_matchall.c | 2 ++ net/sched/cls_u32.c | 4 +++ 8 files changed, 55 insertions(+), 45 deletions(-) diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index 22c5ab4269d7..c64fd896b1f9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -75,11 +75,11 @@ static inline bool tcf_block_non_null_shared(struct tcf_block *block) } #ifdef CONFIG_NET_CLS_ACT -DECLARE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); +DECLARE_STATIC_KEY_FALSE(tcf_sw_enabled_key); static inline bool tcf_block_bypass_sw(struct tcf_block *block) { - return block && block->bypass_wanted; + return block && !atomic_read(&block->useswcnt); } #endif @@ -760,6 +760,15 @@ tc_cls_common_offload_init(struct flow_cls_common_offload *cls_common, cls_common->extack = extack; } +static inline void tcf_proto_update_usesw(struct tcf_proto *tp, u32 flags) +{ + if (tp->usesw) + return; + if (tc_skip_sw(flags) && tc_in_hw(flags)) + return; + tp->usesw = true; +} + #if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) static inline struct tc_skb_ext *tc_skb_ext_alloc(struct sk_buff *skb) { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 8074322dd636..d635c5b47eba 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -425,6 +425,7 @@ struct tcf_proto { spinlock_t lock; bool deleting; bool counted; + bool usesw; refcount_t refcnt; struct rcu_head rcu; struct hlist_node destroy_ht_node; @@ -474,9 +475,7 @@ struct tcf_block { struct flow_block flow_block; struct list_head owner_list; bool keep_dst; - bool bypass_wanted; - atomic_t filtercnt; /* Number of filters */ - atomic_t skipswcnt; /* Number of skip_sw filters */ + atomic_t useswcnt; atomic_t offloadcnt; /* Number of oddloaded filters */ unsigned int nooffloaddevcnt; /* Number of devs unable to do offload */ unsigned int lockeddevcnt; /* Number of devs that require rtnl lock. */ diff --git a/net/core/dev.c b/net/core/dev.c index d7cbe6ff5249..3dab6699b1c1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2248,8 +2248,8 @@ EXPORT_SYMBOL_GPL(net_dec_egress_queue); #endif #ifdef CONFIG_NET_CLS_ACT -DEFINE_STATIC_KEY_FALSE(tcf_bypass_check_needed_key); -EXPORT_SYMBOL(tcf_bypass_check_needed_key); +DEFINE_STATIC_KEY_FALSE(tcf_sw_enabled_key); +EXPORT_SYMBOL(tcf_sw_enabled_key); #endif DEFINE_STATIC_KEY_FALSE(netstamp_needed_key); @@ -4144,10 +4144,13 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, if (!miniq) return ret; - if (static_branch_unlikely(&tcf_bypass_check_needed_key)) { - if (tcf_block_bypass_sw(miniq->block)) - return ret; - } + /* Global bypass */ + if (!static_branch_likely(&tcf_sw_enabled_key)) + return ret; + + /* Block-wise bypass */ + if (tcf_block_bypass_sw(miniq->block)) + return ret; tc_skb_cb(skb)->mru = 0; tc_skb_cb(skb)->post_ct = false; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7578e27260c9..8e47e5355be6 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -390,6 +390,7 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, tp->protocol = protocol; tp->prio = prio; tp->chain = chain; + tp->usesw = !tp->ops->reoffload; spin_lock_init(&tp->lock); refcount_set(&tp->refcnt, 1); @@ -410,39 +411,31 @@ static void tcf_proto_get(struct tcf_proto *tp) refcount_inc(&tp->refcnt); } -static void tcf_maintain_bypass(struct tcf_block *block) +static void tcf_proto_count_usesw(struct tcf_proto *tp, bool add) { - int filtercnt = atomic_read(&block->filtercnt); - int skipswcnt = atomic_read(&block->skipswcnt); - bool bypass_wanted = filtercnt > 0 && filtercnt == skipswcnt; - - if (bypass_wanted != block->bypass_wanted) { #ifdef CONFIG_NET_CLS_ACT - if (bypass_wanted) - static_branch_inc(&tcf_bypass_check_needed_key); - else - static_branch_dec(&tcf_bypass_check_needed_key); -#endif - block->bypass_wanted = bypass_wanted; + struct tcf_block *block = tp->chain->block; + bool counted = false; + + if (!add) { + if (tp->usesw && tp->counted) { + if (!atomic_dec_return(&block->useswcnt)) + static_branch_dec(&tcf_sw_enabled_key); + tp->counted = false; + } + return; } -} - -static void tcf_block_filter_cnt_update(struct tcf_block *block, bool *counted, bool add) -{ - lockdep_assert_not_held(&block->cb_lock); - down_write(&block->cb_lock); - if (*counted != add) { - if (add) { - atomic_inc(&block->filtercnt); - *counted = true; - } else { - atomic_dec(&block->filtercnt); - *counted = false; - } + spin_lock(&tp->lock); + if (tp->usesw && !tp->counted) { + counted = true; + tp->counted = true; } - tcf_maintain_bypass(block); - up_write(&block->cb_lock); + spin_unlock(&tp->lock); + + if (counted && atomic_inc_return(&block->useswcnt) == 1) + static_branch_inc(&tcf_sw_enabled_key); +#endif } static void tcf_chain_put(struct tcf_chain *chain); @@ -451,7 +444,7 @@ static void tcf_proto_destroy(struct tcf_proto *tp, bool rtnl_held, bool sig_destroy, struct netlink_ext_ack *extack) { tp->ops->destroy(tp, rtnl_held, extack); - tcf_block_filter_cnt_update(tp->chain->block, &tp->counted, false); + tcf_proto_count_usesw(tp, false); if (sig_destroy) tcf_proto_signal_destroyed(tp->chain, tp); tcf_chain_put(tp->chain); @@ -2409,7 +2402,7 @@ replay: tfilter_notify(net, skb, n, tp, block, q, parent, fh, RTM_NEWTFILTER, false, rtnl_held, extack); tfilter_put(tp, fh); - tcf_block_filter_cnt_update(block, &tp->counted, true); + tcf_proto_count_usesw(tp, true); /* q pointer is NULL for shared blocks */ if (q) q->flags &= ~TCQ_F_CAN_BYPASS; @@ -3532,8 +3525,6 @@ static void tcf_block_offload_inc(struct tcf_block *block, u32 *flags) if (*flags & TCA_CLS_FLAGS_IN_HW) return; *flags |= TCA_CLS_FLAGS_IN_HW; - if (tc_skip_sw(*flags)) - atomic_inc(&block->skipswcnt); atomic_inc(&block->offloadcnt); } @@ -3542,8 +3533,6 @@ static void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) if (!(*flags & TCA_CLS_FLAGS_IN_HW)) return; *flags &= ~TCA_CLS_FLAGS_IN_HW; - if (tc_skip_sw(*flags)) - atomic_dec(&block->skipswcnt); atomic_dec(&block->offloadcnt); } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 1941ebec23ff..7fbe42f0e5c2 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -509,6 +509,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, prog->gen_flags); + if (oldprog) { idr_replace(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1008ec8a464c..03505673d523 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2503,6 +2503,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(fnew->flags)) fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, fnew->flags); + spin_lock(&tp->lock); /* tp was deleted concurrently. -EAGAIN will cause caller to lookup diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 9f1e62ca508d..f03bf5da39ee 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -228,6 +228,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, new->flags); + *arg = head; rcu_assign_pointer(tp->root, new); return 0; diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index d3a03c57545b..2a1c00048fd6 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -951,6 +951,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(new->flags)) new->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, new->flags); + u32_replace_knode(tp, tp_c, new); tcf_unbind_filter(tp, &n->res); tcf_exts_get_net(&n->exts); @@ -1164,6 +1166,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!tc_in_hw(n->flags)) n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; + tcf_proto_update_usesw(tp, n->flags); + ins = &ht->ht[TC_U32_HASH(handle)]; for (pins = rtnl_dereference(*ins); pins; ins = &pins->next, pins = rtnl_dereference(*ins)) -- 2.51.0 From 544c9394065fdbb733349bcd1464087d6e12d28e Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 15 Jan 2025 20:47:03 +0100 Subject: [PATCH 02/16] dsa: Use str_enable_disable-like helpers Replace ternary (condition ? "enable" : "disable") syntax with helpers from string_choices.h because: 1. Simple function call with one argument is easier to read. Ternary operator has three arguments and with wrapping might lead to quite long code. 2. Is slightly shorter thus also easier to read. 3. It brings uniformity in the text - same string. 4. Allows deduping by the linker, which results in a smaller binary file. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Linus Walleij Reviewed-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/dsa/mv88e6xxx/pcs-639x.c | 3 ++- drivers/net/dsa/mv88e6xxx/port.c | 3 ++- drivers/net/dsa/realtek/rtl8366rb.c | 7 ++++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/mv88e6xxx/pcs-639x.c b/drivers/net/dsa/mv88e6xxx/pcs-639x.c index 026b7bfb7ee5..59f63d6beec8 100644 --- a/drivers/net/dsa/mv88e6xxx/pcs-639x.c +++ b/drivers/net/dsa/mv88e6xxx/pcs-639x.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "chip.h" #include "global2.h" @@ -750,7 +751,7 @@ static int mv88e6393x_sgmii_apply_2500basex_an(struct mv88e639x_pcs *mpcs, if (err) dev_err(mpcs->mdio.dev.parent, "failed to %s 2500basex fix: %pe\n", - enable ? "enable" : "disable", ERR_PTR(err)); + str_enable_disable(enable), ERR_PTR(err)); return err; } diff --git a/drivers/net/dsa/mv88e6xxx/port.c b/drivers/net/dsa/mv88e6xxx/port.c index dc777ddce1f3..66b1b7277281 100644 --- a/drivers/net/dsa/mv88e6xxx/port.c +++ b/drivers/net/dsa/mv88e6xxx/port.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "chip.h" #include "global2.h" @@ -176,7 +177,7 @@ int mv88e6xxx_port_set_link(struct mv88e6xxx_chip *chip, int port, int link) dev_dbg(chip->dev, "p%d: %s link %s\n", port, reg & MV88E6XXX_PORT_MAC_CTL_FORCE_LINK ? "Force" : "Unforce", - reg & MV88E6XXX_PORT_MAC_CTL_LINK_UP ? "up" : "down"); + str_up_down(reg & MV88E6XXX_PORT_MAC_CTL_LINK_UP)); return 0; } diff --git a/drivers/net/dsa/realtek/rtl8366rb.c b/drivers/net/dsa/realtek/rtl8366rb.c index 23374178a176..4c4a95d4380c 100644 --- a/drivers/net/dsa/realtek/rtl8366rb.c +++ b/drivers/net/dsa/realtek/rtl8366rb.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "realtek.h" #include "realtek-smi.h" @@ -1522,7 +1523,7 @@ static int rtl8366rb_vlan_filtering(struct dsa_switch *ds, int port, rb = priv->chip_data; dev_dbg(priv->dev, "port %d: %s VLAN filtering\n", port, - vlan_filtering ? "enable" : "disable"); + str_enable_disable(vlan_filtering)); /* If the port is not in the member set, the frame will be dropped */ ret = regmap_update_bits(priv->map, RTL8366RB_VLAN_INGRESS_CTRL2_REG, @@ -1884,7 +1885,7 @@ static bool rtl8366rb_is_vlan_valid(struct realtek_priv *priv, unsigned int vlan static int rtl8366rb_enable_vlan(struct realtek_priv *priv, bool enable) { - dev_dbg(priv->dev, "%s VLAN\n", enable ? "enable" : "disable"); + dev_dbg(priv->dev, "%s VLAN\n", str_enable_disable(enable)); return regmap_update_bits(priv->map, RTL8366RB_SGCR, RTL8366RB_SGCR_EN_VLAN, enable ? RTL8366RB_SGCR_EN_VLAN : 0); @@ -1892,7 +1893,7 @@ static int rtl8366rb_enable_vlan(struct realtek_priv *priv, bool enable) static int rtl8366rb_enable_vlan4k(struct realtek_priv *priv, bool enable) { - dev_dbg(priv->dev, "%s VLAN 4k\n", enable ? "enable" : "disable"); + dev_dbg(priv->dev, "%s VLAN 4k\n", str_enable_disable(enable)); return regmap_update_bits(priv->map, RTL8366RB_SGCR, RTL8366RB_SGCR_EN_VLAN_4KTB, enable ? RTL8366RB_SGCR_EN_VLAN_4KTB : 0); -- 2.51.0 From 45bd1c5ba7580f612e46f3c6cb430c64adfd0294 Mon Sep 17 00:00:00 2001 From: =?utf8?q?=E8=B0=A2=E8=87=B4=E9=82=A6=20=28XIE=20Zhibang=29?= Date: Fri, 17 Jan 2025 01:41:40 +0000 Subject: [PATCH 03/16] net: appletalk: Drop aarp_send_probe_phase1() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit aarp_send_probe_phase1() used to work by calling ndo_do_ioctl of appletalk drivers ltpc or cops, but these two drivers have been removed since the following commits: commit 03dcb90dbf62 ("net: appletalk: remove Apple/Farallon LocalTalk PC support") commit 00f3696f7555 ("net: appletalk: remove cops support") Thus aarp_send_probe_phase1() no longer works, so drop it. (found by code inspection) Signed-off-by: 谢致邦 (XIE Zhibang) Signed-off-by: David S. Miller --- net/appletalk/aarp.c | 45 +++++++------------------------------------- 1 file changed, 7 insertions(+), 38 deletions(-) diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c index 9fa0b246902b..05cbb3c227c5 100644 --- a/net/appletalk/aarp.c +++ b/net/appletalk/aarp.c @@ -432,49 +432,18 @@ static struct atalk_addr *__aarp_proxy_find(struct net_device *dev, return a ? sa : NULL; } -/* - * Probe a Phase 1 device or a device that requires its Net:Node to - * be set via an ioctl. - */ -static void aarp_send_probe_phase1(struct atalk_iface *iface) -{ - struct ifreq atreq; - struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr; - const struct net_device_ops *ops = iface->dev->netdev_ops; - - sa->sat_addr.s_node = iface->address.s_node; - sa->sat_addr.s_net = ntohs(iface->address.s_net); - - /* We pass the Net:Node to the drivers/cards by a Device ioctl. */ - if (!(ops->ndo_do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) { - ops->ndo_do_ioctl(iface->dev, &atreq, SIOCGIFADDR); - if (iface->address.s_net != htons(sa->sat_addr.s_net) || - iface->address.s_node != sa->sat_addr.s_node) - iface->status |= ATIF_PROBE_FAIL; - - iface->address.s_net = htons(sa->sat_addr.s_net); - iface->address.s_node = sa->sat_addr.s_node; - } -} - - void aarp_probe_network(struct atalk_iface *atif) { - if (atif->dev->type == ARPHRD_LOCALTLK || - atif->dev->type == ARPHRD_PPP) - aarp_send_probe_phase1(atif); - else { - unsigned int count; + unsigned int count; - for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { - aarp_send_probe(atif->dev, &atif->address); + for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { + aarp_send_probe(atif->dev, &atif->address); - /* Defer 1/10th */ - msleep(100); + /* Defer 1/10th */ + msleep(100); - if (atif->status & ATIF_PROBE_FAIL) - break; - } + if (atif->status & ATIF_PROBE_FAIL) + break; } } -- 2.51.0 From af10e092b77aaa11f056765d979e9be7e8276a3a Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Fri, 17 Jan 2025 08:44:25 +0000 Subject: [PATCH 04/16] net: phylink: always do a major config when attaching a SFP PHY Background: https://lore.kernel.org/r/20250107123615.161095-1-ericwouds@gmail.com Since adding negotiation of in-band capabilities, it is no longer sufficient to just look at the MLO_AN_xxx mode and PHY interface to decide whether to do a major configuration, since the result now depends on the capabilities of the attaching PHY. Always trigger a major configuration in this case. Testing log: https://lore.kernel.org/r/f20c9744-3953-40e7-a9c9-5534b25d2e2a@gmail.com Reported-by: Eric Woudstra Tested-by: Eric Woudstra Signed-off-by: Russell King (Oracle) Signed-off-by: David S. Miller --- drivers/net/phy/phylink.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 66eea3f963d3..d130634d3bc7 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -3541,12 +3541,11 @@ static phy_interface_t phylink_choose_sfp_interface(struct phylink *pl, return interface; } -static void phylink_sfp_set_config(struct phylink *pl, - unsigned long *supported, - struct phylink_link_state *state) +static void phylink_sfp_set_config(struct phylink *pl, unsigned long *supported, + struct phylink_link_state *state, + bool changed) { u8 mode = MLO_AN_INBAND; - bool changed = false; phylink_dbg(pl, "requesting link mode %s/%s with support %*pb\n", phylink_an_mode_str(mode), phy_modes(state->interface), @@ -3623,7 +3622,7 @@ static int phylink_sfp_config_phy(struct phylink *pl, struct phy_device *phy) pl->link_port = pl->sfp_port; - phylink_sfp_set_config(pl, support, &config); + phylink_sfp_set_config(pl, support, &config, true); return 0; } @@ -3698,7 +3697,7 @@ static int phylink_sfp_config_optical(struct phylink *pl) pl->link_port = pl->sfp_port; - phylink_sfp_set_config(pl, pl->sfp_support, &config); + phylink_sfp_set_config(pl, pl->sfp_support, &config, false); return 0; } -- 2.51.0 From 5fe71fda89745fc3cd95f70d06e9162b595c3702 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 17 Jan 2025 12:36:14 +0300 Subject: [PATCH 05/16] tipc: re-order conditions in tipc_crypto_key_rcv() On a 32bit system the "keylen + sizeof(struct tipc_aead_key)" math could have an integer wrapping issue. It doesn't matter because the "keylen" is checked on the next line, but just to make life easier for static analysis tools, let's re-order these conditions and avoid the integer overflow. Signed-off-by: Dan Carpenter Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/tipc/crypto.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 43c3f1c971b8..c524421ec652 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -2293,8 +2293,8 @@ static bool tipc_crypto_key_rcv(struct tipc_crypto *rx, struct tipc_msg *hdr) keylen = ntohl(*((__be32 *)(data + TIPC_AEAD_ALG_NAME))); /* Verify the supplied size values */ - if (unlikely(size != keylen + sizeof(struct tipc_aead_key) || - keylen > TIPC_AEAD_KEY_SIZE_MAX)) { + if (unlikely(keylen > TIPC_AEAD_KEY_SIZE_MAX || + size != keylen + sizeof(struct tipc_aead_key))) { pr_debug("%s: invalid MSG_CRYPTO key size\n", rx->name); goto exit; } -- 2.51.0 From 457bb7970a0f10effd6b4ca9e1057727963c473a Mon Sep 17 00:00:00 2001 From: Ales Nezbeda Date: Fri, 17 Jan 2025 12:22:28 +0100 Subject: [PATCH 06/16] net: macsec: Add endianness annotations in salt struct This change resolves warning produced by sparse tool as currently there is a mismatch between normal generic type in salt and endian annotated type in macsec driver code. Endian annotated types should be used here. Sparse output: warning: restricted ssci_t degrades to integer warning: incorrect type in assignment (different base types) expected restricted ssci_t [usertype] ssci got unsigned int warning: restricted __be64 degrades to integer warning: incorrect type in assignment (different base types) expected restricted __be64 [usertype] pn got unsigned long long Signed-off-by: Ales Nezbeda Reviewed-by: Simon Horman Reviewed-by: Sabrina Dubroca Signed-off-by: David S. Miller --- include/net/macsec.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/macsec.h b/include/net/macsec.h index de216cbc6b05..bc7de5b53e54 100644 --- a/include/net/macsec.h +++ b/include/net/macsec.h @@ -38,8 +38,8 @@ struct metadata_dst; typedef union salt { struct { - u32 ssci; - u64 pn; + ssci_t ssci; + __be64 pn; } __packed; u8 bytes[MACSEC_SALT_LEN]; } __packed salt_t; -- 2.51.0 From 64ff63aeefb03139ae27454bd4208244579ae88e Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Fri, 17 Jan 2025 23:24:21 +0100 Subject: [PATCH 07/16] net: phy: realtek: HWMON support for standalone versions of RTL8221B and RTL8251 HWMON support has been added for the RTL8221/8251 PHYs integrated together with the MAC inside the RTL8125/8126 chips. This patch extends temperature reading support for standalone variants of the mentioned PHYs. I don't know whether the earlier revisions of the RTL8226 also have a built-in temperature sensor, so they have been skipped for now. Tested on RTL8221B-VB-CG. Signed-off-by: Aleksander Jan Bajkowski Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/realtek/realtek_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/phy/realtek/realtek_main.c b/drivers/net/phy/realtek/realtek_main.c index 38149958d95b..11ff44c3be5b 100644 --- a/drivers/net/phy/realtek/realtek_main.c +++ b/drivers/net/phy/realtek/realtek_main.c @@ -1475,6 +1475,7 @@ static struct phy_driver realtek_drvs[] = { }, { .match_phy_device = rtl8221b_vb_cg_c22_match_phy_device, .name = "RTL8221B-VB-CG 2.5Gbps PHY (C22)", + .probe = rtl822x_probe, .get_features = rtl822x_get_features, .config_aneg = rtl822x_config_aneg, .config_init = rtl822xb_config_init, @@ -1487,6 +1488,7 @@ static struct phy_driver realtek_drvs[] = { }, { .match_phy_device = rtl8221b_vb_cg_c45_match_phy_device, .name = "RTL8221B-VB-CG 2.5Gbps PHY (C45)", + .probe = rtl822x_probe, .config_init = rtl822xb_config_init, .get_rate_matching = rtl822xb_get_rate_matching, .get_features = rtl822x_c45_get_features, @@ -1497,6 +1499,7 @@ static struct phy_driver realtek_drvs[] = { }, { .match_phy_device = rtl8221b_vn_cg_c22_match_phy_device, .name = "RTL8221B-VM-CG 2.5Gbps PHY (C22)", + .probe = rtl822x_probe, .get_features = rtl822x_get_features, .config_aneg = rtl822x_config_aneg, .config_init = rtl822xb_config_init, @@ -1509,6 +1512,7 @@ static struct phy_driver realtek_drvs[] = { }, { .match_phy_device = rtl8221b_vn_cg_c45_match_phy_device, .name = "RTL8221B-VN-CG 2.5Gbps PHY (C45)", + .probe = rtl822x_probe, .config_init = rtl822xb_config_init, .get_rate_matching = rtl822xb_get_rate_matching, .get_features = rtl822x_c45_get_features, @@ -1519,6 +1523,7 @@ static struct phy_driver realtek_drvs[] = { }, { .match_phy_device = rtl8251b_c45_match_phy_device, .name = "RTL8251B 5Gbps PHY", + .probe = rtl822x_probe, .get_features = rtl822x_get_features, .config_aneg = rtl822x_config_aneg, .read_status = rtl822x_read_status, -- 2.51.0 From 3a0b7fa095212b51ed63892540c4f249991a2d74 Mon Sep 17 00:00:00 2001 From: Liu Ye Date: Thu, 16 Jan 2025 09:30:37 +0800 Subject: [PATCH 08/16] selftests/net/ipsec: Fix Null pointer dereference in rtattr_pack() Address Null pointer dereference / undefined behavior in rtattr_pack (note that size is 0 in the bad case). Flagged by cppcheck as: tools/testing/selftests/net/ipsec.c:230:25: warning: Possible null pointer dereference: payload [nullPointer] memcpy(RTA_DATA(attr), payload, size); ^ tools/testing/selftests/net/ipsec.c:1618:54: note: Calling function 'rtattr_pack', 4th argument 'NULL' value is 0 if (rtattr_pack(&req.nh, sizeof(req), XFRMA_IF_ID, NULL, 0)) { ^ tools/testing/selftests/net/ipsec.c:230:25: note: Null pointer dereference memcpy(RTA_DATA(attr), payload, size); ^ Signed-off-by: Liu Ye Link: https://patch.msgid.link/20250116013037.29470-1-liuye@kylinos.cn Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/ipsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index be4a30a0d02a..9b44a091802c 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -227,7 +227,8 @@ static int rtattr_pack(struct nlmsghdr *nh, size_t req_sz, attr->rta_len = RTA_LENGTH(size); attr->rta_type = rta_type; - memcpy(RTA_DATA(attr), payload, size); + if (payload) + memcpy(RTA_DATA(attr), payload, size); return 0; } -- 2.51.0 From 454d402481d45af79ee7eea7e64bce02bbbe9766 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:34 +0900 Subject: [PATCH 09/16] net: dropreason: Gather SOCKET_ drop reasons. The following patch adds a new drop reason starting with the SOCKET_ prefix. Let's gather the existing SOCKET_ reasons. Note that the order is not part of uAPI. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index ed864934e20b..f3714cbea50d 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -6,9 +6,10 @@ #define DEFINE_DROP_REASON(FN, FNe) \ FN(NOT_SPECIFIED) \ FN(NO_SOCKET) \ + FN(SOCKET_FILTER) \ + FN(SOCKET_RCVBUFF) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ - FN(SOCKET_FILTER) \ FN(UDP_CSUM) \ FN(NETFILTER_DROP) \ FN(OTHERHOST) \ @@ -18,7 +19,6 @@ FN(UNICAST_IN_L2_MULTICAST) \ FN(XFRM_POLICY) \ FN(IP_NOPROTO) \ - FN(SOCKET_RCVBUFF) \ FN(PROTO_MEM) \ FN(TCP_AUTH_HDR) \ FN(TCP_MD5NOTFOUND) \ @@ -138,12 +138,14 @@ enum skb_drop_reason { * 3) no valid child socket during 3WHS process */ SKB_DROP_REASON_NO_SOCKET, + /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ + SKB_DROP_REASON_SOCKET_FILTER, + /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ + SKB_DROP_REASON_SOCKET_RCVBUFF, /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ SKB_DROP_REASON_PKT_TOO_SMALL, /** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */ SKB_DROP_REASON_TCP_CSUM, - /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ - SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */ SKB_DROP_REASON_UDP_CSUM, /** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */ @@ -174,8 +176,6 @@ enum skb_drop_reason { SKB_DROP_REASON_XFRM_POLICY, /** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */ SKB_DROP_REASON_IP_NOPROTO, - /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ - SKB_DROP_REASON_SOCKET_RCVBUFF, /** * @SKB_DROP_REASON_PROTO_MEM: proto memory limitation, such as * udp packet drop out of udp_memory_allocated. -- 2.51.0 From c32f0bd7d4838982c6724fca0da92353f27c6f88 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:35 +0900 Subject: [PATCH 10/16] af_unix: Set drop reason in unix_release_sock(). unix_release_sock() is called when the last refcnt of struct file is released. Let's define a new drop reason SKB_DROP_REASON_SOCKET_CLOSE and set it for kfree_skb() in unix_release_sock(). # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX) >>> s1.send(b'hello world') >>> s2.close() # cat /sys/kernel/tracing/trace_pipe ... python3-280 ... kfree_skb: ... protocol=0 location=unix_release_sock+0x260/0x420 reason: SOCKET_CLOSE To be precise, unix_release_sock() is also called for a new child socket in unix_stream_connect() when something fails, but the new sk does not have skb in the recv queue then and no event is logged. Note that only tcp_inbound_ao_hash() uses a similar drop reason, SKB_DROP_REASON_TCP_CLOSE, and this can be generalised later. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 3 +++ net/unix/af_unix.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index f3714cbea50d..b9e7ff853ce3 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -6,6 +6,7 @@ #define DEFINE_DROP_REASON(FN, FNe) \ FN(NOT_SPECIFIED) \ FN(NO_SOCKET) \ + FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ FN(PKT_TOO_SMALL) \ @@ -138,6 +139,8 @@ enum skb_drop_reason { * 3) no valid child socket during 3WHS process */ SKB_DROP_REASON_NO_SOCKET, + /** @SKB_DROP_REASON_SOCKET_CLOSE: socket is close()d */ + SKB_DROP_REASON_SOCKET_CLOSE, /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8f2b605ce5b3..a05d25cc5545 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -715,8 +715,8 @@ static void unix_release_sock(struct sock *sk, int embrion) if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); - /* passed fds are erased in the kfree_skb hook */ - kfree_skb(skb); + /* passed fds are erased in the kfree_skb hook */ + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); } if (path.dentry) -- 2.51.0 From 4d0446b7a214e2aa28c0e914329610731f665ad2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:36 +0900 Subject: [PATCH 11/16] af_unix: Set drop reason in unix_sock_destructor(). unix_sock_destructor() is called as sk->sk_destruct() just before the socket is actually freed. Let's use SKB_DROP_REASON_SOCKET_CLOSE for skb_queue_purge(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a05d25cc5545..41b99984008a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -640,7 +640,7 @@ static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); - skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); -- 2.51.0 From c49a157c33c45cf00a1881e8c1f65bed5ff0023e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:37 +0900 Subject: [PATCH 12/16] af_unix: Set drop reason in __unix_gc(). Inflight file descriptors by SCM_RIGHTS hold references to the struct file. AF_UNIX sockets could hold references to each other, forming reference cycles. Once such sockets are close()d without the fd recv()ed, they will be unaccessible from userspace but remain in kernel. __unix_gc() garbage-collects skb with the dead file descriptors and frees them by __skb_queue_purge(). Let's set SKB_DROP_REASON_SOCKET_CLOSE there. # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> from array import array >>> >>> # Create a reference cycle >>> s1 = socket(AF_UNIX, SOCK_DGRAM) >>> s1.bind('') >>> s1.sendmsg([b"nop"], [(SOL_SOCKET, SCM_RIGHTS, array("i", [s1.fileno()]))], 0, s1.getsockname()) >>> s1.close() >>> >>> # Trigger GC >>> s2 = socket(AF_UNIX) >>> s2.close() # cat /sys/kernel/tracing/trace_pipe ... kworker/u16:2-42 ... kfree_skb: ... location=__unix_gc+0x4ad/0x580 reason: SOCKET_CLOSE Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 0068e758be4d..9848b7b78701 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -573,7 +573,7 @@ static void __unix_gc(struct work_struct *work) UNIXCB(skb).fp->dead = true; } - __skb_queue_purge(&hitlist); + __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE); skip_gc: WRITE_ONCE(gc_in_progress, false); } -- 2.51.0 From 533643b091dd6e246d57caf81e6892fa9cbb1cc9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:38 +0900 Subject: [PATCH 13/16] af_unix: Set drop reason in manage_oob(). AF_UNIX SOCK_STREAM socket supports MSG_OOB. When OOB data is sent to a socket, recv() will break at that point. If the next recv() does not have MSG_OOB, the normal data following the OOB data is returned. Then, the OOB skb is dropped. Let's define a new drop reason for that case in manage_oob(). # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX) >>> s1.send(b'a', MSG_OOB) >>> s1.send(b'b') >>> s2.recv(2) b'b' # cat /sys/kernel/tracing/trace_pipe ... python3-223 ... kfree_skb: ... location=unix_stream_read_generic+0x59e/0xc20 reason: UNIX_SKIP_OOB Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ net/unix/af_unix.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index b9e7ff853ce3..d6c9d841eb11 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -9,6 +9,7 @@ FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ + FN(UNIX_SKIP_OOB) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ FN(UDP_CSUM) \ @@ -145,6 +146,11 @@ enum skb_drop_reason { SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ SKB_DROP_REASON_SOCKET_RCVBUFF, + /** + * @SKB_DROP_REASON_UNIX_SKIP_OOB: Out-Of-Band data is skipped by + * recv() without MSG_OOB so dropped. + */ + SKB_DROP_REASON_UNIX_SKIP_OOB, /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ SKB_DROP_REASON_PKT_TOO_SMALL, /** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 41b99984008a..e31fda1d319f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2695,7 +2695,7 @@ unlock: spin_unlock(&sk->sk_receive_queue.lock); consume_skb(read_skb); - kfree_skb(unread_skb); + kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return skb; } -- 2.51.0 From bace4b468049a558295a0f59460fcb51e28f8fde Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:39 +0900 Subject: [PATCH 14/16] af_unix: Set drop reason in unix_stream_read_skb(). unix_stream_read_skb() is called when BPF SOCKMAP reads some data from a socket in the map. SOCKMAP does not support MSG_OOB, and reading OOB results in a drop. Let's set drop reasons respectively. * SOCKET_CLOSE : the socket in SOCKMAP was close()d * UNIX_SKIP_OOB : OOB was read from the socket in SOCKMAP Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e31fda1d319f..de4966e1b7ff 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2724,7 +2724,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (sock_flag(sk, SOCK_DEAD)) { unix_state_unlock(sk); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); return -ECONNRESET; } @@ -2738,7 +2738,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) unix_state_unlock(sk); if (drop) { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return -EAGAIN; } } -- 2.51.0 From b3e365bbf4f47b8f76b25b0fcf3f38916ca53e42 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:40 +0900 Subject: [PATCH 15/16] af_unix: Set drop reason in unix_dgram_disconnected(). unix_dgram_disconnected() is called from two places: 1. when a connect()ed socket dis-connect()s or re-connect()s to another socket 2. when sendmsg() fails because the peer socket that the client has connect()ed to has been close()d Then, the client's recv queue is purged to remove all messages from the old peer socket. Let's define a new drop reason for that case. # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> >>> # s1 has a message from s2 >>> s1, s2 = socketpair(AF_UNIX, SOCK_DGRAM) >>> s2.send(b'hello world') >>> >>> # re-connect() drops the message from s2 >>> s3 = socket(AF_UNIX, SOCK_DGRAM) >>> s3.bind('') >>> s1.connect(s3.getsockname()) # cat /sys/kernel/tracing/trace_pipe python3-250 ... kfree_skb: ... location=skb_queue_purge_reason+0xdc/0x110 reason: UNIX_DISCONNECT Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 7 +++++++ net/unix/af_unix.c | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d6c9d841eb11..32a34dfe8cc5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -9,6 +9,7 @@ FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ + FN(UNIX_DISCONNECT) \ FN(UNIX_SKIP_OOB) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ @@ -146,6 +147,12 @@ enum skb_drop_reason { SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ SKB_DROP_REASON_SOCKET_RCVBUFF, + /** + * @SKB_DROP_REASON_UNIX_DISCONNECT: recv queue is purged when SOCK_DGRAM + * or SOCK_SEQPACKET socket re-connect()s to another socket or notices + * during send() that the peer has been close()d. + */ + SKB_DROP_REASON_UNIX_DISCONNECT, /** * @SKB_DROP_REASON_UNIX_SKIP_OOB: Out-Of-Band data is skipped by * recv() without MSG_OOB so dropped. diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index de4966e1b7ff..5e1b408c19da 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -622,7 +622,9 @@ static void unix_write_space(struct sock *sk) static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { - skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge_reason(&sk->sk_receive_queue, + SKB_DROP_REASON_UNIX_DISCONNECT); + wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, -- 2.51.0 From 3b2d40dc13c26a4efde438beb664576d20a9fb4a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:41 +0900 Subject: [PATCH 16/16] af_unix: Reuse out_pipe label in unix_stream_sendmsg(). This is a follow-up of commit d460b04bc452 ("af_unix: Clean up error paths in unix_stream_sendmsg()."). If we initialise skb with NULL in unix_stream_sendmsg(), we can reuse the existing out_pipe label for the SEND_SHUTDOWN check. Let's rename it and adjust the existing label as out_pipe_lock. While at it, size and data_len are moved to the while loop scope. Suggested-by: Paolo Abeni Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5e1b408c19da..43a45cf06f2e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2238,13 +2238,11 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; + struct sk_buff *skb = NULL; struct sock *other = NULL; - int err, size; - struct sk_buff *skb; - int sent = 0; struct scm_cookie scm; bool fds_sent = false; - int data_len; + int err, sent = 0; err = scm_send(sock, msg, &scm, false); if (err < 0) @@ -2273,16 +2271,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } } - if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) { - if (!(msg->msg_flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - - err = -EPIPE; - goto out_err; - } + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) + goto out_pipe; while (sent < len) { - size = len - sent; + int size = len - sent; + int data_len; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb = sock_alloc_send_pskb(sk, 0, 0, @@ -2335,7 +2329,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) - goto out_pipe; + goto out_pipe_unlock; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); @@ -2358,8 +2352,9 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return sent; -out_pipe: +out_pipe_unlock: unix_state_unlock(other); +out_pipe: if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; -- 2.51.0