From ce6cb8113c842b94e77364b247c4f85c7b34e0c2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Apr 2025 14:18:46 -0700 Subject: [PATCH 01/16] tools: ynl-gen: individually free previous values on double set When user calls request_attrA_set() multiple times (for the same attribute), and attrA is of type which allocates memory - we try to free the previously associated values. For array types (including multi-attr) we have only freed the array, but the array may have contained pointers. Refactor the code generation for free attr and reuse the generated lines in setters to flush out the previous state. Since setters are static inlines in the header we need to add forward declarations for the free helpers of pure nested structs. Track which types get used by arrays and include the right forwad declarations. At least ethtool string set and bit set would not be freed without this. Tho, admittedly, overriding already set attribute twice is likely a very very rare thing to do. Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink") Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250414211851.602096-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 62 +++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 17 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 662a925bd9e1..2d856ccc88f4 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -162,9 +162,15 @@ class Type(SpecAttr): def free_needs_iter(self): return False - def free(self, ri, var, ref): + def _free_lines(self, ri, var, ref): if self.is_multi_val() or self.presence_type() == 'len': - ri.cw.p(f'free({var}->{ref}{self.c_name});') + return [f'free({var}->{ref}{self.c_name});'] + return [] + + def free(self, ri, var, ref): + lines = self._free_lines(ri, var, ref) + for line in lines: + ri.cw.p(line) def arg_member(self, ri): member = self._complex_member_type(ri) @@ -263,6 +269,10 @@ class Type(SpecAttr): var = "req" member = f"{var}->{'.'.join(ref)}" + local_vars = [] + if self.free_needs_iter(): + local_vars += ['unsigned int i;'] + code = [] presence = '' for i in range(0, len(ref)): @@ -272,6 +282,10 @@ class Type(SpecAttr): if i == len(ref) - 1 and self.presence_type() != 'bit': continue code.append(presence + ' = 1;') + ref_path = '.'.join(ref[:-1]) + if ref_path: + ref_path += '.' + code += self._free_lines(ri, var, ref_path) code += self._setter_lines(ri, member, presence) func_name = f"{op_prefix(ri, direction, deref=deref)}_set_{'_'.join(ref)}" @@ -279,7 +293,8 @@ class Type(SpecAttr): alloc = bool([x for x in code if 'alloc(' in x]) if free and not alloc: func_name = '__' + func_name - ri.cw.write_func('static inline void', func_name, body=code, + ri.cw.write_func('static inline void', func_name, local_vars=local_vars, + body=code, args=[f'{type_name(ri, direction, deref=deref)} *{var}'] + self.arg_member(ri)) @@ -482,8 +497,7 @@ class TypeString(Type): ['unsigned int len;'] def _setter_lines(self, ri, member, presence): - return [f"free({member});", - f"{presence}_len = strlen({self.c_name});", + return [f"{presence}_len = strlen({self.c_name});", f"{member} = malloc({presence}_len + 1);", f'memcpy({member}, {self.c_name}, {presence}_len);', f'{member}[{presence}_len] = 0;'] @@ -536,8 +550,7 @@ class TypeBinary(Type): ['unsigned int len;'] def _setter_lines(self, ri, member, presence): - return [f"free({member});", - f"{presence}_len = len;", + return [f"{presence}_len = len;", f"{member} = malloc({presence}_len);", f'memcpy({member}, {self.c_name}, {presence}_len);'] @@ -574,12 +587,14 @@ class TypeNest(Type): def _complex_member_type(self, ri): return self.nested_struct_type - def free(self, ri, var, ref): + def _free_lines(self, ri, var, ref): + lines = [] at = '&' if self.is_recursive_for_op(ri): at = '' - ri.cw.p(f'if ({var}->{ref}{self.c_name})') - ri.cw.p(f'{self.nested_render_name}_free({at}{var}->{ref}{self.c_name});') + lines += [f'if ({var}->{ref}{self.c_name})'] + lines += [f'{self.nested_render_name}_free({at}{var}->{ref}{self.c_name});'] + return lines def _attr_typol(self): return f'.type = YNL_PT_NEST, .nest = &{self.nested_render_name}_nest, ' @@ -632,15 +647,19 @@ class TypeMultiAttr(Type): def free_needs_iter(self): return 'type' not in self.attr or self.attr['type'] == 'nest' - def free(self, ri, var, ref): + def _free_lines(self, ri, var, ref): + lines = [] if self.attr['type'] in scalars: - ri.cw.p(f"free({var}->{ref}{self.c_name});") + lines += [f"free({var}->{ref}{self.c_name});"] elif 'type' not in self.attr or self.attr['type'] == 'nest': - ri.cw.p(f"for (i = 0; i < {var}->{ref}n_{self.c_name}; i++)") - ri.cw.p(f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);') - ri.cw.p(f"free({var}->{ref}{self.c_name});") + lines += [ + f"for (i = 0; i < {var}->{ref}n_{self.c_name}; i++)", + f'{self.nested_render_name}_free(&{var}->{ref}{self.c_name}[i]);', + f"free({var}->{ref}{self.c_name});", + ] else: raise Exception(f"Free of MultiAttr sub-type {self.attr['type']} not supported yet") + return lines def _attr_policy(self, policy): return self.base_type._attr_policy(policy) @@ -666,8 +685,7 @@ class TypeMultiAttr(Type): def _setter_lines(self, ri, member, presence): # For multi-attr we have a count, not presence, hack up the presence presence = presence[:-(len('_present.') + len(self.c_name))] + "n_" + self.c_name - return [f"free({member});", - f"{member} = {self.c_name};", + return [f"{member} = {self.c_name};", f"{presence} = n_{self.c_name};"] @@ -755,6 +773,7 @@ class Struct: self.request = False self.reply = False self.recursive = False + self.in_multi_val = False # used by a MultiAttr or and legacy arrays self.attr_list = [] self.attrs = dict() @@ -1122,6 +1141,10 @@ class Family(SpecFamily): if attr in rs_members['reply']: self.pure_nested_structs[nested].reply = True + if spec.is_multi_val(): + child = self.pure_nested_structs.get(nested) + child.in_multi_val = True + self._sort_pure_types() # Propagate the request / reply / recursive @@ -1136,6 +1159,8 @@ class Family(SpecFamily): struct.child_nests.update(child.child_nests) child.request |= struct.request child.reply |= struct.reply + if spec.is_multi_val(): + child.in_multi_val = True if attr_set in struct.child_nests: struct.recursive = True @@ -2958,6 +2983,9 @@ def main(): for attr_set, struct in parsed.pure_nested_structs.items(): ri = RenderInfo(cw, parsed, args.mode, "", "", attr_set) print_type_full(ri, struct) + if struct.request and struct.in_multi_val: + free_rsp_nested_prototype(ri) + cw.nl() for op_name, op in parsed.ops.items(): cw.p(f"/* ============== {op.enum_name} ============== */") -- 2.51.0 From 57e7dedf2b8c72caa6f04b9e08b19e4f370562fa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Apr 2025 14:18:47 -0700 Subject: [PATCH 02/16] tools: ynl-gen: make sure we validate subtype of array-nest ArrayNest AKA indexed-array support currently skips inner type validation. We count the attributes and then we parse them, make sure we call validate, too. Otherwise buggy / unexpected kernel response may lead to crashes. Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink") Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250414211851.602096-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/net/ynl/pyynl/ynl_gen_c.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/net/ynl/pyynl/ynl_gen_c.py b/tools/net/ynl/pyynl/ynl_gen_c.py index 2d856ccc88f4..30c0a34b2784 100755 --- a/tools/net/ynl/pyynl/ynl_gen_c.py +++ b/tools/net/ynl/pyynl/ynl_gen_c.py @@ -714,8 +714,11 @@ class TypeArrayNest(Type): def _attr_get(self, ri, var): local_vars = ['const struct nlattr *attr2;'] get_lines = [f'attr_{self.c_name} = attr;', - 'ynl_attr_for_each_nested(attr2, attr)', - f'\t{var}->n_{self.c_name}++;'] + 'ynl_attr_for_each_nested(attr2, attr) {', + '\tif (ynl_attr_validate(yarg, attr2))', + '\t\treturn YNL_PARSE_CB_ERROR;', + f'\t{var}->n_{self.c_name}++;', + '}'] return get_lines, None, local_vars -- 2.51.0 From acf4da17deada7f8b120e051aa6c9cac40dbd83b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Apr 2025 14:18:48 -0700 Subject: [PATCH 03/16] netlink: specs: rt-link: add an attr layer around alt-ifname alt-ifname attr is directly placed in requests (as an alternative to ifname) but in responses its wrapped up in IFLA_PROP_LIST and only there is may be multi-attr. See rtnl_fill_prop_list(). Fixes: b2f63d904e72 ("doc/netlink: Add spec for rt link messages") Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250414211851.602096-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_link.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 31238455f8e9..200e9a7e5b11 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -1113,11 +1113,10 @@ attribute-sets: - name: prop-list type: nest - nested-attributes: link-attrs + nested-attributes: prop-list-link-attrs - name: alt-ifname type: string - multi-attr: true - name: perm-address type: binary @@ -1163,6 +1162,13 @@ attribute-sets: - name: netns-immutable type: u8 + - + name: prop-list-link-attrs + subset-of: link-attrs + attributes: + - + name: alt-ifname + multi-attr: true - name: af-spec-attrs attributes: @@ -2453,7 +2459,6 @@ operations: - min-mtu - max-mtu - prop-list - - alt-ifname - perm-address - proto-down-reason - parent-dev-name -- 2.51.0 From 540201c0ef7e8e7b169f68a238ade931a81a31a6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Apr 2025 14:18:49 -0700 Subject: [PATCH 04/16] netlink: specs: rtnetlink: attribute naming corrections Some attribute names diverge in very minor ways from the C names. These are most likely typos, and they prevent the C codegen from working. Fixes: bc515ed06652 ("netlink: specs: Add a spec for neighbor tables in rtnetlink") Fixes: b2f63d904e72 ("doc/netlink: Add spec for rt link messages") Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250414211851.602096-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_link.yaml | 6 +++--- Documentation/netlink/specs/rt_neigh.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 200e9a7e5b11..03323d7f58dc 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -1591,7 +1591,7 @@ attribute-sets: name: nf-call-iptables type: u8 - - name: nf-call-ip6-tables + name: nf-call-ip6tables type: u8 - name: nf-call-arptables @@ -2083,7 +2083,7 @@ attribute-sets: name: id type: u16 - - name: flag + name: flags type: binary struct: ifla-vlan-flags - @@ -2171,7 +2171,7 @@ attribute-sets: type: binary struct: ifla-cacheinfo - - name: icmp6-stats + name: icmp6stats type: binary struct: ifla-icmp6-stats - diff --git a/Documentation/netlink/specs/rt_neigh.yaml b/Documentation/netlink/specs/rt_neigh.yaml index e670b6dc07be..a1e137a16abd 100644 --- a/Documentation/netlink/specs/rt_neigh.yaml +++ b/Documentation/netlink/specs/rt_neigh.yaml @@ -189,7 +189,7 @@ attribute-sets: type: binary display-hint: ipv4 - - name: lladr + name: lladdr type: binary display-hint: mac - -- 2.51.0 From beb3c5ad8829b52057f48a776a9d9558b98c157f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Apr 2025 14:18:50 -0700 Subject: [PATCH 05/16] netlink: specs: rt-link: adjust mctp attribute naming MCTP attribute naming is inconsistent. In C we have: IFLA_MCTP_NET, IFLA_MCTP_PHYS_BINDING, ^^^^ but in YAML: - mctp-net - phys-binding ^ no "mctp" It's unclear whether the "mctp" part of the name is supposed to be a prefix or part of attribute name. Make it a prefix, seems cleaner, even tho technically phys-binding was added later. Fixes: b2f63d904e72 ("doc/netlink: Add spec for rt link messages") Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250414211851.602096-8-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_link.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/netlink/specs/rt_link.yaml b/Documentation/netlink/specs/rt_link.yaml index 03323d7f58dc..6b9d5ee87d93 100644 --- a/Documentation/netlink/specs/rt_link.yaml +++ b/Documentation/netlink/specs/rt_link.yaml @@ -2185,9 +2185,10 @@ attribute-sets: type: u32 - name: mctp-attrs + name-prefix: ifla-mctp- attributes: - - name: mctp-net + name: net type: u32 - name: phys-binding -- 2.51.0 From e31f86ee4b9ccb844baf2131da8e5d4d6f23aa1d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 14 Apr 2025 14:18:51 -0700 Subject: [PATCH 06/16] netlink: specs: rt-neigh: prefix struct nfmsg members with ndm Attach ndm- to all members of struct nfmsg. We could possibly use name-prefix just for C, but I don't think we have any precedent for using name-prefix on structs, and other rtnetlink sub-specs give full names for fixed header struct members. Fixes: bc515ed06652 ("netlink: specs: Add a spec for neighbor tables in rtnetlink") Reviewed-by: Donald Hunter Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250414211851.602096-9-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt_neigh.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/netlink/specs/rt_neigh.yaml b/Documentation/netlink/specs/rt_neigh.yaml index a1e137a16abd..a843caa72259 100644 --- a/Documentation/netlink/specs/rt_neigh.yaml +++ b/Documentation/netlink/specs/rt_neigh.yaml @@ -13,25 +13,25 @@ definitions: type: struct members: - - name: family + name: ndm-family type: u8 - - name: pad + name: ndm-pad type: pad len: 3 - - name: ifindex + name: ndm-ifindex type: s32 - - name: state + name: ndm-state type: u16 enum: nud-state - - name: flags + name: ndm-flags type: u8 enum: ntf-flags - - name: type + name: ndm-type type: u8 enum: rtm-type - -- 2.51.0 From 36355ddfe8955f226a88a543ed354b9f6b84cd70 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 14 Apr 2025 22:04:34 +0200 Subject: [PATCH 07/16] net: b53: enable BPDU reception for management port For STP to work, receiving BPDUs is essential, but the appropriate bit was never set. Without GC_RX_BPDU_EN, the switch chip will filter all BPDUs, even if an appropriate PVID VLAN was setup. Fixes: ff39c2d68679 ("net: dsa: b53: Add bridge support") Signed-off-by: Jonas Gorski Link: https://patch.msgid.link/20250414200434.194422-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 61d164ffb3ae..e5ba71897906 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -737,6 +737,15 @@ static void b53_enable_mib(struct b53_device *dev) b53_write8(dev, B53_MGMT_PAGE, B53_GLOBAL_CONFIG, gc); } +static void b53_enable_stp(struct b53_device *dev) +{ + u8 gc; + + b53_read8(dev, B53_MGMT_PAGE, B53_GLOBAL_CONFIG, &gc); + gc |= GC_RX_BPDU_EN; + b53_write8(dev, B53_MGMT_PAGE, B53_GLOBAL_CONFIG, gc); +} + static u16 b53_default_pvid(struct b53_device *dev) { if (is5325(dev) || is5365(dev)) @@ -876,6 +885,7 @@ static int b53_switch_reset(struct b53_device *dev) } b53_enable_mib(dev); + b53_enable_stp(dev); return b53_flush_arl(dev, FAST_AGE_STATIC); } -- 2.51.0 From eb25de13bd9cf025413a04f25e715d0e99847e30 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Mon, 14 Apr 2025 22:00:20 +0200 Subject: [PATCH 08/16] net: bridge: switchdev: do not notify new brentries as changed When adding a bridge vlan that is pvid or untagged after the vlan has already been added to any other switchdev backed port, the vlan change will be propagated as changed, since the flags change. This causes the vlan to not be added to the hardware for DSA switches, since the DSA handler ignores any vlans for the CPU or DSA ports that are changed. E.g. the following order of operations would work: $ ip link add swbridge type bridge vlan_filtering 1 vlan_default_pvid 0 $ ip link set lan1 master swbridge $ bridge vlan add dev swbridge vid 1 pvid untagged self $ bridge vlan add dev lan1 vid 1 pvid untagged but this order would break: $ ip link add swbridge type bridge vlan_filtering 1 vlan_default_pvid 0 $ ip link set lan1 master swbridge $ bridge vlan add dev lan1 vid 1 pvid untagged $ bridge vlan add dev swbridge vid 1 pvid untagged self Additionally, the vlan on the bridge itself would become undeletable: $ bridge vlan port vlan-id lan1 1 PVID Egress Untagged swbridge 1 PVID Egress Untagged $ bridge vlan del dev swbridge vid 1 self $ bridge vlan port vlan-id lan1 1 PVID Egress Untagged swbridge 1 Egress Untagged since the vlan was never added to DSA's vlan list, so deleting it will cause an error, causing the bridge code to not remove it. Fix this by checking if flags changed only for vlans that are already brentry and pass changed as false for those that become brentries, as these are a new vlan (member) from the switchdev point of view. Since *changed is set to true for becomes_brentry = true regardless of would_change's value, this will not change any rtnetlink notification delivery, just the value passed on to switchdev in vlan->changed. Fixes: 8d23a54f5bee ("net: bridge: switchdev: differentiate new VLANs from changed ones") Reviewed-by: Vladimir Oltean Signed-off-by: Jonas Gorski Reviewed-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250414200020.192715-1-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski --- net/bridge/br_vlan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index d9a69ec9affe..939a3aa78d5c 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -715,8 +715,8 @@ static int br_vlan_add_existing(struct net_bridge *br, u16 flags, bool *changed, struct netlink_ext_ack *extack) { - bool would_change = __vlan_flags_would_change(vlan, flags); bool becomes_brentry = false; + bool would_change = false; int err; if (!br_vlan_is_brentry(vlan)) { @@ -725,6 +725,8 @@ static int br_vlan_add_existing(struct net_bridge *br, return -EINVAL; becomes_brentry = true; + } else { + would_change = __vlan_flags_would_change(vlan, flags); } /* Master VLANs that aren't brentries weren't notified before, -- 2.51.0 From b2727326d0a53709380aa147018085d71a6d4843 Mon Sep 17 00:00:00 2001 From: Abdun Nihaal Date: Tue, 15 Apr 2025 08:59:09 +0530 Subject: [PATCH 09/16] net: txgbe: fix memory leak in txgbe_probe() error path When txgbe_sw_init() is called, memory is allocated for wx->rss_key in wx_init_rss_key(). However, in txgbe_probe() function, the subsequent error paths after txgbe_sw_init() don't free the rss_key. Fix that by freeing it in error path along with wx->mac_table. Also change the label to which execution jumps when txgbe_sw_init() fails, because otherwise, it could lead to a double free for rss_key, when the mac_table allocation fails in wx_sw_init(). Fixes: 937d46ecc5f9 ("net: wangxun: add ethtool_ops for channel number") Reported-by: Jiawen Wu Signed-off-by: Abdun Nihaal Reviewed-by: Jiawen Wu Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250415032910.13139-1-abdun.nihaal@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/wangxun/txgbe/txgbe_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c index a2e245e3b016..38206a46693b 100644 --- a/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c +++ b/drivers/net/ethernet/wangxun/txgbe/txgbe_main.c @@ -611,7 +611,7 @@ static int txgbe_probe(struct pci_dev *pdev, /* setup the private structure */ err = txgbe_sw_init(wx); if (err) - goto err_free_mac_table; + goto err_pci_release_regions; /* check if flash load is done after hw power up */ err = wx_check_flash_load(wx, TXGBE_SPI_ILDR_STATUS_PERST); @@ -769,6 +769,7 @@ err_release_hw: wx_clear_interrupt_scheme(wx); wx_control_hw(wx, false); err_free_mac_table: + kfree(wx->rss_key); kfree(wx->mac_table); err_pci_release_regions: pci_release_selected_regions(pdev, -- 2.51.0 From c84f6ce918a9e6f4996597cbc62536bbf2247c96 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Apr 2025 00:28:50 +0300 Subject: [PATCH 10/16] net: dsa: mv88e6xxx: avoid unregistering devlink regions which were never registered Russell King reports that a system with mv88e6xxx dereferences a NULL pointer when unbinding this driver: https://lore.kernel.org/netdev/Z_lRkMlTJ1KQ0kVX@shell.armlinux.org.uk/ The crash seems to be in devlink_region_destroy(), which is not NULL tolerant but is given a NULL devlink global region pointer. At least on some chips, some devlink regions are conditionally registered since the blamed commit, see mv88e6xxx_setup_devlink_regions_global(): if (cond && !cond(chip)) continue; These are MV88E6XXX_REGION_STU and MV88E6XXX_REGION_PVT. If the chip does not have an STU or PVT, it should crash like this. To fix the issue, avoid unregistering those regions which are NULL, i.e. were skipped at mv88e6xxx_setup_devlink_regions_global() time. Fixes: 836021a2d0e0 ("net: dsa: mv88e6xxx: Export cross-chip PVT as devlink region") Tested-by: Russell King (Oracle) Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250414212850.2953957-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/devlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/devlink.c b/drivers/net/dsa/mv88e6xxx/devlink.c index 795c8df7b6a7..195460a0a0d4 100644 --- a/drivers/net/dsa/mv88e6xxx/devlink.c +++ b/drivers/net/dsa/mv88e6xxx/devlink.c @@ -736,7 +736,8 @@ void mv88e6xxx_teardown_devlink_regions_global(struct dsa_switch *ds) int i; for (i = 0; i < ARRAY_SIZE(mv88e6xxx_regions); i++) - dsa_devlink_region_destroy(chip->regions[i]); + if (chip->regions[i]) + dsa_devlink_region_destroy(chip->regions[i]); } void mv88e6xxx_teardown_devlink_regions_port(struct dsa_switch *ds, int port) -- 2.51.0 From ea08dfc35f83cfc73493c52f63ae4f2e29edfe8d Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Apr 2025 00:29:13 +0300 Subject: [PATCH 11/16] net: dsa: mv88e6xxx: fix -ENOENT when deleting VLANs and MST is unsupported Russell King reports that on the ZII dev rev B, deleting a bridge VLAN from a user port fails with -ENOENT: https://lore.kernel.org/netdev/Z_lQXNP0s5-IiJzd@shell.armlinux.org.uk/ This comes from mv88e6xxx_port_vlan_leave() -> mv88e6xxx_mst_put(), which tries to find an MST entry in &chip->msts associated with the SID, but fails and returns -ENOENT as such. But we know that this chip does not support MST at all, so that is not surprising. The question is why does the guard in mv88e6xxx_mst_put() not exit early: if (!sid) return 0; And the answer seems to be simple: the sid comes from vlan.sid which supposedly was previously populated by mv88e6xxx_vtu_get(). But some chip->info->ops->vtu_getnext() implementations do not populate vlan.sid, for example see mv88e6185_g1_vtu_getnext(). In that case, later in mv88e6xxx_port_vlan_leave() we are using a garbage sid which is just residual stack memory. Testing for sid == 0 covers all cases of a non-bridge VLAN or a bridge VLAN mapped to the default MSTI. For some chips, SID 0 is valid and installed by mv88e6xxx_stu_setup(). A chip which does not support the STU would implicitly only support mapping all VLANs to the default MSTI, so although SID 0 is not valid, it would be sufficient, if we were to zero-initialize the vlan structure, to fix the bug, due to the coincidence that a test for vlan.sid == 0 already exists and leads to the same (correct) behavior. Another option which would be sufficient would be to add a test for mv88e6xxx_has_stu() inside mv88e6xxx_mst_put(), symmetric to the one which already exists in mv88e6xxx_mst_get(). But that placement means the caller will have to dereference vlan.sid, which means it will access uninitialized memory, which is not nice even if it ignores it later. So we end up making both modifications, in order to not rely just on the sid == 0 coincidence, but also to avoid having uninitialized structure fields which might get temporarily accessed. Fixes: acaf4d2e36b3 ("net: dsa: mv88e6xxx: MST Offloading") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250414212913.2955253-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 29a89ab4b789..08db846cda8d 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -1852,6 +1852,8 @@ static int mv88e6xxx_vtu_get(struct mv88e6xxx_chip *chip, u16 vid, if (!chip->info->ops->vtu_getnext) return -EOPNOTSUPP; + memset(entry, 0, sizeof(*entry)); + entry->vid = vid ? vid - 1 : mv88e6xxx_max_vid(chip); entry->valid = false; @@ -1960,7 +1962,16 @@ static int mv88e6xxx_mst_put(struct mv88e6xxx_chip *chip, u8 sid) struct mv88e6xxx_mst *mst, *tmp; int err; - if (!sid) + /* If the SID is zero, it is for a VLAN mapped to the default MSTI, + * and mv88e6xxx_stu_setup() made sure it is always present, and thus, + * should not be removed here. + * + * If the chip lacks STU support, numerically the "sid" variable will + * happen to also be zero, but we don't want to rely on that fact, so + * we explicitly test that first. In that case, there is also nothing + * to do here. + */ + if (!mv88e6xxx_has_stu(chip) || !sid) return 0; list_for_each_entry_safe(mst, tmp, &chip->msts, node) { -- 2.51.0 From 7afb5fb42d4950f33af2732b8147c552659f79b7 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Apr 2025 00:29:30 +0300 Subject: [PATCH 12/16] net: dsa: clean up FDB, MDB, VLAN entries on unbind As explained in many places such as commit b117e1e8a86d ("net: dsa: delete dsa_legacy_fdb_add and dsa_legacy_fdb_del"), DSA is written given the assumption that higher layers have balanced additions/deletions. As such, it only makes sense to be extremely vocal when those assumptions are violated and the driver unbinds with entries still present. But Ido Schimmel points out a very simple situation where that is wrong: https://lore.kernel.org/netdev/ZDazSM5UsPPjQuKr@shredder/ (also briefly discussed by me in the aforementioned commit). Basically, while the bridge bypass operations are not something that DSA explicitly documents, and for the majority of DSA drivers this API simply causes them to go to promiscuous mode, that isn't the case for all drivers. Some have the necessary requirements for bridge bypass operations to do something useful - see dsa_switch_supports_uc_filtering(). Although in tools/testing/selftests/net/forwarding/local_termination.sh, we made an effort to popularize better mechanisms to manage address filters on DSA interfaces from user space - namely macvlan for unicast, and setsockopt(IP_ADD_MEMBERSHIP) - through mtools - for multicast, the fact is that 'bridge fdb add ... self static local' also exists as kernel UAPI, and might be useful to someone, even if only for a quick hack. It seems counter-productive to block that path by implementing shim .ndo_fdb_add and .ndo_fdb_del operations which just return -EOPNOTSUPP in order to prevent the ndo_dflt_fdb_add() and ndo_dflt_fdb_del() from running, although we could do that. Accepting that cleanup is necessary seems to be the only option. Especially since we appear to be coming back at this from a different angle as well. Russell King is noticing that the WARN_ON() triggers even for VLANs: https://lore.kernel.org/netdev/Z_li8Bj8bD4-BYKQ@shell.armlinux.org.uk/ What happens in the bug report above is that dsa_port_do_vlan_del() fails, then the VLAN entry lingers on, and then we warn on unbind and leak it. This is not a straight revert of the blamed commit, but we now add an informational print to the kernel log (to still have a way to see that bugs exist), and some extra comments gathered from past years' experience, to justify the logic. Fixes: 0832cd9f1f02 ("net: dsa: warn if port lists aren't empty in dsa_port_teardown") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250414212930.2956310-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index e827775baf2e..e7e32956070a 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -1478,12 +1478,44 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd) static void dsa_switch_release_ports(struct dsa_switch *ds) { + struct dsa_mac_addr *a, *tmp; struct dsa_port *dp, *next; + struct dsa_vlan *v, *n; dsa_switch_for_each_port_safe(dp, next, ds) { - WARN_ON(!list_empty(&dp->fdbs)); - WARN_ON(!list_empty(&dp->mdbs)); - WARN_ON(!list_empty(&dp->vlans)); + /* These are either entries that upper layers lost track of + * (probably due to bugs), or installed through interfaces + * where one does not necessarily have to remove them, like + * ndo_dflt_fdb_add(). + */ + list_for_each_entry_safe(a, tmp, &dp->fdbs, list) { + dev_info(ds->dev, + "Cleaning up unicast address %pM vid %u from port %d\n", + a->addr, a->vid, dp->index); + list_del(&a->list); + kfree(a); + } + + list_for_each_entry_safe(a, tmp, &dp->mdbs, list) { + dev_info(ds->dev, + "Cleaning up multicast address %pM vid %u from port %d\n", + a->addr, a->vid, dp->index); + list_del(&a->list); + kfree(a); + } + + /* These are entries that upper layers have lost track of, + * probably due to bugs, but also due to dsa_port_do_vlan_del() + * having failed and the VLAN entry still lingering on. + */ + list_for_each_entry_safe(v, n, &dp->vlans, list) { + dev_info(ds->dev, + "Cleaning up vid %u from port %d\n", + v->vid, dp->index); + list_del(&v->list); + kfree(v); + } + list_del(&dp->list); kfree(dp); } -- 2.51.0 From 8bf108d7161ffc6880ad13a0cc109de3cf631727 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Apr 2025 00:30:01 +0300 Subject: [PATCH 13/16] net: dsa: free routing table on probe failure If complete = true in dsa_tree_setup(), it means that we are the last switch of the tree which is successfully probing, and we should be setting up all switches from our probe path. After "complete" becomes true, dsa_tree_setup_cpu_ports() or any subsequent function may fail. If that happens, the entire tree setup is in limbo: the first N-1 switches have successfully finished probing (doing nothing but having allocated persistent memory in the tree's dst->ports, and maybe dst->rtable), and switch N failed to probe, ending the tree setup process before anything is tangible from the user's PoV. If switch N fails to probe, its memory (ports) will be freed and removed from dst->ports. However, the dst->rtable elements pointing to its ports, as created by dsa_link_touch(), will remain there, and will lead to use-after-free if dereferenced. If dsa_tree_setup_switches() returns -EPROBE_DEFER, which is entirely possible because that is where ds->ops->setup() is, we get a kasan report like this: ================================================================== BUG: KASAN: slab-use-after-free in mv88e6xxx_setup_upstream_port+0x240/0x568 Read of size 8 at addr ffff000004f56020 by task kworker/u8:3/42 Call trace: __asan_report_load8_noabort+0x20/0x30 mv88e6xxx_setup_upstream_port+0x240/0x568 mv88e6xxx_setup+0xebc/0x1eb0 dsa_register_switch+0x1af4/0x2ae0 mv88e6xxx_register_switch+0x1b8/0x2a8 mv88e6xxx_probe+0xc4c/0xf60 mdio_probe+0x78/0xb8 really_probe+0x2b8/0x5a8 __driver_probe_device+0x164/0x298 driver_probe_device+0x78/0x258 __device_attach_driver+0x274/0x350 Allocated by task 42: __kasan_kmalloc+0x84/0xa0 __kmalloc_cache_noprof+0x298/0x490 dsa_switch_touch_ports+0x174/0x3d8 dsa_register_switch+0x800/0x2ae0 mv88e6xxx_register_switch+0x1b8/0x2a8 mv88e6xxx_probe+0xc4c/0xf60 mdio_probe+0x78/0xb8 really_probe+0x2b8/0x5a8 __driver_probe_device+0x164/0x298 driver_probe_device+0x78/0x258 __device_attach_driver+0x274/0x350 Freed by task 42: __kasan_slab_free+0x48/0x68 kfree+0x138/0x418 dsa_register_switch+0x2694/0x2ae0 mv88e6xxx_register_switch+0x1b8/0x2a8 mv88e6xxx_probe+0xc4c/0xf60 mdio_probe+0x78/0xb8 really_probe+0x2b8/0x5a8 __driver_probe_device+0x164/0x298 driver_probe_device+0x78/0x258 __device_attach_driver+0x274/0x350 The simplest way to fix the bug is to delete the routing table in its entirety. dsa_tree_setup_routing_table() has no problem in regenerating it even if we deleted links between ports other than those of switch N, because dsa_link_touch() first checks whether the port pair already exists in dst->rtable, allocating if not. The deletion of the routing table in its entirety already exists in dsa_tree_teardown(), so refactor that into a function that can also be called from the tree setup error path. In my analysis of the commit to blame, it is the one which added dsa_link elements to dst->rtable. Prior to that, each switch had its own ds->rtable which is freed when the switch fails to probe. But the tree is potentially persistent memory. Fixes: c5f51765a1f6 ("net: dsa: list DSA links in the fabric") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250414213001.2957964-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/dsa.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index e7e32956070a..436a7e1b412a 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -862,6 +862,16 @@ static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst) kfree(dst->lags); } +static void dsa_tree_teardown_routing_table(struct dsa_switch_tree *dst) +{ + struct dsa_link *dl, *next; + + list_for_each_entry_safe(dl, next, &dst->rtable, list) { + list_del(&dl->list); + kfree(dl); + } +} + static int dsa_tree_setup(struct dsa_switch_tree *dst) { bool complete; @@ -879,7 +889,7 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst) err = dsa_tree_setup_cpu_ports(dst); if (err) - return err; + goto teardown_rtable; err = dsa_tree_setup_switches(dst); if (err) @@ -911,14 +921,14 @@ teardown_switches: dsa_tree_teardown_switches(dst); teardown_cpu_ports: dsa_tree_teardown_cpu_ports(dst); +teardown_rtable: + dsa_tree_teardown_routing_table(dst); return err; } static void dsa_tree_teardown(struct dsa_switch_tree *dst) { - struct dsa_link *dl, *next; - if (!dst->setup) return; @@ -932,10 +942,7 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst) dsa_tree_teardown_cpu_ports(dst); - list_for_each_entry_safe(dl, next, &dst->rtable, list) { - list_del(&dl->list); - kfree(dl); - } + dsa_tree_teardown_routing_table(dst); pr_info("DSA: tree %d torn down\n", dst->index); -- 2.51.0 From 514eff7b0aa1c5eb645ddbb8676ef3e2d88a8b99 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 15 Apr 2025 00:30:20 +0300 Subject: [PATCH 14/16] net: dsa: avoid refcount warnings when ds->ops->tag_8021q_vlan_del() fails This is very similar to the problem and solution from commit 232deb3f9567 ("net: dsa: avoid refcount warnings when ->port_{fdb,mdb}_del returns error"), except for the dsa_port_do_tag_8021q_vlan_del() operation. Fixes: c64b9c05045a ("net: dsa: tag_8021q: add proper cross-chip notifier support") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250414213020.2959021-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/tag_8021q.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index 3ee53e28ec2e..53e03fd8071b 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -197,7 +197,7 @@ static int dsa_port_do_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid) err = ds->ops->tag_8021q_vlan_del(ds, port, vid); if (err) { - refcount_inc(&v->refcount); + refcount_set(&v->refcount, 1); return err; } -- 2.51.0 From 2a5970d5aaff8f3e33ce3bfaa403ae88c40de40d Mon Sep 17 00:00:00 2001 From: Sagi Maimon Date: Tue, 15 Apr 2025 08:31:31 +0300 Subject: [PATCH 15/16] ptp: ocp: fix start time alignment in ptp_ocp_signal_set In ptp_ocp_signal_set, the start time for periodic signals is not aligned to the next period boundary. The current code rounds up the start time and divides by the period but fails to multiply back by the period, causing misaligned signal starts. Fix this by multiplying the rounded-up value by the period to ensure the start time is the closest next period. Fixes: 4bd46bb037f8e ("ptp: ocp: Use DIV64_U64_ROUND_UP for rounding.") Signed-off-by: Sagi Maimon Reviewed-by: Vadim Fedorenko Link: https://patch.msgid.link/20250415053131.129413-1-maimon.sagi@gmail.com Signed-off-by: Jakub Kicinski --- drivers/ptp/ptp_ocp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c index 7945c6be1f7c..faf6e027f89a 100644 --- a/drivers/ptp/ptp_ocp.c +++ b/drivers/ptp/ptp_ocp.c @@ -2067,6 +2067,7 @@ ptp_ocp_signal_set(struct ptp_ocp *bp, int gen, struct ptp_ocp_signal *s) if (!s->start) { /* roundup() does not work on 32-bit systems */ s->start = DIV64_U64_ROUND_UP(start_ns, s->period); + s->start *= s->period; s->start = ktime_add(s->start, s->phase); } -- 2.51.0 From 4798cfa2097f0833d54d8f5ce20ef14631917839 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 15 Apr 2025 08:15:52 -0700 Subject: [PATCH 16/16] net: don't try to ops lock uninitialized devs We need to be careful when operating on dev while in rtnl_create_link(). Some devices (vxlan) initialize netdev_ops in ->newlink, so later on. Avoid using netdev_lock_ops(), the device isn't registered so we cannot legally call its ops or generate any notifications for it. netdev_ops_assert_locked_or_invisible() is safe to use, it checks registration status first. Reported-by: syzbot+de1c7d68a10e3f123bdd@syzkaller.appspotmail.com Fixes: 04efcee6ef8d ("net: hold instance lock during NETDEV_CHANGE") Acked-by: Stanislav Fomichev Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250415151552.768373-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 2 ++ net/core/rtnetlink.c | 5 +---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 5fcbc66d865e..1be7cb73a602 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1520,6 +1520,8 @@ EXPORT_SYMBOL(netdev_features_change); void netif_state_change(struct net_device *dev) { + netdev_ops_assert_locked_or_invisible(dev); + if (dev->flags & IFF_UP) { struct netdev_notifier_change_info change_info = { .info.dev = dev, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 39a5b72e861f..c5a7f41982a5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3676,11 +3676,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, nla_len(tb[IFLA_BROADCAST])); if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); - if (tb[IFLA_OPERSTATE]) { - netdev_lock_ops(dev); + if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); - netdev_unlock_ops(dev); - } if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) -- 2.51.0