From 0d3008d1a9aefb89e09e8dd39134512d678e3461 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:49 -0700 Subject: [PATCH 01/16] rtnetlink: Move ops->validate to rtnl_newlink(). ops->validate() does not require RTNL. Let's move it to rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 49 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index e708f0852602..9c9290a6c271 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3692,16 +3692,14 @@ out_unregister: static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct rtnl_newlink_tbs *tbs, + struct nlattr **data, struct netlink_ext_ack *extack) { - struct nlattr ** const linkinfo = tbs->linkinfo; struct nlattr ** const tb = tbs->tb; struct net *net = sock_net(skb->sk); struct net_device *dev; struct ifinfomsg *ifm; - struct nlattr **data; bool link_specified; - int err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { @@ -3718,26 +3716,6 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, dev = NULL; } - data = NULL; - if (ops) { - if (ops->maxtype > RTNL_MAX_TYPE) - return -EINVAL; - - if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { - err = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, - linkinfo[IFLA_INFO_DATA], - ops->policy, extack); - if (err < 0) - return err; - data = tbs->attr; - } - if (ops->validate) { - err = ops->validate(tb, data, extack); - if (err < 0) - return err; - } - } - if (dev) return rtnl_changelink(skb, nlh, ops, dev, tbs, data, extack); @@ -3768,8 +3746,8 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct nlattr **tb, **linkinfo, **data = NULL; const struct rtnl_link_ops *ops = NULL; - struct nlattr **tb, **linkinfo; struct rtnl_newlink_tbs *tbs; int ret; @@ -3813,7 +3791,28 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, #endif } - ret = __rtnl_newlink(skb, nlh, ops, tbs, extack); + if (ops) { + if (ops->maxtype > RTNL_MAX_TYPE) + return -EINVAL; + + if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { + ret = nla_parse_nested_deprecated(tbs->attr, ops->maxtype, + linkinfo[IFLA_INFO_DATA], + ops->policy, extack); + if (ret < 0) + goto free; + + data = tbs->attr; + } + + if (ops->validate) { + ret = ops->validate(tb, data, extack); + if (ret < 0) + goto free; + } + } + + ret = __rtnl_newlink(skb, nlh, ops, tbs, data, extack); free: kfree(tbs); -- 2.51.0 From 43c7ce69d28e185f62fe2b8be2c681c5cac0bc6b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:50 -0700 Subject: [PATCH 02/16] rtnetlink: Protect struct rtnl_link_ops with SRCU. Once RTNL is replaced with rtnl_net_lock(), we need a mechanism to guarantee that rtnl_link_ops is alive during inflight RTM_NEWLINK even when its module is being unloaded. Let's use SRCU to protect ops. rtnl_link_ops_get() now iterates link_ops under RCU and returns SRCU-protected ops pointer. The caller must call rtnl_link_ops_put() to release the pointer after the use. Also, __rtnl_link_unregister() unlinks the ops first and calls synchronize_srcu() to wait for inflight RTM_NEWLINK requests to complete. Note that link_ops needs to be protected by its dedicated lock when RTNL is removed. Suggested-by: Eric Dumazet Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 5 ++- net/core/rtnetlink.c | 83 ++++++++++++++++++++++++++++++----------- 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index bb49c5708ce7..1a6aa5ca74f3 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -3,6 +3,7 @@ #define __NET_RTNETLINK_H #include +#include #include typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, @@ -69,7 +70,8 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) /** * struct rtnl_link_ops - rtnetlink link operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @kind: Identifier * @netns_refund: Physical device, move to init_net on netns exit * @maxtype: Highest device specific netlink attribute number @@ -100,6 +102,7 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh) */ struct rtnl_link_ops { struct list_head list; + struct srcu_struct srcu; const char *kind; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9c9290a6c271..31b105b3a834 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -457,15 +457,29 @@ EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static LIST_HEAD(link_ops); -static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) +static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index) { - const struct rtnl_link_ops *ops; + struct rtnl_link_ops *ops; - list_for_each_entry(ops, &link_ops, list) { - if (!strcmp(ops->kind, kind)) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &link_ops, list) { + if (!strcmp(ops->kind, kind)) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** @@ -480,8 +494,16 @@ static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) */ int __rtnl_link_register(struct rtnl_link_ops *ops) { - if (rtnl_link_ops_get(ops->kind)) - return -EEXIST; + struct rtnl_link_ops *tmp; + int err; + + /* When RTNL is removed, add lock for link_ops. */ + ASSERT_RTNL(); + + list_for_each_entry(tmp, &link_ops, list) { + if (!strcmp(ops->kind, tmp->kind)) + return -EEXIST; + } /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible @@ -491,7 +513,12 @@ int __rtnl_link_register(struct rtnl_link_ops *ops) if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; - list_add_tail(&ops->list, &link_ops); + err = init_srcu_struct(&ops->srcu); + if (err) + return err; + + list_add_tail_rcu(&ops->list, &link_ops); + return 0; } EXPORT_SYMBOL_GPL(__rtnl_link_register); @@ -542,10 +569,12 @@ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; - for_each_net(net) { + list_del_rcu(&ops->list); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); + + for_each_net(net) __rtnl_kill_links(net, ops); - } - list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); @@ -2158,10 +2187,11 @@ static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; -static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) +static struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla, + int *ops_srcu_index) { - const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; + struct rtnl_link_ops *ops = NULL; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; @@ -2170,7 +2200,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, ops_srcu_index); } return ops; @@ -2290,8 +2320,8 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { - const struct rtnl_link_ops *kind_ops = NULL; struct netlink_ext_ack *extack = cb->extack; + struct rtnl_link_ops *kind_ops = NULL; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; @@ -2302,6 +2332,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net *tgt_net = net; u32 ext_filter_mask = 0; struct net_device *dev; + int ops_srcu_index; int master_idx = 0; int netnsid = -1; int err, i; @@ -2335,7 +2366,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: - kind_ops = linkinfo_to_kind_ops(tb[i]); + kind_ops = linkinfo_to_kind_ops(tb[i], &ops_srcu_index); break; default: if (cb->strict_check) { @@ -2361,6 +2392,10 @@ walk_entries: if (err < 0) break; } + + if (kind_ops) + rtnl_link_ops_put(kind_ops, ops_srcu_index); + cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) @@ -3747,8 +3782,9 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr **tb, **linkinfo, **data = NULL; - const struct rtnl_link_ops *ops = NULL; + struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; + int ops_srcu_index; int ret; tbs = kmalloc(sizeof(*tbs), GFP_KERNEL); @@ -3780,13 +3816,13 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); #ifdef CONFIG_MODULES if (!ops) { __rtnl_unlock(); request_module("rtnl-link-%s", kind); rtnl_lock(); - ops = rtnl_link_ops_get(kind); + ops = rtnl_link_ops_get(kind, &ops_srcu_index); } #endif } @@ -3800,7 +3836,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (ret < 0) - goto free; + goto put_ops; data = tbs->attr; } @@ -3808,12 +3844,15 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (ops->validate) { ret = ops->validate(tb, data, extack); if (ret < 0) - goto free; + goto put_ops; } } ret = __rtnl_newlink(skb, nlh, ops, tbs, data, extack); +put_ops: + if (ops) + rtnl_link_ops_put(ops, ops_srcu_index); free: kfree(tbs); return ret; -- 2.51.0 From 0fef2a1212f1ff68fc3834abd41928b4353f8af6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:51 -0700 Subject: [PATCH 03/16] rtnetlink: Call rtnl_link_get_net_capable() in rtnl_newlink(). As a prerequisite of per-netns RTNL, we must fetch netns before looking up dev or moving it to another netns. rtnl_link_get_net_capable() is called in rtnl_newlink_create() and do_setlink(), but both of them need to be moved to the RTNL-independent region, which will be rtnl_newlink(). Let's call rtnl_link_get_net_capable() in rtnl_newlink() and pass the netns down to where needed. Note that the latter two have not passed the nets to do_setlink() yet but will do so after the remaining rtnl_link_get_net_capable() is moved to rtnl_setlink() later. While at it, dest_net is renamed to tgt_net in rtnl_newlink_create() to align with rtnl_{del,set}link(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 51 ++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 31b105b3a834..f6823c8d21ad 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3549,7 +3549,7 @@ struct rtnl_newlink_tbs { static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, - struct net_device *dev, + struct net_device *dev, struct net *tgt_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3613,10 +3613,10 @@ static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, } static int rtnl_group_changelink(const struct sk_buff *skb, - struct net *net, int group, - struct ifinfomsg *ifm, - struct netlink_ext_ack *extack, - struct nlattr **tb) + struct net *net, struct net *tgt_net, + int group, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, + struct nlattr **tb) { struct net_device *dev, *aux; int err; @@ -3634,6 +3634,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, + struct net *tgt_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3641,9 +3642,9 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, unsigned char name_assign_type = NET_NAME_USER; struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; - struct net *dest_net, *link_net; struct net_device *dev; char ifname[IFNAMSIZ]; + struct net *link_net; int err; if (!ops->alloc && !ops->setup) @@ -3656,14 +3657,10 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); - if (IS_ERR(dest_net)) - return PTR_ERR(dest_net); - if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - link_net = get_net_ns_by_id(dest_net, id); + link_net = get_net_ns_by_id(tgt_net, id); if (!link_net) { NL_SET_ERR_MSG(extack, "Unknown network namespace id"); err = -EINVAL; @@ -3676,7 +3673,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, link_net = NULL; } - dev = rtnl_create_link(link_net ? : dest_net, ifname, + dev = rtnl_create_link(link_net ? : tgt_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); @@ -3698,7 +3695,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); + err = dev_change_net_namespace(dev, tgt_net, ifname); if (err < 0) goto out_unregister; } @@ -3710,7 +3707,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, out: if (link_net) put_net(link_net); - put_net(dest_net); + return err; out_unregister: if (ops->newlink) { @@ -3726,6 +3723,7 @@ out_unregister: static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, + struct net *tgt_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3752,19 +3750,18 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (dev) - return rtnl_changelink(skb, nlh, ops, dev, tbs, data, extack); + return rtnl_changelink(skb, nlh, ops, dev, tgt_net, tbs, data, extack); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, * or it's for a group */ - if (link_specified) + if (link_specified || !tb[IFLA_GROUP]) return -ENODEV; - if (tb[IFLA_GROUP]) - return rtnl_group_changelink(skb, net, - nla_get_u32(tb[IFLA_GROUP]), - ifm, extack, tb); - return -ENODEV; + + return rtnl_group_changelink(skb, net, tgt_net, + nla_get_u32(tb[IFLA_GROUP]), + ifm, extack, tb); } if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) @@ -3775,7 +3772,7 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -EOPNOTSUPP; } - return rtnl_newlink_create(skb, ifm, ops, nlh, tb, data, extack); + return rtnl_newlink_create(skb, ifm, ops, tgt_net, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -3784,6 +3781,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **tb, **linkinfo, **data = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; + struct net *tgt_net; int ops_srcu_index; int ret; @@ -3848,8 +3846,15 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } } - ret = __rtnl_newlink(skb, nlh, ops, tbs, data, extack); + tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + ret = PTR_ERR(tgt_net); + goto put_ops; + } + + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, tbs, data, extack); + put_net(tgt_net); put_ops: if (ops) rtnl_link_ops_put(ops, ops_srcu_index); -- 2.51.0 From f7774eec20b41fae36a58e8ab04ff4dd48bb1845 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:52 -0700 Subject: [PATCH 04/16] rtnetlink: Fetch IFLA_LINK_NETNSID in rtnl_newlink(). Another netns option for RTM_NEWLINK is IFLA_LINK_NETNSID and is fetched in rtnl_newlink_create(). This must be done before holding rtnl_net_lock(). Let's move IFLA_LINK_NETNSID processing to rtnl_newlink(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 49 ++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f6823c8d21ad..eee0f820ddf6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3634,7 +3634,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, - struct net *tgt_net, + struct net *tgt_net, struct net *link_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3644,7 +3644,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, u32 portid = NETLINK_CB(skb).portid; struct net_device *dev; char ifname[IFNAMSIZ]; - struct net *link_net; int err; if (!ops->alloc && !ops->setup) @@ -3657,22 +3656,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, name_assign_type = NET_NAME_ENUM; } - if (tb[IFLA_LINK_NETNSID]) { - int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); - - link_net = get_net_ns_by_id(tgt_net, id); - if (!link_net) { - NL_SET_ERR_MSG(extack, "Unknown network namespace id"); - err = -EINVAL; - goto out; - } - err = -EPERM; - if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) - goto out; - } else { - link_net = NULL; - } - dev = rtnl_create_link(link_net ? : tgt_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { @@ -3705,9 +3688,6 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, goto out_unregister; } out: - if (link_net) - put_net(link_net); - return err; out_unregister: if (ops->newlink) { @@ -3723,7 +3703,7 @@ out_unregister: static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, - struct net *tgt_net, + struct net *tgt_net, struct net *link_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3772,16 +3752,16 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -EOPNOTSUPP; } - return rtnl_newlink_create(skb, ifm, ops, tgt_net, nlh, tb, data, extack); + return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, nlh, tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr **tb, **linkinfo, **data = NULL; + struct net *tgt_net, *link_net = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; - struct net *tgt_net; int ops_srcu_index; int ret; @@ -3852,8 +3832,27 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto put_ops; } - ret = __rtnl_newlink(skb, nlh, ops, tgt_net, tbs, data, extack); + if (tb[IFLA_LINK_NETNSID]) { + int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); + + link_net = get_net_ns_by_id(tgt_net, id); + if (!link_net) { + NL_SET_ERR_MSG(extack, "Unknown network namespace id"); + ret = -EINVAL; + goto put_net; + } + + if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + goto put_net; + } + } + + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); +put_net: + if (link_net) + put_net(link_net); put_net(tgt_net); put_ops: if (ops) -- 2.51.0 From 175cfc5cd373b6665ec145cafe742252453a7c0e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:53 -0700 Subject: [PATCH 05/16] rtnetlink: Clean up rtnl_dellink(). We will push RTNL down to rtnl_delink(). Let's unify the error path to make it easy to place rtnl_net_lock(). While at it, keep the variables in reverse xmas order. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index eee0f820ddf6..a19b2eb2727e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3368,14 +3368,14 @@ EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); u32 portid = NETLINK_CB(skb).portid; - struct net *tgt_net = net; - struct net_device *dev = NULL; - struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; - int err; + struct net_device *dev = NULL; + struct net *tgt_net = net; int netnsid = -1; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3393,27 +3393,20 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, return PTR_ERR(tgt_net); } - err = -EINVAL; - ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); + + if (dev) + err = rtnl_delete_link(dev, portid, nlh); + else if (ifm->ifi_index > 0 || tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) + err = -ENODEV; else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - goto out; - - if (!dev) { - if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME] || ifm->ifi_index > 0) - err = -ENODEV; - - goto out; - } - - err = rtnl_delete_link(dev, portid, nlh); + err = -EINVAL; -out: if (netnsid >= 0) put_net(tgt_net); -- 2.51.0 From 6e495fad88ef7bdea70b0e4a1a714f6eccab2a5a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:54 -0700 Subject: [PATCH 06/16] rtnetlink: Clean up rtnl_setlink(). We will push RTNL down to rtnl_setlink(). Let's unify the error path to make it easy to place rtnl_net_lock(). While at it, keep the variables in reverse xmas order. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index a19b2eb2727e..f89a902458d6 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3279,11 +3279,11 @@ static struct net_device *rtnl_dev_get(struct net *net, static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct ifinfomsg *ifm = nlmsg_data(nlh); struct net *net = sock_net(skb->sk); - struct ifinfomsg *ifm; - struct net_device *dev; - int err; struct nlattr *tb[IFLA_MAX+1]; + struct net_device *dev = NULL; + int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); @@ -3294,21 +3294,18 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - err = -EINVAL; - ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else - goto errout; + err = -EINVAL; - if (dev == NULL) { + if (dev) + err = do_setlink(skb, dev, ifm, extack, tb, 0); + else if (!err) err = -ENODEV; - goto errout; - } - err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; } -- 2.51.0 From a0b63c6457e100b84b1ff9179bc328c0924de75c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:55 -0700 Subject: [PATCH 07/16] rtnetlink: Call rtnl_link_get_net_capable() in do_setlink(). We will push RTNL down to rtnl_setlink(). RTM_SETLINK could call rtnl_link_get_net_capable() in do_setlink() to move a dev to a new netns, but the netns needs to be fetched before holding rtnl_net_lock(). Let's move it to rtnl_setlink() and pass the netns to do_setlink(). Now, RTM_NEWLINK paths (rtnl_changelink() and rtnl_group_changelink()) can pass the prefetched netns to do_setlink(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f89a902458d6..445e6ffed75e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2881,8 +2881,8 @@ static int do_set_proto_down(struct net_device *dev, #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 -static int do_setlink(const struct sk_buff *skb, - struct net_device *dev, struct ifinfomsg *ifm, +static int do_setlink(const struct sk_buff *skb, struct net_device *dev, + struct net *tgt_net, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { @@ -2899,27 +2899,19 @@ static int do_setlink(const struct sk_buff *skb, else ifname[0] = '\0'; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { + if (!net_eq(tgt_net, dev_net(dev))) { const char *pat = ifname[0] ? ifname : NULL; - struct net *net; int new_ifindex; - net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); - if (IS_ERR(net)) { - err = PTR_ERR(net); - goto errout; - } - if (tb[IFLA_NEW_IFINDEX]) new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); else new_ifindex = 0; - err = __dev_change_net_namespace(dev, net, pat, new_ifindex); - put_net(net); + err = __dev_change_net_namespace(dev, tgt_net, pat, new_ifindex); if (err) goto errout; + status |= DO_SETLINK_MODIFIED; } @@ -3283,6 +3275,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; + struct net *tgt_net; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, @@ -3294,6 +3287,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; + tgt_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); + if (IS_ERR(tgt_net)) { + err = PTR_ERR(tgt_net); + goto errout; + } + if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) @@ -3302,10 +3301,11 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; if (dev) - err = do_setlink(skb, dev, ifm, extack, tb, 0); + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); else if (!err) err = -ENODEV; + put_net(tgt_net); errout: return err; } @@ -3599,7 +3599,7 @@ static int rtnl_changelink(const struct sk_buff *skb, struct nlmsghdr *nlh, status |= DO_SETLINK_NOTIFY; } - return do_setlink(skb, dev, nlmsg_data(nlh), extack, tb, status); + return do_setlink(skb, dev, tgt_net, nlmsg_data(nlh), extack, tb, status); } static int rtnl_group_changelink(const struct sk_buff *skb, @@ -3613,7 +3613,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = do_setlink(skb, dev, ifm, extack, tb, 0); + err = do_setlink(skb, dev, tgt_net, ifm, extack, tb, 0); if (err < 0) return err; } -- 2.51.0 From 26eebdc4b005ccd4cf63f4fef4c9c0adf9bfa380 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:56 -0700 Subject: [PATCH 08/16] rtnetlink: Return int from rtnl_af_register(). The next patch will add init_srcu_struct() in rtnl_af_register(), then we need to handle its error. Let's add the error handling in advance to make the following patch cleaner. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Matt Johnston Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 2 +- net/bridge/br_netlink.c | 6 +++++- net/core/rtnetlink.c | 4 +++- net/ipv4/devinet.c | 3 ++- net/ipv6/addrconf.c | 5 ++++- net/mctp/device.c | 16 +++++++++++----- net/mpls/af_mpls.c | 5 ++++- 7 files changed, 30 insertions(+), 11 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 1a6aa5ca74f3..969138ae2f4b 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -204,7 +204,7 @@ struct rtnl_af_ops { size_t (*get_stats_af_size)(const struct net_device *dev); }; -void rtnl_af_register(struct rtnl_af_ops *ops); +int rtnl_af_register(struct rtnl_af_ops *ops); void rtnl_af_unregister(struct rtnl_af_ops *ops); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 6b97ae47f855..3e0f47203f2a 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1924,7 +1924,9 @@ int __init br_netlink_init(void) if (err) goto out; - rtnl_af_register(&br_af_ops); + err = rtnl_af_register(&br_af_ops); + if (err) + goto out_vlan; err = rtnl_link_register(&br_link_ops); if (err) @@ -1934,6 +1936,8 @@ int __init br_netlink_init(void) out_af: rtnl_af_unregister(&br_af_ops); +out_vlan: + br_vlan_rtnl_uninit(); out: return err; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 445e6ffed75e..70b663aca209 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -686,11 +686,13 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) * * Returns 0 on success or a negative error code. */ -void rtnl_af_register(struct rtnl_af_ops *ops) +int rtnl_af_register(struct rtnl_af_ops *ops) { rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); + + return 0; } EXPORT_SYMBOL_GPL(rtnl_af_register); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ec29ead83e74..0ff9c0abfaa0 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2827,7 +2827,8 @@ void __init devinet_init(void) register_pernet_subsys(&devinet_ops); register_netdevice_notifier(&ip_netdev_notifier); - rtnl_af_register(&inet_af_ops); + if (rtnl_af_register(&inet_af_ops)) + panic("Unable to register inet_af_ops\n"); rtnl_register_many(devinet_rtnl_msg_handlers); } diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ac8645ad2537..d0a99710d65d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7468,7 +7468,9 @@ int __init addrconf_init(void) addrconf_verify(&init_net); - rtnl_af_register(&inet6_ops); + err = rtnl_af_register(&inet6_ops); + if (err) + goto erraf; err = rtnl_register_many(addrconf_rtnl_msg_handlers); if (err) @@ -7482,6 +7484,7 @@ int __init addrconf_init(void) errout: rtnl_unregister_all(PF_INET6); rtnl_af_unregister(&inet6_ops); +erraf: unregister_netdevice_notifier(&ipv6_dev_notf); errlo: destroy_workqueue(addrconf_wq); diff --git a/net/mctp/device.c b/net/mctp/device.c index 85cc5f31f1e7..3d75b919995d 100644 --- a/net/mctp/device.c +++ b/net/mctp/device.c @@ -535,14 +535,20 @@ int __init mctp_device_init(void) int err; register_netdevice_notifier(&mctp_dev_nb); - rtnl_af_register(&mctp_af_ops); + + err = rtnl_af_register(&mctp_af_ops); + if (err) + goto err_notifier; err = rtnl_register_many(mctp_device_rtnl_msg_handlers); - if (err) { - rtnl_af_unregister(&mctp_af_ops); - unregister_netdevice_notifier(&mctp_dev_nb); - } + if (err) + goto err_af; + return 0; +err_af: + rtnl_af_unregister(&mctp_af_ops); +err_notifier: + unregister_netdevice_notifier(&mctp_dev_nb); return err; } diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index a0573847bc55..1f63b32d76d6 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -2753,7 +2753,9 @@ static int __init mpls_init(void) dev_add_pack(&mpls_packet_type); - rtnl_af_register(&mpls_af_ops); + err = rtnl_af_register(&mpls_af_ops); + if (err) + goto out_unregister_dev_type; err = rtnl_register_many(mpls_rtnl_msg_handlers); if (err) @@ -2773,6 +2775,7 @@ out_unregister_rtnl: rtnl_unregister_many(mpls_rtnl_msg_handlers); out_unregister_rtnl_af: rtnl_af_unregister(&mpls_af_ops); +out_unregister_dev_type: dev_remove_pack(&mpls_packet_type); out_unregister_pernet: unregister_pernet_subsys(&mpls_net_ops); -- 2.51.0 From 6ab0f866948323724e95cf14d9e47fd77703c192 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Oct 2024 11:53:57 -0700 Subject: [PATCH 09/16] rtnetlink: Protect struct rtnl_af_ops with SRCU. Once RTNL is replaced with rtnl_net_lock(), we need a mechanism to guarantee that rtnl_af_ops is alive during inflight RTM_SETLINK even when its module is being unloaded. Let's use SRCU to protect ops. rtnl_af_lookup() now iterates rtnl_af_ops under RCU and returns SRCU-protected ops pointer. The caller must call rtnl_af_put() to release the pointer after the use. Also, rtnl_af_unregister() unlinks the ops first and calls synchronize_srcu() to wait for inflight RTM_SETLINK requests to complete. Note that rtnl_af_ops needs to be protected by its dedicated lock when RTNL is removed. Note also that BUG_ON() in do_setlink() is changed to the normal error handling as a different af_ops might be found after validate_linkmsg(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/rtnetlink.h | 5 +++- net/core/rtnetlink.c | 63 ++++++++++++++++++++++++++++++----------- 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 969138ae2f4b..e0d9a8eae6b6 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -172,7 +172,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); /** * struct rtnl_af_ops - rtnetlink address family operations * - * @list: Used internally + * @list: Used internally, protected by RTNL and SRCU + * @srcu: Used internally * @family: Address family * @fill_link_af: Function to fill IFLA_AF_SPEC with address family * specific netlink attributes. @@ -185,6 +186,8 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops); */ struct rtnl_af_ops { struct list_head list; + struct srcu_struct srcu; + int family; int (*fill_link_af)(struct sk_buff *skb, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 70b663aca209..194a81e5f608 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -666,18 +666,31 @@ static size_t rtnl_link_get_size(const struct net_device *dev) static LIST_HEAD(rtnl_af_ops); -static const struct rtnl_af_ops *rtnl_af_lookup(const int family) +static struct rtnl_af_ops *rtnl_af_lookup(const int family, int *srcu_index) { - const struct rtnl_af_ops *ops; + struct rtnl_af_ops *ops; ASSERT_RTNL(); - list_for_each_entry(ops, &rtnl_af_ops, list) { - if (ops->family == family) - return ops; + rcu_read_lock(); + + list_for_each_entry_rcu(ops, &rtnl_af_ops, list) { + if (ops->family == family) { + *srcu_index = srcu_read_lock(&ops->srcu); + goto unlock; + } } - return NULL; + ops = NULL; +unlock: + rcu_read_unlock(); + + return ops; +} + +static void rtnl_af_put(struct rtnl_af_ops *ops, int srcu_index) +{ + srcu_read_unlock(&ops->srcu, srcu_index); } /** @@ -688,6 +701,11 @@ static const struct rtnl_af_ops *rtnl_af_lookup(const int family) */ int rtnl_af_register(struct rtnl_af_ops *ops) { + int err = init_srcu_struct(&ops->srcu); + + if (err) + return err; + rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); @@ -707,6 +725,8 @@ void rtnl_af_unregister(struct rtnl_af_ops *ops) rtnl_unlock(); synchronize_rcu(); + synchronize_srcu(&ops->srcu); + cleanup_srcu_struct(&ops->srcu); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); @@ -2579,20 +2599,24 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - af_ops = rtnl_af_lookup(nla_type(af)); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) - return -EOPNOTSUPP; - - if (af_ops->validate_link_af) { + err = -EOPNOTSUPP; + else if (af_ops->validate_link_af) err = af_ops->validate_link_af(dev, af, extack); - if (err < 0) - return err; - } + else + err = 0; + + rtnl_af_put(af_ops, af_ops_srcu_index); + + if (err < 0) + return err; } } @@ -3172,11 +3196,18 @@ static int do_setlink(const struct sk_buff *skb, struct net_device *dev, int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { - const struct rtnl_af_ops *af_ops; + struct rtnl_af_ops *af_ops; + int af_ops_srcu_index; - BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); + af_ops = rtnl_af_lookup(nla_type(af), &af_ops_srcu_index); + if (!af_ops) { + err = -EAFNOSUPPORT; + goto errout; + } err = af_ops->set_link_af(dev, af, extack); + rtnl_af_put(af_ops, af_ops_srcu_index); + if (err < 0) goto errout; -- 2.51.0 From d10f1a4e44c3bf874701f86f8cc43490e1956acf Mon Sep 17 00:00:00 2001 From: Abhishek Chauhan Date: Wed, 16 Oct 2024 16:43:13 -0700 Subject: [PATCH 10/16] net: stmmac: Programming sequence for VLAN packets with split header Currently reset state configuration of split header works fine for non-tagged packets and we see no corruption in payload of any size We need additional programming sequence with reset configuration to handle VLAN tagged packets to avoid corruption in payload for packets of size greater than 256 bytes. Without this change ping application complains about corruption in payload when the size of the VLAN packet exceeds 256 bytes. With this change tagged and non-tagged packets of any size works fine and there is no corruption seen. Current configuration which has the issue for VLAN packet ---------------------------------------------------------- Split happens at the position at Layer 3 header |MAC-DA|MAC-SA|Vlan Tag|Ether type|IP header|IP data|Rest of the payload| 2 bytes ^ | With the fix we are making sure that the split happens now at Layer 2 which is end of ethernet header and start of IP payload Ip traffic split ----------------- Bits which take care of this are SPLM and SPLOFST SPLM = Split mode is set to Layer 2 SPLOFST = These bits indicate the value of offset from the beginning of Length/Type field at which header split should take place when the appropriate SPLM is selected. Reset value is 2bytes. Un-tagged data (without VLAN) |MAC-DA|MAC-SA|Ether type|IP header|IP data|Rest of the payload| 2bytes ^ | Tagged data (with VLAN) |MAC-DA|MAC-SA|VLAN Tag|Ether type|IP header|IP data|Rest of the payload| 2bytes ^ | Non-IP traffic split such AV packet ------------------------------------ Bits which take care of this are SAVE = Split AV Enable SAVO = Split AV Offset, similar to SPLOFST but this is for AVTP packets. |Preamble|MAC-DA|MAC-SA|VLAN tag|Ether type|IEEE 1722 payload|CRC| 2bytes ^ | Signed-off-by: Abhishek Chauhan Reviewed-by: Simon Horman Link: https://patch.msgid.link/20241016234313.3992214-1-quic_abchauha@quicinc.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac4.h | 5 +++++ drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h index 93a78fd0737b..28fff6cab812 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4.h @@ -44,6 +44,7 @@ #define GMAC_MDIO_DATA 0x00000204 #define GMAC_GPIO_STATUS 0x0000020C #define GMAC_ARP_ADDR 0x00000210 +#define GMAC_EXT_CFG1 0x00000238 #define GMAC_ADDR_HIGH(reg) (0x300 + reg * 8) #define GMAC_ADDR_LOW(reg) (0x304 + reg * 8) #define GMAC_L3L4_CTRL(reg) (0x900 + (reg) * 0x30) @@ -284,6 +285,10 @@ enum power_event { #define GMAC_HW_FEAT_DVLAN BIT(5) #define GMAC_HW_FEAT_NRVF GENMASK(2, 0) +/* MAC extended config 1 */ +#define GMAC_CONFIG1_SAVE_EN BIT(24) +#define GMAC_CONFIG1_SPLM(v) FIELD_PREP(GENMASK(9, 8), v) + /* GMAC GPIO Status reg */ #define GMAC_GPO0 BIT(16) #define GMAC_GPO1 BIT(17) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c index e0165358c4ac..7c895e0ae71f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_dma.c @@ -526,6 +526,11 @@ static void dwmac4_enable_sph(struct stmmac_priv *priv, void __iomem *ioaddr, value |= GMAC_CONFIG_HDSMS_256; /* Segment max 256 bytes */ writel(value, ioaddr + GMAC_EXT_CONFIG); + value = readl(ioaddr + GMAC_EXT_CFG1); + value |= GMAC_CONFIG1_SPLM(1); /* Split mode set to L2OFST */ + value |= GMAC_CONFIG1_SAVE_EN; /* Enable Split AV mode */ + writel(value, ioaddr + GMAC_EXT_CFG1); + value = readl(ioaddr + DMA_CHAN_CONTROL(dwmac4_addrs, chan)); if (en) value |= DMA_CONTROL_SPH; -- 2.51.0 From c797cb9c09882195d58989a40cc7d42c9c033a3b Mon Sep 17 00:00:00 2001 From: Paul Davey Date: Thu, 17 Oct 2024 14:50:25 +1300 Subject: [PATCH 11/16] net: phy: marvell: Add mdix status reporting Report MDI-X resolved state after link up. Tested on Linkstreet 88E6193X internal PHYs. Signed-off-by: Paul Davey Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241017015026.255224-1-paul.davey@alliedtelesis.co.nz Signed-off-by: Paolo Abeni --- drivers/net/phy/marvell.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/net/phy/marvell.c b/drivers/net/phy/marvell.c index 9964bf3dea2f..28aec37acd2c 100644 --- a/drivers/net/phy/marvell.c +++ b/drivers/net/phy/marvell.c @@ -176,6 +176,7 @@ #define MII_M1011_PHY_STATUS_FULLDUPLEX 0x2000 #define MII_M1011_PHY_STATUS_RESOLVED 0x0800 #define MII_M1011_PHY_STATUS_LINK 0x0400 +#define MII_M1011_PHY_STATUS_MDIX BIT(6) #define MII_88E3016_PHY_SPEC_CTRL 0x10 #define MII_88E3016_DISABLE_SCRAMBLER 0x0200 @@ -1722,6 +1723,19 @@ static int marvell_read_status_page(struct phy_device *phydev, int page) phydev->duplex = DUPLEX_UNKNOWN; phydev->port = fiber ? PORT_FIBRE : PORT_TP; + if (fiber) { + phydev->mdix = ETH_TP_MDI_INVALID; + } else { + /* The MDI-X state is set regardless of Autoneg being enabled + * and reflects forced MDI-X state as well as auto resolution + */ + if (status & MII_M1011_PHY_STATUS_RESOLVED) + phydev->mdix = status & MII_M1011_PHY_STATUS_MDIX ? + ETH_TP_MDI_X : ETH_TP_MDI; + else + phydev->mdix = ETH_TP_MDI_INVALID; + } + if (phydev->autoneg == AUTONEG_ENABLE) err = marvell_read_status_page_an(phydev, fiber, status); else -- 2.51.0 From a6e263f125cd7b10614a83159c453c061dbf6877 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 17 Oct 2024 11:45:43 +0200 Subject: [PATCH 12/16] selftests: net: lib: Introduce deferred commands In commit 8510801a9dbd ("selftests: drv-net: add ability to schedule cleanup with defer()"), a defer helper was added to Python selftests. The idea is to keep cleanup commands close to their dirtying counterparts, thereby making it more transparent what is cleaning up what, making it harder to miss a cleanup, and make the whole cleanup business exception safe. All these benefits are applicable to bash as well, exception safety can be interpreted in terms of safety vs. a SIGINT. This patch therefore introduces a framework of several helpers that serve to schedule cleanups in bash selftests: - defer_scope_push(), defer_scope_pop(): Deferred statements can be batched together in scopes. When a scope is popped, the deferred commands scheduled in that scope are executed in the order opposite to order of their scheduling. - defer(): Schedules a defer to the most recently pushed scope (or the default scope if none was pushed.) - defer_prio(): Schedules a defer on the priority track. The priority defer queue is run before the default defer queue when scope is popped. The issue that this is addressing is specifically the one of restoring devlink shared buffer threshold type. When setting up static thresholds, one has to first change the threshold type to static, then override the individual thresholds. When cleaning up, it would be natural to reset the threshold values first, then change the threshold type. But the values that are valid for dynamic thresholds are generally invalid for static thresholds and vice versa. Attempts to restore the values first would be bounced. Thus one has to first reset the threshold type, then adjust the thresholds. (You could argue that the shared buffer threshold type API is broken and you would be right, but here we are.) This cannot be solved by pure defers easily. I considered making it possible to disable an existing defer, so that one could then schedule a new defer and disable the original. But this forward-shifting of the defer job would have to take place after every threshold-adjusting command, which would make it very awkward to schedule these jobs. - defer_scopes_cleanup(): Pops any unpopped scopes, including the default one. The selftests that use defer should run this in their exit trap. This is important to get cleanups of interrupted scripts. - in_defer_scope(): Sometimes a function would like to introduce a new defer scope, then run whatever it is that it wants to run, and then pop the scope to run the deferred cleanups. The helper in_defer_scope() can be used to run another command within such environment, such that any scheduled defers run after the command finishes. The framework is added as a separate file lib/sh/defer.sh so that it can be used by all bash selftests, including those that do not currently use lib.sh. lib.sh however includes the file by default, because ideally all tests would use these helpers instead of hand-rolling their cleanups. Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 3 +- tools/testing/selftests/net/lib.sh | 3 + tools/testing/selftests/net/lib/Makefile | 2 +- tools/testing/selftests/net/lib/sh/defer.sh | 115 ++++++++++++++++++ 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/net/lib/sh/defer.sh diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index c992e385159c..d24b6af7ebfa 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1403,7 +1403,8 @@ tests_run() local current_test for current_test in ${TESTS:-$ALL_TESTS}; do - $current_test + in_defer_scope \ + $current_test done } diff --git a/tools/testing/selftests/net/lib.sh b/tools/testing/selftests/net/lib.sh index be8707bfb46e..c8991cc6bf28 100644 --- a/tools/testing/selftests/net/lib.sh +++ b/tools/testing/selftests/net/lib.sh @@ -1,6 +1,9 @@ #!/bin/bash # SPDX-License-Identifier: GPL-2.0 +net_dir=$(dirname "$(readlink -e "${BASH_SOURCE[0]}")") +source "$net_dir/lib/sh/defer.sh" + ############################################################################## # Defines diff --git a/tools/testing/selftests/net/lib/Makefile b/tools/testing/selftests/net/lib/Makefile index 82c3264b115e..18b9443454a9 100644 --- a/tools/testing/selftests/net/lib/Makefile +++ b/tools/testing/selftests/net/lib/Makefile @@ -10,6 +10,6 @@ TEST_FILES += ../../../../net/ynl TEST_GEN_FILES += csum -TEST_INCLUDES := $(wildcard py/*.py) +TEST_INCLUDES := $(wildcard py/*.py sh/*.sh) include ../../lib.mk diff --git a/tools/testing/selftests/net/lib/sh/defer.sh b/tools/testing/selftests/net/lib/sh/defer.sh new file mode 100644 index 000000000000..082f5d38321b --- /dev/null +++ b/tools/testing/selftests/net/lib/sh/defer.sh @@ -0,0 +1,115 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# map[(scope_id,track,cleanup_id) -> cleanup_command] +# track={d=default | p=priority} +declare -A __DEFER__JOBS + +# map[(scope_id,track) -> # cleanup_commands] +declare -A __DEFER__NJOBS + +# scope_id of the topmost scope. +__DEFER__SCOPE_ID=0 + +__defer__ndefer_key() +{ + local track=$1; shift + + echo $__DEFER__SCOPE_ID,$track +} + +__defer__defer_key() +{ + local track=$1; shift + local defer_ix=$1; shift + + echo $__DEFER__SCOPE_ID,$track,$defer_ix +} + +__defer__ndefers() +{ + local track=$1; shift + + echo ${__DEFER__NJOBS[$(__defer__ndefer_key $track)]} +} + +__defer__run() +{ + local track=$1; shift + local defer_ix=$1; shift + local defer_key=$(__defer__defer_key $track $defer_ix) + + ${__DEFER__JOBS[$defer_key]} + unset __DEFER__JOBS[$defer_key] +} + +__defer__schedule() +{ + local track=$1; shift + local ndefers=$(__defer__ndefers $track) + local ndefers_key=$(__defer__ndefer_key $track) + local defer_key=$(__defer__defer_key $track $ndefers) + local defer="$@" + + __DEFER__JOBS[$defer_key]="$defer" + __DEFER__NJOBS[$ndefers_key]=$((ndefers + 1)) +} + +__defer__scope_wipe() +{ + __DEFER__NJOBS[$(__defer__ndefer_key d)]=0 + __DEFER__NJOBS[$(__defer__ndefer_key p)]=0 +} + +defer_scope_push() +{ + ((__DEFER__SCOPE_ID++)) + __defer__scope_wipe +} + +defer_scope_pop() +{ + local defer_ix + + for ((defer_ix=$(__defer__ndefers p); defer_ix-->0; )); do + __defer__run p $defer_ix + done + + for ((defer_ix=$(__defer__ndefers d); defer_ix-->0; )); do + __defer__run d $defer_ix + done + + __defer__scope_wipe + ((__DEFER__SCOPE_ID--)) +} + +defer() +{ + __defer__schedule d "$@" +} + +defer_prio() +{ + __defer__schedule p "$@" +} + +defer_scopes_cleanup() +{ + while ((__DEFER__SCOPE_ID >= 0)); do + defer_scope_pop + done +} + +in_defer_scope() +{ + local ret + + defer_scope_push + "$@" + ret=$? + defer_scope_pop + + return $ret +} + +__defer__scope_wipe -- 2.51.0 From b4b0549a4e59747b49619b2edabfb0d04e37c0b9 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 17 Oct 2024 11:45:44 +0200 Subject: [PATCH 13/16] selftests: forwarding: Add a fallback cleanup() Consistent use of defers obviates the need for a separate test-specific cleanup function -- everything is just taken care of in defers. So in this patch, introduce a cleanup() helper in the forwarding lib.sh, which calls just pre_cleanup() and defer_scopes_cleanup(). Selftests are obviously still free to override the function. Since pre_cleanup() is too entangled with forwarding-specific minutia, the function cannot currently be in net/lib.sh. Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index d24b6af7ebfa..76e6d7698caf 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1408,6 +1408,12 @@ tests_run() done } +cleanup() +{ + pre_cleanup + defer_scopes_cleanup +} + multipath_eval() { local desc="$1" -- 2.51.0 From 0e07d5dbfbd9b0441ae4ec07a2a72738121356e2 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 17 Oct 2024 11:45:45 +0200 Subject: [PATCH 14/16] selftests: forwarding: lib: Allow passing PID to stop_traffic() Now that it is possible to schedule a deferral of stop_traffic() right after the traffic is started, we do not have to rely on the %% magic to kill the background process that was started last. Instead we can just give the PID explicitly. This makes it possible to start other background processes after the traffic is started without confusing the cleanup. Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/forwarding/lib.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/lib.sh b/tools/testing/selftests/net/forwarding/lib.sh index 76e6d7698caf..89c25f72b10c 100644 --- a/tools/testing/selftests/net/forwarding/lib.sh +++ b/tools/testing/selftests/net/forwarding/lib.sh @@ -1768,8 +1768,10 @@ start_tcp_traffic() stop_traffic() { + local pid=${1-%%}; shift + # Suppress noise from killing mausezahn. - { kill %% && wait %%; } 2>/dev/null + { kill $pid && wait $pid; } 2>/dev/null } declare -A cappid -- 2.51.0 From 7f46615d59373b65dcd0fea7784bf20f93c169f0 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 17 Oct 2024 11:45:46 +0200 Subject: [PATCH 15/16] selftests: RED: Use defer for test cleanup Instead of having a suite of dedicated cleanup functions, use the defer framework to schedule cleanups right as their setup functions are run. The sleep after stop_traffic() in mlxsw selftests is necessary, but scheduling it as "defer sleep; defer stop_traffic" is silly. Instead, add a local helper to stop traffic and sleep afterwards. Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- .../drivers/net/mlxsw/sch_red_core.sh | 185 +++++++++--------- .../drivers/net/mlxsw/sch_red_ets.sh | 24 +-- .../drivers/net/mlxsw/sch_red_root.sh | 18 +- .../selftests/net/forwarding/sch_red.sh | 103 ++++------ 4 files changed, 149 insertions(+), 181 deletions(-) diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh index f4c324957dcc..537d6baa77b7 100644 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_core.sh @@ -75,6 +75,18 @@ source $lib_dir/lib.sh source $lib_dir/devlink_lib.sh source mlxsw_lib.sh +stop_traffic_sleep() +{ + local pid=$1; shift + + # Issuing a kill still leaves a bunch of packets lingering in the + # buffers. This traffic then arrives at the point where a follow-up test + # is already running, and can confuse the test. Therefore sleep after + # stopping traffic to flush any leftover packets. + stop_traffic "$pid" + sleep 1 +} + ipaddr() { local host=$1; shift @@ -89,39 +101,31 @@ host_create() local host=$1; shift simple_if_init $dev + defer simple_if_fini $dev + mtu_set $dev 10000 + defer mtu_restore $dev vlan_create $dev 10 v$dev $(ipaddr $host 10)/28 + defer vlan_destroy $dev 10 ip link set dev $dev.10 type vlan egress 0:0 vlan_create $dev 11 v$dev $(ipaddr $host 11)/28 + defer vlan_destroy $dev 11 ip link set dev $dev.11 type vlan egress 0:1 } -host_destroy() -{ - local dev=$1; shift - - vlan_destroy $dev 11 - vlan_destroy $dev 10 - mtu_restore $dev - simple_if_fini $dev -} - h1_create() { host_create $h1 1 } -h1_destroy() -{ - host_destroy $h1 -} - h2_create() { host_create $h2 2 + tc qdisc add dev $h2 clsact + defer tc qdisc del dev $h2 clsact # Some of the tests in this suite use multicast traffic. As this traffic # enters BR2_10 resp. BR2_11, it is flooded to all other ports. Thus @@ -139,13 +143,7 @@ h2_create() tc qdisc replace dev $h2 root handle 10: tbf rate 200mbit \ burst 128K limit 1G -} - -h2_destroy() -{ - tc qdisc del dev $h2 root handle 10: - tc qdisc del dev $h2 clsact - host_destroy $h2 + defer tc qdisc del dev $h2 root handle 10: } h3_create() @@ -153,40 +151,54 @@ h3_create() host_create $h3 3 } -h3_destroy() -{ - host_destroy $h3 -} - switch_create() { local intf local vlan ip link add dev br1_10 type bridge + defer ip link del dev br1_10 + ip link add dev br1_11 type bridge + defer ip link del dev br1_11 ip link add dev br2_10 type bridge + defer ip link del dev br2_10 + ip link add dev br2_11 type bridge + defer ip link del dev br2_11 for intf in $swp1 $swp2 $swp3 $swp4 $swp5; do ip link set dev $intf up + defer ip link set dev $intf down + mtu_set $intf 10000 + defer mtu_restore $intf done for intf in $swp1 $swp4; do for vlan in 10 11; do vlan_create $intf $vlan + defer vlan_destroy $intf $vlan + ip link set dev $intf.$vlan master br1_$vlan + defer ip link set dev $intf.$vlan nomaster + ip link set dev $intf.$vlan up + defer ip link set dev $intf.$vlan up done done for intf in $swp2 $swp3 $swp5; do for vlan in 10 11; do vlan_create $intf $vlan + defer vlan_destroy $intf $vlan + ip link set dev $intf.$vlan master br2_$vlan + defer ip link set dev $intf.$vlan nomaster + ip link set dev $intf.$vlan up + defer ip link set dev $intf.$vlan up done done @@ -201,49 +213,25 @@ switch_create() for intf in $swp3 $swp4; do tc qdisc replace dev $intf root handle 1: tbf rate 200mbit \ burst 128K limit 1G + defer tc qdisc del dev $intf root handle 1: done ip link set dev br1_10 up + defer ip link set dev br1_10 down + ip link set dev br1_11 up + defer ip link set dev br1_11 down + ip link set dev br2_10 up + defer ip link set dev br2_10 down + ip link set dev br2_11 up + defer ip link set dev br2_11 down local size=$(devlink_pool_size_thtype 0 | cut -d' ' -f 1) devlink_port_pool_th_save $swp3 8 devlink_port_pool_th_set $swp3 8 $size -} - -switch_destroy() -{ - local intf - local vlan - - devlink_port_pool_th_restore $swp3 8 - - ip link set dev br2_11 down - ip link set dev br2_10 down - ip link set dev br1_11 down - ip link set dev br1_10 down - - for intf in $swp4 $swp3; do - tc qdisc del dev $intf root handle 1: - done - - for intf in $swp5 $swp3 $swp2 $swp4 $swp1; do - for vlan in 11 10; do - ip link set dev $intf.$vlan down - ip link set dev $intf.$vlan nomaster - vlan_destroy $intf $vlan - done - - mtu_restore $intf - ip link set dev $intf down - done - - ip link del dev br2_11 - ip link del dev br2_10 - ip link del dev br1_11 - ip link del dev br1_10 + defer devlink_port_pool_th_restore $swp3 8 } setup_prepare() @@ -263,6 +251,7 @@ setup_prepare() h3_mac=$(mac_get $h3) vrf_prepare + defer vrf_cleanup h1_create h2_create @@ -270,18 +259,6 @@ setup_prepare() switch_create } -cleanup() -{ - pre_cleanup - - switch_destroy - h3_destroy - h2_destroy - h1_destroy - - vrf_cleanup -} - ping_ipv4() { ping_test $h1.10 $(ipaddr 3 10) " from host 1, vlan 10" @@ -450,6 +427,7 @@ __do_ecn_test() start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ $h3_mac tos=0x01 + defer stop_traffic_sleep $! sleep 1 ecn_test_common "$name" "$get_nmarked" $vlan $limit @@ -461,9 +439,6 @@ __do_ecn_test() build_backlog $vlan $((2 * limit)) udp >/dev/null check_fail $? "UDP traffic went into backlog instead of being early-dropped" log_test "TC $((vlan - 10)): $name backlog > limit: UDP early-dropped" - - stop_traffic - sleep 1 } do_ecn_test() @@ -471,7 +446,8 @@ do_ecn_test() local vlan=$1; shift local limit=$1; shift - __do_ecn_test get_nmarked "$vlan" "$limit" + in_defer_scope \ + __do_ecn_test get_nmarked "$vlan" "$limit" } do_ecn_test_perband() @@ -480,10 +456,11 @@ do_ecn_test_perband() local limit=$1; shift mlxsw_only_on_spectrum 3+ || return - __do_ecn_test get_qdisc_nmarked "$vlan" "$limit" "per-band ECN" + in_defer_scope \ + __do_ecn_test get_qdisc_nmarked "$vlan" "$limit" "per-band ECN" } -do_ecn_nodrop_test() +__do_ecn_nodrop_test() { local vlan=$1; shift local limit=$1; shift @@ -491,6 +468,7 @@ do_ecn_nodrop_test() start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ $h3_mac tos=0x01 + defer stop_traffic_sleep $! sleep 1 ecn_test_common "$name" get_nmarked $vlan $limit @@ -502,12 +480,15 @@ do_ecn_nodrop_test() build_backlog $vlan $((2 * limit)) udp >/dev/null check_err $? "UDP traffic was early-dropped instead of getting into backlog" log_test "TC $((vlan - 10)): $name backlog > limit: UDP not dropped" +} - stop_traffic - sleep 1 +do_ecn_nodrop_test() +{ + in_defer_scope \ + __do_ecn_nodrop_test "$@" } -do_red_test() +__do_red_test() { local vlan=$1; shift local limit=$1; shift @@ -518,6 +499,7 @@ do_red_test() # is above limit. start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ $h3_mac tos=0x01 + defer stop_traffic_sleep $! # Pushing below the queue limit should work. RET=0 @@ -539,12 +521,15 @@ do_red_test() ((-15 <= pct && pct <= 15)) check_err $? "backlog $backlog / $limit expected <= 15% distance" log_test "TC $((vlan - 10)): RED backlog > limit" +} - stop_traffic - sleep 1 +do_red_test() +{ + in_defer_scope \ + __do_red_test "$@" } -do_mc_backlog_test() +__do_mc_backlog_test() { local vlan=$1; shift local limit=$1; shift @@ -554,7 +539,10 @@ do_mc_backlog_test() RET=0 start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) bc + defer stop_traffic_sleep $! + start_tcp_traffic $h2.$vlan $(ipaddr 2 $vlan) $(ipaddr 3 $vlan) bc + defer stop_traffic_sleep $! qbl=$(busywait 5000 until_counter_is ">= 500000" \ get_qdisc_backlog $vlan) @@ -567,13 +555,16 @@ do_mc_backlog_test() get_mc_transmit_queue $vlan) check_err $? "MC backlog reported by qdisc not visible in ethtool" - stop_traffic - stop_traffic - log_test "TC $((vlan - 10)): Qdisc reports MC backlog" } -do_mark_test() +do_mc_backlog_test() +{ + in_defer_scope \ + __do_mc_backlog_test "$@" +} + +__do_mark_test() { local vlan=$1; shift local limit=$1; shift @@ -588,6 +579,7 @@ do_mark_test() start_tcp_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) \ $h3_mac tos=0x01 + defer stop_traffic_sleep $! # Create a bit of a backlog and observe no mirroring due to marks. qevent_rule_install_$subtest @@ -617,12 +609,15 @@ do_mark_test() else log_test "TC $((vlan - 10)): marked packets $subtest'd" fi +} - stop_traffic - sleep 1 +do_mark_test() +{ + in_defer_scope \ + __do_mark_test "$@" } -do_drop_test() +__do_drop_test() { local vlan=$1; shift local limit=$1; shift @@ -637,6 +632,7 @@ do_drop_test() RET=0 start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 3 $vlan) $h3_mac + defer stop_traffic_sleep $! # Create a bit of a backlog and observe no mirroring due to drops. qevent_rule_install_$subtest @@ -671,9 +667,12 @@ do_drop_test() check_fail $? "$((now - base)) spurious packets observed after uninstall" log_test "TC $((vlan - 10)): ${trigger}ped packets $subtest'd" +} - stop_traffic - sleep 1 +do_drop_test() +{ + in_defer_scope \ + __do_drop_test "$@" } qevent_rule_install_mirror() diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh index 576067b207a8..8902a115d9cd 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_ets.sh @@ -80,36 +80,34 @@ uninstall_qdisc() ecn_test() { install_qdisc ecn + defer uninstall_qdisc do_ecn_test 10 $BACKLOG1 do_ecn_test 11 $BACKLOG2 - - uninstall_qdisc } ecn_test_perband() { install_qdisc ecn + defer uninstall_qdisc do_ecn_test_perband 10 $BACKLOG1 do_ecn_test_perband 11 $BACKLOG2 - - uninstall_qdisc } ecn_nodrop_test() { install_qdisc ecn nodrop + defer uninstall_qdisc do_ecn_nodrop_test 10 $BACKLOG1 do_ecn_nodrop_test 11 $BACKLOG2 - - uninstall_qdisc } red_test() { install_qdisc + defer uninstall_qdisc # Make sure that we get the non-zero value if there is any. local cur=$(busywait 1100 until_counter_is "> 0" \ @@ -120,50 +118,44 @@ red_test() do_red_test 10 $BACKLOG1 do_red_test 11 $BACKLOG2 - - uninstall_qdisc } mc_backlog_test() { install_qdisc + defer uninstall_qdisc # Note that the backlog numbers here do not correspond to RED # configuration, but are arbitrary. do_mc_backlog_test 10 $BACKLOG1 do_mc_backlog_test 11 $BACKLOG2 - - uninstall_qdisc } red_mirror_test() { install_qdisc qevent early_drop block 10 + defer uninstall_qdisc do_drop_mirror_test 10 $BACKLOG1 early_drop do_drop_mirror_test 11 $BACKLOG2 early_drop - - uninstall_qdisc } red_trap_test() { install_qdisc qevent early_drop block 10 + defer uninstall_qdisc do_drop_trap_test 10 $BACKLOG1 early_drop do_drop_trap_test 11 $BACKLOG2 early_drop - - uninstall_qdisc } ecn_mirror_test() { install_qdisc ecn qevent mark block 10 + defer uninstall_qdisc do_mark_mirror_test 10 $BACKLOG1 do_mark_mirror_test 11 $BACKLOG2 - - uninstall_qdisc } bail_on_lldpad "configure DCB" "configure Qdiscs" diff --git a/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh index 159108d02895..e9043771787b 100755 --- a/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh +++ b/tools/testing/selftests/drivers/net/mlxsw/sch_red_root.sh @@ -32,45 +32,51 @@ uninstall_qdisc() ecn_test() { install_qdisc ecn + defer uninstall_qdisc + do_ecn_test 10 $BACKLOG - uninstall_qdisc } ecn_test_perband() { install_qdisc ecn + defer uninstall_qdisc + do_ecn_test_perband 10 $BACKLOG - uninstall_qdisc } ecn_nodrop_test() { install_qdisc ecn nodrop + defer uninstall_qdisc + do_ecn_nodrop_test 10 $BACKLOG - uninstall_qdisc } red_test() { install_qdisc + defer uninstall_qdisc + do_red_test 10 $BACKLOG - uninstall_qdisc } mc_backlog_test() { install_qdisc + defer uninstall_qdisc + # Note that the backlog value here does not correspond to RED # configuration, but is arbitrary. do_mc_backlog_test 10 $BACKLOG - uninstall_qdisc } red_mirror_test() { install_qdisc qevent early_drop block 10 + defer uninstall_qdisc + do_drop_mirror_test 10 $BACKLOG - uninstall_qdisc } bail_on_lldpad "configure DCB" "configure Qdiscs" diff --git a/tools/testing/selftests/net/forwarding/sch_red.sh b/tools/testing/selftests/net/forwarding/sch_red.sh index 17f28644568e..af166662b78a 100755 --- a/tools/testing/selftests/net/forwarding/sch_red.sh +++ b/tools/testing/selftests/net/forwarding/sch_red.sh @@ -53,71 +53,63 @@ PKTSZ=1400 h1_create() { simple_if_init $h1 192.0.2.1/28 + defer simple_if_fini $h1 192.0.2.1/28 + mtu_set $h1 10000 + defer mtu_restore $h1 + tc qdisc replace dev $h1 root handle 1: tbf \ rate 10Mbit burst 10K limit 1M -} - -h1_destroy() -{ - tc qdisc del dev $h1 root - mtu_restore $h1 - simple_if_fini $h1 192.0.2.1/28 + defer tc qdisc del dev $h1 root } h2_create() { simple_if_init $h2 192.0.2.2/28 - mtu_set $h2 10000 -} + defer simple_if_fini $h2 192.0.2.2/28 -h2_destroy() -{ - mtu_restore $h2 - simple_if_fini $h2 192.0.2.2/28 + mtu_set $h2 10000 + defer mtu_restore $h2 } h3_create() { simple_if_init $h3 192.0.2.3/28 - mtu_set $h3 10000 -} + defer simple_if_fini $h3 192.0.2.3/28 -h3_destroy() -{ - mtu_restore $h3 - simple_if_fini $h3 192.0.2.3/28 + mtu_set $h3 10000 + defer mtu_restore $h3 } switch_create() { ip link add dev br up type bridge + defer ip link del dev br + ip link set dev $swp1 up master br + defer ip link set dev $swp1 down nomaster + ip link set dev $swp2 up master br + defer ip link set dev $swp2 down nomaster + ip link set dev $swp3 up master br + defer ip link set dev $swp3 down nomaster mtu_set $swp1 10000 + defer mtu_restore $h1 + mtu_set $swp2 10000 + defer mtu_restore $h2 + mtu_set $swp3 10000 + defer mtu_restore $h3 tc qdisc replace dev $swp3 root handle 1: tbf \ rate 10Mbit burst 10K limit 1M - ip link add name _drop_test up type dummy -} + defer tc qdisc del dev $swp3 root -switch_destroy() -{ - ip link del dev _drop_test - tc qdisc del dev $swp3 root - - mtu_restore $h3 - mtu_restore $h2 - mtu_restore $h1 - - ip link set dev $swp3 down nomaster - ip link set dev $swp2 down nomaster - ip link set dev $swp1 down nomaster - ip link del dev br + ip link add name _drop_test up type dummy + defer ip link del dev _drop_test } setup_prepare() @@ -134,6 +126,7 @@ setup_prepare() h3_mac=$(mac_get $h3) vrf_prepare + defer vrf_cleanup h1_create h2_create @@ -141,18 +134,6 @@ setup_prepare() switch_create } -cleanup() -{ - pre_cleanup - - switch_destroy - h3_destroy - h2_destroy - h1_destroy - - vrf_cleanup -} - ping_ipv4() { ping_test $h1 192.0.2.3 " from host 1" @@ -287,6 +268,7 @@ do_ecn_test() $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ -a own -b $h3_mac -t tcp -q tos=0x01 & + defer stop_traffic $! sleep 1 ecn_test_common "$name" $limit @@ -298,9 +280,6 @@ do_ecn_test() build_backlog $((2 * limit)) udp >/dev/null check_fail $? "UDP traffic went into backlog instead of being early-dropped" log_test "$name backlog > limit: UDP early-dropped" - - stop_traffic - sleep 1 } do_ecn_nodrop_test() @@ -310,6 +289,7 @@ do_ecn_nodrop_test() $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ -a own -b $h3_mac -t tcp -q tos=0x01 & + defer stop_traffic $! sleep 1 ecn_test_common "$name" $limit @@ -321,9 +301,6 @@ do_ecn_nodrop_test() build_backlog $((2 * limit)) udp >/dev/null check_err $? "UDP traffic was early-dropped instead of getting into backlog" log_test "$name backlog > limit: UDP not dropped" - - stop_traffic - sleep 1 } do_red_test() @@ -336,6 +313,7 @@ do_red_test() # is above limit. $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ -a own -b $h3_mac -t tcp -q tos=0x01 & + defer stop_traffic $! # Pushing below the queue limit should work. RET=0 @@ -352,9 +330,6 @@ do_red_test() pct=$(check_marking "== 0") check_err $? "backlog $backlog / $limit Got $pct% marked packets, expected == 0." log_test "RED backlog > limit" - - stop_traffic - sleep 1 } do_red_qevent_test() @@ -369,6 +344,7 @@ do_red_qevent_test() $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ -a own -b $h3_mac -t udp -q & + defer stop_traffic $! sleep 1 tc filter add block 10 pref 1234 handle 102 matchall skip_hw \ @@ -396,9 +372,6 @@ do_red_qevent_test() check_err $? "Dropped packets still observed: 0 expected, $((now - base)) seen" log_test "RED early_dropped packets mirrored" - - stop_traffic - sleep 1 } do_ecn_qevent_test() @@ -410,6 +383,7 @@ do_ecn_qevent_test() $MZ $h1 -p $PKTSZ -A 192.0.2.1 -B 192.0.2.3 -c 0 \ -a own -b $h3_mac -t tcp -q tos=0x01 & + defer stop_traffic $! sleep 1 tc filter add block 10 pref 1234 handle 102 matchall skip_hw \ @@ -428,9 +402,6 @@ do_ecn_qevent_test() tc filter del block 10 pref 1234 handle 102 matchall log_test "ECN marked packets mirrored" - - stop_traffic - sleep 1 } install_qdisc() @@ -451,36 +422,36 @@ uninstall_qdisc() ecn_test() { install_qdisc ecn + defer uninstall_qdisc xfail_on_slow do_ecn_test $BACKLOG - uninstall_qdisc } ecn_nodrop_test() { install_qdisc ecn nodrop + defer uninstall_qdisc xfail_on_slow do_ecn_nodrop_test $BACKLOG - uninstall_qdisc } red_test() { install_qdisc + defer uninstall_qdisc xfail_on_slow do_red_test $BACKLOG - uninstall_qdisc } red_qevent_test() { install_qdisc qevent early_drop block 10 + defer uninstall_qdisc xfail_on_slow do_red_qevent_test $BACKLOG - uninstall_qdisc } ecn_qevent_test() { install_qdisc ecn qevent mark block 10 + defer uninstall_qdisc xfail_on_slow do_ecn_qevent_test $BACKLOG - uninstall_qdisc } trap cleanup EXIT -- 2.51.0 From a1b3741dcfd16bf1e337c89b9fca5fbb9110fbed Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Thu, 17 Oct 2024 11:45:47 +0200 Subject: [PATCH 16/16] selftests: TBF: Use defer for test cleanup Use the defer framework to schedule cleanups as soon as the command is executed. Signed-off-by: Petr Machata Signed-off-by: Paolo Abeni --- .../selftests/net/forwarding/sch_tbf_core.sh | 91 ++++++------------- .../net/forwarding/sch_tbf_etsprio.sh | 7 +- .../selftests/net/forwarding/sch_tbf_root.sh | 3 +- 3 files changed, 36 insertions(+), 65 deletions(-) diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh index 9cd884d4a5de..ec309a5086bc 100644 --- a/tools/testing/selftests/net/forwarding/sch_tbf_core.sh +++ b/tools/testing/selftests/net/forwarding/sch_tbf_core.sh @@ -60,68 +60,65 @@ host_create() local host=$1; shift simple_if_init $dev + defer simple_if_fini $dev + mtu_set $dev 10000 + defer mtu_restore $dev vlan_create $dev 10 v$dev $(ipaddr $host 10)/28 + defer vlan_destroy $dev 10 ip link set dev $dev.10 type vlan egress 0:0 vlan_create $dev 11 v$dev $(ipaddr $host 11)/28 + defer vlan_destroy $dev 11 ip link set dev $dev.11 type vlan egress 0:1 } -host_destroy() -{ - local dev=$1; shift - - vlan_destroy $dev 11 - vlan_destroy $dev 10 - mtu_restore $dev - simple_if_fini $dev -} - h1_create() { host_create $h1 1 } -h1_destroy() -{ - host_destroy $h1 -} - h2_create() { host_create $h2 2 tc qdisc add dev $h2 clsact + defer tc qdisc del dev $h2 clsact + tc filter add dev $h2 ingress pref 1010 prot 802.1q \ flower $TCFLAGS vlan_id 10 action pass tc filter add dev $h2 ingress pref 1011 prot 802.1q \ flower $TCFLAGS vlan_id 11 action pass } -h2_destroy() -{ - tc qdisc del dev $h2 clsact - host_destroy $h2 -} - switch_create() { local intf local vlan ip link add dev br10 type bridge + defer ip link del dev br10 + ip link add dev br11 type bridge + defer ip link del dev br11 for intf in $swp1 $swp2; do ip link set dev $intf up + defer ip link set dev $intf down + mtu_set $intf 10000 + defer mtu_restore $intf for vlan in 10 11; do vlan_create $intf $vlan + defer vlan_destroy $intf $vlan + ip link set dev $intf.$vlan master br$vlan + defer ip link set dev $intf.$vlan nomaster + ip link set dev $intf.$vlan up + defer ip link set dev $intf.$vlan down done done @@ -130,34 +127,10 @@ switch_create() done ip link set dev br10 up - ip link set dev br11 up -} - -switch_destroy() -{ - local intf - local vlan - - # A test may have been interrupted mid-run, with Qdisc installed. Delete - # it here. - tc qdisc del dev $swp2 root 2>/dev/null - - ip link set dev br11 down - ip link set dev br10 down + defer ip link set dev br10 down - for intf in $swp2 $swp1; do - for vlan in 11 10; do - ip link set dev $intf.$vlan down - ip link set dev $intf.$vlan nomaster - vlan_destroy $intf $vlan - done - - mtu_restore $intf - ip link set dev $intf down - done - - ip link del dev br11 - ip link del dev br10 + ip link set dev br11 up + defer ip link set dev br11 down } setup_prepare() @@ -177,23 +150,13 @@ setup_prepare() h2_mac=$(mac_get $h2) vrf_prepare + defer vrf_cleanup h1_create h2_create switch_create } -cleanup() -{ - pre_cleanup - - switch_destroy - h2_destroy - h1_destroy - - vrf_cleanup -} - ping_ipv4() { ping_test $h1.10 $(ipaddr 2 10) " vlan 10" @@ -207,18 +170,18 @@ tbf_get_counter() tc_rule_stats_get $h2 10$vlan ingress .bytes } -do_tbf_test() +__tbf_test() { local vlan=$1; shift local mbit=$1; shift start_traffic $h1.$vlan $(ipaddr 1 $vlan) $(ipaddr 2 $vlan) $h2_mac + defer stop_traffic $! sleep 5 # Wait for the burst to dwindle local t2=$(busywait_for_counter 1000 +1 tbf_get_counter $vlan) sleep 10 local t3=$(tbf_get_counter $vlan) - stop_traffic RET=0 @@ -231,3 +194,9 @@ do_tbf_test() log_test "TC $((vlan - 10)): TBF rate ${mbit}Mbit" } + +do_tbf_test() +{ + in_defer_scope \ + __tbf_test "$@" +} diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh index df9bcd6a811a..c182a04282bc 100644 --- a/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh +++ b/tools/testing/selftests/net/forwarding/sch_tbf_etsprio.sh @@ -30,8 +30,9 @@ tbf_test() # This test is used for both ETS and PRIO. Even though we only need two # bands, PRIO demands a minimum of three. tc qdisc add dev $swp2 root handle 10: $QDISC 3 priomap 2 1 0 + defer tc qdisc del dev $swp2 root + tbf_test_one 128K - tc qdisc del dev $swp2 root } tbf_root_test() @@ -42,6 +43,8 @@ tbf_root_test() tc qdisc replace dev $swp2 root handle 1: \ tbf rate 400Mbit burst $bs limit 1M + defer tc qdisc del dev $swp2 root + tc qdisc replace dev $swp2 parent 1:1 handle 10: \ $QDISC 3 priomap 2 1 0 tc qdisc replace dev $swp2 parent 10:3 handle 103: \ @@ -53,8 +56,6 @@ tbf_root_test() do_tbf_test 10 400 $bs do_tbf_test 11 400 $bs - - tc qdisc del dev $swp2 root } if type -t sch_tbf_pre_hook >/dev/null; then diff --git a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh index 96c997be0d03..9f20320f8d84 100755 --- a/tools/testing/selftests/net/forwarding/sch_tbf_root.sh +++ b/tools/testing/selftests/net/forwarding/sch_tbf_root.sh @@ -14,13 +14,14 @@ tbf_test_one() tc qdisc replace dev $swp2 root handle 108: tbf \ rate 400Mbit burst $bs limit 1M + defer tc qdisc del dev $swp2 root + do_tbf_test 10 400 $bs } tbf_test() { tbf_test_one 128K - tc qdisc del dev $swp2 root } if type -t sch_tbf_pre_hook >/dev/null; then -- 2.51.0