From 986a93045183ae2f13e6d99d990ae8be36f6d6b0 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 7 Mar 2025 01:12:10 +0000 Subject: [PATCH 01/16] virtio-net: Refactor napi_disable paths Create virtnet_napi_disable helper and refactor virtnet_napi_tx_disable to take a struct send_queue. Signed-off-by: Joe Damato Acked-by: Michael S. Tsirkin Acked-by: Jason Wang Tested-by: Lei Yang Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20250307011215.266806-3-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 133b004c7a9a..e578885c1093 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2845,12 +2845,21 @@ static void virtnet_napi_tx_enable(struct send_queue *sq) virtnet_napi_do_enable(sq->vq, napi); } -static void virtnet_napi_tx_disable(struct napi_struct *napi) +static void virtnet_napi_tx_disable(struct send_queue *sq) { + struct napi_struct *napi = &sq->napi; + if (napi->weight) napi_disable(napi); } +static void virtnet_napi_disable(struct receive_queue *rq) +{ + struct napi_struct *napi = &rq->napi; + + napi_disable(napi); +} + static void refill_work(struct work_struct *work) { struct virtnet_info *vi = @@ -2861,7 +2870,7 @@ static void refill_work(struct work_struct *work) for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; - napi_disable(&rq->napi); + virtnet_napi_disable(rq); still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); virtnet_napi_enable(rq); @@ -3060,8 +3069,8 @@ static int virtnet_poll(struct napi_struct *napi, int budget) static void virtnet_disable_queue_pair(struct virtnet_info *vi, int qp_index) { - virtnet_napi_tx_disable(&vi->sq[qp_index].napi); - napi_disable(&vi->rq[qp_index].napi); + virtnet_napi_tx_disable(&vi->sq[qp_index]); + virtnet_napi_disable(&vi->rq[qp_index]); xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq); } @@ -3333,7 +3342,7 @@ static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) bool running = netif_running(vi->dev); if (running) { - napi_disable(&rq->napi); + virtnet_napi_disable(rq); virtnet_cancel_dim(vi, &rq->dim); } } @@ -3375,7 +3384,7 @@ static void virtnet_tx_pause(struct virtnet_info *vi, struct send_queue *sq) qindex = sq - vi->sq; if (running) - virtnet_napi_tx_disable(&sq->napi); + virtnet_napi_tx_disable(sq); txq = netdev_get_tx_queue(vi->dev, qindex); @@ -5952,8 +5961,8 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, /* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { for (i = 0; i < vi->max_queue_pairs; i++) { - napi_disable(&vi->rq[i].napi); - virtnet_napi_tx_disable(&vi->sq[i].napi); + virtnet_napi_disable(&vi->rq[i]); + virtnet_napi_tx_disable(&vi->sq[i]); } } -- 2.51.0 From e7231f49d526823975bbfe0abde5c50b7e8dfe3a Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 7 Mar 2025 01:12:11 +0000 Subject: [PATCH 02/16] virtio-net: Map NAPIs to queues Use netif_queue_set_napi to map NAPIs to queue IDs so that the mapping can be accessed by user apps. Note that the netif_queue_set_napi currently requires RTNL, so care must be taken to ensure RTNL is held on paths where this API might be reached. The paths in the driver where this API can be reached appear to be: - ndo_open, ndo_close, which hold RTNL so no driver change is needed. - rx_pause, rx_resume, tx_pause, tx_resume are reached either via an ethtool ioctl or via XSK - neither path requires a driver change. - power management paths (which call open and close), which have been updated to hold/release RTNL. $ ethtool -i ens4 | grep driver driver: virtio_net $ sudo ethtool -L ens4 combined 4 $ ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/netdev.yaml \ --dump queue-get --json='{"ifindex": 2}' [{'id': 0, 'ifindex': 2, 'napi-id': 8289, 'type': 'rx'}, {'id': 1, 'ifindex': 2, 'napi-id': 8290, 'type': 'rx'}, {'id': 2, 'ifindex': 2, 'napi-id': 8291, 'type': 'rx'}, {'id': 3, 'ifindex': 2, 'napi-id': 8292, 'type': 'rx'}, {'id': 0, 'ifindex': 2, 'type': 'tx'}, {'id': 1, 'ifindex': 2, 'type': 'tx'}, {'id': 2, 'ifindex': 2, 'type': 'tx'}, {'id': 3, 'ifindex': 2, 'type': 'tx'}] Note that virtio_net has TX-only NAPIs which do not have NAPI IDs, so the lack of 'napi-id' in the above output is expected. Signed-off-by: Joe Damato Acked-by: Jason Wang Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20250307011215.266806-4-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index e578885c1093..7bd63a677123 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2823,13 +2823,18 @@ static void virtnet_napi_do_enable(struct virtqueue *vq, static void virtnet_napi_enable(struct receive_queue *rq) { + struct virtnet_info *vi = rq->vq->vdev->priv; + int qidx = vq2rxq(rq->vq); + virtnet_napi_do_enable(rq->vq, &rq->napi); + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, &rq->napi); } static void virtnet_napi_tx_enable(struct send_queue *sq) { struct virtnet_info *vi = sq->vq->vdev->priv; struct napi_struct *napi = &sq->napi; + int qidx = vq2txq(sq->vq); if (!napi->weight) return; @@ -2843,20 +2848,28 @@ static void virtnet_napi_tx_enable(struct send_queue *sq) } virtnet_napi_do_enable(sq->vq, napi); + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, napi); } static void virtnet_napi_tx_disable(struct send_queue *sq) { + struct virtnet_info *vi = sq->vq->vdev->priv; struct napi_struct *napi = &sq->napi; + int qidx = vq2txq(sq->vq); - if (napi->weight) + if (napi->weight) { + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, NULL); napi_disable(napi); + } } static void virtnet_napi_disable(struct receive_queue *rq) { + struct virtnet_info *vi = rq->vq->vdev->priv; struct napi_struct *napi = &rq->napi; + int qidx = vq2rxq(rq->vq); + netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, NULL); napi_disable(napi); } @@ -2870,9 +2883,23 @@ static void refill_work(struct work_struct *work) for (i = 0; i < vi->curr_queue_pairs; i++) { struct receive_queue *rq = &vi->rq[i]; - virtnet_napi_disable(rq); + /* + * When queue API support is added in the future and the call + * below becomes napi_disable_locked, this driver will need to + * be refactored. + * + * One possible solution would be to: + * - cancel refill_work with cancel_delayed_work (note: + * non-sync) + * - cancel refill_work with cancel_delayed_work_sync in + * virtnet_remove after the netdev is unregistered + * - wrap all of the work in a lock (perhaps the netdev + * instance lock) + * - check netif_running() and return early to avoid a race + */ + napi_disable(&rq->napi); still_empty = !try_fill_recv(vi, rq, GFP_KERNEL); - virtnet_napi_enable(rq); + virtnet_napi_do_enable(rq->vq, &rq->napi); /* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again. @@ -5650,8 +5677,11 @@ static void virtnet_freeze_down(struct virtio_device *vdev) netif_tx_lock_bh(vi->dev); netif_device_detach(vi->dev); netif_tx_unlock_bh(vi->dev); - if (netif_running(vi->dev)) + if (netif_running(vi->dev)) { + rtnl_lock(); virtnet_close(vi->dev); + rtnl_unlock(); + } } static int init_vqs(struct virtnet_info *vi); @@ -5671,7 +5701,9 @@ static int virtnet_restore_up(struct virtio_device *vdev) enable_rx_mode_work(vi); if (netif_running(vi->dev)) { + rtnl_lock(); err = virtnet_open(vi->dev); + rtnl_unlock(); if (err) return err; } -- 2.51.0 From d5d715207e2911b84b92f10420d4a8d7653aa98d Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Fri, 7 Mar 2025 01:12:12 +0000 Subject: [PATCH 03/16] virtio_net: Use persistent NAPI config Use persistent NAPI config so that NAPI IDs are not renumbered as queue counts change. $ sudo ethtool -l ens4 | tail -5 | egrep -i '(current|combined)' Current hardware settings: Combined: 4 $ ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/netdev.yaml \ --dump queue-get --json='{"ifindex": 2}' [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'}, {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'}, {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'rx'}, {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'rx'}, {'id': 0, 'ifindex': 2, 'type': 'tx'}, {'id': 1, 'ifindex': 2, 'type': 'tx'}, {'id': 2, 'ifindex': 2, 'type': 'tx'}, {'id': 3, 'ifindex': 2, 'type': 'tx'}] Now adjust the queue count, note that the NAPI IDs are not renumbered: $ sudo ethtool -L ens4 combined 1 $ ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/netdev.yaml \ --dump queue-get --json='{"ifindex": 2}' [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'}, {'id': 0, 'ifindex': 2, 'type': 'tx'}] $ sudo ethtool -L ens4 combined 8 $ ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/netdev.yaml \ --dump queue-get --json='{"ifindex": 2}' [{'id': 0, 'ifindex': 2, 'napi-id': 8193, 'type': 'rx'}, {'id': 1, 'ifindex': 2, 'napi-id': 8194, 'type': 'rx'}, {'id': 2, 'ifindex': 2, 'napi-id': 8195, 'type': 'rx'}, {'id': 3, 'ifindex': 2, 'napi-id': 8196, 'type': 'rx'}, {'id': 4, 'ifindex': 2, 'napi-id': 8197, 'type': 'rx'}, {'id': 5, 'ifindex': 2, 'napi-id': 8198, 'type': 'rx'}, {'id': 6, 'ifindex': 2, 'napi-id': 8199, 'type': 'rx'}, {'id': 7, 'ifindex': 2, 'napi-id': 8200, 'type': 'rx'}, [...] Signed-off-by: Joe Damato Reviewed-by: Gerhard Engleder Acked-by: Jason Wang Acked-by: Michael S. Tsirkin Tested-by: Lei Yang Reviewed-by: Xuan Zhuo Link: https://patch.msgid.link/20250307011215.266806-5-jdamato@fastly.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7bd63a677123..34cec2b11b74 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -6455,8 +6455,9 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) INIT_DELAYED_WORK(&vi->refill, refill_work); for (i = 0; i < vi->max_queue_pairs; i++) { vi->rq[i].pages = NULL; - netif_napi_add_weight(vi->dev, &vi->rq[i].napi, virtnet_poll, - napi_weight); + netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll, + i); + vi->rq[i].napi.weight = napi_weight; netif_napi_add_tx_weight(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, napi_tx ? napi_weight : 0); -- 2.51.0 From 54580ccdd8a9c6821fd6f72171d435480867e4c3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 6 Mar 2025 22:34:08 -0500 Subject: [PATCH 04/16] ipv6: remove leftover ip6 cookie initializer As of the blamed commit ipc6.dontfrag is always initialized at the start of udpv6_sendmsg, by ipcm6_init_sk, to either 0 or 1. Later checks against -1 are no longer needed and the branches are now dead code. The blamed commit had removed those branches. But I had overlooked this one case. UDP has both a lockless fast path and a slower path for corked requests. This branch remained in the fast path. Fixes: 096208592b09 ("ipv6: replace ipcm6_init calls with ipcm6_init_sk") Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250307033620.411611-2-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_output.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d577bf2f3053..d91da522c34e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -2054,8 +2054,6 @@ struct sk_buff *ip6_make_skb(struct sock *sk, ip6_cork_release(cork, &v6_cork); return ERR_PTR(err); } - if (ipc6->dontfrag < 0) - ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, -- 2.51.0 From a18dfa9925b9ef6107ea3aa5814ca3c704d34a8a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 6 Mar 2025 22:34:09 -0500 Subject: [PATCH 05/16] ipv6: save dontfrag in cork When spanning datagram construction over multiple send calls using MSG_MORE, per datagram settings are configured on the first send. That is when ip(6)_setup_cork stores these settings for subsequent use in __ip(6)_append_data and others. The only flag that escaped this was dontfrag. As a result, a datagram could be constructed with df=0 on the first sendmsg, but df=1 on a next. Which is what cmsg_ip.sh does in an upcoming MSG_MORE test in the "diff" scenario. Changing datagram conditions in the middle of constructing an skb makes this already complex code path even more convoluted. It is here unintentional. Bring this flag in line with expected sockopt/cmsg behavior. And stop passing ipc6 to __ip6_append_data, to avoid such issues in the future. This is already the case for __ip_append_data. inet6_cork had a 6 byte hole, so the 1B flag has no impact. Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250307033620.411611-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/ipv6.h | 1 + net/ipv6/ip6_output.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a6e2aadbb91b..5aeeed22f35b 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -207,6 +207,7 @@ struct inet6_cork { struct ipv6_txoptions *opt; u8 hop_limit; u8 tclass; + u8 dontfrag:1; }; /* struct ipv6_pinfo - ipv6 private area */ diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d91da522c34e..581bc6289081 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1386,6 +1386,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; + v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); @@ -1421,7 +1422,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6) + unsigned int flags) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; @@ -1475,7 +1476,7 @@ static int __ip6_append_data(struct sock *sk, if (headersize + transhdrlen > mtu) goto emsgsize; - if (cork->length + length > mtu - headersize && ipc6->dontfrag && + if (cork->length + length > mtu - headersize && v6_cork->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { @@ -1855,7 +1856,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6); + from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -2058,7 +2059,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6); + flags); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); -- 2.51.0 From 0922cb68edfde9e3920bb3aedea203d333af9f10 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 6 Mar 2025 22:34:10 -0500 Subject: [PATCH 06/16] selftests/net: expand cmsg_ip with MSG_MORE UDP send with MSG_MORE takes a slightly different path than the lockless fast path. For completeness, add coverage to this case too. Pass MSG_MORE on the initial sendmsg, then follow up with a zero byte write to unplug the cork. Unrelated: also add two missing endlines in usage(). Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250307033620.411611-4-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/cmsg_ip.sh | 11 +++++++---- tools/testing/selftests/net/cmsg_sender.c | 24 ++++++++++++++++++----- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/net/cmsg_ip.sh b/tools/testing/selftests/net/cmsg_ip.sh index 2a52520aca32..b55680e081ad 100755 --- a/tools/testing/selftests/net/cmsg_ip.sh +++ b/tools/testing/selftests/net/cmsg_ip.sh @@ -50,8 +50,9 @@ check_result() { # IPV6_DONTFRAG for ovr in setsock cmsg both diff; do for df in 0 1; do - for p in u i r; do + for p in u U i r; do [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP [ $p == "i" ] && prot=ICMP [ $p == "r" ] && prot=RAW @@ -81,8 +82,9 @@ test_dscp() { ip $IPVER -netns $NS route add table 300 prohibit any for ovr in setsock cmsg both diff; do - for p in u i r; do + for p in u U i r; do [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP [ $p == "i" ] && prot=ICMP [ $p == "r" ] && prot=RAW @@ -134,8 +136,9 @@ test_ttl_hoplimit() { local -r LIM=4 for ovr in setsock cmsg both diff; do - for p in u i r; do + for p in u U i r; do [ $p == "u" ] && prot=UDP + [ $p == "U" ] && prot=UDP [ $p == "i" ] && prot=ICMP [ $p == "r" ] && prot=RAW @@ -166,7 +169,7 @@ test_ttl_hoplimit -4 $TGT4 ttl test_ttl_hoplimit -6 $TGT6 hlim # IPV6 exthdr -for p in u i r; do +for p in u U i r; do # Very basic "does it crash" test for h in h d r; do $NSEXE ./cmsg_sender -p $p -6 -H $h $TGT6 1234 diff --git a/tools/testing/selftests/net/cmsg_sender.c b/tools/testing/selftests/net/cmsg_sender.c index 19bd8499031b..a825e628aee7 100644 --- a/tools/testing/selftests/net/cmsg_sender.c +++ b/tools/testing/selftests/net/cmsg_sender.c @@ -33,6 +33,7 @@ enum { ERN_RECVERR, ERN_CMSG_RD, ERN_CMSG_RCV, + ERN_SEND_MORE, }; struct option_cmsg_u32 { @@ -46,6 +47,7 @@ struct options { const char *service; unsigned int size; unsigned int num_pkt; + bool msg_more; struct { unsigned int mark; unsigned int dontfrag; @@ -94,7 +96,8 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\t\t-S send() size\n" "\t\t-4/-6 Force IPv4 / IPv6 only\n" "\t\t-p prot Socket protocol\n" - "\t\t (u = UDP (default); i = ICMP; r = RAW)\n" + "\t\t (u = UDP (default); i = ICMP; r = RAW;\n" + "\t\t U = UDP with MSG_MORE)\n" "\n" "\t\t-m val Set SO_MARK with given value\n" "\t\t-M val Set SO_MARK via setsockopt\n" @@ -109,8 +112,8 @@ static void __attribute__((noreturn)) cs_usage(const char *bin) "\t\t-l val Set TTL/HOPLIMIT via cmsg\n" "\t\t-L val Set TTL/HOPLIMIT via setsockopt\n" "\t\t-H type Add an IPv6 header option\n" - "\t\t (h = HOP; d = DST; r = RTDST)" - ""); + "\t\t (h = HOP; d = DST; r = RTDST)\n" + "\n"); exit(ERN_HELP); } @@ -133,8 +136,11 @@ static void cs_parse_args(int argc, char *argv[]) opt.sock.family = AF_INET6; break; case 'p': - if (*optarg == 'u' || *optarg == 'U') { + if (*optarg == 'u') { opt.sock.proto = IPPROTO_UDP; + } else if (*optarg == 'U') { + opt.sock.proto = IPPROTO_UDP; + opt.msg_more = true; } else if (*optarg == 'i' || *optarg == 'I') { opt.sock.proto = IPPROTO_ICMP; } else if (*optarg == 'r') { @@ -531,7 +537,7 @@ int main(int argc, char *argv[]) cs_write_cmsg(fd, &msg, cbuf, sizeof(cbuf)); for (i = 0; i < opt.num_pkt; i++) { - err = sendmsg(fd, &msg, 0); + err = sendmsg(fd, &msg, opt.msg_more ? MSG_MORE : 0); if (err < 0) { if (!opt.silent_send) fprintf(stderr, "send failed: %s\n", strerror(errno)); @@ -542,6 +548,14 @@ int main(int argc, char *argv[]) err = ERN_SEND_SHORT; goto err_out; } + if (opt.msg_more) { + err = write(fd, NULL, 0); + if (err < 0) { + fprintf(stderr, "send more: %s\n", strerror(errno)); + err = ERN_SEND_MORE; + goto err_out; + } + } } err = ERN_SUCCESS; -- 2.51.0 From 473367a5ffe1607a61be481e2feda684eb5faea9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 7 Mar 2025 08:29:47 +0100 Subject: [PATCH 07/16] r8169: increase max jumbo packet size on RTL8125/RTL8126 Realtek confirmed that all RTL8125/RTL8126 chip versions support up to 16K jumbo packets. Reflect this in the driver. Tested by Rui on RTL8125B with 12K jumbo packets. Suggested-by: Rui Salvaterra Tested-by: Rui Salvaterra Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Link: https://patch.msgid.link/396762ad-cc65-4e60-b01e-8847db89e98b@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index fa339bd8c775..b18daeeda40d 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -89,6 +89,7 @@ #define JUMBO_6K (6 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) #define JUMBO_7K (7 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) #define JUMBO_9K (9 * SZ_1K - VLAN_ETH_HLEN - ETH_FCS_LEN) +#define JUMBO_16K (SZ_16K - VLAN_ETH_HLEN - ETH_FCS_LEN) static const struct { const char *name; @@ -5360,6 +5361,9 @@ static int rtl_jumbo_max(struct rtl8169_private *tp) /* RTL8168c */ case RTL_GIGA_MAC_VER_18 ... RTL_GIGA_MAC_VER_24: return JUMBO_6K; + /* RTL8125/8126 */ + case RTL_GIGA_MAC_VER_61 ... RTL_GIGA_MAC_VER_71: + return JUMBO_16K; default: return JUMBO_9K; } -- 2.51.0 From 991a1b09920bc15c66f64c1e7d15cdabd3816c46 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 7 Mar 2025 12:28:48 +0300 Subject: [PATCH 08/16] eth: fbnic: fix memory corruption in fbnic_tlv_attr_get_string() This code is trying to ensure that the last byte of the buffer is a NUL terminator. However, the problem is that attr->value[] is an array of __le32, not char, so it zeroes out 4 bytes way beyond the end of the buffer. Cast the buffer to char to address this. Fixes: e5cf5107c9e4 ("eth: fbnic: Update fbnic_tlv_attr_get_string() to work like nla_strscpy()") Signed-off-by: Dan Carpenter Reviewed-by: Lee Trager Link: https://patch.msgid.link/2791d4be-ade4-4e50-9b12-33307d8410f6@stanley.mountain Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/meta/fbnic/fbnic_tlv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/meta/fbnic/fbnic_tlv.c b/drivers/net/ethernet/meta/fbnic/fbnic_tlv.c index d558d176e0df..517ed8b2f1cb 100644 --- a/drivers/net/ethernet/meta/fbnic/fbnic_tlv.c +++ b/drivers/net/ethernet/meta/fbnic/fbnic_tlv.c @@ -261,7 +261,7 @@ ssize_t fbnic_tlv_attr_get_string(struct fbnic_tlv_msg *attr, char *dst, return -E2BIG; srclen = le16_to_cpu(attr->hdr.len) - sizeof(*attr); - if (srclen > 0 && attr->value[srclen - 1] == '\0') + if (srclen > 0 && ((char *)attr->value)[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { -- 2.51.0 From 7462fe22cc74321eb663768848976d42eba3ddbb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 7 Mar 2025 12:21:45 +0100 Subject: [PATCH 09/16] mptcp: pm: use addr entry for get_local_id The following code in mptcp_userspace_pm_get_local_id() that assigns "skc" to "new_entry" is not allowed in BPF if we use the same code to implement the get_local_id() interface of a BFP path manager: memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); new_entry.addr = *skc; new_entry.addr.id = 0; new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; To solve the issue, this patch moves this assignment to "new_entry" forward to mptcp_pm_get_local_id(), and then passing "new_entry" as a parameter to both mptcp_pm_nl_get_local_id() and mptcp_userspace_pm_get_local_id(). No behavioural changes intended. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-1-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 9 ++++++--- net/mptcp/pm_netlink.c | 11 ++++------- net/mptcp/pm_userspace.c | 17 ++++++----------- net/mptcp/protocol.h | 6 ++++-- 4 files changed, 20 insertions(+), 23 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 6c8cadf84f31..f6030ce04efd 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -406,7 +406,7 @@ out_unlock: int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) { - struct mptcp_addr_info skc_local; + struct mptcp_pm_addr_entry skc_local = { 0 }; struct mptcp_addr_info msk_local; if (WARN_ON_ONCE(!msk)) @@ -416,10 +416,13 @@ int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc) * addr */ mptcp_local_address((struct sock_common *)msk, &msk_local); - mptcp_local_address((struct sock_common *)skc, &skc_local); - if (mptcp_addresses_equal(&msk_local, &skc_local, false)) + mptcp_local_address((struct sock_common *)skc, &skc_local.addr); + if (mptcp_addresses_equal(&msk_local, &skc_local.addr, false)) return 0; + skc_local.addr.id = 0; + skc_local.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; + if (mptcp_pm_is_userspace(msk)) return mptcp_userspace_pm_get_local_id(msk, &skc_local); return mptcp_pm_nl_get_local_id(msk, &skc_local); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a6387bcf848b..23c28e37ab8f 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1150,7 +1150,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, return err; } -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc) +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc) { struct mptcp_pm_addr_entry *entry; struct pm_nl_pernet *pernet; @@ -1159,7 +1160,7 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc pernet = pm_nl_get_pernet_from_msk(msk); rcu_read_lock(); - entry = __lookup_addr(pernet, skc); + entry = __lookup_addr(pernet, &skc->addr); ret = entry ? entry->addr.id : -1; rcu_read_unlock(); if (ret >= 0) @@ -1170,12 +1171,8 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc if (!entry) return -ENOMEM; - entry->addr = *skc; - entry->addr.id = 0; + *entry = *skc; entry->addr.port = 0; - entry->ifindex = 0; - entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; - entry->lsk = NULL; ret = mptcp_pm_nl_append_new_local_addr(pernet, entry, true, false); if (ret < 0) kfree(entry); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 7e7d01bef5d4..8c45eebe9bbc 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -130,27 +130,22 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) } int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, - struct mptcp_addr_info *skc) + struct mptcp_pm_addr_entry *skc) { - struct mptcp_pm_addr_entry *entry = NULL, new_entry; __be16 msk_sport = ((struct inet_sock *) inet_sk((struct sock *)msk))->inet_sport; + struct mptcp_pm_addr_entry *entry; spin_lock_bh(&msk->pm.lock); - entry = mptcp_userspace_pm_lookup_addr(msk, skc); + entry = mptcp_userspace_pm_lookup_addr(msk, &skc->addr); spin_unlock_bh(&msk->pm.lock); if (entry) return entry->addr.id; - memset(&new_entry, 0, sizeof(struct mptcp_pm_addr_entry)); - new_entry.addr = *skc; - new_entry.addr.id = 0; - new_entry.flags = MPTCP_PM_ADDR_FLAG_IMPLICIT; - - if (new_entry.addr.port == msk_sport) - new_entry.addr.port = 0; + if (skc->addr.port == msk_sport) + skc->addr.port = 0; - return mptcp_userspace_pm_append_new_local_addr(msk, &new_entry, true); + return mptcp_userspace_pm_append_new_local_addr(msk, skc, true); } bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 7b74dedc7936..333d20a018b4 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1121,8 +1121,10 @@ bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb, bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining, struct mptcp_rm_list *rm_list); int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc); -int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); -int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, struct mptcp_addr_info *skc); +int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc); +int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *skc); bool mptcp_pm_is_backup(struct mptcp_sock *msk, struct sock_common *skc); bool mptcp_pm_nl_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk, struct mptcp_addr_info *skc); -- 2.51.0 From fac7a6ddc75740ee3d24c7fa054921f9c495d8c2 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:46 +0100 Subject: [PATCH 10/16] mptcp: pm: remove '_nl' from mptcp_pm_nl_addr_send_ack Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_addr_send_ack()' is not specific to this PM: it is used by both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-2-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 2 +- net/mptcp/pm_netlink.c | 8 ++++---- net/mptcp/pm_userspace.c | 2 +- net/mptcp/protocol.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index f6030ce04efd..ece706e8ed22 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -57,7 +57,7 @@ int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_ msk->pm.rm_list_tx = *rm_list; rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL); WRITE_ONCE(msk->pm.addr_signal, rm_addr); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); return 0; } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 23c28e37ab8f..a70a688eae84 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -606,7 +606,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) local.addr.id = 0; mptcp_pm_announce_addr(msk, &local.addr, false); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); if (local.flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) signal_and_subflow = true; @@ -740,7 +740,7 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) remote = msk->pm.remote; mptcp_pm_announce_addr(msk, &remote, true); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) return; @@ -781,7 +781,7 @@ bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, return mptcp_addresses_equal(&mpc_remote, remote, remote->port); } -void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow, *alt = NULL; @@ -942,7 +942,7 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) } if (pm->status & BIT(MPTCP_PM_ADD_ADDR_SEND_ACK)) { pm->status &= ~BIT(MPTCP_PM_ADD_ADDR_SEND_ACK); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 8c45eebe9bbc..b41e1aaa1d1c 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -234,7 +234,7 @@ int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info) if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { msk->pm.add_addr_signaled++; mptcp_pm_announce_addr(msk, &addr_val.addr, false); - mptcp_pm_nl_addr_send_ack(msk); + mptcp_pm_addr_send_ack(msk); } spin_unlock_bh(&msk->pm.lock); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 333d20a018b4..2a3eb2392b3b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1008,7 +1008,7 @@ void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *remote); -void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk); +void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); -- 2.51.0 From d1734987992c977f1515a5208688aced653c1869 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:47 +0100 Subject: [PATCH 11/16] mptcp: pm: remove '_nl' from mptcp_pm_nl_mp_prio_send_ack Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_mp_prio_send_ack()' is not specific to this PM: it is used by both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-3-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 10 +++++----- net/mptcp/pm_userspace.c | 4 ++-- net/mptcp/protocol.h | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a70a688eae84..5494b5b409dc 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -808,10 +808,10 @@ void mptcp_pm_addr_send_ack(struct mptcp_sock *msk) mptcp_pm_send_ack(msk, alt, false, false); } -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup) +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup) { struct mptcp_subflow_context *subflow; @@ -1936,7 +1936,7 @@ static void mptcp_nl_set_flags(struct net *net, lock_sock(sk); if (changed & MPTCP_PM_ADDR_FLAG_BACKUP) - mptcp_pm_nl_mp_prio_send_ack(msk, &local->addr, NULL, bkup); + mptcp_pm_mp_prio_send_ack(msk, &local->addr, NULL, bkup); /* Subflows will only be recreated if the SUBFLOW flag is set */ if (is_subflow && (changed & MPTCP_PM_ADDR_FLAG_FULLMESH)) mptcp_pm_nl_fullmesh(msk, &local->addr); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index b41e1aaa1d1c..2626b2b092d4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -605,10 +605,10 @@ int mptcp_userspace_pm_set_flags(struct mptcp_pm_addr_entry *local, spin_unlock_bh(&msk->pm.lock); lock_sock(sk); - ret = mptcp_pm_nl_mp_prio_send_ack(msk, &local->addr, &rem, bkup); + ret = mptcp_pm_mp_prio_send_ack(msk, &local->addr, &rem, bkup); release_sock(sk); - /* mptcp_pm_nl_mp_prio_send_ack() only fails in one case */ + /* mptcp_pm_mp_prio_send_ack() only fails in one case */ if (ret < 0) GENL_SET_ERR_MSG(info, "subflow not found"); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 2a3eb2392b3b..5508343f2c69 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1013,10 +1013,10 @@ void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); void mptcp_pm_mp_prio_received(struct sock *sk, u8 bkup); void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq); -int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, - struct mptcp_addr_info *addr, - struct mptcp_addr_info *rem, - u8 bkup); +int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, + struct mptcp_addr_info *addr, + struct mptcp_addr_info *rem, + u8 bkup); bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_free_anno_list(struct mptcp_sock *msk); -- 2.51.0 From 551a9ad7879df1c7d604b27272fe84032a040074 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:48 +0100 Subject: [PATCH 12/16] mptcp: pm: remove '_nl' from mptcp_pm_nl_work Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_work' is not specific to this PM: it is called from the core to call helpers, some of them needed by both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. Also used 'worker' instead of 'work', similar to protocol.c's worker. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-4-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 2 +- net/mptcp/protocol.c | 2 +- net/mptcp/protocol.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5494b5b409dc..f6f7ea25640b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -924,7 +924,7 @@ static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } -void mptcp_pm_nl_work(struct mptcp_sock *msk) +void mptcp_pm_worker(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ec23e65ef0f1..ac946263ec64 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2681,7 +2681,7 @@ static void mptcp_worker(struct work_struct *work) mptcp_check_fastclose(msk); - mptcp_pm_nl_work(msk); + mptcp_pm_worker(msk); mptcp_check_send_data_fin(sk); mptcp_check_data_fin_ack(sk); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 5508343f2c69..f29f4dd28fc5 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1147,7 +1147,7 @@ static inline u8 subflow_get_local_id(const struct mptcp_subflow_context *subflo } void __init mptcp_pm_nl_init(void); -void mptcp_pm_nl_work(struct mptcp_sock *msk); +void mptcp_pm_worker(struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); -- 2.51.0 From 63611391850850bf27f81afb0d0b6d1237a34006 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:49 +0100 Subject: [PATCH 13/16] mptcp: pm: remove '_nl' from mptcp_pm_nl_rm_addr_received Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_rm_addr_received' is not specific to this PM: it is called from the PM worker, and used by both the in-kernel and userspace PMs. The helper has been renamed to 'mptcp_pm_rm_addr_recv' instead of '_received' to avoid confusions with the one from pm.c. mptcp_pm_nl_rm_addr_or_subflow', and 'mptcp_pm_nl_rm_subflow_received' have been updated too for the same reason. To avoid confusions, the '_nl' bit has been removed from the name. While at it, the in-kernel PM specific code has been move from mptcp_pm_rm_addr_or_subflow to a new dedicated helper, clearer. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-5-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 55 +++++++++++++++++++++--------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index f6f7ea25640b..09ef3aa025e7 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -838,9 +838,20 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } -static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list, - enum linux_mptcp_mib_field rm_type) +static void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) +{ + if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + /* Note: if the subflow has been closed before, this + * add_addr_accepted counter will not be decremented. + */ + if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + WRITE_ONCE(msk->pm.accept_addr, true); + } +} + +static void mptcp_pm_rm_addr_or_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list, + enum linux_mptcp_mib_field rm_type) { struct mptcp_subflow_context *subflow, *tmp; struct sock *sk = (struct sock *)msk; @@ -893,35 +904,23 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, __MPTCP_INC_STATS(sock_net(sk), rm_type); } - if (rm_type == MPTCP_MIB_RMADDR) + if (rm_type == MPTCP_MIB_RMADDR) { __MPTCP_INC_STATS(sock_net(sk), rm_type); - - if (!removed) - continue; - - if (!mptcp_pm_is_kernel(msk)) - continue; - - if (rm_type == MPTCP_MIB_RMADDR && rm_id && - !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { - /* Note: if the subflow has been closed before, this - * add_addr_accepted counter will not be decremented. - */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) - WRITE_ONCE(msk->pm.accept_addr, true); + if (removed && mptcp_pm_is_kernel(msk)) + mptcp_pm_nl_rm_addr(msk, rm_id); } } } -static void mptcp_pm_nl_rm_addr_received(struct mptcp_sock *msk) +static void mptcp_pm_rm_addr_recv(struct mptcp_sock *msk) { - mptcp_pm_nl_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); + mptcp_pm_rm_addr_or_subflow(msk, &msk->pm.rm_list_rx, MPTCP_MIB_RMADDR); } -static void mptcp_pm_nl_rm_subflow_received(struct mptcp_sock *msk, - const struct mptcp_rm_list *rm_list) +static void mptcp_pm_rm_subflow(struct mptcp_sock *msk, + const struct mptcp_rm_list *rm_list) { - mptcp_pm_nl_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); + mptcp_pm_rm_addr_or_subflow(msk, rm_list, MPTCP_MIB_RMSUBFLOW); } void mptcp_pm_worker(struct mptcp_sock *msk) @@ -946,7 +945,7 @@ void mptcp_pm_worker(struct mptcp_sock *msk) } if (pm->status & BIT(MPTCP_PM_RM_ADDR_RECEIVED)) { pm->status &= ~BIT(MPTCP_PM_RM_ADDR_RECEIVED); - mptcp_pm_nl_rm_addr_received(msk); + mptcp_pm_rm_addr_recv(msk); } if (pm->status & BIT(MPTCP_PM_ESTABLISHED)) { pm->status &= ~BIT(MPTCP_PM_ESTABLISHED); @@ -1538,7 +1537,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, list.ids[0] = mptcp_endp_get_local_id(msk, addr); if (remove_subflow) { spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, &list); + mptcp_pm_rm_subflow(msk, &list); spin_unlock_bh(&msk->pm.lock); } @@ -1583,7 +1582,7 @@ static int mptcp_nl_remove_id_zero_address(struct net *net, lock_sock(sk); spin_lock_bh(&msk->pm.lock); mptcp_pm_remove_addr(msk, &list); - mptcp_pm_nl_rm_subflow_received(msk, &list); + mptcp_pm_rm_subflow(msk, &list); __mark_subflow_endp_available(msk, 0); spin_unlock_bh(&msk->pm.lock); release_sock(sk); @@ -1670,7 +1669,7 @@ static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk, mptcp_pm_remove_addr(msk, &alist); } if (slist.nr) - mptcp_pm_nl_rm_subflow_received(msk, &slist); + mptcp_pm_rm_subflow(msk, &slist); /* Reset counters: maybe some subflows have been removed before */ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); msk->pm.local_addr_used = 0; @@ -1910,7 +1909,7 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); spin_lock_bh(&msk->pm.lock); - mptcp_pm_nl_rm_subflow_received(msk, &list); + mptcp_pm_rm_subflow(msk, &list); __mark_subflow_endp_available(msk, list.ids[0]); mptcp_pm_create_subflow_or_signal_addr(msk); spin_unlock_bh(&msk->pm.lock); -- 2.51.0 From 550c50bbc2b7950dd4ea1082c016f84469509eab Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:50 +0100 Subject: [PATCH 14/16] mptcp: pm: remove '_nl' from mptcp_pm_nl_subflow_chk_stale() Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_subflow_chk_stale' is not specific to this PM: it is called from pm.c for both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-6-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 2 +- net/mptcp/pm_netlink.c | 2 +- net/mptcp/protocol.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index ece706e8ed22..14c7ff5c606c 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -567,7 +567,7 @@ void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) } else if (subflow->stale_rcv_tstamp == rcv_tstamp) { if (subflow->stale_count < U8_MAX) subflow->stale_count++; - mptcp_pm_nl_subflow_chk_stale(msk, ssk); + mptcp_pm_subflows_chk_stale(msk, ssk); } else { subflow->stale_count = 0; mptcp_subflow_set_active(subflow); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 09ef3aa025e7..43667ad4c4ae 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1203,7 +1203,7 @@ static const struct genl_multicast_group mptcp_pm_mcgrps[] = { }, }; -void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) +void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk) { struct mptcp_subflow_context *iter, *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index f29f4dd28fc5..a5db1a297fbc 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -992,7 +992,7 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, const struct mptcp_addr_info *loc, const struct mptcp_addr_info *rem); void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); -void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); +void mptcp_pm_subflows_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); -- 2.51.0 From 498d7d8b75f16a5b6e4f6499f1e72e9296f2d0cd Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:51 +0100 Subject: [PATCH 15/16] mptcp: pm: remove '_nl' from mptcp_pm_nl_is_init_remote_addr Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. But here 'mptcp_pm_nl_is_init_remote_addr' is not specific to this PM: it is called from pm.c for both the in-kernel and userspace PMs. To avoid confusions, the '_nl' bit has been removed from the name. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-7-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm.c | 2 +- net/mptcp/pm_netlink.c | 4 ++-- net/mptcp/protocol.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 14c7ff5c606c..ab443b9f9c5f 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -231,7 +231,7 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } /* id0 should not have a different address */ - } else if ((addr->id == 0 && !mptcp_pm_nl_is_init_remote_addr(msk, addr)) || + } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 43667ad4c4ae..029a74162b0b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -772,8 +772,8 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) } } -bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote) +bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote) { struct mptcp_addr_info mpc_remote; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a5db1a297fbc..39bcad1def6b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -1006,8 +1006,8 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk, const struct mptcp_addr_info *addr); void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk); -bool mptcp_pm_nl_is_init_remote_addr(struct mptcp_sock *msk, - const struct mptcp_addr_info *remote); +bool mptcp_pm_is_init_remote_addr(struct mptcp_sock *msk, + const struct mptcp_addr_info *remote); void mptcp_pm_addr_send_ack(struct mptcp_sock *msk); void mptcp_pm_rm_addr_received(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list); -- 2.51.0 From 40aa7409d30df2b610f8e590cbb183faa2ac6f1f Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Fri, 7 Mar 2025 12:21:52 +0100 Subject: [PATCH 16/16] mptcp: pm: kernel: add '_pm' to mptcp_nl_set_flags Currently, in-kernel PM specific helpers are prefixed with 'mptcp_pm_nl_'. Here, '_pm' was missing from 'mptcp_nl_set_flags'. Add '_pm' to be similar to others, and add '_all' to avoid confusions witih the global 'mptcp_pm_nl_set_flags'. No behavioural changes intended. Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250307-net-next-mptcp-pm-reorg-v1-8-abef20ada03b@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 029a74162b0b..781831c50691 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1915,9 +1915,9 @@ static void mptcp_pm_nl_fullmesh(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); } -static void mptcp_nl_set_flags(struct net *net, - struct mptcp_pm_addr_entry *local, - u8 changed) +static void mptcp_pm_nl_set_flags_all(struct net *net, + struct mptcp_pm_addr_entry *local, + u8 changed) { u8 is_subflow = !!(local->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW); u8 bkup = !!(local->flags & MPTCP_PM_ADDR_FLAG_BACKUP); @@ -1992,7 +1992,7 @@ int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, *local = *entry; spin_unlock_bh(&pernet->lock); - mptcp_nl_set_flags(net, local, changed); + mptcp_pm_nl_set_flags_all(net, local, changed); return 0; } -- 2.51.0