From 3a0b7fa095212b51ed63892540c4f249991a2d74 Mon Sep 17 00:00:00 2001 From: Liu Ye Date: Thu, 16 Jan 2025 09:30:37 +0800 Subject: [PATCH 01/16] selftests/net/ipsec: Fix Null pointer dereference in rtattr_pack() Address Null pointer dereference / undefined behavior in rtattr_pack (note that size is 0 in the bad case). Flagged by cppcheck as: tools/testing/selftests/net/ipsec.c:230:25: warning: Possible null pointer dereference: payload [nullPointer] memcpy(RTA_DATA(attr), payload, size); ^ tools/testing/selftests/net/ipsec.c:1618:54: note: Calling function 'rtattr_pack', 4th argument 'NULL' value is 0 if (rtattr_pack(&req.nh, sizeof(req), XFRMA_IF_ID, NULL, 0)) { ^ tools/testing/selftests/net/ipsec.c:230:25: note: Null pointer dereference memcpy(RTA_DATA(attr), payload, size); ^ Signed-off-by: Liu Ye Link: https://patch.msgid.link/20250116013037.29470-1-liuye@kylinos.cn Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/ipsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/ipsec.c b/tools/testing/selftests/net/ipsec.c index be4a30a0d02a..9b44a091802c 100644 --- a/tools/testing/selftests/net/ipsec.c +++ b/tools/testing/selftests/net/ipsec.c @@ -227,7 +227,8 @@ static int rtattr_pack(struct nlmsghdr *nh, size_t req_sz, attr->rta_len = RTA_LENGTH(size); attr->rta_type = rta_type; - memcpy(RTA_DATA(attr), payload, size); + if (payload) + memcpy(RTA_DATA(attr), payload, size); return 0; } -- 2.51.0 From 454d402481d45af79ee7eea7e64bce02bbbe9766 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:34 +0900 Subject: [PATCH 02/16] net: dropreason: Gather SOCKET_ drop reasons. The following patch adds a new drop reason starting with the SOCKET_ prefix. Let's gather the existing SOCKET_ reasons. Note that the order is not part of uAPI. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index ed864934e20b..f3714cbea50d 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -6,9 +6,10 @@ #define DEFINE_DROP_REASON(FN, FNe) \ FN(NOT_SPECIFIED) \ FN(NO_SOCKET) \ + FN(SOCKET_FILTER) \ + FN(SOCKET_RCVBUFF) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ - FN(SOCKET_FILTER) \ FN(UDP_CSUM) \ FN(NETFILTER_DROP) \ FN(OTHERHOST) \ @@ -18,7 +19,6 @@ FN(UNICAST_IN_L2_MULTICAST) \ FN(XFRM_POLICY) \ FN(IP_NOPROTO) \ - FN(SOCKET_RCVBUFF) \ FN(PROTO_MEM) \ FN(TCP_AUTH_HDR) \ FN(TCP_MD5NOTFOUND) \ @@ -138,12 +138,14 @@ enum skb_drop_reason { * 3) no valid child socket during 3WHS process */ SKB_DROP_REASON_NO_SOCKET, + /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ + SKB_DROP_REASON_SOCKET_FILTER, + /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ + SKB_DROP_REASON_SOCKET_RCVBUFF, /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ SKB_DROP_REASON_PKT_TOO_SMALL, /** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */ SKB_DROP_REASON_TCP_CSUM, - /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ - SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_UDP_CSUM: UDP checksum error */ SKB_DROP_REASON_UDP_CSUM, /** @SKB_DROP_REASON_NETFILTER_DROP: dropped by netfilter */ @@ -174,8 +176,6 @@ enum skb_drop_reason { SKB_DROP_REASON_XFRM_POLICY, /** @SKB_DROP_REASON_IP_NOPROTO: no support for IP protocol */ SKB_DROP_REASON_IP_NOPROTO, - /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ - SKB_DROP_REASON_SOCKET_RCVBUFF, /** * @SKB_DROP_REASON_PROTO_MEM: proto memory limitation, such as * udp packet drop out of udp_memory_allocated. -- 2.51.0 From c32f0bd7d4838982c6724fca0da92353f27c6f88 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:35 +0900 Subject: [PATCH 03/16] af_unix: Set drop reason in unix_release_sock(). unix_release_sock() is called when the last refcnt of struct file is released. Let's define a new drop reason SKB_DROP_REASON_SOCKET_CLOSE and set it for kfree_skb() in unix_release_sock(). # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX) >>> s1.send(b'hello world') >>> s2.close() # cat /sys/kernel/tracing/trace_pipe ... python3-280 ... kfree_skb: ... protocol=0 location=unix_release_sock+0x260/0x420 reason: SOCKET_CLOSE To be precise, unix_release_sock() is also called for a new child socket in unix_stream_connect() when something fails, but the new sk does not have skb in the recv queue then and no event is logged. Note that only tcp_inbound_ao_hash() uses a similar drop reason, SKB_DROP_REASON_TCP_CLOSE, and this can be generalised later. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 3 +++ net/unix/af_unix.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index f3714cbea50d..b9e7ff853ce3 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -6,6 +6,7 @@ #define DEFINE_DROP_REASON(FN, FNe) \ FN(NOT_SPECIFIED) \ FN(NO_SOCKET) \ + FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ FN(PKT_TOO_SMALL) \ @@ -138,6 +139,8 @@ enum skb_drop_reason { * 3) no valid child socket during 3WHS process */ SKB_DROP_REASON_NO_SOCKET, + /** @SKB_DROP_REASON_SOCKET_CLOSE: socket is close()d */ + SKB_DROP_REASON_SOCKET_CLOSE, /** @SKB_DROP_REASON_SOCKET_FILTER: dropped by socket filter */ SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8f2b605ce5b3..a05d25cc5545 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -715,8 +715,8 @@ static void unix_release_sock(struct sock *sk, int embrion) if (state == TCP_LISTEN) unix_release_sock(skb->sk, 1); - /* passed fds are erased in the kfree_skb hook */ - kfree_skb(skb); + /* passed fds are erased in the kfree_skb hook */ + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); } if (path.dentry) -- 2.51.0 From 4d0446b7a214e2aa28c0e914329610731f665ad2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:36 +0900 Subject: [PATCH 04/16] af_unix: Set drop reason in unix_sock_destructor(). unix_sock_destructor() is called as sk->sk_destruct() just before the socket is actually freed. Let's use SKB_DROP_REASON_SOCKET_CLOSE for skb_queue_purge(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a05d25cc5545..41b99984008a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -640,7 +640,7 @@ static void unix_sock_destructor(struct sock *sk) { struct unix_sock *u = unix_sk(sk); - skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE); DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); -- 2.51.0 From c49a157c33c45cf00a1881e8c1f65bed5ff0023e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:37 +0900 Subject: [PATCH 05/16] af_unix: Set drop reason in __unix_gc(). Inflight file descriptors by SCM_RIGHTS hold references to the struct file. AF_UNIX sockets could hold references to each other, forming reference cycles. Once such sockets are close()d without the fd recv()ed, they will be unaccessible from userspace but remain in kernel. __unix_gc() garbage-collects skb with the dead file descriptors and frees them by __skb_queue_purge(). Let's set SKB_DROP_REASON_SOCKET_CLOSE there. # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> from array import array >>> >>> # Create a reference cycle >>> s1 = socket(AF_UNIX, SOCK_DGRAM) >>> s1.bind('') >>> s1.sendmsg([b"nop"], [(SOL_SOCKET, SCM_RIGHTS, array("i", [s1.fileno()]))], 0, s1.getsockname()) >>> s1.close() >>> >>> # Trigger GC >>> s2 = socket(AF_UNIX) >>> s2.close() # cat /sys/kernel/tracing/trace_pipe ... kworker/u16:2-42 ... kfree_skb: ... location=__unix_gc+0x4ad/0x580 reason: SOCKET_CLOSE Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/garbage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 0068e758be4d..9848b7b78701 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -573,7 +573,7 @@ static void __unix_gc(struct work_struct *work) UNIXCB(skb).fp->dead = true; } - __skb_queue_purge(&hitlist); + __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE); skip_gc: WRITE_ONCE(gc_in_progress, false); } -- 2.51.0 From 533643b091dd6e246d57caf81e6892fa9cbb1cc9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:38 +0900 Subject: [PATCH 06/16] af_unix: Set drop reason in manage_oob(). AF_UNIX SOCK_STREAM socket supports MSG_OOB. When OOB data is sent to a socket, recv() will break at that point. If the next recv() does not have MSG_OOB, the normal data following the OOB data is returned. Then, the OOB skb is dropped. Let's define a new drop reason for that case in manage_oob(). # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX) >>> s1.send(b'a', MSG_OOB) >>> s1.send(b'b') >>> s2.recv(2) b'b' # cat /sys/kernel/tracing/trace_pipe ... python3-223 ... kfree_skb: ... location=unix_stream_read_generic+0x59e/0xc20 reason: UNIX_SKIP_OOB Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 ++++++ net/unix/af_unix.c | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index b9e7ff853ce3..d6c9d841eb11 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -9,6 +9,7 @@ FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ + FN(UNIX_SKIP_OOB) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ FN(UDP_CSUM) \ @@ -145,6 +146,11 @@ enum skb_drop_reason { SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ SKB_DROP_REASON_SOCKET_RCVBUFF, + /** + * @SKB_DROP_REASON_UNIX_SKIP_OOB: Out-Of-Band data is skipped by + * recv() without MSG_OOB so dropped. + */ + SKB_DROP_REASON_UNIX_SKIP_OOB, /** @SKB_DROP_REASON_PKT_TOO_SMALL: packet size is too small */ SKB_DROP_REASON_PKT_TOO_SMALL, /** @SKB_DROP_REASON_TCP_CSUM: TCP checksum error */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 41b99984008a..e31fda1d319f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2695,7 +2695,7 @@ unlock: spin_unlock(&sk->sk_receive_queue.lock); consume_skb(read_skb); - kfree_skb(unread_skb); + kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return skb; } -- 2.51.0 From bace4b468049a558295a0f59460fcb51e28f8fde Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:39 +0900 Subject: [PATCH 07/16] af_unix: Set drop reason in unix_stream_read_skb(). unix_stream_read_skb() is called when BPF SOCKMAP reads some data from a socket in the map. SOCKMAP does not support MSG_OOB, and reading OOB results in a drop. Let's set drop reasons respectively. * SOCKET_CLOSE : the socket in SOCKMAP was close()d * UNIX_SKIP_OOB : OOB was read from the socket in SOCKMAP Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e31fda1d319f..de4966e1b7ff 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2724,7 +2724,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (sock_flag(sk, SOCK_DEAD)) { unix_state_unlock(sk); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); return -ECONNRESET; } @@ -2738,7 +2738,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) unix_state_unlock(sk); if (drop) { - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return -EAGAIN; } } -- 2.51.0 From b3e365bbf4f47b8f76b25b0fcf3f38916ca53e42 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:40 +0900 Subject: [PATCH 08/16] af_unix: Set drop reason in unix_dgram_disconnected(). unix_dgram_disconnected() is called from two places: 1. when a connect()ed socket dis-connect()s or re-connect()s to another socket 2. when sendmsg() fails because the peer socket that the client has connect()ed to has been close()d Then, the client's recv queue is purged to remove all messages from the old peer socket. Let's define a new drop reason for that case. # echo 1 > /sys/kernel/tracing/events/skb/kfree_skb/enable # python3 >>> from socket import * >>> >>> # s1 has a message from s2 >>> s1, s2 = socketpair(AF_UNIX, SOCK_DGRAM) >>> s2.send(b'hello world') >>> >>> # re-connect() drops the message from s2 >>> s3 = socket(AF_UNIX, SOCK_DGRAM) >>> s3.bind('') >>> s1.connect(s3.getsockname()) # cat /sys/kernel/tracing/trace_pipe python3-250 ... kfree_skb: ... location=skb_queue_purge_reason+0xdc/0x110 reason: UNIX_DISCONNECT Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 7 +++++++ net/unix/af_unix.c | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index d6c9d841eb11..32a34dfe8cc5 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -9,6 +9,7 @@ FN(SOCKET_CLOSE) \ FN(SOCKET_FILTER) \ FN(SOCKET_RCVBUFF) \ + FN(UNIX_DISCONNECT) \ FN(UNIX_SKIP_OOB) \ FN(PKT_TOO_SMALL) \ FN(TCP_CSUM) \ @@ -146,6 +147,12 @@ enum skb_drop_reason { SKB_DROP_REASON_SOCKET_FILTER, /** @SKB_DROP_REASON_SOCKET_RCVBUFF: socket receive buff is full */ SKB_DROP_REASON_SOCKET_RCVBUFF, + /** + * @SKB_DROP_REASON_UNIX_DISCONNECT: recv queue is purged when SOCK_DGRAM + * or SOCK_SEQPACKET socket re-connect()s to another socket or notices + * during send() that the peer has been close()d. + */ + SKB_DROP_REASON_UNIX_DISCONNECT, /** * @SKB_DROP_REASON_UNIX_SKIP_OOB: Out-Of-Band data is skipped by * recv() without MSG_OOB so dropped. diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index de4966e1b7ff..5e1b408c19da 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -622,7 +622,9 @@ static void unix_write_space(struct sock *sk) static void unix_dgram_disconnected(struct sock *sk, struct sock *other) { if (!skb_queue_empty(&sk->sk_receive_queue)) { - skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge_reason(&sk->sk_receive_queue, + SKB_DROP_REASON_UNIX_DISCONNECT); + wake_up_interruptible_all(&unix_sk(sk)->peer_wait); /* If one link of bidirectional dgram pipe is disconnected, -- 2.51.0 From 3b2d40dc13c26a4efde438beb664576d20a9fb4a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:41 +0900 Subject: [PATCH 09/16] af_unix: Reuse out_pipe label in unix_stream_sendmsg(). This is a follow-up of commit d460b04bc452 ("af_unix: Clean up error paths in unix_stream_sendmsg()."). If we initialise skb with NULL in unix_stream_sendmsg(), we can reuse the existing out_pipe label for the SEND_SHUTDOWN check. Let's rename it and adjust the existing label as out_pipe_lock. While at it, size and data_len are moved to the while loop scope. Suggested-by: Paolo Abeni Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5e1b408c19da..43a45cf06f2e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2238,13 +2238,11 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; + struct sk_buff *skb = NULL; struct sock *other = NULL; - int err, size; - struct sk_buff *skb; - int sent = 0; struct scm_cookie scm; bool fds_sent = false; - int data_len; + int err, sent = 0; err = scm_send(sock, msg, &scm, false); if (err < 0) @@ -2273,16 +2271,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, } } - if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) { - if (!(msg->msg_flags & MSG_NOSIGNAL)) - send_sig(SIGPIPE, current, 0); - - err = -EPIPE; - goto out_err; - } + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) + goto out_pipe; while (sent < len) { - size = len - sent; + int size = len - sent; + int data_len; if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb = sock_alloc_send_pskb(sk, 0, 0, @@ -2335,7 +2329,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) - goto out_pipe; + goto out_pipe_unlock; maybe_add_creds(skb, sock, other); scm_stat_add(other, skb); @@ -2358,8 +2352,9 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, return sent; -out_pipe: +out_pipe_unlock: unix_state_unlock(other); +out_pipe: if (!sent && !(msg->msg_flags & MSG_NOSIGNAL)) send_sig(SIGPIPE, current, 0); err = -EPIPE; -- 2.51.0 From 085e6cba85ca81fbb4ebfc238c934108f0e8467e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 16 Jan 2025 14:34:42 +0900 Subject: [PATCH 10/16] af_unix: Use consume_skb() in connect() and sendmsg(). This is based on Donald Hunter's patch. These functions could fail for various reasons, sometimes triggering kfree_skb(). * unix_stream_connect() : connect() * unix_stream_sendmsg() : sendmsg() * queue_oob() : sendmsg(MSG_OOB) * unix_dgram_sendmsg() : sendmsg() Such kfree_skb() is tied to the errno of connect() and sendmsg(), and we need not define skb drop reasons. Let's use consume_skb() not to churn kfree_skb() events. Link: https://lore.kernel.org/netdev/eb30b164-7f86-46bf-a5d3-0f8bda5e9398@redhat.com/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250116053441.5758-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 43a45cf06f2e..34945de1fb1f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1701,7 +1701,7 @@ out_unlock: unix_state_unlock(other); sock_put(other); out_free_skb: - kfree_skb(skb); + consume_skb(skb); out_free_sk: unix_release_sock(newsk, 0); out: @@ -2172,7 +2172,7 @@ out_unlock: out_sock_put: sock_put(other); out_free: - kfree_skb(skb); + consume_skb(skb); out: scm_destroy(&scm); return err; @@ -2189,7 +2189,7 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; - int err = 0; + int err; skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); @@ -2197,25 +2197,22 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other return err; err = unix_scm_to_skb(scm, skb, !fds_sent); - if (err < 0) { - kfree_skb(skb); - return err; - } + if (err < 0) + goto out; + skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); - if (err) { - kfree_skb(skb); - return err; - } + if (err) + goto out; unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { unix_state_unlock(other); - kfree_skb(skb); - return -EPIPE; + err = -EPIPE; + goto out; } maybe_add_creds(skb, sock, other); @@ -2230,6 +2227,9 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other unix_state_unlock(other); other->sk_data_ready(other); + return 0; +out: + consume_skb(skb); return err; } #endif @@ -2359,7 +2359,7 @@ out_pipe: send_sig(SIGPIPE, current, 0); err = -EPIPE; out_free: - kfree_skb(skb); + consume_skb(skb); out_err: scm_destroy(&scm); return sent ? : err; -- 2.51.0 From 3c836451ca9041cfb32a7d8f59ea15b3b991bbb3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:11 -0800 Subject: [PATCH 11/16] net: move HDS config from ethtool state Separate the HDS config from the ethtool state struct. The HDS config contains just simple parameters, not state. Having it as a separate struct will make it easier to clone / copy and also long term potentially make it per-queue. Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 4 ++-- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 5 +++-- drivers/net/netdevsim/ethtool.c | 9 +++++---- drivers/net/netdevsim/netdev.c | 10 +++++----- include/linux/ethtool.h | 4 ---- include/linux/netdevice.h | 3 +++ include/net/netdev_queues.h | 10 ++++++++++ net/core/dev.c | 10 ++++++++-- net/core/devmem.c | 4 ++-- net/ethtool/rings.c | 8 +++++--- 10 files changed, 43 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 748c9b1ea701..0998b20578b4 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4610,7 +4610,7 @@ void bnxt_set_tpa_flags(struct bnxt *bp) static void bnxt_init_ring_params(struct bnxt *bp) { bp->rx_copybreak = BNXT_DEFAULT_RX_COPYBREAK; - bp->dev->ethtool->hds_thresh = BNXT_DEFAULT_RX_COPYBREAK; + bp->dev->cfg->hds_thresh = BNXT_DEFAULT_RX_COPYBREAK; } /* bp->rx_ring_size, bp->tx_ring_size, dev->mtu, BNXT_FLAG_{G|L}RO flags must @@ -6585,7 +6585,7 @@ static void bnxt_hwrm_update_rss_hash_cfg(struct bnxt *bp) static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, struct bnxt_vnic_info *vnic) { - u16 hds_thresh = (u16)bp->dev->ethtool->hds_thresh; + u16 hds_thresh = (u16)bp->dev->cfg->hds_thresh; struct hwrm_vnic_plcmodes_cfg_input *req; int rc; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 65a20931c579..0a6d47d4d66b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "bnxt_hsi.h" #include "bnxt.h" @@ -834,7 +835,7 @@ static void bnxt_get_ringparam(struct net_device *dev, ering->rx_jumbo_pending = bp->rx_agg_ring_size; ering->tx_pending = bp->tx_ring_size; - kernel_ering->hds_thresh = dev->ethtool->hds_thresh; + kernel_ering->hds_thresh = dev->cfg->hds_thresh; kernel_ering->hds_thresh_max = BNXT_HDS_THRESHOLD_MAX; } @@ -852,7 +853,7 @@ static int bnxt_set_ringparam(struct net_device *dev, (ering->tx_pending < BNXT_MIN_TX_DESC_CNT)) return -EINVAL; - hds_config_mod = tcp_data_split != dev->ethtool->hds_config; + hds_config_mod = tcp_data_split != dev->cfg->hds_config; if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && hds_config_mod) return -EINVAL; diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index 12163635b759..189793debdb7 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -3,6 +3,7 @@ #include #include +#include #include "netdevsim.h" @@ -71,8 +72,8 @@ static void nsim_get_ringparam(struct net_device *dev, struct netdevsim *ns = netdev_priv(dev); memcpy(ring, &ns->ethtool.ring, sizeof(ns->ethtool.ring)); - kernel_ring->tcp_data_split = dev->ethtool->hds_config; - kernel_ring->hds_thresh = dev->ethtool->hds_thresh; + kernel_ring->tcp_data_split = dev->cfg->hds_config; + kernel_ring->hds_thresh = dev->cfg->hds_thresh; kernel_ring->hds_thresh_max = NSIM_HDS_THRESHOLD_MAX; if (kernel_ring->tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN) @@ -190,8 +191,8 @@ static void nsim_ethtool_ring_init(struct netdevsim *ns) ns->ethtool.ring.rx_mini_max_pending = 4096; ns->ethtool.ring.tx_max_pending = 4096; - ns->netdev->ethtool->hds_config = ETHTOOL_TCP_DATA_SPLIT_UNKNOWN; - ns->netdev->ethtool->hds_thresh = 0; + ns->netdev->cfg->hds_config = ETHTOOL_TCP_DATA_SPLIT_UNKNOWN; + ns->netdev->cfg->hds_thresh = 0; } void nsim_ethtool_init(struct netdevsim *ns) diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c index f92b05ccdca9..42f247cbdcee 100644 --- a/drivers/net/netdevsim/netdev.c +++ b/drivers/net/netdevsim/netdev.c @@ -55,10 +55,10 @@ static int nsim_forward_skb(struct net_device *dev, struct sk_buff *skb, static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct netdevsim *ns = netdev_priv(dev); - struct ethtool_netdev_state *ethtool; struct net_device *peer_dev; unsigned int len = skb->len; struct netdevsim *peer_ns; + struct netdev_config *cfg; struct nsim_rq *rq; int rxq; @@ -76,11 +76,11 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) rxq = rxq % peer_dev->num_rx_queues; rq = peer_ns->rq[rxq]; - ethtool = peer_dev->ethtool; + cfg = peer_dev->cfg; if (skb_is_nonlinear(skb) && - (ethtool->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED || - (ethtool->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && - ethtool->hds_thresh > len))) + (cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED || + (cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + cfg->hds_thresh > len))) skb_linearize(skb); skb_tx_timestamp(skb); diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 64301ddf2f59..870994cc3ef7 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -1171,16 +1171,12 @@ int ethtool_virtdev_set_link_ksettings(struct net_device *dev, * @rss_ctx: XArray of custom RSS contexts * @rss_lock: Protects entries in @rss_ctx. May be taken from * within RTNL. - * @hds_thresh: HDS Threshold value. - * @hds_config: HDS value from userspace. * @wol_enabled: Wake-on-LAN is enabled * @module_fw_flash_in_progress: Module firmware flashing is in progress. */ struct ethtool_netdev_state { struct xarray rss_ctx; struct mutex rss_lock; - u32 hds_thresh; - u8 hds_config; unsigned wol_enabled:1; unsigned module_fw_flash_in_progress:1; }; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 8308d9c75918..173a8b3a9eb2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -63,6 +63,7 @@ struct dsa_port; struct ip_tunnel_parm_kern; struct macsec_context; struct macsec_ops; +struct netdev_config; struct netdev_name_node; struct sd_flow_limit; struct sfp_bus; @@ -2410,6 +2411,8 @@ struct net_device { const struct udp_tunnel_nic_info *udp_tunnel_nic_info; struct udp_tunnel_nic *udp_tunnel_nic; + /** @cfg: net_device queue-related configuration */ + struct netdev_config *cfg; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index 5ca019d294ca..b02bb9f109d5 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -4,6 +4,16 @@ #include +/** + * struct netdev_config - queue-related configuration for a netdev + * @hds_thresh: HDS Threshold value. + * @hds_config: HDS value from userspace. + */ +struct netdev_config { + u32 hds_thresh; + u8 hds_config; +}; + /* See the netdev.yaml spec for definition of each statistic */ struct netdev_queue_stats_rx { u64 bytes; diff --git a/net/core/dev.c b/net/core/dev.c index 3dab6699b1c1..e37d47cf476b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -106,6 +106,7 @@ #include #include #include +#include #include #include #include @@ -9719,7 +9720,7 @@ int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf) if (!dev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; - if (dev->ethtool->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && bpf->command == XDP_SETUP_PROG && bpf->prog && !bpf->prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(bpf->extack, @@ -9764,7 +9765,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode, struct netdev_bpf xdp; int err; - if (dev->ethtool->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && + if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED && prog && !prog->aux->xdp_has_frags) { NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split"); return -EBUSY; @@ -11542,6 +11543,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (!dev->ethtool) goto free_all; + dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); + if (!dev->cfg) + goto free_all; + napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) @@ -11610,6 +11615,7 @@ void free_netdev(struct net_device *dev) return; } + kfree(dev->cfg); kfree(dev->ethtool); netif_free_tx_queues(dev); netif_free_rx_queues(dev); diff --git a/net/core/devmem.c b/net/core/devmem.c index c971b8aceac8..3bba3f018df0 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -141,12 +141,12 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, return -ERANGE; } - if (dev->ethtool->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { + if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) { NL_SET_ERR_MSG(extack, "tcp-data-split is disabled"); return -EINVAL; } - if (dev->ethtool->hds_thresh) { + if (dev->cfg->hds_thresh) { NL_SET_ERR_MSG(extack, "hds-thresh is not zero"); return -EINVAL; } diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index d8cd4e4d7762..7a3c2a2dff12 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -1,5 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only +#include + #include "netlink.h" #include "common.h" @@ -219,7 +221,7 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) dev->ethtool_ops->get_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); - kernel_ringparam.tcp_data_split = dev->ethtool->hds_config; + kernel_ringparam.tcp_data_split = dev->cfg->hds_config; ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod); ethnl_update_u32(&ringparam.rx_mini_pending, @@ -295,8 +297,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); if (!ret) { - dev->ethtool->hds_config = kernel_ringparam.tcp_data_split; - dev->ethtool->hds_thresh = kernel_ringparam.hds_thresh; + dev->cfg->hds_config = kernel_ringparam.tcp_data_split; + dev->cfg->hds_thresh = kernel_ringparam.hds_thresh; } return ret < 0 ? ret : 1; -- 2.51.0 From 743dea746ed6d581877c114ef1f362bc277fed6a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:12 -0800 Subject: [PATCH 12/16] net: ethtool: store netdev in a temp variable in ethnl_default_set_doit() For ease of review of the next patch store the dev pointer on the stack, instead of referring to req_info.dev every time. No functional changes. Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/ethtool/netlink.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 849c98e637c6..c17d8513d4c1 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -667,6 +667,7 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) const struct ethnl_request_ops *ops; struct ethnl_req_info req_info = {}; const u8 cmd = info->genlhdr->cmd; + struct net_device *dev; int ret; ops = ethnl_default_requests[cmd]; @@ -688,19 +689,21 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) goto out_dev; } + dev = req_info.dev; + rtnl_lock(); - ret = ethnl_ops_begin(req_info.dev); + ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; ret = ops->set(&req_info, info); if (ret <= 0) goto out_ops; - ethtool_notify(req_info.dev, ops->set_ntf_cmd, NULL); + ethtool_notify(dev, ops->set_ntf_cmd, NULL); ret = 0; out_ops: - ethnl_ops_complete(req_info.dev); + ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); out_dev: -- 2.51.0 From 32ad1f7a050d0c17e1e52e1dfdd9f6221ae20ef9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:13 -0800 Subject: [PATCH 13/16] net: provide pending ring configuration in net_device Record the pending configuration in net_device struct. ethtool core duplicates the current config and the specific handlers (for now just ringparam) can modify it. Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-4-kuba@kernel.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 6 ++++++ net/core/dev.c | 2 ++ net/ethtool/netlink.c | 21 ++++++++++++++++++--- net/ethtool/rings.c | 8 +++----- 4 files changed, 29 insertions(+), 8 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 173a8b3a9eb2..8da4c61f97b9 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2413,6 +2413,12 @@ struct net_device { /** @cfg: net_device queue-related configuration */ struct netdev_config *cfg; + /** + * @cfg_pending: same as @cfg but when device is being actively + * reconfigured includes any changes to the configuration + * requested by the user, but which may or may not be rejected. + */ + struct netdev_config *cfg_pending; struct ethtool_netdev_state *ethtool; /* protected by rtnl_lock */ diff --git a/net/core/dev.c b/net/core/dev.c index e37d47cf476b..afa2282f2604 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11546,6 +11546,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); if (!dev->cfg) goto free_all; + dev->cfg_pending = dev->cfg; napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); @@ -11615,6 +11616,7 @@ void free_netdev(struct net_device *dev) return; } + WARN_ON(dev->cfg != dev->cfg_pending); kfree(dev->cfg); kfree(dev->ethtool); netif_free_tx_queues(dev); diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index c17d8513d4c1..1d2f62ef6130 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include #include #include @@ -692,19 +693,33 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) dev = req_info.dev; rtnl_lock(); + dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg), + GFP_KERNEL_ACCOUNT); + if (!dev->cfg_pending) { + ret = -ENOMEM; + goto out_tie_cfg; + } + ret = ethnl_ops_begin(dev); if (ret < 0) - goto out_rtnl; + goto out_free_cfg; ret = ops->set(&req_info, info); - if (ret <= 0) + if (ret < 0) + goto out_ops; + + swap(dev->cfg, dev->cfg_pending); + if (!ret) goto out_ops; ethtool_notify(dev, ops->set_ntf_cmd, NULL); ret = 0; out_ops: ethnl_ops_complete(dev); -out_rtnl: +out_free_cfg: + kfree(dev->cfg_pending); +out_tie_cfg: + dev->cfg_pending = dev->cfg; rtnl_unlock(); out_dev: ethnl_parse_header_dev_put(&req_info); diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 7a3c2a2dff12..5e8ba81fbb3e 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -294,13 +294,11 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info) return -EINVAL; } + dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split; + dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh; + ret = dev->ethtool_ops->set_ringparam(dev, &ringparam, &kernel_ringparam, info->extack); - if (!ret) { - dev->cfg->hds_config = kernel_ringparam.tcp_data_split; - dev->cfg->hds_thresh = kernel_ringparam.hds_thresh; - } - return ret < 0 ? ret : 1; } -- 2.51.0 From e58263e9111733c4bfbbf07517d1f98f21134c52 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:14 -0800 Subject: [PATCH 14/16] eth: bnxt: apply hds_thrs settings correctly Use the pending config for hds_thrs. Core will only update the "current" one after we return success. Without this change 2 reconfigs would be required for the setting to reach the device. Fixes: 6b43673a25c3 ("bnxt_en: add support for hds-thresh ethtool command") Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-5-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 0998b20578b4..2eeed4c11b64 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6585,7 +6585,7 @@ static void bnxt_hwrm_update_rss_hash_cfg(struct bnxt *bp) static int bnxt_hwrm_vnic_set_hds(struct bnxt *bp, struct bnxt_vnic_info *vnic) { - u16 hds_thresh = (u16)bp->dev->cfg->hds_thresh; + u16 hds_thresh = (u16)bp->dev->cfg_pending->hds_thresh; struct hwrm_vnic_plcmodes_cfg_input *req; int rc; -- 2.51.0 From 928459bbda19eb47e09d9bd4386ac46fad088958 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:15 -0800 Subject: [PATCH 15/16] net: ethtool: populate the default HDS params in the core The core has the current HDS config, it can pre-populate the values for the drivers. While at it, remove the zero-setting in netdevsim. Zero are the default values since the config is zalloc'ed. Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-6-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 1 - drivers/net/netdevsim/ethtool.c | 5 ----- net/ethtool/rings.c | 4 ++++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c index 0a6d47d4d66b..9c5820839514 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c @@ -835,7 +835,6 @@ static void bnxt_get_ringparam(struct net_device *dev, ering->rx_jumbo_pending = bp->rx_agg_ring_size; ering->tx_pending = bp->tx_ring_size; - kernel_ering->hds_thresh = dev->cfg->hds_thresh; kernel_ering->hds_thresh_max = BNXT_HDS_THRESHOLD_MAX; } diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index 189793debdb7..3b23f3d3ca2b 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -72,8 +72,6 @@ static void nsim_get_ringparam(struct net_device *dev, struct netdevsim *ns = netdev_priv(dev); memcpy(ring, &ns->ethtool.ring, sizeof(ns->ethtool.ring)); - kernel_ring->tcp_data_split = dev->cfg->hds_config; - kernel_ring->hds_thresh = dev->cfg->hds_thresh; kernel_ring->hds_thresh_max = NSIM_HDS_THRESHOLD_MAX; if (kernel_ring->tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_UNKNOWN) @@ -190,9 +188,6 @@ static void nsim_ethtool_ring_init(struct netdevsim *ns) ns->ethtool.ring.rx_jumbo_max_pending = 4096; ns->ethtool.ring.rx_mini_max_pending = 4096; ns->ethtool.ring.tx_max_pending = 4096; - - ns->netdev->cfg->hds_config = ETHTOOL_TCP_DATA_SPLIT_UNKNOWN; - ns->netdev->cfg->hds_thresh = 0; } void nsim_ethtool_init(struct netdevsim *ns) diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c index 5e8ba81fbb3e..7839bfd1ac6a 100644 --- a/net/ethtool/rings.c +++ b/net/ethtool/rings.c @@ -39,6 +39,10 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; + + data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config; + data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh; + dev->ethtool_ops->get_ringparam(dev, &data->ringparam, &data->kernel_ringparam, info->extack); ethnl_ops_complete(dev); -- 2.51.0 From bee018052d1bbfa6d33ca016a42e8e2534429492 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Sat, 18 Jan 2025 18:05:16 -0800 Subject: [PATCH 16/16] eth: bnxt: allocate enough buffer space to meet HDS threshold Now that we can configure HDS threshold separately from the rx_copybreak HDS threshold may be higher than rx_copybreak. We need to make sure that we have enough space for the headers. Fixes: 6b43673a25c3 ("bnxt_en: add support for hds-thresh ethtool command") Reviewed-by: Michael Chan Link: https://patch.msgid.link/20250119020518.1962249-7-kuba@kernel.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 2eeed4c11b64..19e723493c4e 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4671,9 +4671,10 @@ void bnxt_set_ring_params(struct bnxt *bp) ALIGN(max(NET_SKB_PAD, XDP_PACKET_HEADROOM), 8) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); } else { - rx_size = SKB_DATA_ALIGN(max(BNXT_DEFAULT_RX_COPYBREAK, - bp->rx_copybreak) + - NET_IP_ALIGN); + rx_size = max3(BNXT_DEFAULT_RX_COPYBREAK, + bp->rx_copybreak, + bp->dev->cfg_pending->hds_thresh); + rx_size = SKB_DATA_ALIGN(rx_size + NET_IP_ALIGN); rx_space = rx_size + NET_SKB_PAD + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); } -- 2.51.0