From a0ffa68c70b367358b2672cdab6fa5bc4c40de2c Mon Sep 17 00:00:00 2001 From: Eddie James Date: Wed, 25 Sep 2024 10:55:23 -0500 Subject: [PATCH 01/16] net/ncsi: Disable the ncsi work before freeing the associated structure The work function can run after the ncsi device is freed, resulting in use-after-free bugs or kernel panic. Fixes: 2d283bdd079c ("net/ncsi: Resource management") Signed-off-by: Eddie James Link: https://patch.msgid.link/20240925155523.1017097-1-eajames@linux.ibm.com Signed-off-by: Paolo Abeni --- net/ncsi/ncsi-manage.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 5ecf611c8820..5cf55bde366d 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -1954,6 +1954,8 @@ void ncsi_unregister_dev(struct ncsi_dev *nd) list_del_rcu(&ndp->node); spin_unlock_irqrestore(&ncsi_dev_lock, flags); + disable_work_sync(&ndp->work); + kfree(ndp); } EXPORT_SYMBOL_GPL(ncsi_unregister_dev); -- 2.51.0 From f7a4874d977bf4202ad575031222e78809a36292 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 2 Oct 2024 08:00:40 -0700 Subject: [PATCH 02/16] iomap: don't bother unsharing delalloc extents If unshare encounters a delalloc reservation in the srcmap, that means that the file range isn't shared because delalloc reservations cannot be reflinked. Therefore, don't try to unshare them. Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20241002150040.GB21853@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Christian Brauner --- fs/iomap/buffered-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 11ea747228ae..c1c559e0cc07 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1321,7 +1321,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) return length; /* - * Don't bother with holes or unwritten extents. + * Don't bother with delalloc reservations, holes or unwritten extents. * * Note that we use srcmap directly instead of iomap_iter_srcmap as * unsharing requires providing a separate source map, and the presence @@ -1330,6 +1330,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) * fork for XFS. */ if (iter->srcmap.type == IOMAP_HOLE || + iter->srcmap.type == IOMAP_DELALLOC || iter->srcmap.type == IOMAP_UNWRITTEN) return length; -- 2.51.0 From a311a08a4237241fb5b9d219d3e33346de6e83e0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 2 Oct 2024 08:02:13 -0700 Subject: [PATCH 03/16] iomap: constrain the file range passed to iomap_file_unshare File contents can only be shared (i.e. reflinked) below EOF, so it makes no sense to try to unshare ranges beyond EOF. Constrain the file range parameters here so that we don't have to do that in the callers. Fixes: 5f4e5752a8a3 ("fs: add iomap_file_dirty") Signed-off-by: Darrick J. Wong Link: https://lore.kernel.org/r/20241002150213.GC21853@frogsfrogsfrogs Reviewed-by: Christoph Hellwig Reviewed-by: Brian Foster Signed-off-by: Christian Brauner --- fs/dax.c | 6 +++++- fs/iomap/buffered-io.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index becb4a6920c6..c62acd2812f8 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -1305,11 +1305,15 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, struct iomap_iter iter = { .inode = inode, .pos = pos, - .len = len, .flags = IOMAP_WRITE | IOMAP_UNSHARE | IOMAP_DAX, }; + loff_t size = i_size_read(inode); int ret; + if (pos < 0 || pos >= size) + return 0; + + iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = dax_unshare_iter(&iter); return ret; diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index c1c559e0cc07..78ebd265f425 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1375,11 +1375,15 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, struct iomap_iter iter = { .inode = inode, .pos = pos, - .len = len, .flags = IOMAP_WRITE | IOMAP_UNSHARE, }; + loff_t size = i_size_read(inode); int ret; + if (pos < 0 || pos >= size) + return 0; + + iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_unshare_iter(&iter); return ret; -- 2.51.0 From 7c2f1c2690a5965fa0913c7b8c1b833dccddbb39 Mon Sep 17 00:00:00 2001 From: zhang jiao Date: Fri, 27 Sep 2024 12:00:50 +0800 Subject: [PATCH 04/16] selftests/net: Add missing va_end. There is no va_end after va_copy, just add it. Signed-off-by: zhang jiao Reviewed-by: Simon Horman Link: https://patch.msgid.link/20240927040050.7851-1-zhangjiao2@cmss.chinamobile.com Signed-off-by: Paolo Abeni --- tools/testing/selftests/net/tcp_ao/lib/aolib.h | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/tcp_ao/lib/aolib.h b/tools/testing/selftests/net/tcp_ao/lib/aolib.h index db44e77428dd..5db2f65cddc4 100644 --- a/tools/testing/selftests/net/tcp_ao/lib/aolib.h +++ b/tools/testing/selftests/net/tcp_ao/lib/aolib.h @@ -46,6 +46,7 @@ static inline char *test_snprintf(const char *fmt, va_list vargs) va_copy(tmp, vargs); n = vsnprintf(ret, size, fmt, tmp); + va_end(tmp); if (n < 0) return NULL; -- 2.51.0 From e26a0c5d828b225b88f534e2fcf10bf617f85f23 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Sun, 29 Sep 2024 20:44:35 -0700 Subject: [PATCH 05/16] net: mana: Increase the DEF_RX_BUFFERS_PER_QUEUE to 1024 Through some experiments, we found out that increasing the default RX buffers count from 512 to 1024, gives slightly better throughput and significantly reduces the no_wqe_rx errs on the receiver side. Along with these, other parameters like cpu usage, retrans seg etc also show some improvement with 1024 value. Following are some snippets from the experiments ntttcp tests with 512 Rx buffers --------------------------------------- connections| throughput| no_wqe errs| --------------------------------------- 1 | 40.93Gbps | 123,211 | 16 | 180.15Gbps | 190,120 | 128 | 180.20Gbps | 173,508 | 256 | 180.27Gbps | 189,884 | ntttcp tests with 1024 Rx buffers --------------------------------------- connections| throughput| no_wqe errs| --------------------------------------- 1 | 44.22Gbps | 19,864 | 16 | 180.19Gbps | 4,430 | 128 | 180.21Gbps | 2,560 | 256 | 180.29Gbps | 1,529 | So, increasing the default RX buffers per queue count to 1024 Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang Reviewed-by: Pavan Chebbi Link: https://patch.msgid.link/1727667875-29908-1-git-send-email-shradhagupta@linux.microsoft.com Signed-off-by: Paolo Abeni --- include/net/mana/mana.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index f2a5200d8a0f..9b0faa24b758 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -43,7 +43,7 @@ enum TRI_STATE { * size beyond this value gets rejected by __alloc_page() call. */ #define MAX_RX_BUFFERS_PER_QUEUE 8192 -#define DEF_RX_BUFFERS_PER_QUEUE 512 +#define DEF_RX_BUFFERS_PER_QUEUE 1024 #define MIN_RX_BUFFERS_PER_QUEUE 128 /* This max value for TX buffers is derived as the maximum allocatable -- 2.51.0 From b63ad06ddddfe792f93df0c24adb66622bd7b8c9 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Mon, 30 Sep 2024 11:39:54 -0400 Subject: [PATCH 06/16] doc: net: napi: Update documentation for napi_schedule_irqoff Since commit 8380c81d5c4f ("net: Treat __napi_schedule_irqoff() as __napi_schedule() on PREEMPT_RT"), napi_schedule_irqoff will do the right thing if IRQs are threaded. Therefore, there is no need to use IRQF_NO_THREAD. Signed-off-by: Sean Anderson Reviewed-by: Bagas Sanjaya Reviewed-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20240930153955.971657-1-sean.anderson@linux.dev Signed-off-by: Paolo Abeni --- Documentation/networking/napi.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/napi.rst b/Documentation/networking/napi.rst index 7bf7b95c4f7a..dfa5d549be9c 100644 --- a/Documentation/networking/napi.rst +++ b/Documentation/networking/napi.rst @@ -144,9 +144,8 @@ IRQ should only be unmasked after a successful call to napi_complete_done(): napi_schedule_irqoff() is a variant of napi_schedule() which takes advantage of guarantees given by being invoked in IRQ context (no need to -mask interrupts). Note that PREEMPT_RT forces all interrupts -to be threaded so the interrupt may need to be marked ``IRQF_NO_THREAD`` -to avoid issues on real-time kernel configurations. +mask interrupts). napi_schedule_irqoff() will fall back to napi_schedule() if +IRQs are threaded (such as if ``PREEMPT_RT`` is enabled). Instance to queue mapping ------------------------- -- 2.51.0 From c6929644c1e0d6108e57061d427eb966e1746351 Mon Sep 17 00:00:00 2001 From: Ravikanth Tuniki Date: Tue, 1 Oct 2024 00:43:35 +0530 Subject: [PATCH 07/16] dt-bindings: net: xlnx,axi-ethernet: Add missing reg minItems Add missing reg minItems as based on current binding document only ethernet MAC IO space is a supported configuration. There is a bug in schema, current examples contain 64-bit addressing as well as 32-bit addressing. The schema validation does pass incidentally considering one 64-bit reg address as two 32-bit reg address entries. If we change axi_ethernet_eth1 example node reg addressing to 32-bit schema validation reports: Documentation/devicetree/bindings/net/xlnx,axi-ethernet.example.dtb: ethernet@40000000: reg: [[1073741824, 262144]] is too short To fix it add missing reg minItems constraints and to make things clearer stick to 32-bit addressing in examples. Fixes: cbb1ca6d5f9a ("dt-bindings: net: xlnx,axi-ethernet: convert bindings document to yaml") Signed-off-by: Ravikanth Tuniki Signed-off-by: Radhey Shyam Pandey Acked-by: Conor Dooley Link: https://patch.msgid.link/1727723615-2109795-1-git-send-email-radhey.shyam.pandey@amd.com Signed-off-by: Paolo Abeni --- Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml b/Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml index bbe89ea9590c..e95c21628281 100644 --- a/Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml +++ b/Documentation/devicetree/bindings/net/xlnx,axi-ethernet.yaml @@ -34,6 +34,7 @@ properties: and length of the AXI DMA controller IO space, unless axistream-connected is specified, in which case the reg attribute of the node referenced by it is used. + minItems: 1 maxItems: 2 interrupts: @@ -181,7 +182,7 @@ examples: clock-names = "s_axi_lite_clk", "axis_clk", "ref_clk", "mgt_clk"; clocks = <&axi_clk>, <&axi_clk>, <&pl_enet_ref_clk>, <&mgt_clk>; phy-mode = "mii"; - reg = <0x00 0x40000000 0x00 0x40000>; + reg = <0x40000000 0x40000>; xlnx,rxcsum = <0x2>; xlnx,rxmem = <0x800>; xlnx,txcsum = <0x2>; -- 2.51.0 From 8beee4d8dee76b67c75dc91fd8185d91e845c160 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 30 Sep 2024 16:49:51 -0400 Subject: [PATCH 08/16] sctp: set sk_state back to CLOSED if autobind fails in sctp_listen_start In sctp_listen_start() invoked by sctp_inet_listen(), it should set the sk_state back to CLOSED if sctp_autobind() fails due to whatever reason. Otherwise, next time when calling sctp_inet_listen(), if sctp_sk(sk)->reuse is already set via setsockopt(SCTP_REUSE_PORT), sctp_sk(sk)->bind_hash will be dereferenced as sk_state is LISTENING, which causes a crash as bind_hash is NULL. KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] RIP: 0010:sctp_inet_listen+0x7f0/0xa20 net/sctp/socket.c:8617 Call Trace: __sys_listen_socket net/socket.c:1883 [inline] __sys_listen+0x1b7/0x230 net/socket.c:1894 __do_sys_listen net/socket.c:1902 [inline] Fixes: 5e8f3f703ae4 ("sctp: simplify sctp listening code") Reported-by: syzbot+f4e0f821e3a3b7cee51d@syzkaller.appspotmail.com Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Link: https://patch.msgid.link/a93e655b3c153dc8945d7a812e6d8ab0d52b7aa0.1727729391.git.lucien.xin@gmail.com Signed-off-by: Paolo Abeni --- net/sctp/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 32f76f1298da..078bcb3858c7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8557,8 +8557,10 @@ static int sctp_listen_start(struct sock *sk, int backlog) */ inet_sk_set_state(sk, SCTP_SS_LISTENING); if (!ep->base.bind_addr.port) { - if (sctp_autobind(sk)) + if (sctp_autobind(sk)) { + inet_sk_set_state(sk, SCTP_SS_CLOSED); return -EAGAIN; + } } else { if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { inet_sk_set_state(sk, SCTP_SS_CLOSED); -- 2.51.0 From c30a3f54e661d01df2bf193398336155089dd502 Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Sun, 29 Sep 2024 22:42:14 -0700 Subject: [PATCH 09/16] net: mana: Add get_link and get_link_ksettings in ethtool Add support for the ethtool get_link and get_link_ksettings operations. Display standard port information using ethtool. Before the change: $ethtool enP30832s1 > No data available After the change: $ethtool enP30832s1 > Settings for enP30832s1: Supported ports: [ ] Supported link modes: Not reported Supported pause frame use: No Supports auto-negotiation: No Supported FEC modes: Not reported Advertised link modes: Not reported Advertised pause frame use: No Advertised auto-negotiation: No Advertised FEC modes: Not reported Speed: Unknown! Duplex: Full Auto-negotiation: off Port: Other PHYAD: 0 Transceiver: internal Link detected: yes Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Haiyang Zhang Reviewed-by: Shradha Gupta Reviewed-by: Simon Horman Link: https://patch.msgid.link/1727674934-12130-1-git-send-email-ernis@linux.microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index dc3864377538..349f11bf8e64 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -443,6 +443,15 @@ out: return err; } +static int mana_get_link_ksettings(struct net_device *ndev, + struct ethtool_link_ksettings *cmd) +{ + cmd->base.duplex = DUPLEX_FULL; + cmd->base.port = PORT_OTHER; + + return 0; +} + const struct ethtool_ops mana_ethtool_ops = { .get_ethtool_stats = mana_get_ethtool_stats, .get_sset_count = mana_get_sset_count, @@ -456,4 +465,6 @@ const struct ethtool_ops mana_ethtool_ops = { .set_channels = mana_set_channels, .get_ringparam = mana_get_ringparam, .set_ringparam = mana_set_ringparam, + .get_link_ksettings = mana_get_link_ksettings, + .get_link = ethtool_op_get_link, }; -- 2.51.0 From 6c959fd5e17387201dba3619b2e6af213939a0a7 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Mon, 30 Sep 2024 02:58:54 -0700 Subject: [PATCH 10/16] netfilter: Make legacy configs user selectable This option makes legacy Netfilter Kconfig user selectable, giving users the option to configure iptables without enabling any other config. Make the following KConfig entries user selectable: * BRIDGE_NF_EBTABLES_LEGACY * IP_NF_ARPTABLES * IP_NF_IPTABLES_LEGACY * IP6_NF_IPTABLES_LEGACY Signed-off-by: Breno Leitao Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/Kconfig | 8 +++++++- net/ipv4/netfilter/Kconfig | 16 ++++++++++++++-- net/ipv6/netfilter/Kconfig | 9 ++++++++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 104c0125e32e..f16bbbbb9481 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -41,7 +41,13 @@ config NF_CONNTRACK_BRIDGE # old sockopt interface and eval loop config BRIDGE_NF_EBTABLES_LEGACY - tristate + tristate "Legacy EBTABLES support" + depends on BRIDGE && NETFILTER_XTABLES + default n + help + Legacy ebtables packet/frame classifier. + This is not needed if you are using ebtables over nftables + (iptables-nft). menuconfig BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1b991b889506..ef8009281da5 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -12,7 +12,13 @@ config NF_DEFRAG_IPV4 # old sockopt interface and eval loop config IP_NF_IPTABLES_LEGACY - tristate + tristate "Legacy IP tables support" + default n + select NETFILTER_XTABLES + help + iptables is a legacy packet classifier. + This is not needed if you are using iptables over nftables + (iptables-nft). config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" @@ -318,7 +324,13 @@ endif # IP_NF_IPTABLES # ARP tables config IP_NF_ARPTABLES - tristate + tristate "Legacy ARPTABLES support" + depends on NETFILTER_XTABLES + default n + help + arptables is a legacy packet classifier. + This is not needed if you are using arptables over nftables + (iptables-nft). config NFT_COMPAT_ARP tristate diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index f3c8e2d918e1..e087a8e97ba7 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -8,7 +8,14 @@ menu "IPv6: Netfilter Configuration" # old sockopt interface and eval loop config IP6_NF_IPTABLES_LEGACY - tristate + tristate "Legacy IP6 tables support" + depends on INET && IPV6 + select NETFILTER_XTABLES + default n + help + ip6tables is a legacy packet classifier. + This is not needed if you are using iptables over nftables + (iptables-nft). config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" -- 2.51.0 From 0741f55593547d7f25ec003b355a21d6d5fef01e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Thu, 29 Aug 2024 17:29:32 +0200 Subject: [PATCH 11/16] netfilter: nf_tables: Fix percpu address space issues in nf_tables_api.c Compiling nf_tables_api.c results in several sparse warnings: nf_tables_api.c:2077:31: warning: incorrect type in return expression (different address spaces) nf_tables_api.c:2080:31: warning: incorrect type in return expression (different address spaces) nf_tables_api.c:2084:31: warning: incorrect type in return expression (different address spaces) nf_tables_api.c:2740:23: warning: incorrect type in assignment (different address spaces) nf_tables_api.c:2752:38: warning: incorrect type in assignment (different address spaces) nf_tables_api.c:2798:21: warning: incorrect type in argument 1 (different address spaces) Use {ERR_PTR,IS_ERR,PTR_ERR}_PCPU() macros when crossing between generic and percpu address spaces and add __percpu annotation to *stats pointer to fix these warnings. Found by GCC's named address space checks. There were no changes in the resulting object files. Signed-off-by: Uros Bizjak Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a24fe62650a7..6552ec616745 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2082,14 +2082,14 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) err = nla_parse_nested_deprecated(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, NULL); if (err < 0) - return ERR_PTR(err); + return ERR_PTR_PCPU(err); if (!tb[NFTA_COUNTER_BYTES] || !tb[NFTA_COUNTER_PACKETS]) - return ERR_PTR(-EINVAL); + return ERR_PTR_PCPU(-EINVAL); newstats = netdev_alloc_pcpu_stats(struct nft_stats); if (newstats == NULL) - return ERR_PTR(-ENOMEM); + return ERR_PTR_PCPU(-ENOMEM); /* Restore old counters on this cpu, no problem. Per-cpu statistics * are not exposed to userspace. @@ -2533,10 +2533,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (nla[NFTA_CHAIN_COUNTERS]) { stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) { + if (IS_ERR_PCPU(stats)) { nft_chain_release_hook(&hook); kfree(basechain); - return PTR_ERR(stats); + return PTR_ERR_PCPU(stats); } rcu_assign_pointer(basechain->stats, stats); } @@ -2650,7 +2650,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, struct nft_table *table = ctx->table; struct nft_chain *chain = ctx->chain; struct nft_chain_hook hook = {}; - struct nft_stats *stats = NULL; + struct nft_stats __percpu *stats = NULL; struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; @@ -2746,8 +2746,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, } stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); - if (IS_ERR(stats)) { - err = PTR_ERR(stats); + if (IS_ERR_PCPU(stats)) { + err = PTR_ERR_PCPU(stats); goto err_hooks; } } -- 2.51.0 From 544dded8cb6317c2d3ecf4bba8412e616e70bb86 Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Mon, 9 Sep 2024 15:48:39 -0700 Subject: [PATCH 12/16] netfilter: nf_tables: replace deprecated strncpy with strscpy_pad strncpy() is deprecated for use on NUL-terminated destination strings [1] and as such we should prefer more robust and less ambiguous string interfaces. In this particular instance, the usage of strncpy() is fine and works as expected. However, towards the goal of [2], we should consider replacing it with an alternative as many instances of strncpy() are bug-prone. Its removal from the kernel promotes better long term health for the codebase. The current usage of strncpy() likely just wants the NUL-padding behavior offered by strncpy() and doesn't care about the NUL-termination. Since the compiler doesn't know the size of @dest, we can't use strtomem_pad(). Instead, use strscpy_pad() which behaves functionally the same as strncpy() in this context -- as we expect br_dev->name to be NUL-terminated itself. Link: https://www.kernel.org/doc/html/latest/process/deprecated.html#strncpy-on-nul-terminated-strings [1] Link: https://github.com/KSPP/linux/issues/90 [2] Link: https://manpages.debian.org/testing/linux-manual-4.8/strscpy.9.en.html Cc: Kees Cook Cc: linux-hardening@vger.kernel.org Signed-off-by: Justin Stitt Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/nft_meta_bridge.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index d12a221366d6..5adced1e7d0c 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -63,7 +63,7 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, return nft_meta_get_eval(expr, regs, pkt); } - strncpy((char *)dest, br_dev ? br_dev->name : "", IFNAMSIZ); + strscpy_pad((char *)dest, br_dev ? br_dev->name : "", IFNAMSIZ); return; err: regs->verdict.code = NFT_BREAK; -- 2.51.0 From 08e52cccae11a4148a5810f0bd5614796994d221 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Jul 2024 15:08:16 +0200 Subject: [PATCH 13/16] netfilter: nf_tables: prefer nft_trans_elem_alloc helper Reduce references to sizeof(struct nft_trans_elem). Preparation patch to move this to a flexiable array to store elem references. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6552ec616745..30331688301e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -6409,7 +6409,7 @@ err: nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); } -static struct nft_trans *nft_trans_elem_alloc(struct nft_ctx *ctx, +static struct nft_trans *nft_trans_elem_alloc(const struct nft_ctx *ctx, int msg_type, struct nft_set *set) { @@ -7471,13 +7471,11 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, { struct nft_trans *trans; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - sizeof(struct nft_trans_elem), GFP_KERNEL); + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (!trans) return -ENOMEM; nft_setelem_data_deactivate(ctx->net, set, elem_priv); - nft_trans_elem_set(trans) = set; nft_trans_elem_priv(trans) = elem_priv; nft_trans_commit_list_add_tail(ctx->net, trans); -- 2.51.0 From 9adbb4198bf6cf3634032871118a7052aeaa573f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:13 +0100 Subject: [PATCH 14/16] netfilter: nf_tables: avoid false-positive lockdep splat on rule deletion On rule delete we get: WARNING: suspicious RCU usage net/netfilter/nf_tables_api.c:3420 RCU-list traversed in non-reader section!! 1 lock held by iptables/134: #0: ffff888008c4fcc8 (&nft_net->commit_mutex){+.+.}-{3:3}, at: nf_tables_valid_genid (include/linux/jiffies.h:101) nf_tables Code is fine, no other CPU can change the list because we're holding transaction mutex. Pass the needed lockdep annotation to the iterator and fix two comments for functions that are no longer restricted to rcu-only context. This is enough to resolve rule delete, but there are several other missing annotations, added in followup-patches. Fixes: 28875945ba98 ("rcu: Add support for consolidated-RCU reader checking") Reported-by: Matthieu Baerts Tested-by: Matthieu Baerts Closes: https://lore.kernel.org/netfilter-devel/da27f17f-3145-47af-ad0f-7fd2a823623e@kernel.org/ Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 30331688301e..80c285ac7e07 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3411,13 +3411,15 @@ void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr) * Rules */ -static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *__nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, u64 handle) { struct nft_rule *rule; // FIXME: this sucks - list_for_each_entry_rcu(rule, &chain->rules, list) { + list_for_each_entry_rcu(rule, &chain->rules, list, + lockdep_commit_lock_is_held(net)) { if (handle == rule->handle) return rule; } @@ -3425,13 +3427,14 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, return ERR_PTR(-ENOENT); } -static struct nft_rule *nft_rule_lookup(const struct nft_chain *chain, +static struct nft_rule *nft_rule_lookup(const struct net *net, + const struct nft_chain *chain, const struct nlattr *nla) { if (nla == NULL) return ERR_PTR(-EINVAL); - return __nft_rule_lookup(chain, be64_to_cpu(nla_get_be64(nla))); + return __nft_rule_lookup(net, chain, be64_to_cpu(nla_get_be64(nla))); } static const struct nla_policy nft_rule_policy[NFTA_RULE_MAX + 1] = { @@ -3732,7 +3735,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) return 0; } -/* called with rcu_read_lock held */ +/* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) @@ -3759,7 +3762,7 @@ nf_tables_getrule_single(u32 portid, const struct nfnl_info *info, return ERR_CAST(chain); } - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return ERR_CAST(rule); @@ -4057,7 +4060,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_HANDLE]) { handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_HANDLE])); - rule = __nft_rule_lookup(chain, handle); + rule = __nft_rule_lookup(net, chain, handle); if (IS_ERR(rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_HANDLE]); return PTR_ERR(rule); @@ -4079,7 +4082,7 @@ static int nf_tables_newrule(struct sk_buff *skb, const struct nfnl_info *info, if (nla[NFTA_RULE_POSITION]) { pos_handle = be64_to_cpu(nla_get_be64(nla[NFTA_RULE_POSITION])); - old_rule = __nft_rule_lookup(chain, pos_handle); + old_rule = __nft_rule_lookup(net, chain, pos_handle); if (IS_ERR(old_rule)) { NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); @@ -4296,7 +4299,7 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, if (chain) { if (nla[NFTA_RULE_HANDLE]) { - rule = nft_rule_lookup(chain, nla[NFTA_RULE_HANDLE]); + rule = nft_rule_lookup(info->net, chain, nla[NFTA_RULE_HANDLE]); if (IS_ERR(rule)) { if (PTR_ERR(rule) == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYRULE) @@ -8101,7 +8104,7 @@ static int nf_tables_dump_obj_done(struct netlink_callback *cb) return 0; } -/* called with rcu_read_lock held */ +/* Caller must hold rcu read lock or transaction mutex */ static struct sk_buff * nf_tables_getobj_single(u32 portid, const struct nfnl_info *info, const struct nlattr * const nla[], bool reset) -- 2.51.0 From 8f5f3786dba765099f12da00e6be0c26f69f2fbd Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:14 +0100 Subject: [PATCH 15/16] netfilter: nf_tables: avoid false-positive lockdep splats with sets Same as previous patch. All set handling functions here can be called with transaction mutex held (but not the rcu read lock). The transaction mutex prevents concurrent add/delete, so this is fine. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 80c285ac7e07..a51731d76401 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3986,7 +3986,8 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set) struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, dummy_iter.genmask)) continue; @@ -4459,7 +4460,8 @@ static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = { [NFTA_SET_DESC_CONCAT] = NLA_POLICY_NESTED_ARRAY(nft_concat_policy), }; -static struct nft_set *nft_set_lookup(const struct nft_table *table, +static struct nft_set *nft_set_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -4467,7 +4469,8 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry_rcu(set, &table->sets, list) { + list_for_each_entry_rcu(set, &table->sets, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; @@ -4517,7 +4520,7 @@ struct nft_set *nft_set_lookup_global(const struct net *net, { struct nft_set *set; - set = nft_set_lookup(table, nla_set_name, genmask); + set = nft_set_lookup(net, table, nla_set_name, genmask); if (IS_ERR(set)) { if (!nla_set_id) return set; @@ -4893,7 +4896,7 @@ static int nf_tables_getset(struct sk_buff *skb, const struct nfnl_info *info, if (!nla[NFTA_SET_TABLE]) return -EINVAL; - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); return PTR_ERR(set); @@ -5229,7 +5232,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); - set = nft_set_lookup(table, nla[NFTA_SET_NAME], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_NAME], genmask); if (IS_ERR(set)) { if (PTR_ERR(set) != -ENOENT) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_NAME]); @@ -5431,7 +5434,7 @@ static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, set = nft_set_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_SET_NAME]; - set = nft_set_lookup(table, attr, genmask); + set = nft_set_lookup(net, table, attr, genmask); } if (IS_ERR(set)) { @@ -5495,7 +5498,8 @@ static int nft_set_catchall_bind_check(const struct nft_ctx *ctx, struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; @@ -6261,7 +6265,7 @@ static int nft_set_dump_ctx_init(struct nft_set_dump_ctx *dump_ctx, return PTR_ERR(table); } - set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); @@ -7493,7 +7497,8 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, struct nft_set_ext *ext; int ret = 0; - list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + list_for_each_entry_rcu(catchall, &set->catchall_list, list, + lockdep_commit_lock_is_held(ctx->net)) { ext = nft_set_elem_ext(set, catchall->elem); if (!nft_set_elem_active(ext, genmask)) continue; @@ -7543,7 +7548,7 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return PTR_ERR(table); } - set = nft_set_lookup(table, nla[NFTA_SET_ELEM_LIST_SET], genmask); + set = nft_set_lookup(net, table, nla[NFTA_SET_ELEM_LIST_SET], genmask); if (IS_ERR(set)) { NL_SET_BAD_ATTR(extack, nla[NFTA_SET_ELEM_LIST_SET]); return PTR_ERR(set); -- 2.51.0 From b3e8f29d6b45e865bab9a9964709ff7413e33e85 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 4 Nov 2024 10:41:15 +0100 Subject: [PATCH 16/16] netfilter: nf_tables: avoid false-positive lockdep splats with flowtables The transaction mutex prevents concurrent add/delete, its ok to iterate those lists outside of rcu read side critical sections. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 3 ++- net/netfilter/nf_tables_api.c | 15 +++++++++------ net/netfilter/nft_flow_offload.c | 4 ++-- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 91ae20cb7648..c1513bd14568 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1463,7 +1463,8 @@ struct nft_flowtable { struct nf_flowtable data; }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index a51731d76401..9e367e134691 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8378,12 +8378,14 @@ static const struct nla_policy nft_flowtable_policy[NFTA_FLOWTABLE_MAX + 1] = { [NFTA_FLOWTABLE_FLAGS] = { .type = NLA_U32 }, }; -struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, +struct nft_flowtable *nft_flowtable_lookup(const struct net *net, + const struct nft_table *table, const struct nlattr *nla, u8 genmask) { struct nft_flowtable *flowtable; - list_for_each_entry_rcu(flowtable, &table->flowtables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list, + lockdep_commit_lock_is_held(net)) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; @@ -8739,7 +8741,7 @@ static int nf_tables_newflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { err = PTR_ERR(flowtable); @@ -8933,7 +8935,7 @@ static int nf_tables_delflowtable(struct sk_buff *skb, flowtable = nft_flowtable_lookup_byhandle(table, attr, genmask); } else { attr = nla[NFTA_FLOWTABLE_NAME]; - flowtable = nft_flowtable_lookup(table, attr, genmask); + flowtable = nft_flowtable_lookup(net, table, attr, genmask); } if (IS_ERR(flowtable)) { @@ -9003,7 +9005,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, if (!hook_list) hook_list = &flowtable->hook_list; - list_for_each_entry_rcu(hook, hook_list, list) { + list_for_each_entry_rcu(hook, hook_list, list, + lockdep_commit_lock_is_held(net)) { if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name)) goto nla_put_failure; } @@ -9145,7 +9148,7 @@ static int nf_tables_getflowtable(struct sk_buff *skb, return PTR_ERR(table); } - flowtable = nft_flowtable_lookup(table, nla[NFTA_FLOWTABLE_NAME], + flowtable = nft_flowtable_lookup(net, table, nla[NFTA_FLOWTABLE_NAME], genmask); if (IS_ERR(flowtable)) { NL_SET_BAD_ATTR(extack, nla[NFTA_FLOWTABLE_NAME]); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 2f732fae5a83..65199c23c75c 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -409,8 +409,8 @@ static int nft_flow_offload_init(const struct nft_ctx *ctx, if (!tb[NFTA_FLOW_TABLE_NAME]) return -EINVAL; - flowtable = nft_flowtable_lookup(ctx->table, tb[NFTA_FLOW_TABLE_NAME], - genmask); + flowtable = nft_flowtable_lookup(ctx->net, ctx->table, + tb[NFTA_FLOW_TABLE_NAME], genmask); if (IS_ERR(flowtable)) return PTR_ERR(flowtable); -- 2.51.0