From 6fd27ea183c208e478129a85e11d880fc70040f2 Mon Sep 17 00:00:00 2001 From: "D. Wythe" Date: Wed, 9 Oct 2024 14:55:16 +0800 Subject: [PATCH 01/16] net/smc: fix lacks of icsk_syn_mss with IPPROTO_SMC Eric report a panic on IPPROTO_SMC, and give the facts that when INET_PROTOSW_ICSK was set, icsk->icsk_sync_mss must be set too. Bug: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Mem abort info: ESR = 0x0000000086000005 EC = 0x21: IABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x05: level 1 translation fault user pgtable: 4k pages, 48-bit VAs, pgdp=00000001195d1000 [0000000000000000] pgd=0800000109c46003, p4d=0800000109c46003, pud=0000000000000000 Internal error: Oops: 0000000086000005 [#1] PREEMPT SMP Modules linked in: CPU: 1 UID: 0 PID: 8037 Comm: syz.3.265 Not tainted 6.11.0-rc7-syzkaller-g5f5673607153 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024 pstate: 80400005 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : 0x0 lr : cipso_v4_sock_setattr+0x2a8/0x3c0 net/ipv4/cipso_ipv4.c:1910 sp : ffff80009b887a90 x29: ffff80009b887aa0 x28: ffff80008db94050 x27: 0000000000000000 x26: 1fffe0001aa6f5b3 x25: dfff800000000000 x24: ffff0000db75da00 x23: 0000000000000000 x22: ffff0000d8b78518 x21: 0000000000000000 x20: ffff0000d537ad80 x19: ffff0000d8b78000 x18: 1fffe000366d79ee x17: ffff8000800614a8 x16: ffff800080569b84 x15: 0000000000000001 x14: 000000008b336894 x13: 00000000cd96feaa x12: 0000000000000003 x11: 0000000000040000 x10: 00000000000020a3 x9 : 1fffe0001b16f0f1 x8 : 0000000000000000 x7 : 0000000000000000 x6 : 000000000000003f x5 : 0000000000000040 x4 : 0000000000000001 x3 : 0000000000000000 x2 : 0000000000000002 x1 : 0000000000000000 x0 : ffff0000d8b78000 Call trace: 0x0 netlbl_sock_setattr+0x2e4/0x338 net/netlabel/netlabel_kapi.c:1000 smack_netlbl_add+0xa4/0x154 security/smack/smack_lsm.c:2593 smack_socket_post_create+0xa8/0x14c security/smack/smack_lsm.c:2973 security_socket_post_create+0x94/0xd4 security/security.c:4425 __sock_create+0x4c8/0x884 net/socket.c:1587 sock_create net/socket.c:1622 [inline] __sys_socket_create net/socket.c:1659 [inline] __sys_socket+0x134/0x340 net/socket.c:1706 __do_sys_socket net/socket.c:1720 [inline] __se_sys_socket net/socket.c:1718 [inline] __arm64_sys_socket+0x7c/0x94 net/socket.c:1718 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712 el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730 el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598 Code: ???????? ???????? ???????? ???????? (????????) ---[ end trace 0000000000000000 ]--- This patch add a toy implementation that performs a simple return to prevent such panic. This is because MSS can be set in sock_create_kern or smc_setsockopt, similar to how it's done in AF_SMC. However, for AF_SMC, there is currently no way to synchronize MSS within __sys_connect_file. This toy implementation lays the groundwork for us to support such feature for IPPROTO_SMC in the future. Fixes: d25a92ccae6b ("net/smc: Introduce IPPROTO_SMC") Reported-by: Eric Dumazet Signed-off-by: D. Wythe Reviewed-by: Eric Dumazet Reviewed-by: Wenjia Zhang Link: https://patch.msgid.link/1728456916-67035-1-git-send-email-alibuda@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/smc/smc_inet.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c index a5b2041600f9..a944e7dcb8b9 100644 --- a/net/smc/smc_inet.c +++ b/net/smc/smc_inet.c @@ -108,12 +108,23 @@ static struct inet_protosw smc_inet6_protosw = { }; #endif /* CONFIG_IPV6 */ +static unsigned int smc_sync_mss(struct sock *sk, u32 pmtu) +{ + /* No need pass it through to clcsock, mss can always be set by + * sock_create_kern or smc_setsockopt. + */ + return 0; +} + static int smc_inet_init_sock(struct sock *sk) { struct net *net = sock_net(sk); /* init common smc sock */ smc_sk_init(net, sk, IPPROTO_SMC); + + inet_csk(sk)->icsk_sync_mss = smc_sync_mss; + /* create clcsock */ return smc_create_clcsk(net, sk, sk->sk_family); } -- 2.50.1 From 7d3fce8cbe3a70a1c7c06c9b53696be5d5d8dd5c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2024 09:11:32 +0000 Subject: [PATCH 02/16] slip: make slhc_remember() more robust against malicious packets syzbot found that slhc_remember() was missing checks against malicious packets [1]. slhc_remember() only checked the size of the packet was at least 20, which is not good enough. We need to make sure the packet includes the IPv4 and TCP header that are supposed to be carried. Add iph and th pointers to make the code more readable. [1] BUG: KMSAN: uninit-value in slhc_remember+0x2e8/0x7b0 drivers/net/slip/slhc.c:666 slhc_remember+0x2e8/0x7b0 drivers/net/slip/slhc.c:666 ppp_receive_nonmp_frame+0xe45/0x35e0 drivers/net/ppp/ppp_generic.c:2455 ppp_receive_frame drivers/net/ppp/ppp_generic.c:2372 [inline] ppp_do_recv+0x65f/0x40d0 drivers/net/ppp/ppp_generic.c:2212 ppp_input+0x7dc/0xe60 drivers/net/ppp/ppp_generic.c:2327 pppoe_rcv_core+0x1d3/0x720 drivers/net/ppp/pppoe.c:379 sk_backlog_rcv+0x13b/0x420 include/net/sock.h:1113 __release_sock+0x1da/0x330 net/core/sock.c:3072 release_sock+0x6b/0x250 net/core/sock.c:3626 pppoe_sendmsg+0x2b8/0xb90 drivers/net/ppp/pppoe.c:903 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:744 ____sys_sendmsg+0x903/0xb60 net/socket.c:2602 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2656 __sys_sendmmsg+0x3c1/0x960 net/socket.c:2742 __do_sys_sendmmsg net/socket.c:2771 [inline] __se_sys_sendmmsg net/socket.c:2768 [inline] __x64_sys_sendmmsg+0xbc/0x120 net/socket.c:2768 x64_sys_call+0xb6e/0x3ba0 arch/x86/include/generated/asm/syscalls_64.h:308 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:4091 [inline] slab_alloc_node mm/slub.c:4134 [inline] kmem_cache_alloc_node_noprof+0x6bf/0xb80 mm/slub.c:4186 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:587 __alloc_skb+0x363/0x7b0 net/core/skbuff.c:678 alloc_skb include/linux/skbuff.h:1322 [inline] sock_wmalloc+0xfe/0x1a0 net/core/sock.c:2732 pppoe_sendmsg+0x3a7/0xb90 drivers/net/ppp/pppoe.c:867 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:744 ____sys_sendmsg+0x903/0xb60 net/socket.c:2602 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2656 __sys_sendmmsg+0x3c1/0x960 net/socket.c:2742 __do_sys_sendmmsg net/socket.c:2771 [inline] __se_sys_sendmmsg net/socket.c:2768 [inline] __x64_sys_sendmmsg+0xbc/0x120 net/socket.c:2768 x64_sys_call+0xb6e/0x3ba0 arch/x86/include/generated/asm/syscalls_64.h:308 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f CPU: 0 UID: 0 PID: 5460 Comm: syz.2.33 Not tainted 6.12.0-rc2-syzkaller-00006-g87d6aab2389e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Fixes: b5451d783ade ("slip: Move the SLIP drivers") Reported-by: syzbot+2ada1bc857496353be5a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/670646db.050a0220.3f80e.0027.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241009091132.2136321-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- drivers/net/slip/slhc.c | 57 ++++++++++++++++++++++++----------------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index 252cd757d3a2..ee9fd3a94b96 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -643,46 +643,57 @@ bad: int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { - struct cstate *cs; - unsigned ihl; - + const struct tcphdr *th; unsigned char index; + struct iphdr *iph; + struct cstate *cs; + unsigned int ihl; - if(isize < 20) { - /* The packet is shorter than a legal IP header */ + /* The packet is shorter than a legal IP header. + * Also make sure isize is positive. + */ + if (isize < (int)sizeof(struct iphdr)) { +runt: comp->sls_i_runt++; - return slhc_toss( comp ); + return slhc_toss(comp); } + iph = (struct iphdr *)icp; /* Peek at the IP header's IHL field to find its length */ - ihl = icp[0] & 0xf; - if(ihl < 20 / 4){ - /* The IP header length field is too small */ - comp->sls_i_runt++; - return slhc_toss( comp ); - } - index = icp[9]; - icp[9] = IPPROTO_TCP; + ihl = iph->ihl; + /* The IP header length field is too small, + * or packet is shorter than the IP header followed + * by minimal tcp header. + */ + if (ihl < 5 || isize < ihl * 4 + sizeof(struct tcphdr)) + goto runt; + + index = iph->protocol; + iph->protocol = IPPROTO_TCP; if (ip_fast_csum(icp, ihl)) { /* Bad IP header checksum; discard */ comp->sls_i_badcheck++; - return slhc_toss( comp ); + return slhc_toss(comp); } - if(index > comp->rslot_limit) { + if (index > comp->rslot_limit) { comp->sls_i_error++; return slhc_toss(comp); } - + th = (struct tcphdr *)(icp + ihl * 4); + if (th->doff < sizeof(struct tcphdr) / 4) + goto runt; + if (isize < ihl * 4 + th->doff * 4) + goto runt; /* Update local state */ cs = &comp->rstate[comp->recv_current = index]; comp->flags &=~ SLF_TOSS; - memcpy(&cs->cs_ip,icp,20); - memcpy(&cs->cs_tcp,icp + ihl*4,20); + memcpy(&cs->cs_ip, iph, sizeof(*iph)); + memcpy(&cs->cs_tcp, th, sizeof(*th)); if (ihl > 5) - memcpy(cs->cs_ipopt, icp + sizeof(struct iphdr), (ihl - 5) * 4); - if (cs->cs_tcp.doff > 5) - memcpy(cs->cs_tcpopt, icp + ihl*4 + sizeof(struct tcphdr), (cs->cs_tcp.doff - 5) * 4); - cs->cs_hsize = ihl*2 + cs->cs_tcp.doff*2; + memcpy(cs->cs_ipopt, &iph[1], (ihl - 5) * 4); + if (th->doff > 5) + memcpy(cs->cs_tcpopt, &th[1], (th->doff - 5) * 4); + cs->cs_hsize = ihl*2 + th->doff*2; cs->initialized = true; /* Put headers back on packet * Neither header checksum is recalculated -- 2.50.1 From 9937aae39bc09645cd67d53e0320926cd91570de Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 9 Oct 2024 09:47:22 +0100 Subject: [PATCH 03/16] MAINTAINERS: consistently exclude wireless files from NETWORKING [GENERAL] We already exclude wireless drivers from the netdev@ traffic, to delegate it to linux-wireless@, and avoid overwhelming netdev@. Many of the following wireless-related sections MAINTAINERS are already not included in the NETWORKING [GENERAL] section. For consistency, exclude those that are. * 802.11 (including CFG80211/NL80211) * MAC80211 * RFKILL Acked-by: Johannes Berg Signed-off-by: Simon Horman Link: https://patch.msgid.link/20241009-maint-net-hdrs-v2-1-f2c86e7309c8@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 58380aeafbf0..89789bddfb6a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16197,8 +16197,19 @@ F: lib/random32.c F: net/ F: tools/net/ F: tools/testing/selftests/net/ +X: Documentation/networking/mac80211-injection.rst +X: Documentation/networking/mac80211_hwsim/ +X: Documentation/networking/regulatory.rst +X: include/net/cfg80211.h +X: include/net/ieee80211_radiotap.h +X: include/net/iw_handler.h +X: include/net/mac80211.h +X: include/net/wext.h X: net/9p/ X: net/bluetooth/ +X: net/mac80211/ +X: net/rfkill/ +X: net/wireless/ NETWORKING [IPSEC] M: Steffen Klassert -- 2.50.1 From 5404b5a2fea9831a1f5be4ab9a94de07d976b177 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 9 Oct 2024 09:47:23 +0100 Subject: [PATCH 04/16] MAINTAINERS: Add headers and mailing list to UDP section Add netdev mailing list and some more udp.h headers to the UDP section. This is now more consistent with the TCP section. Acked-by: Willem de Bruijn Signed-off-by: Simon Horman Link: https://patch.msgid.link/20241009-maint-net-hdrs-v2-2-f2c86e7309c8@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 89789bddfb6a..baf2ba9dc070 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -24184,8 +24184,12 @@ F: drivers/usb/host/xhci* USER DATAGRAM PROTOCOL (UDP) M: Willem de Bruijn +L: netdev@vger.kernel.org S: Maintained F: include/linux/udp.h +F: include/net/udp.h +F: include/trace/events/udp.h +F: include/uapi/linux/udp.h F: net/ipv4/udp.c F: net/ipv6/udp.c -- 2.50.1 From f7345ccc62a4b880cf76458db5f320725f28e400 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 10 Oct 2024 18:36:09 +0200 Subject: [PATCH 05/16] rcu/nocb: Fix rcuog wake-up from offline softirq After a CPU has set itself offline and before it eventually calls rcutree_report_cpu_dead(), there are still opportunities for callbacks to be enqueued, for example from a softirq. When that happens on NOCB, the rcuog wake-up is deferred through an IPI to an online CPU in order not to call into the scheduler and risk arming the RT-bandwidth after hrtimers have been migrated out and disabled. But performing a synchronized IPI from a softirq is buggy as reported in the following scenario: WARNING: CPU: 1 PID: 26 at kernel/smp.c:633 smp_call_function_single Modules linked in: rcutorture torture CPU: 1 UID: 0 PID: 26 Comm: migration/1 Not tainted 6.11.0-rc1-00012-g9139f93209d1 #1 Stopper: multi_cpu_stop+0x0/0x320 <- __stop_cpus+0xd0/0x120 RIP: 0010:smp_call_function_single swake_up_one_online __call_rcu_nocb_wake __call_rcu_common ? rcu_torture_one_read call_timer_fn __run_timers run_timer_softirq handle_softirqs irq_exit_rcu ? tick_handle_periodic sysvec_apic_timer_interrupt Fix this with forcing deferred rcuog wake up through the NOCB timer when the CPU is offline. The actual wake up will happen from rcutree_report_cpu_dead(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202409231644.4c55582d-lkp@intel.com Fixes: 9139f93209d1 ("rcu/nocb: Fix RT throttling hrtimer armed from offline CPU") Reviewed-by: "Joel Fernandes (Google)" Signed-off-by: Frederic Weisbecker Signed-off-by: Neeraj Upadhyay --- kernel/rcu/tree_nocb.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 97b99cd06923..16865475120b 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -554,13 +554,19 @@ static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone, rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_LAZY, TPS("WakeLazy")); - } else if (!irqs_disabled_flags(flags)) { + } else if (!irqs_disabled_flags(flags) && cpu_online(rdp->cpu)) { /* ... if queue was empty ... */ rcu_nocb_unlock(rdp); wake_nocb_gp(rdp, false); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeEmpty")); } else { + /* + * Don't do the wake-up upfront on fragile paths. + * Also offline CPUs can't call swake_up_one_online() from + * (soft-)IRQs. Rely on the final deferred wake-up from + * rcutree_report_cpu_dead() + */ rcu_nocb_unlock(rdp); wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE, TPS("WakeEmptyIsDeferred")); -- 2.50.1 From 6e0391e48cf9fb8b1b5e27c0cbbaf2e4639f2c33 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 9 Oct 2024 13:41:31 -0700 Subject: [PATCH 06/16] of: Skip kunit tests when arm64+ACPI doesn't populate root node A root node is required to apply DT overlays. A root node is usually present after commit 7b937cc243e5 ("of: Create of_root if no dtb provided by firmware"), except for on arm64 systems booted with ACPI tables. In that case, the root node is intentionally not populated because it would "allow DT devices to be instantiated atop an ACPI base system"[1]. Introduce an OF function that skips the kunit test if the root node isn't populated. Limit the test to when both CONFIG_ARM64 and CONFIG_ACPI are set, because otherwise the lack of a root node is a bug. Make the function private and take a kunit test parameter so that it can't be abused to test for the presence of the root node in non-test code. Use this function to skip tests that require the root node. Currently that's the DT tests and any tests that apply overlays. Reported-by: Guenter Roeck Closes: https://lore.kernel.org/r/6cd337fb-38f0-41cb-b942-5844b84433db@roeck-us.net Link: https://lore.kernel.org/r/Zd4dQpHO7em1ji67@FVFF77S0Q05N.cambridge.arm.com [1] Fixes: 893ecc6d2d61 ("of: Add KUnit test to confirm DTB is loaded") Signed-off-by: Stephen Boyd Tested-by: Guenter Roeck Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20241009204133.1169931-1-sboyd@kernel.org Signed-off-by: Rob Herring (Arm) --- drivers/of/of_kunit_helpers.c | 15 +++++++++++++++ drivers/of/of_private.h | 3 +++ drivers/of/of_test.c | 3 +++ drivers/of/overlay_test.c | 3 +++ 4 files changed, 24 insertions(+) diff --git a/drivers/of/of_kunit_helpers.c b/drivers/of/of_kunit_helpers.c index 287d6c91bb37..7b3ed5a382aa 100644 --- a/drivers/of/of_kunit_helpers.c +++ b/drivers/of/of_kunit_helpers.c @@ -10,6 +10,19 @@ #include #include +#include "of_private.h" + +/** + * of_root_kunit_skip() - Skip test if the root node isn't populated + * @test: test to skip if the root node isn't populated + */ +void of_root_kunit_skip(struct kunit *test) +{ + if (IS_ENABLED(CONFIG_ARM64) && IS_ENABLED(CONFIG_ACPI) && !of_root) + kunit_skip(test, "arm64+acpi doesn't populate a root node"); +} +EXPORT_SYMBOL_GPL(of_root_kunit_skip); + #if defined(CONFIG_OF_OVERLAY) && defined(CONFIG_OF_EARLY_FLATTREE) static void of_overlay_fdt_apply_kunit_exit(void *ovcs_id) @@ -36,6 +49,8 @@ int of_overlay_fdt_apply_kunit(struct kunit *test, void *overlay_fdt, int ret; int *copy_id; + of_root_kunit_skip(test); + copy_id = kunit_kmalloc(test, sizeof(*copy_id), GFP_KERNEL); if (!copy_id) return -ENOMEM; diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h index 04aa2a91f851..c235d6c909a1 100644 --- a/drivers/of/of_private.h +++ b/drivers/of/of_private.h @@ -42,6 +42,9 @@ extern raw_spinlock_t devtree_lock; extern struct list_head aliases_lookup; extern struct kset *of_kset; +struct kunit; +extern void of_root_kunit_skip(struct kunit *test); + #if defined(CONFIG_OF_DYNAMIC) extern int of_property_notify(int action, struct device_node *np, struct property *prop, struct property *old_prop); diff --git a/drivers/of/of_test.c b/drivers/of/of_test.c index c85a258bc6ae..b0557ded838f 100644 --- a/drivers/of/of_test.c +++ b/drivers/of/of_test.c @@ -7,6 +7,8 @@ #include +#include "of_private.h" + /* * Test that the root node "/" can be found by path. */ @@ -36,6 +38,7 @@ static struct kunit_case of_dtb_test_cases[] = { static int of_dtb_test_init(struct kunit *test) { + of_root_kunit_skip(test); if (!IS_ENABLED(CONFIG_OF_EARLY_FLATTREE)) kunit_skip(test, "requires CONFIG_OF_EARLY_FLATTREE"); diff --git a/drivers/of/overlay_test.c b/drivers/of/overlay_test.c index 19695bdf77be..1f76d50fb16a 100644 --- a/drivers/of/overlay_test.c +++ b/drivers/of/overlay_test.c @@ -11,6 +11,8 @@ #include #include +#include "of_private.h" + static const char * const kunit_node_name = "kunit-test"; static const char * const kunit_compatible = "test,empty"; @@ -62,6 +64,7 @@ static void of_overlay_apply_kunit_cleanup(struct kunit *test) struct device *dev; struct device_node *np; + of_root_kunit_skip(test); if (!IS_ENABLED(CONFIG_OF_EARLY_FLATTREE)) kunit_skip(test, "requires CONFIG_OF_EARLY_FLATTREE for root node"); -- 2.50.1 From 8956c582ac6b1693a351230179f898979dd00bdf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 5 Oct 2024 10:53:29 +0200 Subject: [PATCH 07/16] powerpc/8xx: Fix kernel DTLB miss on dcbz Following OOPS is encountered while loading test_bpf module on powerpc 8xx: [ 218.835567] BUG: Unable to handle kernel data access on write at 0xcb000000 [ 218.842473] Faulting instruction address: 0xc0017a80 [ 218.847451] Oops: Kernel access of bad area, sig: 11 [#1] [ 218.852854] BE PAGE_SIZE=16K PREEMPT CMPC885 [ 218.857207] SAF3000 DIE NOTIFICATION [ 218.860713] Modules linked in: test_bpf(+) test_module [ 218.865867] CPU: 0 UID: 0 PID: 527 Comm: insmod Not tainted 6.11.0-s3k-dev-09856-g3de3d71ae2e6-dirty #1280 [ 218.875546] Hardware name: MIAE 8xx 0x500000 CMPC885 [ 218.880521] NIP: c0017a80 LR: beab859c CTR: 000101d4 [ 218.885584] REGS: cac2bc90 TRAP: 0300 Not tainted (6.11.0-s3k-dev-09856-g3de3d71ae2e6-dirty) [ 218.894308] MSR: 00009032 CR: 55005555 XER: a0007100 [ 218.901290] DAR: cb000000 DSISR: c2000000 [ 218.901290] GPR00: 000185d1 cac2bd50 c21b9580 caf7c030 c3883fcc 00000008 cafffffc 00000000 [ 218.901290] GPR08: 00040000 18300000 20000000 00000004 99005555 100d815e ca669d08 00000369 [ 218.901290] GPR16: ca730000 00000000 ca2c004c 00000000 00000000 0000035d 00000311 00000369 [ 218.901290] GPR24: ca732240 00000001 00030ba3 c3800000 00000000 00185d48 caf7c000 ca2c004c [ 218.941087] NIP [c0017a80] memcpy+0x88/0xec [ 218.945277] LR [beab859c] test_bpf_init+0x22c/0x3c90 [test_bpf] [ 218.951476] Call Trace: [ 218.953916] [cac2bd50] [beab8570] test_bpf_init+0x200/0x3c90 [test_bpf] (unreliable) [ 218.962034] [cac2bde0] [c0004c04] do_one_initcall+0x4c/0x1fc [ 218.967706] [cac2be40] [c00a2ec4] do_init_module+0x68/0x360 [ 218.973292] [cac2be60] [c00a5194] init_module_from_file+0x8c/0xc0 [ 218.979401] [cac2bed0] [c00a5568] sys_finit_module+0x250/0x3f0 [ 218.985248] [cac2bf20] [c000e390] system_call_exception+0x8c/0x15c [ 218.991444] [cac2bf30] [c00120a8] ret_from_syscall+0x0/0x28 This happens in the main loop of memcpy() ==> c0017a80: 7c 0b 37 ec dcbz r11,r6 c0017a84: 80 e4 00 04 lwz r7,4(r4) c0017a88: 81 04 00 08 lwz r8,8(r4) c0017a8c: 81 24 00 0c lwz r9,12(r4) c0017a90: 85 44 00 10 lwzu r10,16(r4) c0017a94: 90 e6 00 04 stw r7,4(r6) c0017a98: 91 06 00 08 stw r8,8(r6) c0017a9c: 91 26 00 0c stw r9,12(r6) c0017aa0: 95 46 00 10 stwu r10,16(r6) c0017aa4: 42 00 ff dc bdnz c0017a80 Commit ac9f97ff8b32 ("powerpc/8xx: Inconditionally use task PGDIR in DTLB misses") relies on re-reading DAR register to know if an error is due to a missing copy of a PMD entry in task's PGDIR, allthough DAR was already read in the exception prolog and copied into thread struct. This is because is it done very early in the exception and there are not enough registers available to keep a pointer to thread struct. However, dcbz instruction is buggy and doesn't update DAR register on fault. That is detected and generates a call to FixupDAR workaround which updates DAR copy in thread struct but doesn't fix DAR register. Let's fix DAR in addition to the update of DAR copy in thread struct. Fixes: ac9f97ff8b32 ("powerpc/8xx: Inconditionally use task PGDIR in DTLB misses") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/2b851399bd87e81c6ccb87ea3a7a6b32c7aa04d7.1728118396.git.christophe.leroy@csgroup.eu --- arch/powerpc/kernel/head_8xx.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/head_8xx.S b/arch/powerpc/kernel/head_8xx.S index 811a7130505c..56c5ebe21b99 100644 --- a/arch/powerpc/kernel/head_8xx.S +++ b/arch/powerpc/kernel/head_8xx.S @@ -494,6 +494,7 @@ FixupDAR:/* Entry point for dcbx workaround. */ bctr /* jump into table */ 152: mfdar r11 + mtdar r10 mtctr r11 /* restore ctr reg from DAR */ mfspr r11, SPRN_SPRG_THREAD stw r10, DAR(r11) -- 2.50.1 From a0cc649353bb726d4aa0db60dce467432197b746 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 8 Oct 2024 21:28:01 -0400 Subject: [PATCH 08/16] selftests/rseq: Fix mm_cid test failure Adapt the rseq.c/rseq.h code to follow GNU C library changes introduced by: glibc commit 2e456ccf0c34 ("Linux: Make __rseq_size useful for feature detection (bug 31965)") Without this fix, rseq selftests for mm_cid fail: ./run_param_test.sh Default parameters Running test spinlock Running compare-twice test spinlock Running mm_cid test spinlock Error: cpu id getter unavailable Fixes: 18c2355838e7 ("selftests/rseq: Implement rseq mm_cid field support") Signed-off-by: Mathieu Desnoyers Cc: Peter Zijlstra CC: Boqun Feng CC: "Paul E. McKenney" Cc: Shuah Khan CC: Carlos O'Donell CC: Florian Weimer CC: linux-kselftest@vger.kernel.org CC: stable@vger.kernel.org Signed-off-by: Shuah Khan --- tools/testing/selftests/rseq/rseq.c | 110 +++++++++++++++++++--------- tools/testing/selftests/rseq/rseq.h | 10 +-- 2 files changed, 77 insertions(+), 43 deletions(-) diff --git a/tools/testing/selftests/rseq/rseq.c b/tools/testing/selftests/rseq/rseq.c index 96e812bdf8a4..5b9772cdf265 100644 --- a/tools/testing/selftests/rseq/rseq.c +++ b/tools/testing/selftests/rseq/rseq.c @@ -60,12 +60,6 @@ unsigned int rseq_size = -1U; /* Flags used during rseq registration. */ unsigned int rseq_flags; -/* - * rseq feature size supported by the kernel. 0 if the registration was - * unsuccessful. - */ -unsigned int rseq_feature_size = -1U; - static int rseq_ownership; static int rseq_reg_success; /* At least one rseq registration has succeded. */ @@ -111,6 +105,43 @@ int rseq_available(void) } } +/* The rseq areas need to be at least 32 bytes. */ +static +unsigned int get_rseq_min_alloc_size(void) +{ + unsigned int alloc_size = rseq_size; + + if (alloc_size < ORIG_RSEQ_ALLOC_SIZE) + alloc_size = ORIG_RSEQ_ALLOC_SIZE; + return alloc_size; +} + +/* + * Return the feature size supported by the kernel. + * + * Depending on the value returned by getauxval(AT_RSEQ_FEATURE_SIZE): + * + * 0: Return ORIG_RSEQ_FEATURE_SIZE (20) + * > 0: Return the value from getauxval(AT_RSEQ_FEATURE_SIZE). + * + * It should never return a value below ORIG_RSEQ_FEATURE_SIZE. + */ +static +unsigned int get_rseq_kernel_feature_size(void) +{ + unsigned long auxv_rseq_feature_size, auxv_rseq_align; + + auxv_rseq_align = getauxval(AT_RSEQ_ALIGN); + assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE); + + auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE); + assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE); + if (auxv_rseq_feature_size) + return auxv_rseq_feature_size; + else + return ORIG_RSEQ_FEATURE_SIZE; +} + int rseq_register_current_thread(void) { int rc; @@ -119,7 +150,7 @@ int rseq_register_current_thread(void) /* Treat libc's ownership as a successful registration. */ return 0; } - rc = sys_rseq(&__rseq_abi, rseq_size, 0, RSEQ_SIG); + rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), 0, RSEQ_SIG); if (rc) { if (RSEQ_READ_ONCE(rseq_reg_success)) { /* Incoherent success/failure within process. */ @@ -140,28 +171,12 @@ int rseq_unregister_current_thread(void) /* Treat libc's ownership as a successful unregistration. */ return 0; } - rc = sys_rseq(&__rseq_abi, rseq_size, RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); + rc = sys_rseq(&__rseq_abi, get_rseq_min_alloc_size(), RSEQ_ABI_FLAG_UNREGISTER, RSEQ_SIG); if (rc) return -1; return 0; } -static -unsigned int get_rseq_feature_size(void) -{ - unsigned long auxv_rseq_feature_size, auxv_rseq_align; - - auxv_rseq_align = getauxval(AT_RSEQ_ALIGN); - assert(!auxv_rseq_align || auxv_rseq_align <= RSEQ_THREAD_AREA_ALLOC_SIZE); - - auxv_rseq_feature_size = getauxval(AT_RSEQ_FEATURE_SIZE); - assert(!auxv_rseq_feature_size || auxv_rseq_feature_size <= RSEQ_THREAD_AREA_ALLOC_SIZE); - if (auxv_rseq_feature_size) - return auxv_rseq_feature_size; - else - return ORIG_RSEQ_FEATURE_SIZE; -} - static __attribute__((constructor)) void rseq_init(void) { @@ -178,28 +193,54 @@ void rseq_init(void) } if (libc_rseq_size_p && libc_rseq_offset_p && libc_rseq_flags_p && *libc_rseq_size_p != 0) { + unsigned int libc_rseq_size; + /* rseq registration owned by glibc */ rseq_offset = *libc_rseq_offset_p; - rseq_size = *libc_rseq_size_p; + libc_rseq_size = *libc_rseq_size_p; rseq_flags = *libc_rseq_flags_p; - rseq_feature_size = get_rseq_feature_size(); - if (rseq_feature_size > rseq_size) - rseq_feature_size = rseq_size; + + /* + * Previous versions of glibc expose the value + * 32 even though the kernel only supported 20 + * bytes initially. Therefore treat 32 as a + * special-case. glibc 2.40 exposes a 20 bytes + * __rseq_size without using getauxval(3) to + * query the supported size, while still allocating a 32 + * bytes area. Also treat 20 as a special-case. + * + * Special-cases are handled by using the following + * value as active feature set size: + * + * rseq_size = min(32, get_rseq_kernel_feature_size()) + */ + switch (libc_rseq_size) { + case ORIG_RSEQ_FEATURE_SIZE: + fallthrough; + case ORIG_RSEQ_ALLOC_SIZE: + { + unsigned int rseq_kernel_feature_size = get_rseq_kernel_feature_size(); + + if (rseq_kernel_feature_size < ORIG_RSEQ_ALLOC_SIZE) + rseq_size = rseq_kernel_feature_size; + else + rseq_size = ORIG_RSEQ_ALLOC_SIZE; + break; + } + default: + /* Otherwise just use the __rseq_size from libc as rseq_size. */ + rseq_size = libc_rseq_size; + break; + } return; } rseq_ownership = 1; if (!rseq_available()) { rseq_size = 0; - rseq_feature_size = 0; return; } rseq_offset = (void *)&__rseq_abi - rseq_thread_pointer(); rseq_flags = 0; - rseq_feature_size = get_rseq_feature_size(); - if (rseq_feature_size == ORIG_RSEQ_FEATURE_SIZE) - rseq_size = ORIG_RSEQ_ALLOC_SIZE; - else - rseq_size = RSEQ_THREAD_AREA_ALLOC_SIZE; } static __attribute__((destructor)) @@ -209,7 +250,6 @@ void rseq_exit(void) return; rseq_offset = 0; rseq_size = -1U; - rseq_feature_size = -1U; rseq_ownership = 0; } diff --git a/tools/testing/selftests/rseq/rseq.h b/tools/testing/selftests/rseq/rseq.h index d7364ea4d201..4e217b620e0c 100644 --- a/tools/testing/selftests/rseq/rseq.h +++ b/tools/testing/selftests/rseq/rseq.h @@ -68,12 +68,6 @@ extern unsigned int rseq_size; /* Flags used during rseq registration. */ extern unsigned int rseq_flags; -/* - * rseq feature size supported by the kernel. 0 if the registration was - * unsuccessful. - */ -extern unsigned int rseq_feature_size; - enum rseq_mo { RSEQ_MO_RELAXED = 0, RSEQ_MO_CONSUME = 1, /* Unused */ @@ -193,7 +187,7 @@ static inline uint32_t rseq_current_cpu(void) static inline bool rseq_node_id_available(void) { - return (int) rseq_feature_size >= rseq_offsetofend(struct rseq_abi, node_id); + return (int) rseq_size >= rseq_offsetofend(struct rseq_abi, node_id); } /* @@ -207,7 +201,7 @@ static inline uint32_t rseq_current_node_id(void) static inline bool rseq_mm_cid_available(void) { - return (int) rseq_feature_size >= rseq_offsetofend(struct rseq_abi, mm_cid); + return (int) rseq_size >= rseq_offsetofend(struct rseq_abi, mm_cid); } static inline uint32_t rseq_current_mm_cid(void) -- 2.50.1 From 4ee5ca9a29384fcf3f18232fdf8474166dea8dca Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Oct 2024 16:52:35 -0400 Subject: [PATCH 09/16] ftrace/selftest: Test combination of function_graph tracer and function profiler Masami reported a bug when running function graph tracing then the function profiler. The following commands would cause a kernel crash: # cd /sys/kernel/tracing/ # echo function_graph > current_tracer # echo 1 > function_profile_enabled In that order. Create a test to test this two to make sure this does not come back as a regression. Link: https://lore.kernel.org/172398528350.293426.8347220120333730248.stgit@devnote2 Link: https://lore.kernel.org/all/20241010165235.35122877@gandalf.local.home/ Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) Signed-off-by: Shuah Khan --- .../ftrace/test.d/ftrace/fgraph-profiler.tc | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 tools/testing/selftests/ftrace/test.d/ftrace/fgraph-profiler.tc diff --git a/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-profiler.tc b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-profiler.tc new file mode 100644 index 000000000000..ffff8646733c --- /dev/null +++ b/tools/testing/selftests/ftrace/test.d/ftrace/fgraph-profiler.tc @@ -0,0 +1,31 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# description: ftrace - function profiler with function graph tracing +# requires: function_profile_enabled set_ftrace_filter function_graph:tracer + +# The function graph tracer can now be run along side of the function +# profiler. But there was a bug that caused the combination of the two +# to crash. It also required the function graph tracer to be started +# first. +# +# This test triggers that bug +# +# We need both function_graph and profiling to run this test + +fail() { # mesg + echo $1 + exit_fail +} + +echo "Enabling function graph tracer:" +echo function_graph > current_tracer +echo "enable profiler" + +# Older kernels do not allow function_profile to be enabled with +# function graph tracer. If the below fails, mark it as unsupported +echo 1 > function_profile_enabled || exit_unsupported + +# Let it run for a bit to make sure nothing explodes +sleep 1 + +exit 0 -- 2.50.1 From 8e929cb546ee42c9a61d24fae60605e9e3192354 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 13 Oct 2024 14:33:32 -0700 Subject: [PATCH 10/16] Linux 6.12-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c5493c0c0ca1..8cf3cf528892 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.50.1 From ad361ed4771da6aebb3ca6184a81ae4b8ad9f0b6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 22 Oct 2024 15:40:15 +0100 Subject: [PATCH 11/16] KVM: arm64: Just advertise SEIS as 0 when emulating ICC_CTLR_EL1 ICC_CTLR_EL1 accesses from a guest are trapped and emulated on systems with broken SEIS support and without FEAT_GICv3_TDIR. On such systems, we mask SEIS support in 'kvm_vgic_global_state.ich_vtr_el2' and so the value of ICC_CTLR_EL1.SEIS visible to the guest is always zero. Simplify the ICC_CTLR_EL1 read emulation to return 0 for the SEIS field, rather than reading an always-zero value from the global state. Cc: Marc Zyngier Cc: Oliver Upton Signed-off-by: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241022144016.27350-2-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index 18d4677002b1..3f9741e51d41 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -1012,9 +1012,6 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt) val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT; /* IDbits */ val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT; - /* SEIS */ - if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) - val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT); /* A3V */ val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT; /* EOImode */ -- 2.50.1 From 8aaf3f7dce74605a2e9f31bd421825a996ac7de3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 22 Oct 2024 15:40:16 +0100 Subject: [PATCH 12/16] KVM: arm64: Don't map 'kvm_vgic_global_state' at EL2 with pKVM Now that 'kvm_vgic_global_state' is no longer needed for ICC_CTLR_EL1 emulation on machines with a broken SEIS implementation, drop the pKVM hypervisor mapping of the page. Note that kvm_vgic_global_state is still mapped in non-protected hypervisor configurations (i.e. {n,h}VHE) through the rodata section mapping. Cc: Marc Zyngier Cc: Oliver Upton Signed-off-by: Will Deacon Reviewed-by: Marc Zyngier Link: https://lore.kernel.org/r/20241022144016.27350-3-will@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/setup.c | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 174007f3fadd..8fec099c2775 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -95,7 +95,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; - enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -148,22 +147,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, } pkvm_create_host_sve_mappings(); - - /* - * Map the host sections RO in the hypervisor, but transfer the - * ownership from the host to the hypervisor itself to make sure they - * can't be donated or shared with another entity. - * - * The ownership transition requires matching changes in the host - * stage-2. This will be done later (see finalize_host_mappings()) once - * the hyp_vmemmap is addressable. - */ - prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); - ret = pkvm_create_mappings(&kvm_vgic_global_state, - &kvm_vgic_global_state + 1, prot); - if (ret) - return ret; - return 0; } -- 2.50.1 From 0546d4a925a6ce52b362e528f6d962efd72c84b9 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 18 Oct 2024 08:48:29 +0100 Subject: [PATCH 13/16] KVM: arm64: Move pkvm_vcpu_init_traps() to init_pkvm_hyp_vcpu() Move pkvm_vcpu_init_traps() to the initialization of the hypervisor's vcpu state in init_pkvm_hyp_vcpu(), and remove the associated hypercall. In protected mode, traps need to be initialized whenever a VCPU is initialized anyway, and not only for protected VMs. This also saves an unnecessary hypercall. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241018074833.2563674-2-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_asm.h | 1 - arch/arm64/kvm/arm.c | 8 -------- arch/arm64/kvm/hyp/include/nvhe/trap_handler.h | 2 -- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 8 -------- arch/arm64/kvm/hyp/nvhe/pkvm.c | 4 +++- 5 files changed, 3 insertions(+), 20 deletions(-) diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index b36a3b6cc011..7f7866d2bfc2 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -76,7 +76,6 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___kvm_timer_set_cntvoff, __KVM_HOST_SMCCC_FUNC___vgic_v3_save_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, - __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps, __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index a0d01c46e408..ece934bb4531 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -856,14 +856,6 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) static_branch_inc(&userspace_irqchip_in_use); } - /* - * Initialize traps for protected VMs. - * NOTE: Move to run in EL2 directly, rather than via a hypercall, once - * the code is in place for first run initialization at EL2. - */ - if (kvm_vm_is_protected(kvm)) - kvm_call_hyp_nvhe(__pkvm_vcpu_init_traps, vcpu); - mutex_lock(&kvm->arch.config_lock); set_bit(KVM_ARCH_FLAG_HAS_RAN_ONCE, &kvm->arch.flags); mutex_unlock(&kvm->arch.config_lock); diff --git a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h index 45a84f0ade04..1e6d995968a1 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h +++ b/arch/arm64/kvm/hyp/include/nvhe/trap_handler.h @@ -15,6 +15,4 @@ #define DECLARE_REG(type, name, ctxt, reg) \ type name = (type)cpu_reg(ctxt, (reg)) -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu); - #endif /* __ARM64_KVM_NVHE_TRAP_HANDLER_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index fefc89209f9e..1a224d5df207 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -349,13 +349,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } -static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1); - - __pkvm_vcpu_init_traps(kern_hyp_va(vcpu)); -} - static void handle___pkvm_init_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1); @@ -411,7 +404,6 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__kvm_timer_set_cntvoff), HANDLE_FUNC(__vgic_v3_save_vmcr_aprs), HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), - HANDLE_FUNC(__pkvm_vcpu_init_traps), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), HANDLE_FUNC(__pkvm_teardown_vm), diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 077d4098548d..869955e551a0 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -204,7 +204,7 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) /* * Initialize trap register values in protected mode. */ -void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) +static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) { pvm_init_trap_regs(vcpu); pvm_init_traps_aa64pfr0(vcpu); @@ -335,6 +335,8 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + + pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); done: if (ret) unpin_host_vcpu(host_vcpu); -- 2.50.1 From 3663b258f7231c7f3888c96e5c1eee33547873a6 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 18 Oct 2024 08:48:30 +0100 Subject: [PATCH 14/16] KVM: arm64: Refactor kvm_vcpu_enable_ptrauth() for hyp use Move kvm_vcpu_enable_ptrauth() to a shared header to be used by hypervisor code in protected mode. No functional change intended. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241018074833.2563674-3-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/include/asm/kvm_emulate.h | 4 ++++ arch/arm64/kvm/reset.c | 5 ----- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h index a601a9305b10..9a68befbeecf 100644 --- a/arch/arm64/include/asm/kvm_emulate.h +++ b/arch/arm64/include/asm/kvm_emulate.h @@ -693,4 +693,8 @@ static inline bool guest_hyp_sve_traps_enabled(const struct kvm_vcpu *vcpu) return __guest_hyp_cptr_xen_trap_enabled(vcpu, ZEN); } +static inline void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) +{ + vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); +} #endif /* __ARM64_KVM_EMULATE_H__ */ diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 0b0ae5ae7bc2..470524b31951 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -167,11 +167,6 @@ static void kvm_vcpu_reset_sve(struct kvm_vcpu *vcpu) memset(vcpu->arch.sve_state, 0, vcpu_sve_state_size(vcpu)); } -static void kvm_vcpu_enable_ptrauth(struct kvm_vcpu *vcpu) -{ - vcpu_set_flag(vcpu, GUEST_HAS_PTRAUTH); -} - /** * kvm_reset_vcpu - sets core registers and sys_regs to reset value * @vcpu: The VCPU pointer -- 2.50.1 From cb0c272acebdf099431c34972963377f83872a08 Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 18 Oct 2024 08:48:31 +0100 Subject: [PATCH 15/16] KVM: arm64: Initialize the hypervisor's VM state at EL2 Do not trust the state of the VM as provided by the host when initializing the hypervisor's view of the VM sate. Initialize it instead at EL2 to a known good and safe state, as pKVM already does with hypervisor VCPU states. Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241018074833.2563674-4-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/pkvm.c | 77 ++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 869955e551a0..954df57a935f 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -6,6 +6,9 @@ #include #include + +#include + #include #include #include @@ -289,6 +292,65 @@ void pkvm_put_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_spin_unlock(&vm_table_lock); } +static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struct kvm *host_kvm) +{ + struct kvm *kvm = &hyp_vm->kvm; + DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* No restrictions for non-protected VMs. */ + if (!kvm_vm_is_protected(kvm)) { + bitmap_copy(kvm->arch.vcpu_features, + host_kvm->arch.vcpu_features, + KVM_VCPU_MAX_FEATURES); + return; + } + + bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES); + + /* + * For protected VMs, always allow: + * - CPU starting in poweroff state + * - PSCI v0.2 + */ + set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features); + set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features); + + /* + * Check if remaining features are allowed: + * - Performance Monitoring + * - Scalable Vectors + * - Pointer Authentication + */ + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_EL1_PMUVer), PVM_ID_AA64DFR0_ALLOW)) + set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_EL1_SVE), PVM_ID_AA64PFR0_ALLOW)) + set_bit(KVM_ARM_VCPU_SVE, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_API), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_APA), PVM_ID_AA64ISAR1_RESTRICT_UNSIGNED)) + set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features); + + if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPI), PVM_ID_AA64ISAR1_ALLOW) && + FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_EL1_GPA), PVM_ID_AA64ISAR1_ALLOW)) + set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features); + + bitmap_and(kvm->arch.vcpu_features, host_kvm->arch.vcpu_features, + allowed_features, KVM_VCPU_MAX_FEATURES); +} + +static void pkvm_vcpu_init_ptrauth(struct pkvm_hyp_vcpu *hyp_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_ADDRESS) || + vcpu_has_feature(vcpu, KVM_ARM_VCPU_PTRAUTH_GENERIC)) { + kvm_vcpu_enable_ptrauth(vcpu); + } else { + vcpu_clear_flag(&hyp_vcpu->vcpu, GUEST_HAS_PTRAUTH); + } +} + static void unpin_host_vcpu(struct kvm_vcpu *host_vcpu) { if (host_vcpu) @@ -310,6 +372,18 @@ static void init_pkvm_hyp_vm(struct kvm *host_kvm, struct pkvm_hyp_vm *hyp_vm, hyp_vm->host_kvm = host_kvm; hyp_vm->kvm.created_vcpus = nr_vcpus; hyp_vm->kvm.arch.mmu.vtcr = host_mmu.arch.mmu.vtcr; + hyp_vm->kvm.arch.pkvm.enabled = READ_ONCE(host_kvm->arch.pkvm.enabled); + pkvm_init_features_from_host(hyp_vm, host_kvm); +} + +static void pkvm_vcpu_init_sve(struct pkvm_hyp_vcpu *hyp_vcpu, struct kvm_vcpu *host_vcpu) +{ + struct kvm_vcpu *vcpu = &hyp_vcpu->vcpu; + + if (!vcpu_has_feature(vcpu, KVM_ARM_VCPU_SVE)) { + vcpu_clear_flag(vcpu, GUEST_HAS_SVE); + vcpu_clear_flag(vcpu, VCPU_SVE_FINALIZED); + } } static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, @@ -335,7 +409,10 @@ static int init_pkvm_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu, hyp_vcpu->vcpu.arch.hw_mmu = &hyp_vm->kvm.arch.mmu; hyp_vcpu->vcpu.arch.cflags = READ_ONCE(host_vcpu->arch.cflags); + hyp_vcpu->vcpu.arch.mp_state.mp_state = KVM_MP_STATE_STOPPED; + pkvm_vcpu_init_sve(hyp_vcpu, host_vcpu); + pkvm_vcpu_init_ptrauth(hyp_vcpu); pkvm_vcpu_init_traps(&hyp_vcpu->vcpu); done: if (ret) -- 2.50.1 From b56680de9c6489f2aed707fad2811e714b52fe4c Mon Sep 17 00:00:00 2001 From: Fuad Tabba Date: Fri, 18 Oct 2024 08:48:32 +0100 Subject: [PATCH 16/16] KVM: arm64: Initialize trap register values in hyp in pKVM Handle the initialization of trap registers at the hypervisor in pKVM, even for non-protected guests. The host is not trusted with the values of the trap registers, regardless of the VM type. Therefore, when switching between the host and the guests, only flush the HCR_EL2 TWI and TWE bits. The host is allowed to configure these for opportunistic scheduling, as neither affects the protection of VMs or the hypervisor. Reported-by: Will Deacon Fixes: 814ad8f96e92 ("KVM: arm64: Drop trapping of PAuth instructions/keys") Signed-off-by: Fuad Tabba Link: https://lore.kernel.org/r/20241018074833.2563674-5-tabba@google.com Signed-off-by: Oliver Upton --- arch/arm64/kvm/hyp/nvhe/hyp-main.c | 4 +++- arch/arm64/kvm/hyp/nvhe/pkvm.c | 35 ++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 1a224d5df207..6aa0b13d86e5 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -105,8 +105,10 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu) hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu; - hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2; hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2; + hyp_vcpu->vcpu.arch.hcr_el2 &= ~(HCR_TWI | HCR_TWE); + hyp_vcpu->vcpu.arch.hcr_el2 |= READ_ONCE(host_vcpu->arch.hcr_el2) & + (HCR_TWI | HCR_TWE); hyp_vcpu->vcpu.arch.iflags = host_vcpu->arch.iflags; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 954df57a935f..01616c39a810 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -204,11 +204,46 @@ static void pvm_init_trap_regs(struct kvm_vcpu *vcpu) } } +static void pkvm_vcpu_reset_hcr(struct kvm_vcpu *vcpu) +{ + vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS; + + if (has_hvhe()) + vcpu->arch.hcr_el2 |= HCR_E2H; + + if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN)) { + /* route synchronous external abort exceptions to EL2 */ + vcpu->arch.hcr_el2 |= HCR_TEA; + /* trap error record accesses */ + vcpu->arch.hcr_el2 |= HCR_TERR; + } + + if (cpus_have_final_cap(ARM64_HAS_STAGE2_FWB)) + vcpu->arch.hcr_el2 |= HCR_FWB; + + if (cpus_have_final_cap(ARM64_HAS_EVT) && + !cpus_have_final_cap(ARM64_MISMATCHED_CACHE_TYPE)) + vcpu->arch.hcr_el2 |= HCR_TID4; + else + vcpu->arch.hcr_el2 |= HCR_TID2; + + if (vcpu_has_ptrauth(vcpu)) + vcpu->arch.hcr_el2 |= (HCR_API | HCR_APK); +} + /* * Initialize trap register values in protected mode. */ static void pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu) { + vcpu->arch.cptr_el2 = kvm_get_reset_cptr_el2(vcpu); + vcpu->arch.mdcr_el2 = 0; + + pkvm_vcpu_reset_hcr(vcpu); + + if ((!vcpu_is_protected(vcpu))) + return; + pvm_init_trap_regs(vcpu); pvm_init_traps_aa64pfr0(vcpu); pvm_init_traps_aa64pfr1(vcpu); -- 2.50.1