From 711f9b8fbe4f4936302804e246e206f0829f628f Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Mon, 3 Feb 2025 23:32:05 +0100 Subject: [PATCH 01/16] fsnotify: disable pre-content and permission events by default After introducing pre-content events, we had a regression related to disabling huge faults on files that should never have pre-content events enabled. This happened because the default f_mode of allocated files (0) does not disable pre-content events. Pre-content events are disabled in file_set_fsnotify_mode_by_watchers() but internal files may not get to call this helper. Initialize f_mode to disable permission and pre-content events for all files and if needed they will be enabled for the callers of file_set_fsnotify_mode_by_watchers(). Fixes: 20bf82a898b6 ("mm: don't allow huge faults for files with pre content watches") Reported-by: Alex Williamson Closes: https://lore.kernel.org/linux-fsdevel/20250131121703.1e4d00a7.alex.williamson@redhat.com/ Tested-by: Alex Williamson Signed-off-by: Amir Goldstein Link: https://lore.kernel.org/r/20250203223205.861346-4-amir73il@gmail.com Signed-off-by: Christian Brauner --- fs/file_table.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/file_table.c b/fs/file_table.c index 35b93da6c5cb..5c00dc38558d 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -194,6 +194,11 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * refcount bumps we should reinitialize the reused file first. */ file_ref_init(&f->f_ref, 1); + /* + * Disable permission and pre-content events for all files by default. + * They may be enabled later by file_set_fsnotify_mode_from_watchers(). + */ + file_set_fsnotify_mode(f, FMODE_NONOTIFY_PERM); return 0; } -- 2.51.0 From 091ee63e36e8289f9067f659a48d497911e49d6f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Feb 2025 14:51:20 +0100 Subject: [PATCH 02/16] pidfs: improve ioctl handling Pidfs supports extensible and non-extensible ioctls. The extensible ioctls need to check for the ioctl number itself not just the ioctl command otherwise both backward- and forward compatibility are broken. The pidfs ioctl handler also needs to look at the type of the ioctl command to guard against cases where "[...] a daemon receives some random file descriptor from a (potentially less privileged) client and expects the FD to be of some specific type, it might call ioctl() on this FD with some type-specific command and expect the call to fail if the FD is of the wrong type; but due to the missing type check, the kernel instead performs some action that userspace didn't expect." (cf. [1]] Link: https://lore.kernel.org/r/20250204-work-pidfs-ioctl-v1-1-04987d239575@kernel.org Link: https://lore.kernel.org/r/CAG48ez2K9A5GwtgqO31u9ZL292we8ZwAA=TJwwEv7wRuJ3j4Lw@mail.gmail.com [1] Fixes: 8ce352818820 ("pidfs: check for valid ioctl commands") Acked-by: Luca Boccassi Reported-by: Jann Horn Cc: stable@vger.kernel.org # v6.13; please backport with 8ce352818820 ("pidfs: check for valid ioctl commands") Signed-off-by: Christian Brauner --- fs/pidfs.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/pidfs.c b/fs/pidfs.c index 049352f973de..63f9699ebac3 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -287,7 +287,6 @@ static bool pidfs_ioctl_valid(unsigned int cmd) switch (cmd) { case FS_IOC_GETVERSION: case PIDFD_GET_CGROUP_NAMESPACE: - case PIDFD_GET_INFO: case PIDFD_GET_IPC_NAMESPACE: case PIDFD_GET_MNT_NAMESPACE: case PIDFD_GET_NET_NAMESPACE: @@ -300,6 +299,17 @@ static bool pidfs_ioctl_valid(unsigned int cmd) return true; } + /* Extensible ioctls require some more careful checks. */ + switch (_IOC_NR(cmd)) { + case _IOC_NR(PIDFD_GET_INFO): + /* + * Try to prevent performing a pidfd ioctl when someone + * erronously mistook the file descriptor for a pidfd. + * This is not perfect but will catch most cases. + */ + return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); + } + return false; } -- 2.51.0 From 37d11cfc63604b3886308e2111d845d148ced8bc Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 4 Feb 2025 22:32:07 +0100 Subject: [PATCH 03/16] vfs: sanity check the length passed to inode_set_cached_link() This costs a strlen() call when instatianating a symlink. Preferably it would be hidden behind VFS_WARN_ON (or compatible), but there is no such facility at the moment. With the facility in place the call can be patched out in production kernels. In the meantime, since the cost is being paid unconditionally, use the result to a fixup the bad caller. This is not expected to persist in the long run (tm). Sample splat: bad length passed for symlink [/tmp/syz-imagegen43743633/file0/file0] (got 131109, expected 37) [rest of WARN blurp goes here] Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/r/20250204213207.337980-1-mjguzik@gmail.com Signed-off-by: Christian Brauner --- include/linux/fs.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index 7620547432a8..2c3b2f8a621f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -790,6 +790,19 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { + int testlen; + + /* + * TODO: patch it into a debug-only check if relevant macros show up. + * In the meantime, since we are suffering strlen even on production kernels + * to find the right length, do a fixup if the wrong value got passed. + */ + testlen = strlen(link); + if (testlen != linklen) { + WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", + link, linklen, testlen); + linklen = testlen; + } inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; -- 2.51.0 From 511121a48bbd12df4ae50a099a8936e833df8c46 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Wed, 5 Feb 2025 19:42:01 +0100 Subject: [PATCH 04/16] MAINTAINERS: Move Pavel to kernel.org address I need to filter my emails better, switch to pavel@kernel.org address to help with that. Signed-off-by: Pavel Machek Signed-off-by: Linus Torvalds --- CREDITS | 6 ++---- MAINTAINERS | 10 +++++----- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/CREDITS b/CREDITS index 1f9f0f078b4a..53d11a46fd69 100644 --- a/CREDITS +++ b/CREDITS @@ -2515,11 +2515,9 @@ D: SLS distribution D: Initial implementation of VC's, pty's and select() N: Pavel Machek -E: pavel@ucw.cz +E: pavel@kernel.org P: 4096R/92DFCE96 4FA7 9EEF FCD4 C44F C585 B8C7 C060 2241 92DF CE96 -D: Softcursor for vga, hypertech cdrom support, vcsa bugfix, nbd, -D: sun4/330 port, capabilities for elf, speedup for rm on ext2, USB, -D: work on suspend-to-ram/disk, killing duplicates from ioctl32, +D: NBD, Sun4/330 port, USB, work on suspend-to-ram/disk, D: Altera SoCFPGA and Nokia N900 support. S: Czech Republic diff --git a/MAINTAINERS b/MAINTAINERS index 873aa2cce4d7..157818de0b55 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9418,7 +9418,7 @@ F: fs/freevxfs/ FREEZER M: "Rafael J. Wysocki" -M: Pavel Machek +M: Pavel Machek L: linux-pm@vger.kernel.org S: Supported F: Documentation/power/freezing-of-tasks.rst @@ -10253,7 +10253,7 @@ F: drivers/video/fbdev/hgafb.c HIBERNATION (aka Software Suspend, aka swsusp) M: "Rafael J. Wysocki" -M: Pavel Machek +M: Pavel Machek L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org @@ -13124,8 +13124,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/har F: scripts/leaking_addresses.pl LED SUBSYSTEM -M: Pavel Machek M: Lee Jones +M: Pavel Machek L: linux-leds@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git @@ -16823,7 +16823,7 @@ F: include/linux/tick.h F: kernel/time/tick*.* NOKIA N900 CAMERA SUPPORT (ET8EK8 SENSOR, AD5820 FOCUS) -M: Pavel Machek +M: Pavel Machek M: Sakari Ailus L: linux-media@vger.kernel.org S: Maintained @@ -22849,7 +22849,7 @@ F: drivers/sh/ SUSPEND TO RAM M: "Rafael J. Wysocki" M: Len Brown -M: Pavel Machek +M: Pavel Machek L: linux-pm@vger.kernel.org S: Supported B: https://bugzilla.kernel.org -- 2.51.0 From 1b3291f00013c86a9bb349d6158a9a7a4f0334fe Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Fri, 7 Feb 2025 03:21:46 +0900 Subject: [PATCH 05/16] MAINTAINERS: Remove myself I no longer have any faith left in the kernel development process or community management approach. Apple/ARM platform development will continue downstream. If I feel like sending some patches upstream in the future myself for whatever subtree I may, or I may not. Anyone who feels like fighting the upstreaming fight themselves is welcome to do so. Signed-off-by: Hector Martin Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 157818de0b55..20c9e0871215 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2209,7 +2209,6 @@ F: sound/soc/codecs/cs42l84.* F: sound/soc/codecs/ssm3515.c ARM/APPLE MACHINE SUPPORT -M: Hector Martin M: Sven Peter R: Alyssa Rosenzweig L: asahi@lists.linux.dev -- 2.51.0 From f354fc88a72ae83dacd68370f6fa040e5733bcfe Mon Sep 17 00:00:00 2001 From: WangYuli Date: Fri, 7 Feb 2025 15:08:55 +0800 Subject: [PATCH 06/16] kbuild: install-extmod-build: add missing quotation marks for CC variable While attempting to build a Debian packages with CC="ccache gcc", I saw the following error as builddeb builds linux-headers-$KERNELVERSION: make HOSTCC=ccache gcc VPATH= srcroot=. -f ./scripts/Makefile.build obj=debian/linux-headers-6.14.0-rc1/usr/src/linux-headers-6.14.0-rc1/scripts make[6]: *** No rule to make target 'gcc'. Stop. Upon investigation, it seems that one instance of $(CC) variable reference in ./scripts/package/install-extmod-build was missing quotation marks, causing the above error. Add the missing quotation marks around $(CC) to fix build. Fixes: 5f73e7d0386d ("kbuild: refactor cross-compiling linux-headers package") Co-developed-by: Mingcong Bai Signed-off-by: Mingcong Bai Tested-by: WangYuli Signed-off-by: WangYuli Signed-off-by: Masahiro Yamada --- scripts/package/install-extmod-build | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/package/install-extmod-build b/scripts/package/install-extmod-build index bb6e23c1174e..b724626ea0ca 100755 --- a/scripts/package/install-extmod-build +++ b/scripts/package/install-extmod-build @@ -63,7 +63,7 @@ if [ "${CC}" != "${HOSTCC}" ]; then # Clear VPATH and srcroot because the source files reside in the output # directory. # shellcheck disable=SC2016 # $(MAKE), $(CC), and $(build) will be expanded by Make - "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC=$(CC) VPATH= srcroot=. $(build)='"${destdir}"/scripts + "${MAKE}" run-command KBUILD_RUN_COMMAND='+$(MAKE) HOSTCC="$(CC)" VPATH= srcroot=. $(build)='"${destdir}"/scripts rm -f "${destdir}/scripts/Kbuild" fi -- 2.51.0 From 8f6629c004b193d23612641c3607e785819e97ab Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 17 Oct 2024 10:09:22 -0700 Subject: [PATCH 07/16] kbuild: Move -Wenum-enum-conversion to W=2 -Wenum-enum-conversion was strengthened in clang-19 to warn for C, which caused the kernel to move it to W=1 in commit 75b5ab134bb5 ("kbuild: Move -Wenum-{compare-conditional,enum-conversion} into W=1") because there were numerous instances that would break builds with -Werror. Unfortunately, this is not a full solution, as more and more developers, subsystems, and distributors are building with W=1 as well, so they continue to see the numerous instances of this warning. Since the move to W=1, there have not been many new instances that have appeared through various build reports and the ones that have appeared seem to be following similar existing patterns, suggesting that most instances of this warning will not be real issues. The only alternatives for silencing this warning are adding casts (which is generally seen as an ugly practice) or refactoring the enums to macro defines or a unified enum (which may be undesirable because of type safety in other parts of the code). Move the warning to W=2, where warnings that occur frequently but may be relevant should reside. Cc: stable@vger.kernel.org Fixes: 75b5ab134bb5 ("kbuild: Move -Wenum-{compare-conditional,enum-conversion} into W=1") Link: https://lore.kernel.org/ZwRA9SOcOjjLJcpi@google.com/ Signed-off-by: Nathan Chancellor Acked-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- scripts/Makefile.extrawarn | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index eb719f6d8d53..a7003c1e66c7 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -133,7 +133,6 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast) KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare KBUILD_CFLAGS += $(call cc-disable-warning, unaligned-access) KBUILD_CFLAGS += -Wno-enum-compare-conditional -KBUILD_CFLAGS += -Wno-enum-enum-conversion endif endif @@ -157,6 +156,10 @@ KBUILD_CFLAGS += -Wno-missing-field-initializers KBUILD_CFLAGS += -Wno-type-limits KBUILD_CFLAGS += -Wno-shift-negative-value +ifdef CONFIG_CC_IS_CLANG +KBUILD_CFLAGS += -Wno-enum-enum-conversion +endif + ifdef CONFIG_CC_IS_GCC KBUILD_CFLAGS += -Wno-maybe-uninitialized endif -- 2.51.0 From c8c9b1d2d5b4377c72a979f5a26e842a869aefc9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 8 Feb 2025 00:15:11 -0500 Subject: [PATCH 08/16] fgraph: Fix set_graph_notrace with setting TRACE_GRAPH_NOTRACE_BIT The code was restructured where the function graph notrace code, that would not trace a function and all its children is done by setting a NOTRACE flag when the function that is not to be traced is hit. There's a TRACE_GRAPH_NOTRACE_BIT which defines the bit in the flags and a TRACE_GRAPH_NOTRACE which is the mask with that bit set. But the restructuring used TRACE_GRAPH_NOTRACE_BIT when it should have used TRACE_GRAPH_NOTRACE. For example: # cd /sys/kernel/tracing # echo set_track_prepare stack_trace_save > set_graph_notrace # echo function_graph > current_tracer # cat trace [..] 0) | __slab_free() { 0) | free_to_partial_list() { 0) | arch_stack_walk() { 0) | __unwind_start() { 0) 0.501 us | get_stack_info(); Where a non filter trace looks like: # echo > set_graph_notrace # cat trace 0) | free_to_partial_list() { 0) | set_track_prepare() { 0) | stack_trace_save() { 0) | arch_stack_walk() { 0) | __unwind_start() { Where the filter should look like: # cat trace 0) | free_to_partial_list() { 0) | _raw_spin_lock_irqsave() { 0) 0.350 us | preempt_count_add(); 0) 0.351 us | do_raw_spin_lock(); 0) 2.440 us | } Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250208001511.535be150@batman.local.home Fixes: b84214890a9bc ("function_graph: Move graph notrace bit to shadow stack global var") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_functions_graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 54d850997c0a..136c750b0b4d 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -198,7 +198,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace, * returning from the function. */ if (ftrace_graph_notrace_addr(trace->func)) { - *task_var |= TRACE_GRAPH_NOTRACE_BIT; + *task_var |= TRACE_GRAPH_NOTRACE; /* * Need to return 1 to have the return called * that will clear the NOTRACE bit. -- 2.51.0 From 7585946243d614bd2cd4e13377be2c711c9539e0 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 8 Feb 2025 18:54:28 +0100 Subject: [PATCH 09/16] PM: sleep: core: Restrict power.set_active propagation Commit 3775fc538f53 ("PM: sleep: core: Synchronize runtime PM status of parents and children") exposed an issue related to simple_pm_bus_pm_ops that uses pm_runtime_force_suspend() and pm_runtime_force_resume() as bus type PM callbacks for the noirq phases of system-wide suspend and resume. The problem is that pm_runtime_force_suspend() does not distinguish runtime-suspended devices from devices for which runtime PM has never been enabled, so if it sees a device with runtime PM status set to RPM_ACTIVE, it will assume that runtime PM is enabled for that device and so it will attempt to suspend it with the help of its runtime PM callbacks which may not be ready for that. As it turns out, this causes simple_pm_bus_runtime_suspend() to crash due to a NULL pointer dereference. Another problem related to the above commit and simple_pm_bus_pm_ops is that setting runtime PM status of a device handled by the latter to RPM_ACTIVE will actually prevent it from being resumed because pm_runtime_force_resume() only resumes devices with runtime PM status set to RPM_SUSPENDED. To mitigate these issues, do not allow power.set_active to propagate beyond the parent of the device with DPM_FLAG_SMART_SUSPEND set that will need to be resumed, which should be a sufficient stop-gap for the time being, but they will need to be properly addressed in the future because in general during system-wide resume it is necessary to resume all devices in a dependency chain in which at least one device is going to be resumed. Fixes: 3775fc538f53 ("PM: sleep: core: Synchronize runtime PM status of parents and children") Closes: https://lore.kernel.org/linux-pm/1c2433d4-7e0f-4395-b841-b8eac7c25651@nvidia.com/ Reported-by: Jon Hunter Tested-by: Johan Hovold Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/6137505.lOV4Wx5bFT@rjwysocki.net --- drivers/base/power/main.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index d497d448e4b2..40e1d8d8a589 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -1191,24 +1191,18 @@ static pm_message_t resume_event(pm_message_t sleep_state) return PMSG_ON; } -static void dpm_superior_set_must_resume(struct device *dev, bool set_active) +static void dpm_superior_set_must_resume(struct device *dev) { struct device_link *link; int idx; - if (dev->parent) { + if (dev->parent) dev->parent->power.must_resume = true; - if (set_active) - dev->parent->power.set_active = true; - } idx = device_links_read_lock(); - list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) { + list_for_each_entry_rcu_locked(link, &dev->links.suppliers, c_node) link->supplier->power.must_resume = true; - if (set_active) - link->supplier->power.set_active = true; - } device_links_read_unlock(idx); } @@ -1287,9 +1281,12 @@ Skip: dev->power.must_resume = true; if (dev->power.must_resume) { - dev->power.set_active = dev->power.set_active || - dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND); - dpm_superior_set_must_resume(dev, dev->power.set_active); + if (dev_pm_test_driver_flags(dev, DPM_FLAG_SMART_SUSPEND)) { + dev->power.set_active = true; + if (dev->parent && !dev->parent->power.ignore_children) + dev->parent->power.set_active = true; + } + dpm_superior_set_must_resume(dev); } Complete: -- 2.51.0 From a64dcfb451e254085a7daee5fe51bf22959d52d3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 9 Feb 2025 12:45:03 -0800 Subject: [PATCH 10/16] Linux 6.14-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9e0d63d9d94b..89628e354ca7 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From a8de7f100bb5989d9c3627d3a223ee1c863f3b69 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:51 -0800 Subject: [PATCH 11/16] KVM: x86: Reject Hyper-V's SEND_IPI hypercalls if local APIC isn't in-kernel Advertise support for Hyper-V's SEND_IPI and SEND_IPI_EX hypercalls if and only if the local API is emulated/virtualized by KVM, and explicitly reject said hypercalls if the local APIC is emulated in userspace, i.e. don't rely on userspace to opt-in to KVM_CAP_HYPERV_ENFORCE_CPUID. Rejecting SEND_IPI and SEND_IPI_EX fixes a NULL-pointer dereference if Hyper-V enlightenments are exposed to the guest without an in-kernel local APIC: dump_stack+0xbe/0xfd __kasan_report.cold+0x34/0x84 kasan_report+0x3a/0x50 __apic_accept_irq+0x3a/0x5c0 kvm_hv_send_ipi.isra.0+0x34e/0x820 kvm_hv_hypercall+0x8d9/0x9d0 kvm_emulate_hypercall+0x506/0x7e0 __vmx_handle_exit+0x283/0xb60 vmx_handle_exit+0x1d/0xd0 vcpu_enter_guest+0x16b0/0x24c0 vcpu_run+0xc0/0x550 kvm_arch_vcpu_ioctl_run+0x170/0x6d0 kvm_vcpu_ioctl+0x413/0xb20 __se_sys_ioctl+0x111/0x160 do_syscal1_64+0x30/0x40 entry_SYSCALL_64_after_hwframe+0x67/0xd1 Note, checking the sending vCPU is sufficient, as the per-VM irqchip_mode can't be modified after vCPUs are created, i.e. if one vCPU has an in-kernel local APIC, then all vCPUs have an in-kernel local APIC. Reported-by: Dongjie Zou Fixes: 214ff83d4473 ("KVM: x86: hyperv: implement PV IPI send hypercalls") Fixes: 2bc39970e932 ("x86/kvm/hyper-v: Introduce KVM_GET_SUPPORTED_HV_CPUID") Cc: stable@vger.kernel.org Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-2-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/hyperv.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 6a6dd5a84f22..6ebeb6cea6c0 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -2226,6 +2226,9 @@ static u64 kvm_hv_send_ipi(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc) u32 vector; bool all_cpus; + if (!lapic_in_kernel(vcpu)) + return HV_STATUS_INVALID_HYPERCALL_INPUT; + if (hc->code == HVCALL_SEND_IPI) { if (!hc->fast) { if (unlikely(kvm_read_guest(kvm, hc->ingpa, &send_ipi, @@ -2852,7 +2855,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED; ent->eax |= HV_X64_APIC_ACCESS_RECOMMENDED; ent->eax |= HV_X64_RELAXED_TIMING_RECOMMENDED; - ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; + if (!vcpu || lapic_in_kernel(vcpu)) + ent->eax |= HV_X64_CLUSTER_IPI_RECOMMENDED; ent->eax |= HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED; if (evmcs_ver) ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; -- 2.51.0 From 0b6db0dc43eefb4f89181546785c3609fd276524 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:52 -0800 Subject: [PATCH 12/16] KVM: selftests: Mark test_hv_cpuid_e2big() static in Hyper-V CPUID test Make the Hyper-V CPUID test's local helper test_hv_cpuid_e2big() static, it's not used outside of the test (and isn't intended to be). Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-3-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/hyperv_cpuid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 4f5881d4ef66..9a0fcc713350 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -111,7 +111,7 @@ static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, } } -void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { static struct kvm_cpuid2 cpuid = {.nent = 0}; int ret; -- 2.51.0 From cd5a0c2f0faeb4a3fab3b78f6693a2d55ee51efa Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:53 -0800 Subject: [PATCH 13/16] KVM: selftests: Manage CPUID array in Hyper-V CPUID test's core helper Allocate, get, and free the CPUID array in the Hyper-V CPUID test in the test's core helper, instead of copy+pasting code at each call site. In addition to deduplicating a small amount of code, restricting visibility of the array to a single invocation of the core test prevents "leaking" an array across test cases. Passing in @vcpu to the helper will also allow pivoting on VM-scoped information without needing to pass more booleans, e.g. to conditionally assert on features that require an in-kernel APIC. To avoid use-after-free bugs due to overzealous and careless developers, opportunstically add a comment to explain that the system-scoped helper caches the Hyper-V CPUID entries, i.e. that the caller is not responsible for freeing the memory. Cc: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-4-seanjc@google.com Signed-off-by: Sean Christopherson --- .../testing/selftests/kvm/x86/hyperv_cpuid.c | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 9a0fcc713350..3188749ec6e1 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -41,13 +41,18 @@ static bool smt_possible(void) return res; } -static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, - bool evmcs_expected) +static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) { + const struct kvm_cpuid2 *hv_cpuid_entries; int i; int nent_expected = 10; u32 test_val; + if (vcpu) + hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); + else + hv_cpuid_entries = kvm_get_supported_hv_cpuid(); + TEST_ASSERT(hv_cpuid_entries->nent == nent_expected, "KVM_GET_SUPPORTED_HV_CPUID should return %d entries" " (returned %d)", @@ -109,6 +114,13 @@ static void test_hv_cpuid(const struct kvm_cpuid2 *hv_cpuid_entries, * entry->edx); */ } + + /* + * Note, the CPUID array returned by the system-scoped helper is a one- + * time allocation, i.e. must not be freed. + */ + if (vcpu) + free((void *)hv_cpuid_entries); } static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) @@ -129,7 +141,6 @@ static void test_hv_cpuid_e2big(struct kvm_vm *vm, struct kvm_vcpu *vcpu) int main(int argc, char *argv[]) { struct kvm_vm *vm; - const struct kvm_cpuid2 *hv_cpuid_entries; struct kvm_vcpu *vcpu; TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); @@ -138,10 +149,7 @@ int main(int argc, char *argv[]) /* Test vCPU ioctl version */ test_hv_cpuid_e2big(vm, vcpu); - - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); - test_hv_cpuid(hv_cpuid_entries, false); - free((void *)hv_cpuid_entries); + test_hv_cpuid(vcpu, false); if (!kvm_cpu_has(X86_FEATURE_VMX) || !kvm_has_cap(KVM_CAP_HYPERV_ENLIGHTENED_VMCS)) { @@ -149,9 +157,7 @@ int main(int argc, char *argv[]) goto do_sys; } vcpu_enable_evmcs(vcpu); - hv_cpuid_entries = vcpu_get_supported_hv_cpuid(vcpu); - test_hv_cpuid(hv_cpuid_entries, true); - free((void *)hv_cpuid_entries); + test_hv_cpuid(vcpu, true); do_sys: /* Test system ioctl version */ @@ -161,9 +167,7 @@ do_sys: } test_hv_cpuid_e2big(vm, NULL); - - hv_cpuid_entries = kvm_get_supported_hv_cpuid(); - test_hv_cpuid(hv_cpuid_entries, kvm_cpu_has(X86_FEATURE_VMX)); + test_hv_cpuid(NULL, kvm_cpu_has(X86_FEATURE_VMX)); out: kvm_vm_free(vm); -- 2.51.0 From e36454461c5ebe6372952560b2abad5dc9ac579d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 17 Jan 2025 16:34:54 -0800 Subject: [PATCH 14/16] KVM: selftests: Add CPUID tests for Hyper-V features that need in-kernel APIC Add testcases to x86's Hyper-V CPUID test to verify that KVM advertises support for features that require an in-kernel local APIC appropriately, i.e. that KVM hides support from the vCPU-scoped ioctl if the VM doesn't have an in-kernel local APIC. Cc: Vitaly Kuznetsov Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20250118003454.2619573-5-seanjc@google.com Signed-off-by: Sean Christopherson --- tools/testing/selftests/kvm/x86/hyperv_cpuid.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c index 3188749ec6e1..4e920705681a 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_cpuid.c +++ b/tools/testing/selftests/kvm/x86/hyperv_cpuid.c @@ -43,6 +43,7 @@ static bool smt_possible(void) static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) { + const bool has_irqchip = !vcpu || vcpu->vm->has_irqchip; const struct kvm_cpuid2 *hv_cpuid_entries; int i; int nent_expected = 10; @@ -85,12 +86,19 @@ static void test_hv_cpuid(struct kvm_vcpu *vcpu, bool evmcs_expected) entry->eax, evmcs_expected ); break; + case 0x40000003: + TEST_ASSERT(has_irqchip || !(entry->edx & BIT(19)), + "\"Direct\" Synthetic Timers should require in-kernel APIC"); + break; case 0x40000004: test_val = entry->eax & (1UL << 18); TEST_ASSERT(!!test_val == !smt_possible(), "NoNonArchitecturalCoreSharing bit" " doesn't reflect SMT setting"); + + TEST_ASSERT(has_irqchip || !(entry->eax & BIT(10)), + "Cluster IPI (i.e. SEND_IPI) should require in-kernel APIC"); break; case 0x4000000A: TEST_ASSERT(entry->eax & (1UL << 19), @@ -145,9 +153,14 @@ int main(int argc, char *argv[]) TEST_REQUIRE(kvm_has_cap(KVM_CAP_HYPERV_CPUID)); - vm = vm_create_with_one_vcpu(&vcpu, guest_code); + /* Test the vCPU ioctl without an in-kernel local APIC. */ + vm = vm_create_barebones(); + vcpu = __vm_vcpu_add(vm, 0); + test_hv_cpuid(vcpu, false); + kvm_vm_free(vm); /* Test vCPU ioctl version */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code); test_hv_cpuid_e2big(vm, vcpu); test_hv_cpuid(vcpu, false); -- 2.51.0 From 46d6c6f3ef0eaff71c2db6d77d4e2ebb7adac34f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 29 Jan 2025 17:08:25 -0800 Subject: [PATCH 15/16] KVM: nSVM: Enter guest mode before initializing nested NPT MMU When preparing vmcb02 for nested VMRUN (or state restore), "enter" guest mode prior to initializing the MMU for nested NPT so that guest_mode is set in the MMU's role. KVM's model is that all L2 MMUs are tagged with guest_mode, as the behavior of hypervisor MMUs tends to be significantly different than kernel MMUs. Practically speaking, the bug is relatively benign, as KVM only directly queries role.guest_mode in kvm_mmu_free_guest_mode_roots() and kvm_mmu_page_ad_need_write_protect(), which SVM doesn't use, and in paths that are optimizations (mmu_page_zap_pte() and shadow_mmu_try_split_huge_pages()). And while the role is incorprated into shadow page usage, because nested NPT requires KVM to be using NPT for L1, reusing shadow pages across L1 and L2 is impossible as L1 MMUs will always have direct=1, while L2 MMUs will have direct=0. Hoist the TLB processing and setting of HF_GUEST_MASK to the beginning of the flow instead of forcing guest_mode in the MMU, as nothing in nested_vmcb02_prepare_control() between the old and new locations touches TLB flush requests or HF_GUEST_MASK, i.e. there's no reason to present inconsistent vCPU state to the MMU. Fixes: 69cb877487de ("KVM: nSVM: move MMU setup to nested_prepare_vmcb_control") Cc: stable@vger.kernel.org Reported-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Link: https://lore.kernel.org/r/20250130010825.220346-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/nested.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 74c20dbb92da..d4ac4a1f8b81 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5540,7 +5540,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0, union kvm_mmu_page_role root_role; /* NPT requires CR0.PG=1. */ - WARN_ON_ONCE(cpu_role.base.direct); + WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode); root_role = cpu_role.base; root_role.level = kvm_mmu_get_tdp_level(vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index d77b094d9a4d..04c375bf1ac2 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -646,6 +646,11 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, u32 pause_count12; u32 pause_thresh12; + nested_svm_transition_tlb_flush(vcpu); + + /* Enter Guest-Mode */ + enter_guest_mode(vcpu); + /* * Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2, * exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes. @@ -762,11 +767,6 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm, } } - nested_svm_transition_tlb_flush(vcpu); - - /* Enter Guest-Mode */ - enter_guest_mode(vcpu); - /* * Merge guest and host intercepts - must be called with vcpu in * guest-mode to take effect. -- 2.51.0 From c2fee09fc167c74a64adb08656cb993ea475197e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 24 Jan 2025 17:18:33 -0800 Subject: [PATCH 16/16] KVM: x86: Load DR6 with guest value only before entering .vcpu_run() loop Move the conditional loading of hardware DR6 with the guest's DR6 value out of the core .vcpu_run() loop to fix a bug where KVM can load hardware with a stale vcpu->arch.dr6. When the guest accesses a DR and host userspace isn't debugging the guest, KVM disables DR interception and loads the guest's values into hardware on VM-Enter and saves them on VM-Exit. This allows the guest to access DRs at will, e.g. so that a sequence of DR accesses to configure a breakpoint only generates one VM-Exit. For DR0-DR3, the logic/behavior is identical between VMX and SVM, and also identical between KVM_DEBUGREG_BP_ENABLED (userspace debugging the guest) and KVM_DEBUGREG_WONT_EXIT (guest using DRs), and so KVM handles loading DR0-DR3 in common code, _outside_ of the core kvm_x86_ops.vcpu_run() loop. But for DR6, the guest's value doesn't need to be loaded into hardware for KVM_DEBUGREG_BP_ENABLED, and SVM provides a dedicated VMCB field whereas VMX requires software to manually load the guest value, and so loading the guest's value into DR6 is handled by {svm,vmx}_vcpu_run(), i.e. is done _inside_ the core run loop. Unfortunately, saving the guest values on VM-Exit is initiated by common x86, again outside of the core run loop. If the guest modifies DR6 (in hardware, when DR interception is disabled), and then the next VM-Exit is a fastpath VM-Exit, KVM will reload hardware DR6 with vcpu->arch.dr6 and clobber the guest's actual value. The bug shows up primarily with nested VMX because KVM handles the VMX preemption timer in the fastpath, and the window between hardware DR6 being modified (in guest context) and DR6 being read by guest software is orders of magnitude larger in a nested setup. E.g. in non-nested, the VMX preemption timer would need to fire precisely between #DB injection and the #DB handler's read of DR6, whereas with a KVM-on-KVM setup, the window where hardware DR6 is "dirty" extends all the way from L1 writing DR6 to VMRESUME (in L1). L1's view: ========== CPU 0/KVM-7289 [023] d.... 2925.640961: kvm_entry: vcpu 0 A: L1 Writes DR6 CPU 0/KVM-7289 [023] d.... 2925.640963: : Set DRs, DR6 = 0xffff0ff1 B: CPU 0/KVM-7289 [023] d.... 2925.640967: kvm_exit: vcpu 0 reason EXTERNAL_INTERRUPT intr_info 0x800000ec D: L1 reads DR6, arch.dr6 = 0 CPU 0/KVM-7289 [023] d.... 2925.640969: : Sync DRs, DR6 = 0xffff0ff0 CPU 0/KVM-7289 [023] d.... 2925.640976: kvm_entry: vcpu 0 L2 reads DR6, L1 disables DR interception CPU 0/KVM-7289 [023] d.... 2925.640980: kvm_exit: vcpu 0 reason DR_ACCESS info1 0x0000000000000216 CPU 0/KVM-7289 [023] d.... 2925.640983: kvm_entry: vcpu 0 CPU 0/KVM-7289 [023] d.... 2925.640983: : Set DRs, DR6 = 0xffff0ff0 L2 detects failure CPU 0/KVM-7289 [023] d.... 2925.640987: kvm_exit: vcpu 0 reason HLT L1 reads DR6 (confirms failure) CPU 0/KVM-7289 [023] d.... 2925.640990: : Sync DRs, DR6 = 0xffff0ff0 L0's view: ========== L2 reads DR6, arch.dr6 = 0 CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216 CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit: vcpu 23 reason DR_ACCESS info1 0x0000000000000216 L2 => L1 nested VM-Exit CPU 23/KVM-5046 [001] ..... 3410.005610: kvm_nested_vmexit_inject: reason: DR_ACCESS ext_inf1: 0x0000000000000216 CPU 23/KVM-5046 [001] d.... 3410.005610: kvm_entry: vcpu 23 CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_exit: vcpu 23 reason VMREAD CPU 23/KVM-5046 [001] d.... 3410.005611: kvm_entry: vcpu 23 CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason VMREAD CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_entry: vcpu 23 L1 writes DR7, L0 disables DR interception CPU 23/KVM-5046 [001] d.... 3410.005612: kvm_exit: vcpu 23 reason DR_ACCESS info1 0x0000000000000007 CPU 23/KVM-5046 [001] d.... 3410.005613: kvm_entry: vcpu 23 L0 writes DR6 = 0 (arch.dr6) CPU 23/KVM-5046 [001] d.... 3410.005613: : Set DRs, DR6 = 0xffff0ff0 A: B: CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_exit: vcpu 23 reason PREEMPTION_TIMER CPU 23/KVM-5046 [001] d.... 3410.005614: kvm_entry: vcpu 23 C: L0 writes DR6 = 0 (arch.dr6) CPU 23/KVM-5046 [001] d.... 3410.005614: : Set DRs, DR6 = 0xffff0ff0 L1 => L2 nested VM-Enter CPU 23/KVM-5046 [001] d.... 3410.005616: kvm_exit: vcpu 23 reason VMRESUME L0 reads DR6, arch.dr6 = 0 Reported-by: John Stultz Closes: https://lkml.kernel.org/r/CANDhNCq5_F3HfFYABqFGCA1bPd_%2BxgNj-iDQhH4tDk%2Bwi8iZZg%40mail.gmail.com Fixes: 375e28ffc0cf ("KVM: X86: Set host DR6 only on VMX and for KVM_DEBUGREG_WONT_EXIT") Fixes: d67668e9dd76 ("KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6") Cc: stable@vger.kernel.org Cc: Jim Mattson Tested-by: John Stultz Link: https://lore.kernel.org/r/20250125011833.3644371-1-seanjc@google.com Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm-x86-ops.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 13 ++++++------- arch/x86/kvm/vmx/main.c | 1 + arch/x86/kvm/vmx/vmx.c | 10 ++++++---- arch/x86/kvm/vmx/x86_ops.h | 1 + arch/x86/kvm/x86.c | 3 +++ 7 files changed, 19 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index c35550581da0..823c0434bbad 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -48,6 +48,7 @@ KVM_X86_OP(set_idt) KVM_X86_OP(get_gdt) KVM_X86_OP(set_gdt) KVM_X86_OP(sync_dirty_debug_regs) +KVM_X86_OP(set_dr6) KVM_X86_OP(set_dr7) KVM_X86_OP(cache_reg) KVM_X86_OP(get_rflags) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b15cde0a9b5c..0b7af5902ff7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1696,6 +1696,7 @@ struct kvm_x86_ops { void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); + void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7640a84e554a..a713c803a3a3 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1991,11 +1991,11 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid = sd->next_asid++; } -static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) +static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { - struct vmcb *vmcb = svm->vmcb; + struct vmcb *vmcb = to_svm(vcpu)->vmcb; - if (svm->vcpu.arch.guest_state_protected) + if (vcpu->arch.guest_state_protected) return; if (unlikely(value != vmcb->save.dr6)) { @@ -4247,10 +4247,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, * Run with all-zero DR6 unless needed, so that we can get the exact cause * of a #DB. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - svm_set_dr6(svm, vcpu->arch.dr6); - else - svm_set_dr6(svm, DR6_ACTIVE_LOW); + if (likely(!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))) + svm_set_dr6(vcpu, DR6_ACTIVE_LOW); clgi(); kvm_load_guest_xsave_state(vcpu); @@ -5043,6 +5041,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, + .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 2427f918e763..43ee9ed11291 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -61,6 +61,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, + .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f72835e85b6d..6c56d5235f0f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5648,6 +5648,12 @@ void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) set_debugreg(DR6_RESERVED, 6); } +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) +{ + lockdep_assert_irqs_disabled(); + set_debugreg(vcpu->arch.dr6, 6); +} + void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) { vmcs_writel(GUEST_DR7, val); @@ -7417,10 +7423,6 @@ fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) vmx->loaded_vmcs->host_state.cr4 = cr4; } - /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ - if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) - set_debugreg(vcpu->arch.dr6, 6); - /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h index ce3295a67c04..430773a5ef8e 100644 --- a/arch/x86/kvm/vmx/x86_ops.h +++ b/arch/x86/kvm/vmx/x86_ops.h @@ -73,6 +73,7 @@ void vmx_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void vmx_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt); +void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val); void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val); void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu); void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8e77e61d4fbd..02159c967d29 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10961,6 +10961,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + kvm_x86_call(set_dr6)(vcpu, vcpu->arch.dr6); } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } -- 2.51.0