From 64407f4b5807dc9dec8135e1bfd45d2cb11b4ea0 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Fri, 28 Feb 2025 16:03:47 +0300
Subject: [PATCH 01/16] gpiolib: Fix Oops in gpiod_direction_input_nonotify()

The gpiod_direction_input_nonotify() function is supposed to return zero
if the direction for the pin is input.  But instead it accidentally
returns GPIO_LINE_DIRECTION_IN (1) which will be cast into an ERR_PTR()
in gpiochip_request_own_desc().  The callers dereference it and it leads
to a crash.

I changed gpiod_direction_output_raw_commit() just for consistency but
returning GPIO_LINE_DIRECTION_OUT (0) is fine.

Cc: stable@vger.kernel.org
Fixes: 9d846b1aebbe ("gpiolib: check the return value of gpio_chip::get_direction()")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/254f3925-3015-4c9d-aac5-bb9b4b2cd2c5@stanley.mountain
[Bartosz: moved the variable declarations to the top of the functions]
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
---
 drivers/gpio/gpiolib.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index fc19df5a64c2..8741600af7ef 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -2712,7 +2712,7 @@ EXPORT_SYMBOL_GPL(gpiod_direction_input);
 
 int gpiod_direction_input_nonotify(struct gpio_desc *desc)
 {
-	int ret = 0;
+	int ret = 0, dir;
 
 	CLASS(gpio_chip_guard, guard)(desc);
 	if (!guard.gc)
@@ -2740,12 +2740,12 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc)
 		ret = guard.gc->direction_input(guard.gc,
 						gpio_chip_hwgpio(desc));
 	} else if (guard.gc->get_direction) {
-		ret = guard.gc->get_direction(guard.gc,
+		dir = guard.gc->get_direction(guard.gc,
 					      gpio_chip_hwgpio(desc));
-		if (ret < 0)
-			return ret;
+		if (dir < 0)
+			return dir;
 
-		if (ret != GPIO_LINE_DIRECTION_IN) {
+		if (dir != GPIO_LINE_DIRECTION_IN) {
 			gpiod_warn(desc,
 				   "%s: missing direction_input() operation and line is output\n",
 				    __func__);
@@ -2764,7 +2764,7 @@ int gpiod_direction_input_nonotify(struct gpio_desc *desc)
 
 static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value)
 {
-	int val = !!value, ret = 0;
+	int val = !!value, ret = 0, dir;
 
 	CLASS(gpio_chip_guard, guard)(desc);
 	if (!guard.gc)
@@ -2788,12 +2788,12 @@ static int gpiod_direction_output_raw_commit(struct gpio_desc *desc, int value)
 	} else {
 		/* Check that we are in output mode if we can */
 		if (guard.gc->get_direction) {
-			ret = guard.gc->get_direction(guard.gc,
+			dir = guard.gc->get_direction(guard.gc,
 						      gpio_chip_hwgpio(desc));
-			if (ret < 0)
-				return ret;
+			if (dir < 0)
+				return dir;
 
-			if (ret != GPIO_LINE_DIRECTION_OUT) {
+			if (dir != GPIO_LINE_DIRECTION_OUT) {
 				gpiod_warn(desc,
 					   "%s: missing direction_output() operation\n",
 					   __func__);
-- 
2.51.0


From c157d351460bcf202970e97e611cb6b54a3dd4a4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 25 Feb 2025 23:37:08 +0100
Subject: [PATCH 02/16] intel_idle: Handle older CPUs, which stop the TSC in
 deeper C states, correctly

The Intel idle driver is preferred over the ACPI processor idle driver,
but fails to implement the work around for Core2 generation CPUs, where
the TSC stops in C2 and deeper C-states. This causes stalls and boot
delays, when the clocksource watchdog does not catch the unstable TSC
before the CPU goes deep idle for the first time.

The ACPI driver marks the TSC unstable when it detects that the CPU
supports C2 or deeper and the CPU does not have a non-stop TSC.

Add the equivivalent work around to the Intel idle driver to cure that.

Fixes: 18734958e9bf ("intel_idle: Use ACPI _CST for processor models without C-state tables")
Reported-by: Fab Stz <fabstz-it@yahoo.fr>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Fab Stz <fabstz-it@yahoo.fr>
Cc: All applicable <stable@vger.kernel.org>
Closes: https://lore.kernel.org/all/10cf96aa-1276-4bd4-8966-c890377030c3@yahoo.fr
Link: https://patch.msgid.link/87bjupfy7f.ffs@tglx
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/idle/intel_idle.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 118fe1d37c22..0fdb1d1316c4 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -56,6 +56,7 @@
 #include <asm/intel-family.h>
 #include <asm/mwait.h>
 #include <asm/spec-ctrl.h>
+#include <asm/tsc.h>
 #include <asm/fpu/api.h>
 
 #define INTEL_IDLE_VERSION "0.5.1"
@@ -1799,6 +1800,9 @@ static void __init intel_idle_init_cstates_acpi(struct cpuidle_driver *drv)
 		if (intel_idle_state_needs_timer_stop(state))
 			state->flags |= CPUIDLE_FLAG_TIMER_STOP;
 
+		if (cx->type > ACPI_STATE_C1 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
+			mark_tsc_unstable("TSC halts in idle");
+
 		state->enter = intel_idle;
 		state->enter_s2idle = intel_idle_s2idle;
 	}
-- 
2.51.0


From cb380909ae3b1ebf14d6a455a4f92d7916d790cb Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 27 Feb 2025 15:06:30 -0800
Subject: [PATCH 03/16] vhost: return task creation error instead of NULL

Lets callers distinguish why the vhost task creation failed. No one
currently cares why it failed, so no real runtime change from this
patch, but that will not be the case for long.

Signed-off-by: Keith Busch <kbusch@kernel.org>
Message-ID: <20250227230631.303431-2-kbusch@meta.com>
Reviewed-by: Mike Christie <michael.christie@oracle.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 2 +-
 drivers/vhost/vhost.c  | 2 +-
 kernel/vhost_task.c    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index d4ac4a1f8b81..18ca1ea6dc24 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7471,7 +7471,7 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
 				      kvm_nx_huge_page_recovery_worker_kill,
 				      kvm, "kvm-nx-lpage-recovery");
 
-	if (!nx_thread)
+	if (IS_ERR(nx_thread))
 		return;
 
 	vhost_task_start(nx_thread);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9ac25d08f473..63612faeab72 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -666,7 +666,7 @@ static struct vhost_worker *vhost_worker_create(struct vhost_dev *dev)
 
 	vtsk = vhost_task_create(vhost_run_work_list, vhost_worker_killed,
 				 worker, name);
-	if (!vtsk)
+	if (IS_ERR(vtsk))
 		goto free_worker;
 
 	mutex_init(&worker->mutex);
diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c
index 8800f5acc007..2ef2e1b80091 100644
--- a/kernel/vhost_task.c
+++ b/kernel/vhost_task.c
@@ -133,7 +133,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
 
 	vtsk = kzalloc(sizeof(*vtsk), GFP_KERNEL);
 	if (!vtsk)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	init_completion(&vtsk->exited);
 	mutex_init(&vtsk->exit_mutex);
 	vtsk->data = arg;
@@ -145,7 +145,7 @@ struct vhost_task *vhost_task_create(bool (*fn)(void *),
 	tsk = copy_process(NULL, 0, NUMA_NO_NODE, &args);
 	if (IS_ERR(tsk)) {
 		kfree(vtsk);
-		return NULL;
+		return ERR_PTR(PTR_ERR(tsk));
 	}
 
 	vtsk->task = tsk;
-- 
2.51.0


From 916b7f42b3b3b539a71c204a9b49fdc4ca92cd82 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Thu, 27 Feb 2025 15:06:31 -0800
Subject: [PATCH 04/16] kvm: retry nx_huge_page_recovery_thread creation

A VMM may send a non-fatal signal to its threads, including vCPU tasks,
at any time, and thus may signal vCPU tasks during KVM_RUN.  If a vCPU
task receives the signal while its trying to spawn the huge page recovery
vhost task, then KVM_RUN will fail due to copy_process() returning
-ERESTARTNOINTR.

Rework call_once() to mark the call complete if and only if the called
function succeeds, and plumb the function's true error code back to the
call_once() invoker.  This provides userspace with the correct, non-fatal
error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows
subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge
page task.

Co-developed-by: Sean Christopherson <seanjc@google.com>
Signed-off-by: Sean Christopherson <seanjc@google.com>
[implemented the kvm user side]
Signed-off-by: Keith Busch <kbusch@kernel.org>
Message-ID: <20250227230631.303431-3-kbusch@meta.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c    | 10 ++++-----
 include/linux/call_once.h | 47 ++++++++++++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 18ca1ea6dc24..8160870398b9 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7460,7 +7460,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data)
 	return true;
 }
 
-static void kvm_mmu_start_lpage_recovery(struct once *once)
+static int kvm_mmu_start_lpage_recovery(struct once *once)
 {
 	struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once);
 	struct kvm *kvm = container_of(ka, struct kvm, arch);
@@ -7472,12 +7472,13 @@ static void kvm_mmu_start_lpage_recovery(struct once *once)
 				      kvm, "kvm-nx-lpage-recovery");
 
 	if (IS_ERR(nx_thread))
-		return;
+		return PTR_ERR(nx_thread);
 
 	vhost_task_start(nx_thread);
 
 	/* Make the task visible only once it is fully started. */
 	WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread);
+	return 0;
 }
 
 int kvm_mmu_post_init_vm(struct kvm *kvm)
@@ -7485,10 +7486,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm)
 	if (nx_hugepage_mitigation_hard_disabled)
 		return 0;
 
-	call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
-	if (!kvm->arch.nx_huge_page_recovery_thread)
-		return -ENOMEM;
-	return 0;
+	return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery);
 }
 
 void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
diff --git a/include/linux/call_once.h b/include/linux/call_once.h
index 6261aa0b3fb0..13cd6469e7e5 100644
--- a/include/linux/call_once.h
+++ b/include/linux/call_once.h
@@ -26,20 +26,41 @@ do {									\
 	__once_init((once), #once, &__key);				\
 } while (0)
 
-static inline void call_once(struct once *once, void (*cb)(struct once *))
+/*
+ * call_once - Ensure a function has been called exactly once
+ *
+ * @once: Tracking struct
+ * @cb: Function to be called
+ *
+ * If @once has never completed successfully before, call @cb and, if
+ * it returns a zero or positive value, mark @once as completed.  Return
+ * the value returned by @cb
+ *
+ * If @once has completed succesfully before, return 0.
+ *
+ * The call to @cb is implicitly surrounded by a mutex, though for
+ * efficiency the * function avoids taking it after the first call.
+ */
+static inline int call_once(struct once *once, int (*cb)(struct once *))
 {
-        /* Pairs with atomic_set_release() below.  */
-        if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
-                return;
-
-        guard(mutex)(&once->lock);
-        WARN_ON(atomic_read(&once->state) == ONCE_RUNNING);
-        if (atomic_read(&once->state) != ONCE_NOT_STARTED)
-                return;
-
-        atomic_set(&once->state, ONCE_RUNNING);
-        cb(once);
-        atomic_set_release(&once->state, ONCE_COMPLETED);
+	int r, state;
+
+	/* Pairs with atomic_set_release() below.  */
+	if (atomic_read_acquire(&once->state) == ONCE_COMPLETED)
+		return 0;
+
+	guard(mutex)(&once->lock);
+	state = atomic_read(&once->state);
+	if (unlikely(state != ONCE_NOT_STARTED))
+		return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0;
+
+	atomic_set(&once->state, ONCE_RUNNING);
+	r = cb(once);
+	if (r < 0)
+		atomic_set(&once->state, ONCE_NOT_STARTED);
+	else
+		atomic_set_release(&once->state, ONCE_COMPLETED);
+	return r;
 }
 
 #endif /* _LINUX_CALL_ONCE_H */
-- 
2.51.0


From a2f925a2f62254119cdaa360cfc9c0424bccd531 Mon Sep 17 00:00:00 2001
From: Niklas Cassel <cassel@kernel.org>
Date: Fri, 28 Feb 2025 13:26:04 +0100
Subject: [PATCH 05/16] Revert "ata: libata-core: Add ATA_QUIRK_NOLPM for
 Samsung SSD 870 QVO drives"

This reverts commit cc77e2ce187d26cc66af3577bf896d7410eb25ab.

It was reported that adding ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives
breaks entering lower package states for certain systems.

It turns out that Samsung SSD 870 QVO actually has working LPM when using
a recent SSD firmware version.

The author of commit cc77e2ce187d ("ata: libata-core: Add ATA_QUIRK_NOLPM
for Samsung SSD 870 QVO drives") reported himself that only older SSD
firmware versions have broken LPM:
https://lore.kernel.org/stable/93c10d38-718c-459d-84a5-4d87680b4da7@debian.org/

Unfortunately, he did not specify which older firmware version he was using
which had broken LPM.

Let's revert this quirk, which has FW version field specified as NULL
(which means that it applies for all Samsung SSD 870 QVO firmware versions)
for now. Once the author reports which older firmware version(s) that are
broken, we can create a more fine grained quirk, which populates the FW
version field accordingly.

Fixes: cc77e2ce187d ("ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives")
Reported-by: Dieter Mummenschanz <dmummenschanz@web.de>
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219747
Link: https://lore.kernel.org/r/20250228122603.91814-2-cassel@kernel.org
Signed-off-by: Niklas Cassel <cassel@kernel.org>
---
 drivers/ata/libata-core.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 63ec2f218431..c085dd81ebe7 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -4143,10 +4143,6 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = {
 	{ "Samsung SSD 860*",		NULL,	ATA_QUIRK_NO_NCQ_TRIM |
 						ATA_QUIRK_ZERO_AFTER_TRIM |
 						ATA_QUIRK_NO_NCQ_ON_ATI },
-	{ "Samsung SSD 870 QVO*",	NULL,	ATA_QUIRK_NO_NCQ_TRIM |
-						ATA_QUIRK_ZERO_AFTER_TRIM |
-						ATA_QUIRK_NO_NCQ_ON_ATI |
-						ATA_QUIRK_NOLPM },
 	{ "Samsung SSD 870*",		NULL,	ATA_QUIRK_NO_NCQ_TRIM |
 						ATA_QUIRK_ZERO_AFTER_TRIM |
 						ATA_QUIRK_NO_NCQ_ON_ATI },
-- 
2.51.0


From 7eb172143d5508b4da468ed59ee857c6e5e01da6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 2 Mar 2025 11:48:20 -0800
Subject: [PATCH 06/16] Linux 6.14-rc5

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 30dab4c8b012..70bdbf2218fc 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@
 VERSION = 6
 PATCHLEVEL = 14
 SUBLEVEL = 0
-EXTRAVERSION = -rc4
+EXTRAVERSION = -rc5
 NAME = Baby Opossum Posse
 
 # *DOCUMENTATION*
-- 
2.51.0


From 5550187c4c21740942c32a9ae56f9f472a104cb4 Mon Sep 17 00:00:00 2001
From: David Gow <davidgow@google.com>
Date: Mon, 10 Feb 2025 18:53:51 +0800
Subject: [PATCH 07/16] um: Pass the correct Rust target and options with gcc

In order to work around some issues with disabling SSE on older versions
of gcc (compilation would fail upon seeing a function declaration
containing a float, even if it was never called or defined), the
corresponding CFLAGS and RUSTFLAGS were only set when using clang.

However, this led to two problems:
- Newer gcc versions also wouldn't get the correct flags, despite not
  having the bug.
- The RUSTFLAGS for setting the rust target definition were not set,
  despite being unrelated. This works by chance for x86_64, as the
  built-in default target is close enough, but not for 32-bit x86.

Move the target definition outside the conditional block, and update the
condition to take into account the gcc version.

Fixes: a3046a618a28 ("um: Only disable SSE on clang to work around old GCC bugs")
Signed-off-by: David Gow <davidgow@google.com>
Link: https://patch.msgid.link/20250210105353.2238769-2-davidgow@google.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/x86/Makefile.um | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index a46b1397ad01..c86cbd9cbba3 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -7,12 +7,13 @@ core-y += arch/x86/crypto/
 # GCC versions < 11. See:
 # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652
 #
-ifeq ($(CONFIG_CC_IS_CLANG),y)
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
-KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
+ifeq ($(call gcc-min-version, 110000)$(CONFIG_CC_IS_CLANG),y)
+KBUILD_CFLAGS +=  -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
 KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 endif
 
+KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json
+
 ifeq ($(CONFIG_X86_32),y)
 START := 0x8048000
 
-- 
2.51.0


From d1d7f01f7cd35e16c6bcef5a0e31988b5c9980f9 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 10 Feb 2025 17:09:25 +0100
Subject: [PATCH 08/16] um: mark rodata read-only and implement _nofault
 accesses

Mark read-only data actually read-only (simple mprotect), and
to be able to test it also implement _nofault accesses. This
works by setting up a new "segv_continue" pointer in current,
and then when we hit a segfault we change the signal return
context so that we continue at that address. The code using
this sets it up so that it jumps to a label and then aborts
the access that way, returning -EFAULT.

It's possible to optimize the ___backtrack_faulted() thing by
using asm goto (compiler version dependent) and/or gcc's (not
sure if clang has it) &&label extension, but at least in one
attempt I made the && caused the compiler to not load -EFAULT
into the register in case of jumping to the &&label from the
fault handler. So leave it like this for now.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
Co-developed-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250210160926.420133-2-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/Kconfig                          |  1 +
 arch/um/include/asm/processor-generic.h  |  2 ++
 arch/um/include/asm/uaccess.h            | 20 ++++++++++++-----
 arch/um/include/shared/arch.h            |  2 ++
 arch/um/include/shared/as-layout.h       |  2 +-
 arch/um/include/shared/irq_user.h        |  3 ++-
 arch/um/include/shared/kern_util.h       | 12 ++++++----
 arch/um/kernel/irq.c                     |  3 ++-
 arch/um/kernel/mem.c                     | 10 +++++++++
 arch/um/kernel/trap.c                    | 28 +++++++++++++++++++-----
 arch/um/os-Linux/signal.c                |  4 ++--
 arch/um/os-Linux/skas/process.c          |  8 +++----
 arch/x86/um/os-Linux/mcontext.c          | 12 ++++++++++
 arch/x86/um/shared/sysdep/faultinfo_32.h | 12 ++++++++++
 arch/x86/um/shared/sysdep/faultinfo_64.h | 12 ++++++++++
 15 files changed, 108 insertions(+), 23 deletions(-)

diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 18051b1cfce0..79509c7f39de 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -12,6 +12,7 @@ config UML
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_STRNCPY_FROM_USER
 	select ARCH_HAS_STRNLEN_USER
+	select ARCH_HAS_STRICT_KERNEL_RWX
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_KASAN if X86_64
 	select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h
index 5d6356eafffe..8a789c17acd8 100644
--- a/arch/um/include/asm/processor-generic.h
+++ b/arch/um/include/asm/processor-generic.h
@@ -31,6 +31,8 @@ struct thread_struct {
 		} thread;
 	} request;
 
+	void *segv_continue;
+
 	/* Contains variable sized FP registers */
 	struct pt_regs regs;
 };
diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h
index 1d4b6bbc1b65..3a08f9029a3f 100644
--- a/arch/um/include/asm/uaccess.h
+++ b/arch/um/include/asm/uaccess.h
@@ -9,6 +9,7 @@
 
 #include <asm/elf.h>
 #include <linux/unaligned.h>
+#include <sysdep/faultinfo.h>
 
 #define __under_task_size(addr, size) \
 	(((unsigned long) (addr) < TASK_SIZE) && \
@@ -44,19 +45,28 @@ static inline int __access_ok(const void __user *ptr, unsigned long size)
 		 __access_ok_vsyscall(addr, size));
 }
 
-/* no pagefaults for kernel addresses in um */
 #define __get_kernel_nofault(dst, src, type, err_label)			\
 do {									\
-	*((type *)dst) = get_unaligned((type *)(src));			\
-	if (0) /* make sure the label looks used to the compiler */	\
+	int __faulted;							\
+									\
+	___backtrack_faulted(__faulted);				\
+	if (__faulted) {						\
+		*((type *)dst) = (type) 0;				\
 		goto err_label;						\
+	}								\
+	*((type *)dst) = get_unaligned((type *)(src));			\
+	current->thread.segv_continue = NULL;				\
 } while (0)
 
 #define __put_kernel_nofault(dst, src, type, err_label)			\
 do {									\
-	put_unaligned(*((type *)src), (type *)(dst));			\
-	if (0) /* make sure the label looks used to the compiler */	\
+	int __faulted;							\
+									\
+	___backtrack_faulted(__faulted);				\
+	if (__faulted)							\
 		goto err_label;						\
+	put_unaligned(*((type *)src), (type *)(dst));			\
+	current->thread.segv_continue = NULL;				\
 } while (0)
 
 #endif
diff --git a/arch/um/include/shared/arch.h b/arch/um/include/shared/arch.h
index 880ee42a3329..cc398a21ad96 100644
--- a/arch/um/include/shared/arch.h
+++ b/arch/um/include/shared/arch.h
@@ -12,4 +12,6 @@ extern void arch_check_bugs(void);
 extern int arch_fixup(unsigned long address, struct uml_pt_regs *regs);
 extern void arch_examine_signal(int sig, struct uml_pt_regs *regs);
 
+void mc_set_rip(void *_mc, void *target);
+
 #endif
diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h
index ea65f151bf48..4f44dcce8a7c 100644
--- a/arch/um/include/shared/as-layout.h
+++ b/arch/um/include/shared/as-layout.h
@@ -50,7 +50,7 @@ extern int linux_main(int argc, char **argv, char **envp);
 extern void uml_finishsetup(void);
 
 struct siginfo;
-extern void (*sig_info[])(int, struct siginfo *si, struct uml_pt_regs *);
+extern void (*sig_info[])(int, struct siginfo *si, struct uml_pt_regs *, void *);
 
 #endif
 
diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h
index da0f6eea30d0..88835b52ae2b 100644
--- a/arch/um/include/shared/irq_user.h
+++ b/arch/um/include/shared/irq_user.h
@@ -15,7 +15,8 @@ enum um_irq_type {
 };
 
 struct siginfo;
-extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
+extern void sigio_handler(int sig, struct siginfo *unused_si,
+			  struct uml_pt_regs *regs, void *mc);
 void sigio_run_timetravel_handlers(void);
 extern void free_irq_by_fd(int fd);
 extern void deactivate_fd(int fd, int irqnum);
diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h
index f21dc8517538..00ca3e12fd9a 100644
--- a/arch/um/include/shared/kern_util.h
+++ b/arch/um/include/shared/kern_util.h
@@ -24,10 +24,12 @@ extern void free_stack(unsigned long stack, int order);
 struct pt_regs;
 extern void do_signal(struct pt_regs *regs);
 extern void interrupt_end(void);
-extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs);
+extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs,
+			 void *mc);
 
 extern unsigned long segv(struct faultinfo fi, unsigned long ip,
-			  int is_user, struct uml_pt_regs *regs);
+			  int is_user, struct uml_pt_regs *regs,
+			  void *mc);
 extern int handle_page_fault(unsigned long address, unsigned long ip,
 			     int is_write, int is_user, int *code_out);
 
@@ -59,8 +61,10 @@ extern unsigned long from_irq_stack(int nested);
 
 extern int singlestepping(void);
 
-extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
-extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs);
+extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+			 void *mc);
+extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+		  void *mc);
 extern void fatal_sigsegv(void) __attribute__ ((noreturn));
 
 void um_idle_sleep(void);
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index a4991746f5ea..abe8f30a521c 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -236,7 +236,8 @@ static void _sigio_handler(struct uml_pt_regs *regs,
 		free_irqs();
 }
 
-void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
+void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+		   void *mc)
 {
 	preempt_disable();
 	_sigio_handler(regs, irqs_suspended);
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index befed230aac2..05ffceb555b4 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -9,6 +9,8 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
+#include <linux/init.h>
+#include <asm/sections.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <as-layout.h>
@@ -241,3 +243,11 @@ static const pgprot_t protection_map[16] = {
 	[VM_SHARED | VM_EXEC | VM_WRITE | VM_READ]	= PAGE_SHARED
 };
 DECLARE_VM_GET_PAGE_PROT
+
+void mark_rodata_ro(void)
+{
+	unsigned long rodata_start = PFN_ALIGN(__start_rodata);
+	unsigned long rodata_end = PFN_ALIGN(__end_rodata);
+
+	os_protect_memory((void *)rodata_start, rodata_end - rodata_start, 1, 0, 0);
+}
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index cdaee3e94273..ce073150dc20 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -16,6 +16,7 @@
 #include <kern_util.h>
 #include <os.h>
 #include <skas.h>
+#include <arch.h>
 
 /*
  * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by
@@ -175,12 +176,14 @@ void fatal_sigsegv(void)
  * @sig:	the signal number
  * @unused_si:	the signal info struct; unused in this handler
  * @regs:	the ptrace register information
+ * @mc:		the mcontext of the signal
  *
  * The handler first extracts the faultinfo from the UML ptrace regs struct.
  * If the userfault did not happen in an UML userspace process, bad_segv is called.
  * Otherwise the signal did happen in a cloned userspace process, handle it.
  */
-void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
+void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+		  void *mc)
 {
 	struct faultinfo * fi = UPT_FAULTINFO(regs);
 
@@ -189,7 +192,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 		bad_segv(*fi, UPT_IP(regs));
 		return;
 	}
-	segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs);
+	segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc);
 }
 
 /*
@@ -199,7 +202,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
  * give us bad data!
  */
 unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
-		   struct uml_pt_regs *regs)
+		   struct uml_pt_regs *regs, void *mc)
 {
 	int si_code;
 	int err;
@@ -223,6 +226,19 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 		goto out;
 	}
 	else if (current->mm == NULL) {
+		if (current->pagefault_disabled) {
+			if (!mc) {
+				show_regs(container_of(regs, struct pt_regs, regs));
+				panic("Segfault with pagefaults disabled but no mcontext");
+			}
+			if (!current->thread.segv_continue) {
+				show_regs(container_of(regs, struct pt_regs, regs));
+				panic("Segfault without recovery target");
+			}
+			mc_set_rip(mc, current->thread.segv_continue);
+			current->thread.segv_continue = NULL;
+			goto out;
+		}
 		show_regs(container_of(regs, struct pt_regs, regs));
 		panic("Segfault with no mm");
 	}
@@ -274,7 +290,8 @@ out:
 	return 0;
 }
 
-void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
+void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs,
+		  void *mc)
 {
 	int code, err;
 	if (!UPT_IS_USER(regs)) {
@@ -302,7 +319,8 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs)
 	}
 }
 
-void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
+void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+	   void *mc)
 {
 	do_IRQ(WINCH_IRQ, regs);
 }
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 9ea7269ffb77..e71e5b4878d1 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -21,7 +21,7 @@
 #include <sys/ucontext.h>
 #include <timetravel.h>
 
-void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = {
+void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = {
 	[SIGTRAP]	= relay_signal,
 	[SIGFPE]	= relay_signal,
 	[SIGILL]	= relay_signal,
@@ -47,7 +47,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 	if ((sig != SIGIO) && (sig != SIGWINCH))
 		unblock_signals_trace();
 
-	(*sig_info[sig])(sig, si, &r);
+	(*sig_info[sig])(sig, si, &r, mc);
 
 	errno = save_errno;
 }
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index e2f8f156402f..ae2aea062f06 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -166,7 +166,7 @@ static void get_skas_faultinfo(int pid, struct faultinfo *fi)
 static void handle_segv(int pid, struct uml_pt_regs *regs)
 {
 	get_skas_faultinfo(pid, &regs->faultinfo);
-	segv(regs->faultinfo, 0, 1, NULL);
+	segv(regs->faultinfo, 0, 1, NULL, NULL);
 }
 
 static void handle_trap(int pid, struct uml_pt_regs *regs)
@@ -525,7 +525,7 @@ void userspace(struct uml_pt_regs *regs)
 					get_skas_faultinfo(pid,
 							   &regs->faultinfo);
 					(*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si,
-							     regs);
+							     regs, NULL);
 				}
 				else handle_segv(pid, regs);
 				break;
@@ -533,7 +533,7 @@ void userspace(struct uml_pt_regs *regs)
 				handle_trap(pid, regs);
 				break;
 			case SIGTRAP:
-				relay_signal(SIGTRAP, (struct siginfo *)&si, regs);
+				relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL);
 				break;
 			case SIGALRM:
 				break;
@@ -543,7 +543,7 @@ void userspace(struct uml_pt_regs *regs)
 			case SIGFPE:
 			case SIGWINCH:
 				block_signals_trace();
-				(*sig_info[sig])(sig, (struct siginfo *)&si, regs);
+				(*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL);
 				unblock_signals_trace();
 				break;
 			default:
diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c
index e80ab7d28117..d2f3a595b4ef 100644
--- a/arch/x86/um/os-Linux/mcontext.c
+++ b/arch/x86/um/os-Linux/mcontext.c
@@ -4,6 +4,7 @@
 #include <asm/ptrace.h>
 #include <sysdep/ptrace.h>
 #include <sysdep/mcontext.h>
+#include <arch.h>
 
 void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
 {
@@ -31,3 +32,14 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc)
 	regs->gp[CS / sizeof(unsigned long)] |= 3;
 #endif
 }
+
+void mc_set_rip(void *_mc, void *target)
+{
+	mcontext_t *mc = _mc;
+
+#ifdef __i386__
+	mc->gregs[REG_EIP] = (unsigned long)target;
+#else
+	mc->gregs[REG_RIP] = (unsigned long)target;
+#endif
+}
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
index b6f2437ec29c..ab5c8e47049c 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_32.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -29,4 +29,16 @@ struct faultinfo {
 
 #define PTRACE_FULL_FAULTINFO 0
 
+#define ___backtrack_faulted(_faulted)					\
+	asm volatile (							\
+		"mov $0, %0\n"						\
+		"movl $__get_kernel_nofault_faulted_%=,%1\n"		\
+		"jmp _end_%=\n"						\
+		"__get_kernel_nofault_faulted_%=:\n"			\
+		"mov $1, %0;"						\
+		"_end_%=:"						\
+		: "=r" (_faulted),					\
+		  "=m" (current->thread.segv_continue) ::		\
+	)
+
 #endif
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
index ee88f88974ea..26fb4835d3e9 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_64.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -29,4 +29,16 @@ struct faultinfo {
 
 #define PTRACE_FULL_FAULTINFO 1
 
+#define ___backtrack_faulted(_faulted)					\
+	asm volatile (							\
+		"mov $0, %0\n"						\
+		"movq $__get_kernel_nofault_faulted_%=,%1\n"		\
+		"jmp _end_%=\n"						\
+		"__get_kernel_nofault_faulted_%=:\n"			\
+		"mov $1, %0;"						\
+		"_end_%=:"						\
+		: "=r" (_faulted),					\
+		  "=m" (current->thread.segv_continue) ::		\
+	)
+
 #endif
-- 
2.51.0


From 84a6fc378471fbeaf48f8604566a5a33a3d63c18 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Mon, 10 Feb 2025 17:09:26 +0100
Subject: [PATCH 09/16] um: remove copy_from_kernel_nofault_allowed

There is no need to override the default version of this function
anymore as UML now has proper _nofault memory access functions.

Doing this also fixes the fact that the implementation was incorrect as
using mincore() will incorrectly flag pages as inaccessible if they were
swapped out by the host.

Fixes: f75b1b1bedfb ("um: Implement probe_kernel_read()")
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250210160926.420133-3-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/include/shared/os.h |  1 -
 arch/um/kernel/Makefile     |  2 +-
 arch/um/kernel/maccess.c    | 19 --------------
 arch/um/os-Linux/process.c  | 51 -------------------------------------
 4 files changed, 1 insertion(+), 72 deletions(-)
 delete mode 100644 arch/um/kernel/maccess.c

diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 5babad8c5f75..bc02767f0639 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -213,7 +213,6 @@ extern int os_protect_memory(void *addr, unsigned long len,
 extern int os_unmap_memory(void *addr, int len);
 extern int os_drop_memory(void *addr, int length);
 extern int can_drop_memory(void);
-extern int os_mincore(void *addr, unsigned long len);
 
 void os_set_pdeathsig(void);
 
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index f8567b933ffa..4df1cd0d2017 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -17,7 +17,7 @@ extra-y := vmlinux.lds
 obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \
 	physmem.o process.o ptrace.o reboot.o sigio.o \
 	signal.o sysrq.o time.o tlb.o trap.o \
-	um_arch.o umid.o maccess.o kmsg_dump.o capflags.o skas/
+	um_arch.o umid.o kmsg_dump.o capflags.o skas/
 obj-y += load_file.o
 
 obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o
diff --git a/arch/um/kernel/maccess.c b/arch/um/kernel/maccess.c
deleted file mode 100644
index 8ccd56813f68..000000000000
--- a/arch/um/kernel/maccess.c
+++ /dev/null
@@ -1,19 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Copyright (C) 2013 Richard Weinberger <richrd@nod.at>
- */
-
-#include <linux/uaccess.h>
-#include <linux/kernel.h>
-#include <os.h>
-
-bool copy_from_kernel_nofault_allowed(const void *src, size_t size)
-{
-	void *psrc = (void *)rounddown((unsigned long)src, PAGE_SIZE);
-
-	if ((unsigned long)src < PAGE_SIZE || size <= 0)
-		return false;
-	if (os_mincore(psrc, size + src - psrc) <= 0)
-		return false;
-	return true;
-}
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index 9f086f939420..184566edeee9 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -142,57 +142,6 @@ out:
 	return ok;
 }
 
-static int os_page_mincore(void *addr)
-{
-	char vec[2];
-	int ret;
-
-	ret = mincore(addr, UM_KERN_PAGE_SIZE, vec);
-	if (ret < 0) {
-		if (errno == ENOMEM || errno == EINVAL)
-			return 0;
-		else
-			return -errno;
-	}
-
-	return vec[0] & 1;
-}
-
-int os_mincore(void *addr, unsigned long len)
-{
-	char *vec;
-	int ret, i;
-
-	if (len <= UM_KERN_PAGE_SIZE)
-		return os_page_mincore(addr);
-
-	vec = calloc(1, (len + UM_KERN_PAGE_SIZE - 1) / UM_KERN_PAGE_SIZE);
-	if (!vec)
-		return -ENOMEM;
-
-	ret = mincore(addr, UM_KERN_PAGE_SIZE, vec);
-	if (ret < 0) {
-		if (errno == ENOMEM || errno == EINVAL)
-			ret = 0;
-		else
-			ret = -errno;
-
-		goto out;
-	}
-
-	for (i = 0; i < ((len + UM_KERN_PAGE_SIZE - 1) / UM_KERN_PAGE_SIZE); i++) {
-		if (!(vec[i] & 1)) {
-			ret = 0;
-			goto out;
-		}
-	}
-
-	ret = 1;
-out:
-	free(vec);
-	return ret;
-}
-
 void init_new_thread_signals(void)
 {
 	set_handler(SIGSEGV);
-- 
2.51.0


From 1fc350eed627762f4f6db3f35776d481e7f02c5c Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Wed, 12 Feb 2025 12:57:56 +0800
Subject: [PATCH 10/16] um: Allocate vdso page pointer statically

Instead of dynamically allocating the pointer to the vdso page during
boot, we can just allocate it statically. Doing so will reduce error
handling and make the code slightly more readable.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250212045756.164977-1-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/x86/um/vdso/vma.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c
index f238f7b33cdd..dc8dfb2abd80 100644
--- a/arch/x86/um/vdso/vma.c
+++ b/arch/x86/um/vdso/vma.c
@@ -12,33 +12,22 @@
 
 static unsigned int __read_mostly vdso_enabled = 1;
 unsigned long um_vdso_addr;
+static struct page *um_vdso;
 
 extern unsigned long task_size;
 extern char vdso_start[], vdso_end[];
 
-static struct page **vdsop;
-
 static int __init init_vdso(void)
 {
-	struct page *um_vdso;
-
 	BUG_ON(vdso_end - vdso_start > PAGE_SIZE);
 
 	um_vdso_addr = task_size - PAGE_SIZE;
 
-	vdsop = kmalloc(sizeof(struct page *), GFP_KERNEL);
-	if (!vdsop)
-		goto oom;
-
 	um_vdso = alloc_page(GFP_KERNEL);
-	if (!um_vdso) {
-		kfree(vdsop);
-
+	if (!um_vdso)
 		goto oom;
-	}
 
 	copy_page(page_address(um_vdso), vdso_start);
-	*vdsop = um_vdso;
 
 	return 0;
 
@@ -56,6 +45,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	struct mm_struct *mm = current->mm;
 	static struct vm_special_mapping vdso_mapping = {
 		.name = "[vdso]",
+		.pages = &um_vdso,
 	};
 
 	if (!vdso_enabled)
@@ -64,7 +54,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 	if (mmap_write_lock_killable(mm))
 		return -EINTR;
 
-	vdso_mapping.pages = vdsop;
 	vma = _install_special_mapping(mm, um_vdso_addr, PAGE_SIZE,
 		VM_READ|VM_EXEC|
 		VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-- 
2.51.0


From 0bc754d1e31f40f4a343b692096d9e092ccc0370 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Fri, 14 Feb 2025 10:28:22 +0100
Subject: [PATCH 11/16] um: hostfs: avoid issues on inode number reuse by host
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Some file systems (e.g. ext4) may reuse inode numbers once the inode is
not in use anymore. Usually hostfs will keep an FD open for each inode,
but this is not always the case. In the case of sockets, this cannot
even be done properly.

As such, the following sequence of events was possible:
 * application creates and deletes a socket
 * hostfs creates/deletes the socket on the host
 * inode is still in the hostfs cache
 * hostfs creates a new file
 * ext4 on the outside reuses the inode number
 * hostfs finds the socket inode for the newly created file
 * application receives -ENXIO when opening the file

As mentioned, this can only happen if the deleted file is a special file
that is never opened on the host (i.e. no .open fop).

As such, to prevent issues, it is sufficient to check that the inode
has the expected type. That said, also add a check for the inode birth
time, just to be on the safe side.

Fixes: 74ce793bcbde ("hostfs: Fix ephemeral inodes")
Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Reviewed-by: MickaÃ«l SalaÃ¼n <mic@digikod.net>
Tested-by: MickaÃ«l SalaÃ¼n <mic@digikod.net>
Link: https://patch.msgid.link/20250214092822.1241575-1-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 fs/hostfs/hostfs.h      |  2 +-
 fs/hostfs/hostfs_kern.c |  7 ++++-
 fs/hostfs/hostfs_user.c | 59 ++++++++++++++++++++++++-----------------
 3 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h
index 8b39c15c408c..15b2f094d36e 100644
--- a/fs/hostfs/hostfs.h
+++ b/fs/hostfs/hostfs.h
@@ -60,7 +60,7 @@ struct hostfs_stat {
 	unsigned int uid;
 	unsigned int gid;
 	unsigned long long size;
-	struct hostfs_timespec atime, mtime, ctime;
+	struct hostfs_timespec atime, mtime, ctime, btime;
 	unsigned int blksize;
 	unsigned long long blocks;
 	struct {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e0741e468956..e6e247235728 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -33,6 +33,7 @@ struct hostfs_inode_info {
 	struct inode vfs_inode;
 	struct mutex open_mutex;
 	dev_t dev;
+	struct hostfs_timespec btime;
 };
 
 static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
@@ -547,6 +548,7 @@ static int hostfs_inode_set(struct inode *ino, void *data)
 	}
 
 	HOSTFS_I(ino)->dev = dev;
+	HOSTFS_I(ino)->btime = st->btime;
 	ino->i_ino = st->ino;
 	ino->i_mode = st->mode;
 	return hostfs_inode_update(ino, st);
@@ -557,7 +559,10 @@ static int hostfs_inode_test(struct inode *inode, void *data)
 	const struct hostfs_stat *st = data;
 	dev_t dev = MKDEV(st->dev.maj, st->dev.min);
 
-	return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev;
+	return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev &&
+	       (inode->i_mode & S_IFMT) == (st->mode & S_IFMT) &&
+	       HOSTFS_I(inode)->btime.tv_sec == st->btime.tv_sec &&
+	       HOSTFS_I(inode)->btime.tv_nsec == st->btime.tv_nsec;
 }
 
 static struct inode *hostfs_iget(struct super_block *sb, char *name)
diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c
index 97e9c40a9448..3bcd9f35e70b 100644
--- a/fs/hostfs/hostfs_user.c
+++ b/fs/hostfs/hostfs_user.c
@@ -18,39 +18,48 @@
 #include "hostfs.h"
 #include <utime.h>
 
-static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p)
+static void statx_to_hostfs(const struct statx *buf, struct hostfs_stat *p)
 {
-	p->ino = buf->st_ino;
-	p->mode = buf->st_mode;
-	p->nlink = buf->st_nlink;
-	p->uid = buf->st_uid;
-	p->gid = buf->st_gid;
-	p->size = buf->st_size;
-	p->atime.tv_sec = buf->st_atime;
-	p->atime.tv_nsec = 0;
-	p->ctime.tv_sec = buf->st_ctime;
-	p->ctime.tv_nsec = 0;
-	p->mtime.tv_sec = buf->st_mtime;
-	p->mtime.tv_nsec = 0;
-	p->blksize = buf->st_blksize;
-	p->blocks = buf->st_blocks;
-	p->rdev.maj = os_major(buf->st_rdev);
-	p->rdev.min = os_minor(buf->st_rdev);
-	p->dev.maj = os_major(buf->st_dev);
-	p->dev.min = os_minor(buf->st_dev);
+	p->ino = buf->stx_ino;
+	p->mode = buf->stx_mode;
+	p->nlink = buf->stx_nlink;
+	p->uid = buf->stx_uid;
+	p->gid = buf->stx_gid;
+	p->size = buf->stx_size;
+	p->atime.tv_sec = buf->stx_atime.tv_sec;
+	p->atime.tv_nsec = buf->stx_atime.tv_nsec;
+	p->ctime.tv_sec = buf->stx_ctime.tv_sec;
+	p->ctime.tv_nsec = buf->stx_ctime.tv_nsec;
+	p->mtime.tv_sec = buf->stx_mtime.tv_sec;
+	p->mtime.tv_nsec = buf->stx_mtime.tv_nsec;
+	if (buf->stx_mask & STATX_BTIME) {
+		p->btime.tv_sec = buf->stx_btime.tv_sec;
+		p->btime.tv_nsec = buf->stx_btime.tv_nsec;
+	} else {
+		memset(&p->btime, 0, sizeof(p->btime));
+	}
+	p->blksize = buf->stx_blksize;
+	p->blocks = buf->stx_blocks;
+	p->rdev.maj = buf->stx_rdev_major;
+	p->rdev.min = buf->stx_rdev_minor;
+	p->dev.maj = buf->stx_dev_major;
+	p->dev.min = buf->stx_dev_minor;
 }
 
 int stat_file(const char *path, struct hostfs_stat *p, int fd)
 {
-	struct stat64 buf;
+	struct statx buf;
+	int flags = AT_SYMLINK_NOFOLLOW;
 
 	if (fd >= 0) {
-		if (fstat64(fd, &buf) < 0)
-			return -errno;
-	} else if (lstat64(path, &buf) < 0) {
-		return -errno;
+		flags |= AT_EMPTY_PATH;
+		path = "";
 	}
-	stat64_to_hostfs(&buf, p);
+
+	if ((statx(fd, path, flags, STATX_BASIC_STATS | STATX_BTIME, &buf)) < 0)
+		return -errno;
+
+	statx_to_hostfs(&buf, p);
 	return 0;
 }
 
-- 
2.51.0


From f664a1399633c63706f763d4bb58c96110355330 Mon Sep 17 00:00:00 2001
From: Ethan Carter Edwards <ethan@ethancedwards.com>
Date: Thu, 20 Feb 2025 17:39:40 -0500
Subject: [PATCH 12/16] um: use str_yes_no() to remove hardcoded "yes" and "no"

Remove hard-coded strings by using the str_yes_no() helper function
provided by <linux/string_choices.h>.

Signed-off-by: Ethan Carter Edwards <ethan@ethancedwards.com>
Link: https://patch.msgid.link/20250220-um_yes_no-v1-1-2a355ed2d225@ethancedwards.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/kernel/um_arch.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 79ea97d4797e..7f050783885a 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -12,6 +12,7 @@
 #include <linux/panic_notifier.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
+#include <linux/string_choices.h>
 #include <linux/utsname.h>
 #include <linux/sched.h>
 #include <linux/sched/task.h>
@@ -78,7 +79,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	seq_printf(m, "model name\t: UML\n");
 	seq_printf(m, "mode\t\t: skas\n");
 	seq_printf(m, "host\t\t: %s\n", host_info);
-	seq_printf(m, "fpu\t\t: %s\n", cpu_has(&boot_cpu_data, X86_FEATURE_FPU) ? "yes" : "no");
+	seq_printf(m, "fpu\t\t: %s\n", str_yes_no(cpu_has(&boot_cpu_data, X86_FEATURE_FPU)));
 	seq_printf(m, "flags\t\t:");
 	for (i = 0; i < 32*NCAPINTS; i++)
 		if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL))
-- 
2.51.0


From e82cf3051e6193f61e03898f8dba035199064d36 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Fri, 21 Feb 2025 12:18:55 +0800
Subject: [PATCH 13/16] um: Update min_low_pfn to match changes in uml_reserved

When uml_reserved is updated, min_low_pfn must also be updated
accordingly. Otherwise, min_low_pfn will not accurately reflect
the lowest available PFN.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250221041855.1156109-1-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/kernel/mem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 05ffceb555b4..61b5a5ede01c 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -68,6 +68,7 @@ void __init mem_init(void)
 	map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0);
 	memblock_free((void *)brk_end, uml_reserved - brk_end);
 	uml_reserved = brk_end;
+	min_low_pfn = PFN_UP(__pa(uml_reserved));
 
 	/* this will put all low memory onto the freelists */
 	memblock_free_all();
-- 
2.51.0


From 089db01ea7eb4f366be45b9390a04f1c601c0071 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Fri, 28 Feb 2025 10:00:08 +0100
Subject: [PATCH 14/16] um/locking: Remove semicolon from "lock" prefix

Minimum version of binutils required to compile the kernel is 2.25.
This version correctly handles the "lock" prefix, so it is possible
to remove the semicolon, which was used to support ancient versions
of GNU as.

Due to the semicolon, the compiler considers "lock; insn" as two
separate instructions. Removing the semicolon makes asm length
calculations more accurate, consequently making scheduling and
inlining decisions of the compiler more accurate.

Removing the semicolon also enables assembler checks involving lock
prefix. Trying to assemble e.g. "lock andl %eax, %ebx" results in:

  Error: expecting lockable instruction after `lock'

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Richard Weinberger <richard@nod.at>
Cc: Anton Ivanov <anton.ivanov@cambridgegreys.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Link: https://patch.msgid.link/20250228090058.2499163-1-ubizjak@gmail.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/x86/um/asm/barrier.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 4da336965698..b51aefd6ec2b 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -12,9 +12,9 @@
  */
 #ifdef CONFIG_X86_32
 
-#define mb()	alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
-#define rmb()	alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
-#define wmb()	alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
+#define mb()	alternative("lock addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
+#define rmb()	alternative("lock addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
+#define wmb()	alternative("lock addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
 
 #else /* CONFIG_X86_32 */
 
-- 
2.51.0


From 887c5c12e80c8424bd471122d2e8b6b462e12874 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Fri, 14 Mar 2025 14:08:15 +0100
Subject: [PATCH 15/16] um: work around sched_yield not yielding in time-travel
 mode

sched_yield by a userspace may not actually cause scheduling in
time-travel mode as no time has passed. In the case seen it appears to
be a badly implemented userspace spinlock in ASAN. Unfortunately, with
time-travel it causes an extreme slowdown or even deadlock depending on
the kernel configuration (CONFIG_UML_MAX_USERSPACE_ITERATIONS).

Work around it by accounting time to the process whenever it executes a
sched_yield syscall.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Link: https://patch.msgid.link/20250314130815.226872-1-benjamin@sipsolutions.net
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/include/linux/time-internal.h |  2 ++
 arch/um/kernel/skas/syscall.c         | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h
index b22226634ff6..138908b999d7 100644
--- a/arch/um/include/linux/time-internal.h
+++ b/arch/um/include/linux/time-internal.h
@@ -83,6 +83,8 @@ extern void time_travel_not_configured(void);
 #define time_travel_del_event(...) time_travel_not_configured()
 #endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */
 
+extern unsigned long tt_extra_sched_jiffies;
+
 /*
  * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used,
  * which is intentional since we really shouldn't link it in that case.
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index b09e85279d2b..a5beaea2967e 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -31,6 +31,17 @@ void handle_syscall(struct uml_pt_regs *r)
 		goto out;
 
 	syscall = UPT_SYSCALL_NR(r);
+
+	/*
+	 * If no time passes, then sched_yield may not actually yield, causing
+	 * broken spinlock implementations in userspace (ASAN) to hang for long
+	 * periods of time.
+	 */
+	if ((time_travel_mode == TT_MODE_INFCPU ||
+	     time_travel_mode == TT_MODE_EXTERNAL) &&
+	    syscall == __NR_sched_yield)
+		tt_extra_sched_jiffies += 1;
+
 	if (syscall >= 0 && syscall < __NR_syscalls) {
 		unsigned long ret = EXECUTE_SYSCALL(syscall, regs);
 
-- 
2.51.0


From 24ffa71b0f15fe4827d3672d5918f97564b2fba1 Mon Sep 17 00:00:00 2001
From: Tiwei Bie <tiwei.btw@antgroup.com>
Date: Sun, 16 Mar 2025 00:19:09 +0800
Subject: [PATCH 16/16] um: virt-pci: Refactor virtio_pcidev into its own
 module

Decouple virt-pci and virtio_pcidev, refactoring virtio_pcidev into
its own module. Define a set of APIs for virt-pci. This allows for
future addition of more PCI emulation implementations.

Signed-off-by: Tiwei Bie <tiwei.btw@antgroup.com>
Link: https://patch.msgid.link/20250315161910.4082396-3-tiwei.btw@antgroup.com
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 arch/um/drivers/Kconfig         |  12 +-
 arch/um/drivers/Makefile        |   3 +-
 arch/um/drivers/virt-pci.c      | 699 ++++++--------------------------
 arch/um/drivers/virt-pci.h      |  41 ++
 arch/um/drivers/virtio_pcidev.c | 628 ++++++++++++++++++++++++++++
 5 files changed, 794 insertions(+), 589 deletions(-)
 create mode 100644 arch/um/drivers/virt-pci.h
 create mode 100644 arch/um/drivers/virtio_pcidev.c

diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index ede40a160c5e..9cb196070614 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -345,16 +345,20 @@ config UML_RTC
 	  by providing a fake RTC clock that causes a wakeup at the right
 	  time.
 
-config UML_PCI_OVER_VIRTIO
-	bool "Enable PCI over VIRTIO device simulation"
-	# in theory, just VIRTIO is enough, but that causes recursion
-	depends on VIRTIO_UML
+config UML_PCI
+	bool
 	select FORCE_PCI
 	select UML_IOMEM_EMULATION
 	select UML_DMA_EMULATION
 	select PCI_MSI
 	select PCI_LOCKLESS_CONFIG
 
+config UML_PCI_OVER_VIRTIO
+	bool "Enable PCI over VIRTIO device simulation"
+	# in theory, just VIRTIO is enough, but that causes recursion
+	depends on VIRTIO_UML
+	select UML_PCI
+
 config UML_PCI_OVER_VIRTIO_DEVICE_ID
 	int "set the virtio device ID for PCI emulation"
 	default -1
diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile
index 57882e6bc215..0a5820343ad3 100644
--- a/arch/um/drivers/Makefile
+++ b/arch/um/drivers/Makefile
@@ -60,7 +60,8 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o
 obj-$(CONFIG_UML_RANDOM) += random.o
 obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o
 obj-$(CONFIG_UML_RTC) += rtc.o
-obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virt-pci.o
+obj-$(CONFIG_UML_PCI) += virt-pci.o
+obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o
 
 # pcap_user.o must be added explicitly.
 USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o
diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c
index dd5580f975cc..b83b5a765d4e 100644
--- a/arch/um/drivers/virt-pci.c
+++ b/arch/um/drivers/virt-pci.c
@@ -5,52 +5,19 @@
  */
 #include <linux/module.h>
 #include <linux/pci.h>
-#include <linux/virtio.h>
-#include <linux/virtio_config.h>
 #include <linux/logic_iomem.h>
 #include <linux/of_platform.h>
 #include <linux/irqdomain.h>
-#include <linux/virtio_pcidev.h>
-#include <linux/virtio-uml.h>
-#include <linux/delay.h>
 #include <linux/msi.h>
 #include <linux/unaligned.h>
 #include <irq_kern.h>
 
+#include "virt-pci.h"
+
 #define MAX_DEVICES 8
 #define MAX_MSI_VECTORS 32
 #define CFG_SPACE_SIZE 4096
 
-/* for MSI-X we have a 32-bit payload */
-#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32))
-#define NUM_IRQ_MSGS	10
-
-struct um_pci_message_buffer {
-	struct virtio_pcidev_msg hdr;
-	u8 data[8];
-};
-
-struct um_pci_device {
-	struct virtio_device *vdev;
-
-	/* for now just standard BARs */
-	u8 resptr[PCI_STD_NUM_BARS];
-
-	struct virtqueue *cmd_vq, *irq_vq;
-
-#define UM_PCI_WRITE_BUFS	20
-	struct um_pci_message_buffer bufs[UM_PCI_WRITE_BUFS + 1];
-	void *extra_ptrs[UM_PCI_WRITE_BUFS + 1];
-	DECLARE_BITMAP(used_bufs, UM_PCI_WRITE_BUFS);
-
-#define UM_PCI_STAT_WAITING	0
-	unsigned long status;
-
-	int irq;
-
-	bool platform;
-};
-
 struct um_pci_device_reg {
 	struct um_pci_device *dev;
 	void __iomem *iomem;
@@ -65,179 +32,15 @@ static struct irq_domain *um_pci_inner_domain;
 static struct irq_domain *um_pci_msi_domain;
 static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)];
 
-static unsigned int um_pci_max_delay_us = 40000;
-module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644);
-
-static int um_pci_get_buf(struct um_pci_device *dev, bool *posted)
-{
-	int i;
-
-	for (i = 0; i < UM_PCI_WRITE_BUFS; i++) {
-		if (!test_and_set_bit(i, dev->used_bufs))
-			return i;
-	}
-
-	*posted = false;
-	return UM_PCI_WRITE_BUFS;
-}
-
-static void um_pci_free_buf(struct um_pci_device *dev, void *buf)
-{
-	int i;
-
-	if (buf == &dev->bufs[UM_PCI_WRITE_BUFS]) {
-		kfree(dev->extra_ptrs[UM_PCI_WRITE_BUFS]);
-		dev->extra_ptrs[UM_PCI_WRITE_BUFS] = NULL;
-		return;
-	}
-
-	for (i = 0; i < UM_PCI_WRITE_BUFS; i++) {
-		if (buf == &dev->bufs[i]) {
-			kfree(dev->extra_ptrs[i]);
-			dev->extra_ptrs[i] = NULL;
-			WARN_ON(!test_and_clear_bit(i, dev->used_bufs));
-			return;
-		}
-	}
-
-	WARN_ON(1);
-}
-
-static int um_pci_send_cmd(struct um_pci_device *dev,
-			   struct virtio_pcidev_msg *cmd,
-			   unsigned int cmd_size,
-			   const void *extra, unsigned int extra_size,
-			   void *out, unsigned int out_size)
-{
-	struct scatterlist out_sg, extra_sg, in_sg;
-	struct scatterlist *sgs_list[] = {
-		[0] = &out_sg,
-		[1] = extra ? &extra_sg : &in_sg,
-		[2] = extra ? &in_sg : NULL,
-	};
-	struct um_pci_message_buffer *buf;
-	int delay_count = 0;
-	bool bounce_out;
-	int ret, len;
-	int buf_idx;
-	bool posted;
-
-	if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf)))
-		return -EINVAL;
-
-	switch (cmd->op) {
-	case VIRTIO_PCIDEV_OP_CFG_WRITE:
-	case VIRTIO_PCIDEV_OP_MMIO_WRITE:
-	case VIRTIO_PCIDEV_OP_MMIO_MEMSET:
-		/* in PCI, writes are posted, so don't wait */
-		posted = !out;
-		WARN_ON(!posted);
-		break;
-	default:
-		posted = false;
-		break;
-	}
-
-	bounce_out = !posted && cmd_size <= sizeof(*cmd) &&
-		     out && out_size <= sizeof(buf->data);
-
-	buf_idx = um_pci_get_buf(dev, &posted);
-	buf = &dev->bufs[buf_idx];
-	memcpy(buf, cmd, cmd_size);
-
-	if (posted && extra && extra_size > sizeof(buf) - cmd_size) {
-		dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size,
-						   GFP_ATOMIC);
-
-		if (!dev->extra_ptrs[buf_idx]) {
-			um_pci_free_buf(dev, buf);
-			return -ENOMEM;
-		}
-		extra = dev->extra_ptrs[buf_idx];
-	} else if (extra && extra_size <= sizeof(buf) - cmd_size) {
-		memcpy((u8 *)buf + cmd_size, extra, extra_size);
-		cmd_size += extra_size;
-		extra_size = 0;
-		extra = NULL;
-		cmd = (void *)buf;
-	} else {
-		cmd = (void *)buf;
-	}
-
-	sg_init_one(&out_sg, cmd, cmd_size);
-	if (extra)
-		sg_init_one(&extra_sg, extra, extra_size);
-	/* allow stack for small buffers */
-	if (bounce_out)
-		sg_init_one(&in_sg, buf->data, out_size);
-	else if (out)
-		sg_init_one(&in_sg, out, out_size);
-
-	/* add to internal virtio queue */
-	ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list,
-				extra ? 2 : 1,
-				out ? 1 : 0,
-				cmd, GFP_ATOMIC);
-	if (ret) {
-		um_pci_free_buf(dev, buf);
-		return ret;
-	}
-
-	if (posted) {
-		virtqueue_kick(dev->cmd_vq);
-		return 0;
-	}
-
-	/* kick and poll for getting a response on the queue */
-	set_bit(UM_PCI_STAT_WAITING, &dev->status);
-	virtqueue_kick(dev->cmd_vq);
-	ret = 0;
-
-	while (1) {
-		void *completed = virtqueue_get_buf(dev->cmd_vq, &len);
-
-		if (completed == buf)
-			break;
-
-		if (completed)
-			um_pci_free_buf(dev, completed);
-
-		if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) ||
-			      ++delay_count > um_pci_max_delay_us,
-			      "um virt-pci delay: %d", delay_count)) {
-			ret = -EIO;
-			break;
-		}
-		udelay(1);
-	}
-	clear_bit(UM_PCI_STAT_WAITING, &dev->status);
-
-	if (bounce_out)
-		memcpy(out, buf->data, out_size);
-
-	um_pci_free_buf(dev, buf);
-
-	return ret;
-}
-
 static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset,
 					  int size)
 {
 	struct um_pci_device_reg *reg = priv;
 	struct um_pci_device *dev = reg->dev;
-	struct virtio_pcidev_msg hdr = {
-		.op = VIRTIO_PCIDEV_OP_CFG_READ,
-		.size = size,
-		.addr = offset,
-	};
-	/* max 8, we might not use it all */
-	u8 data[8];
 
 	if (!dev)
 		return ULONG_MAX;
 
-	memset(data, 0xff, sizeof(data));
-
 	switch (size) {
 	case 1:
 	case 2:
@@ -251,23 +54,7 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset,
 		return ULONG_MAX;
 	}
 
-	if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size))
-		return ULONG_MAX;
-
-	switch (size) {
-	case 1:
-		return data[0];
-	case 2:
-		return le16_to_cpup((void *)data);
-	case 4:
-		return le32_to_cpup((void *)data);
-#ifdef CONFIG_64BIT
-	case 8:
-		return le64_to_cpup((void *)data);
-#endif
-	default:
-		return ULONG_MAX;
-	}
+	return dev->ops->cfgspace_read(dev, offset, size);
 }
 
 static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size,
@@ -275,42 +62,24 @@ static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size,
 {
 	struct um_pci_device_reg *reg = priv;
 	struct um_pci_device *dev = reg->dev;
-	struct {
-		struct virtio_pcidev_msg hdr;
-		/* maximum size - we may only use parts of it */
-		u8 data[8];
-	} msg = {
-		.hdr = {
-			.op = VIRTIO_PCIDEV_OP_CFG_WRITE,
-			.size = size,
-			.addr = offset,
-		},
-	};
 
 	if (!dev)
 		return;
 
 	switch (size) {
 	case 1:
-		msg.data[0] = (u8)val;
-		break;
 	case 2:
-		put_unaligned_le16(val, (void *)msg.data);
-		break;
 	case 4:
-		put_unaligned_le32(val, (void *)msg.data);
-		break;
 #ifdef CONFIG_64BIT
 	case 8:
-		put_unaligned_le64(val, (void *)msg.data);
-		break;
 #endif
+		break;
 	default:
 		WARN(1, "invalid config space write size %d\n", size);
 		return;
 	}
 
-	WARN_ON(um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0));
+	dev->ops->cfgspace_write(dev, offset, size, val);
 }
 
 static const struct logic_iomem_ops um_pci_device_cfgspace_ops = {
@@ -318,30 +87,14 @@ static const struct logic_iomem_ops um_pci_device_cfgspace_ops = {
 	.write = um_pci_cfgspace_write,
 };
 
-static void um_pci_bar_copy_from(void *priv, void *buffer,
-				 unsigned int offset, int size)
+static unsigned long um_pci_bar_read(void *priv, unsigned int offset,
+				     int size)
 {
 	u8 *resptr = priv;
 	struct um_pci_device *dev = container_of(resptr - *resptr,
 						 struct um_pci_device,
 						 resptr[0]);
-	struct virtio_pcidev_msg hdr = {
-		.op = VIRTIO_PCIDEV_OP_MMIO_READ,
-		.bar = *resptr,
-		.size = size,
-		.addr = offset,
-	};
-
-	memset(buffer, 0xff, size);
-
-	um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size);
-}
-
-static unsigned long um_pci_bar_read(void *priv, unsigned int offset,
-				     int size)
-{
-	/* 8 is maximum size - we may only use parts of it */
-	u8 data[8];
+	u8 bar = *resptr;
 
 	switch (size) {
 	case 1:
@@ -352,72 +105,60 @@ static unsigned long um_pci_bar_read(void *priv, unsigned int offset,
 #endif
 		break;
 	default:
-		WARN(1, "invalid config space read size %d\n", size);
+		WARN(1, "invalid bar read size %d\n", size);
 		return ULONG_MAX;
 	}
 
-	um_pci_bar_copy_from(priv, data, offset, size);
+	return dev->ops->bar_read(dev, bar, offset, size);
+}
+
+static void um_pci_bar_write(void *priv, unsigned int offset, int size,
+			     unsigned long val)
+{
+	u8 *resptr = priv;
+	struct um_pci_device *dev = container_of(resptr - *resptr,
+						 struct um_pci_device,
+						 resptr[0]);
+	u8 bar = *resptr;
 
 	switch (size) {
 	case 1:
-		return data[0];
 	case 2:
-		return le16_to_cpup((void *)data);
 	case 4:
-		return le32_to_cpup((void *)data);
 #ifdef CONFIG_64BIT
 	case 8:
-		return le64_to_cpup((void *)data);
 #endif
+		break;
 	default:
-		return ULONG_MAX;
+		WARN(1, "invalid bar write size %d\n", size);
+		return;
 	}
+
+	dev->ops->bar_write(dev, bar, offset, size, val);
 }
 
-static void um_pci_bar_copy_to(void *priv, unsigned int offset,
-			       const void *buffer, int size)
+static void um_pci_bar_copy_from(void *priv, void *buffer,
+				 unsigned int offset, int size)
 {
 	u8 *resptr = priv;
 	struct um_pci_device *dev = container_of(resptr - *resptr,
 						 struct um_pci_device,
 						 resptr[0]);
-	struct virtio_pcidev_msg hdr = {
-		.op = VIRTIO_PCIDEV_OP_MMIO_WRITE,
-		.bar = *resptr,
-		.size = size,
-		.addr = offset,
-	};
+	u8 bar = *resptr;
 
-	um_pci_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0);
+	dev->ops->bar_copy_from(dev, bar, buffer, offset, size);
 }
 
-static void um_pci_bar_write(void *priv, unsigned int offset, int size,
-			     unsigned long val)
+static void um_pci_bar_copy_to(void *priv, unsigned int offset,
+			       const void *buffer, int size)
 {
-	/* maximum size - we may only use parts of it */
-	u8 data[8];
-
-	switch (size) {
-	case 1:
-		data[0] = (u8)val;
-		break;
-	case 2:
-		put_unaligned_le16(val, (void *)data);
-		break;
-	case 4:
-		put_unaligned_le32(val, (void *)data);
-		break;
-#ifdef CONFIG_64BIT
-	case 8:
-		put_unaligned_le64(val, (void *)data);
-		break;
-#endif
-	default:
-		WARN(1, "invalid config space write size %d\n", size);
-		return;
-	}
+	u8 *resptr = priv;
+	struct um_pci_device *dev = container_of(resptr - *resptr,
+						 struct um_pci_device,
+						 resptr[0]);
+	u8 bar = *resptr;
 
-	um_pci_bar_copy_to(priv, offset, data, size);
+	dev->ops->bar_copy_to(dev, bar, offset, buffer, size);
 }
 
 static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size)
@@ -426,20 +167,9 @@ static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size)
 	struct um_pci_device *dev = container_of(resptr - *resptr,
 						 struct um_pci_device,
 						 resptr[0]);
-	struct {
-		struct virtio_pcidev_msg hdr;
-		u8 data;
-	} msg = {
-		.hdr = {
-			.op = VIRTIO_PCIDEV_OP_CFG_WRITE,
-			.bar = *resptr,
-			.size = size,
-			.addr = offset,
-		},
-		.data = value,
-	};
+	u8 bar = *resptr;
 
-	um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0);
+	dev->ops->bar_set(dev, bar, offset, value, size);
 }
 
 static const struct logic_iomem_ops um_pci_device_bar_ops = {
@@ -486,76 +216,6 @@ static void um_pci_rescan(void)
 	pci_unlock_rescan_remove();
 }
 
-static void um_pci_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick)
-{
-	struct scatterlist sg[1];
-
-	sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE);
-	if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC))
-		kfree(buf);
-	else if (kick)
-		virtqueue_kick(vq);
-}
-
-static void um_pci_handle_irq_message(struct virtqueue *vq,
-				      struct virtio_pcidev_msg *msg)
-{
-	struct virtio_device *vdev = vq->vdev;
-	struct um_pci_device *dev = vdev->priv;
-
-	if (!dev->irq)
-		return;
-
-	/* we should properly chain interrupts, but on ARCH=um we don't care */
-
-	switch (msg->op) {
-	case VIRTIO_PCIDEV_OP_INT:
-		generic_handle_irq(dev->irq);
-		break;
-	case VIRTIO_PCIDEV_OP_MSI:
-		/* our MSI message is just the interrupt number */
-		if (msg->size == sizeof(u32))
-			generic_handle_irq(le32_to_cpup((void *)msg->data));
-		else
-			generic_handle_irq(le16_to_cpup((void *)msg->data));
-		break;
-	case VIRTIO_PCIDEV_OP_PME:
-		/* nothing to do - we already woke up due to the message */
-		break;
-	default:
-		dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op);
-		break;
-	}
-}
-
-static void um_pci_cmd_vq_cb(struct virtqueue *vq)
-{
-	struct virtio_device *vdev = vq->vdev;
-	struct um_pci_device *dev = vdev->priv;
-	void *cmd;
-	int len;
-
-	if (test_bit(UM_PCI_STAT_WAITING, &dev->status))
-		return;
-
-	while ((cmd = virtqueue_get_buf(vq, &len)))
-		um_pci_free_buf(dev, cmd);
-}
-
-static void um_pci_irq_vq_cb(struct virtqueue *vq)
-{
-	struct virtio_pcidev_msg *msg;
-	int len;
-
-	while ((msg = virtqueue_get_buf(vq, &len))) {
-		if (len >= sizeof(*msg))
-			um_pci_handle_irq_message(vq, msg);
-
-		/* recycle the message buffer */
-		um_pci_irq_vq_addbuf(vq, msg, true);
-	}
-}
-
 #ifdef CONFIG_OF
 /* Copied from arch/x86/kernel/devicetree.c */
 struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
@@ -577,200 +237,6 @@ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus)
 }
 #endif
 
-static int um_pci_init_vqs(struct um_pci_device *dev)
-{
-	struct virtqueue_info vqs_info[] = {
-		{ "cmd", um_pci_cmd_vq_cb },
-		{ "irq", um_pci_irq_vq_cb },
-	};
-	struct virtqueue *vqs[2];
-	int err, i;
-
-	err = virtio_find_vqs(dev->vdev, 2, vqs, vqs_info, NULL);
-	if (err)
-		return err;
-
-	dev->cmd_vq = vqs[0];
-	dev->irq_vq = vqs[1];
-
-	virtio_device_ready(dev->vdev);
-
-	for (i = 0; i < NUM_IRQ_MSGS; i++) {
-		void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL);
-
-		if (msg)
-			um_pci_irq_vq_addbuf(dev->irq_vq, msg, false);
-	}
-
-	virtqueue_kick(dev->irq_vq);
-
-	return 0;
-}
-
-static void __um_pci_virtio_platform_remove(struct virtio_device *vdev,
-					    struct um_pci_device *dev)
-{
-	virtio_reset_device(vdev);
-	vdev->config->del_vqs(vdev);
-
-	mutex_lock(&um_pci_mtx);
-	um_pci_platform_device = NULL;
-	mutex_unlock(&um_pci_mtx);
-
-	kfree(dev);
-}
-
-static int um_pci_virtio_platform_probe(struct virtio_device *vdev,
-					struct um_pci_device *dev)
-{
-	int ret;
-
-	dev->platform = true;
-
-	mutex_lock(&um_pci_mtx);
-
-	if (um_pci_platform_device) {
-		mutex_unlock(&um_pci_mtx);
-		ret = -EBUSY;
-		goto out_free;
-	}
-
-	ret = um_pci_init_vqs(dev);
-	if (ret) {
-		mutex_unlock(&um_pci_mtx);
-		goto out_free;
-	}
-
-	um_pci_platform_device = dev;
-
-	mutex_unlock(&um_pci_mtx);
-
-	ret = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev);
-	if (ret)
-		__um_pci_virtio_platform_remove(vdev, dev);
-
-	return ret;
-
-out_free:
-	kfree(dev);
-	return ret;
-}
-
-static int um_pci_virtio_probe(struct virtio_device *vdev)
-{
-	struct um_pci_device *dev;
-	int i, free = -1;
-	int err = -ENOSPC;
-
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return -ENOMEM;
-
-	dev->vdev = vdev;
-	vdev->priv = dev;
-
-	if (of_device_is_compatible(vdev->dev.of_node, "simple-bus"))
-		return um_pci_virtio_platform_probe(vdev, dev);
-
-	mutex_lock(&um_pci_mtx);
-	for (i = 0; i < MAX_DEVICES; i++) {
-		if (um_pci_devices[i].dev)
-			continue;
-		free = i;
-		break;
-	}
-
-	if (free < 0)
-		goto error;
-
-	err = um_pci_init_vqs(dev);
-	if (err)
-		goto error;
-
-	dev->irq = irq_alloc_desc(numa_node_id());
-	if (dev->irq < 0) {
-		err = dev->irq;
-		goto err_reset;
-	}
-	um_pci_devices[free].dev = dev;
-	vdev->priv = dev;
-
-	mutex_unlock(&um_pci_mtx);
-
-	device_set_wakeup_enable(&vdev->dev, true);
-
-	/*
-	 * In order to do suspend-resume properly, don't allow VQs
-	 * to be suspended.
-	 */
-	virtio_uml_set_no_vq_suspend(vdev, true);
-
-	um_pci_rescan();
-	return 0;
-err_reset:
-	virtio_reset_device(vdev);
-	vdev->config->del_vqs(vdev);
-error:
-	mutex_unlock(&um_pci_mtx);
-	kfree(dev);
-	return err;
-}
-
-static void um_pci_virtio_remove(struct virtio_device *vdev)
-{
-	struct um_pci_device *dev = vdev->priv;
-	int i;
-
-	if (dev->platform) {
-		of_platform_depopulate(&vdev->dev);
-		__um_pci_virtio_platform_remove(vdev, dev);
-		return;
-	}
-
-	device_set_wakeup_enable(&vdev->dev, false);
-
-	mutex_lock(&um_pci_mtx);
-	for (i = 0; i < MAX_DEVICES; i++) {
-		if (um_pci_devices[i].dev != dev)
-			continue;
-
-		um_pci_devices[i].dev = NULL;
-		irq_free_desc(dev->irq);
-
-		break;
-	}
-	mutex_unlock(&um_pci_mtx);
-
-	if (i < MAX_DEVICES) {
-		struct pci_dev *pci_dev;
-
-		pci_dev = pci_get_slot(bridge->bus, i);
-		if (pci_dev)
-			pci_stop_and_remove_bus_device_locked(pci_dev);
-	}
-
-	/* Stop all virtqueues */
-	virtio_reset_device(vdev);
-	dev->cmd_vq = NULL;
-	dev->irq_vq = NULL;
-	vdev->config->del_vqs(vdev);
-
-	kfree(dev);
-}
-
-static struct virtio_device_id id_table[] = {
-	{ CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID },
-	{ 0 },
-};
-MODULE_DEVICE_TABLE(virtio, id_table);
-
-static struct virtio_driver um_pci_virtio_driver = {
-	.driver.name = "virtio-pci",
-	.id_table = id_table,
-	.probe = um_pci_virtio_probe,
-	.remove = um_pci_virtio_remove,
-};
-
 static struct resource virt_cfgspace_resource = {
 	.name = "PCI config space",
 	.start = 0xf0000000 - MAX_DEVICES * CFG_SPACE_SIZE,
@@ -889,7 +355,7 @@ static void um_pci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
 }
 
 static struct irq_chip um_pci_msi_bottom_irq_chip = {
-	.name = "UM virtio MSI",
+	.name = "UM virtual MSI",
 	.irq_compose_msi_msg = um_pci_compose_msi_msg,
 };
 
@@ -939,7 +405,7 @@ static const struct irq_domain_ops um_pci_inner_domain_ops = {
 };
 
 static struct irq_chip um_pci_msi_irq_chip = {
-	.name = "UM virtio PCIe MSI",
+	.name = "UM virtual PCIe MSI",
 	.irq_mask = pci_msi_mask_irq,
 	.irq_unmask = pci_msi_unmask_irq,
 };
@@ -998,6 +464,78 @@ static struct resource virt_platform_resource = {
 	.flags = IORESOURCE_MEM,
 };
 
+int um_pci_device_register(struct um_pci_device *dev)
+{
+	int i, free = -1;
+	int err = 0;
+
+	mutex_lock(&um_pci_mtx);
+	for (i = 0; i < MAX_DEVICES; i++) {
+		if (um_pci_devices[i].dev)
+			continue;
+		free = i;
+		break;
+	}
+
+	if (free < 0) {
+		err = -ENOSPC;
+		goto out;
+	}
+
+	dev->irq = irq_alloc_desc(numa_node_id());
+	if (dev->irq < 0) {
+		err = dev->irq;
+		goto out;
+	}
+
+	um_pci_devices[free].dev = dev;
+
+out:
+	mutex_unlock(&um_pci_mtx);
+	if (!err)
+		um_pci_rescan();
+	return err;
+}
+
+void um_pci_device_unregister(struct um_pci_device *dev)
+{
+	int i;
+
+	mutex_lock(&um_pci_mtx);
+	for (i = 0; i < MAX_DEVICES; i++) {
+		if (um_pci_devices[i].dev != dev)
+			continue;
+		um_pci_devices[i].dev = NULL;
+		irq_free_desc(dev->irq);
+		break;
+	}
+	mutex_unlock(&um_pci_mtx);
+
+	if (i < MAX_DEVICES) {
+		struct pci_dev *pci_dev;
+
+		pci_dev = pci_get_slot(bridge->bus, i);
+		if (pci_dev)
+			pci_stop_and_remove_bus_device_locked(pci_dev);
+	}
+}
+
+int um_pci_platform_device_register(struct um_pci_device *dev)
+{
+	guard(mutex)(&um_pci_mtx);
+	if (um_pci_platform_device)
+		return -EBUSY;
+	um_pci_platform_device = dev;
+	return 0;
+}
+
+void um_pci_platform_device_unregister(struct um_pci_device *dev)
+{
+	guard(mutex)(&um_pci_mtx);
+	if (um_pci_platform_device == dev)
+		um_pci_platform_device = NULL;
+}
+
 static int __init um_pci_init(void)
 {
 	struct irq_domain_info inner_domain_info = {
@@ -1014,10 +552,6 @@ static int __init um_pci_init(void)
 	WARN_ON(logic_iomem_add_region(&virt_platform_resource,
 				       &um_pci_platform_ops));
 
-	if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0,
-		 "No virtio device ID configured for PCI - no PCI support\n"))
-		return 0;
-
 	bridge = pci_alloc_host_bridge(0);
 	if (!bridge) {
 		err = -ENOMEM;
@@ -1065,10 +599,8 @@ static int __init um_pci_init(void)
 	if (err)
 		goto free;
 
-	err = register_virtio_driver(&um_pci_virtio_driver);
-	if (err)
-		goto free;
 	return 0;
+
 free:
 	if (!IS_ERR_OR_NULL(um_pci_inner_domain))
 		irq_domain_remove(um_pci_inner_domain);
@@ -1080,11 +612,10 @@ free:
 	}
 	return err;
 }
-module_init(um_pci_init);
+device_initcall(um_pci_init);
 
 static void __exit um_pci_exit(void)
 {
-	unregister_virtio_driver(&um_pci_virtio_driver);
 	irq_domain_remove(um_pci_msi_domain);
 	irq_domain_remove(um_pci_inner_domain);
 	pci_free_resource_list(&bridge->windows);
diff --git a/arch/um/drivers/virt-pci.h b/arch/um/drivers/virt-pci.h
new file mode 100644
index 000000000000..b20d1475d1eb
--- /dev/null
+++ b/arch/um/drivers/virt-pci.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_VIRT_PCI_H
+#define __UM_VIRT_PCI_H
+
+#include <linux/pci.h>
+
+struct um_pci_device {
+	const struct um_pci_ops *ops;
+
+	/* for now just standard BARs */
+	u8 resptr[PCI_STD_NUM_BARS];
+
+	int irq;
+};
+
+struct um_pci_ops {
+	unsigned long (*cfgspace_read)(struct um_pci_device *dev,
+				       unsigned int offset, int size);
+	void (*cfgspace_write)(struct um_pci_device *dev, unsigned int offset,
+			       int size, unsigned long val);
+
+	unsigned long (*bar_read)(struct um_pci_device *dev, int bar,
+				  unsigned int offset, int size);
+	void (*bar_write)(struct um_pci_device *dev, int bar,
+			  unsigned int offset, int size, unsigned long val);
+
+	void (*bar_copy_from)(struct um_pci_device *dev, int bar, void *buffer,
+			      unsigned int offset, int size);
+	void (*bar_copy_to)(struct um_pci_device *dev, int bar,
+			    unsigned int offset, const void *buffer, int size);
+	void (*bar_set)(struct um_pci_device *dev, int bar,
+			unsigned int offset, u8 value, int size);
+};
+
+int um_pci_device_register(struct um_pci_device *dev);
+void um_pci_device_unregister(struct um_pci_device *dev);
+
+int um_pci_platform_device_register(struct um_pci_device *dev);
+void um_pci_platform_device_unregister(struct um_pci_device *dev);
+
+#endif /* __UM_VIRT_PCI_H */
diff --git a/arch/um/drivers/virtio_pcidev.c b/arch/um/drivers/virtio_pcidev.c
new file mode 100644
index 000000000000..3c4c4c928fdd
--- /dev/null
+++ b/arch/um/drivers/virtio_pcidev.c
@@ -0,0 +1,628 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 Intel Corporation
+ * Author: Johannes Berg <johannes@sipsolutions.net>
+ */
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/logic_iomem.h>
+#include <linux/of_platform.h>
+#include <linux/irqdomain.h>
+#include <linux/virtio_pcidev.h>
+#include <linux/virtio-uml.h>
+#include <linux/delay.h>
+#include <linux/msi.h>
+#include <linux/unaligned.h>
+#include <irq_kern.h>
+
+#include "virt-pci.h"
+
+#define to_virtio_pcidev(_pdev) \
+	container_of(_pdev, struct virtio_pcidev_device, pdev)
+
+/* for MSI-X we have a 32-bit payload */
+#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32))
+#define NUM_IRQ_MSGS	10
+
+struct virtio_pcidev_message_buffer {
+	struct virtio_pcidev_msg hdr;
+	u8 data[8];
+};
+
+struct virtio_pcidev_device {
+	struct um_pci_device pdev;
+	struct virtio_device *vdev;
+
+	struct virtqueue *cmd_vq, *irq_vq;
+
+#define VIRTIO_PCIDEV_WRITE_BUFS	20
+	struct virtio_pcidev_message_buffer bufs[VIRTIO_PCIDEV_WRITE_BUFS + 1];
+	void *extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS + 1];
+	DECLARE_BITMAP(used_bufs, VIRTIO_PCIDEV_WRITE_BUFS);
+
+#define UM_PCI_STAT_WAITING	0
+	unsigned long status;
+
+	bool platform;
+};
+
+static unsigned int virtio_pcidev_max_delay_us = 40000;
+module_param_named(max_delay_us, virtio_pcidev_max_delay_us, uint, 0644);
+
+static int virtio_pcidev_get_buf(struct virtio_pcidev_device *dev, bool *posted)
+{
+	int i;
+
+	for (i = 0; i < VIRTIO_PCIDEV_WRITE_BUFS; i++) {
+		if (!test_and_set_bit(i, dev->used_bufs))
+			return i;
+	}
+
+	*posted = false;
+	return VIRTIO_PCIDEV_WRITE_BUFS;
+}
+
+static void virtio_pcidev_free_buf(struct virtio_pcidev_device *dev, void *buf)
+{
+	int i;
+
+	if (buf == &dev->bufs[VIRTIO_PCIDEV_WRITE_BUFS]) {
+		kfree(dev->extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS]);
+		dev->extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS] = NULL;
+		return;
+	}
+
+	for (i = 0; i < VIRTIO_PCIDEV_WRITE_BUFS; i++) {
+		if (buf == &dev->bufs[i]) {
+			kfree(dev->extra_ptrs[i]);
+			dev->extra_ptrs[i] = NULL;
+			WARN_ON(!test_and_clear_bit(i, dev->used_bufs));
+			return;
+		}
+	}
+
+	WARN_ON(1);
+}
+
+static int virtio_pcidev_send_cmd(struct virtio_pcidev_device *dev,
+				  struct virtio_pcidev_msg *cmd,
+				  unsigned int cmd_size,
+				  const void *extra, unsigned int extra_size,
+				  void *out, unsigned int out_size)
+{
+	struct scatterlist out_sg, extra_sg, in_sg;
+	struct scatterlist *sgs_list[] = {
+		[0] = &out_sg,
+		[1] = extra ? &extra_sg : &in_sg,
+		[2] = extra ? &in_sg : NULL,
+	};
+	struct virtio_pcidev_message_buffer *buf;
+	int delay_count = 0;
+	bool bounce_out;
+	int ret, len;
+	int buf_idx;
+	bool posted;
+
+	if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf)))
+		return -EINVAL;
+
+	switch (cmd->op) {
+	case VIRTIO_PCIDEV_OP_CFG_WRITE:
+	case VIRTIO_PCIDEV_OP_MMIO_WRITE:
+	case VIRTIO_PCIDEV_OP_MMIO_MEMSET:
+		/* in PCI, writes are posted, so don't wait */
+		posted = !out;
+		WARN_ON(!posted);
+		break;
+	default:
+		posted = false;
+		break;
+	}
+
+	bounce_out = !posted && cmd_size <= sizeof(*cmd) &&
+		     out && out_size <= sizeof(buf->data);
+
+	buf_idx = virtio_pcidev_get_buf(dev, &posted);
+	buf = &dev->bufs[buf_idx];
+	memcpy(buf, cmd, cmd_size);
+
+	if (posted && extra && extra_size > sizeof(buf) - cmd_size) {
+		dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size,
+						   GFP_ATOMIC);
+
+		if (!dev->extra_ptrs[buf_idx]) {
+			virtio_pcidev_free_buf(dev, buf);
+			return -ENOMEM;
+		}
+		extra = dev->extra_ptrs[buf_idx];
+	} else if (extra && extra_size <= sizeof(buf) - cmd_size) {
+		memcpy((u8 *)buf + cmd_size, extra, extra_size);
+		cmd_size += extra_size;
+		extra_size = 0;
+		extra = NULL;
+		cmd = (void *)buf;
+	} else {
+		cmd = (void *)buf;
+	}
+
+	sg_init_one(&out_sg, cmd, cmd_size);
+	if (extra)
+		sg_init_one(&extra_sg, extra, extra_size);
+	/* allow stack for small buffers */
+	if (bounce_out)
+		sg_init_one(&in_sg, buf->data, out_size);
+	else if (out)
+		sg_init_one(&in_sg, out, out_size);
+
+	/* add to internal virtio queue */
+	ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list,
+				extra ? 2 : 1,
+				out ? 1 : 0,
+				cmd, GFP_ATOMIC);
+	if (ret) {
+		virtio_pcidev_free_buf(dev, buf);
+		return ret;
+	}
+
+	if (posted) {
+		virtqueue_kick(dev->cmd_vq);
+		return 0;
+	}
+
+	/* kick and poll for getting a response on the queue */
+	set_bit(UM_PCI_STAT_WAITING, &dev->status);
+	virtqueue_kick(dev->cmd_vq);
+	ret = 0;
+
+	while (1) {
+		void *completed = virtqueue_get_buf(dev->cmd_vq, &len);
+
+		if (completed == buf)
+			break;
+
+		if (completed)
+			virtio_pcidev_free_buf(dev, completed);
+
+		if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) ||
+			      ++delay_count > virtio_pcidev_max_delay_us,
+			      "um virt-pci delay: %d", delay_count)) {
+			ret = -EIO;
+			break;
+		}
+		udelay(1);
+	}
+	clear_bit(UM_PCI_STAT_WAITING, &dev->status);
+
+	if (bounce_out)
+		memcpy(out, buf->data, out_size);
+
+	virtio_pcidev_free_buf(dev, buf);
+
+	return ret;
+}
+
+static unsigned long virtio_pcidev_cfgspace_read(struct um_pci_device *pdev,
+						 unsigned int offset, int size)
+{
+	struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev);
+	struct virtio_pcidev_msg hdr = {
+		.op = VIRTIO_PCIDEV_OP_CFG_READ,
+		.size = size,
+		.addr = offset,
+	};
+	/* max 8, we might not use it all */
+	u8 data[8];
+
+	memset(data, 0xff, sizeof(data));
+
+	/* size has been checked in um_pci_cfgspace_read() */
+	if (virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size))
+		return ULONG_MAX;
+
+	switch (size) {
+	case 1:
+		return data[0];
+	case 2:
+		return le16_to_cpup((void *)data);
+	case 4:
+		return le32_to_cpup((void *)data);
+#ifdef CONFIG_64BIT
+	case 8:
+		return le64_to_cpup((void *)data);
+#endif
+	default:
+		return ULONG_MAX;
+	}
+}
+
+static void virtio_pcidev_cfgspace_write(struct um_pci_device *pdev,
+					 unsigned int offset, int size,
+					 unsigned long val)
+{
+	struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev);
+	struct {
+		struct virtio_pcidev_msg hdr;
+		/* maximum size - we may only use parts of it */
+		u8 data[8];
+	} msg = {
+		.hdr = {
+			.op = VIRTIO_PCIDEV_OP_CFG_WRITE,
+			.size = size,
+			.addr = offset,
+		},
+	};
+
+	/* size has been checked in um_pci_cfgspace_write() */
+	switch (size) {
+	case 1:
+		msg.data[0] = (u8)val;
+		break;
+	case 2:
+		put_unaligned_le16(val, (void *)msg.data);
+		break;
+	case 4:
+		put_unaligned_le32(val, (void *)msg.data);
+		break;
+#ifdef CONFIG_64BIT
+	case 8:
+		put_unaligned_le64(val, (void *)msg.data);
+		break;
+#endif
+	}
+
+	WARN_ON(virtio_pcidev_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0));
+}
+
+static void virtio_pcidev_bar_copy_from(struct um_pci_device *pdev,
+					int bar, void *buffer,
+					unsigned int offset, int size)
+{
+	struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev);
+	struct virtio_pcidev_msg hdr = {
+		.op = VIRTIO_PCIDEV_OP_MMIO_READ,
+		.bar = bar,
+		.size = size,
+		.addr = offset,
+	};
+
+	memset(buffer, 0xff, size);
+
+	virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size);
+}
+
+static unsigned long virtio_pcidev_bar_read(struct um_pci_device *pdev, int bar,
+					    unsigned int offset, int size)
+{
+	/* 8 is maximum size - we may only use parts of it */
+	u8 data[8];
+
+	/* size has been checked in um_pci_bar_read() */
+	virtio_pcidev_bar_copy_from(pdev, bar, data, offset, size);
+
+	switch (size) {
+	case 1:
+		return data[0];
+	case 2:
+		return le16_to_cpup((void *)data);
+	case 4:
+		return le32_to_cpup((void *)data);
+#ifdef CONFIG_64BIT
+	case 8:
+		return le64_to_cpup((void *)data);
+#endif
+	default:
+		return ULONG_MAX;
+	}
+}
+
+static void virtio_pcidev_bar_copy_to(struct um_pci_device *pdev,
+				      int bar, unsigned int offset,
+				      const void *buffer, int size)
+{
+	struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev);
+	struct virtio_pcidev_msg hdr = {
+		.op = VIRTIO_PCIDEV_OP_MMIO_WRITE,
+		.bar = bar,
+		.size = size,
+		.addr = offset,
+	};
+
+	virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0);
+}
+
+static void virtio_pcidev_bar_write(struct um_pci_device *pdev, int bar,
+				    unsigned int offset, int size,
+				    unsigned long val)
+{
+	/* maximum size - we may only use parts of it */
+	u8 data[8];
+
+	/* size has been checked in um_pci_bar_write() */
+	switch (size) {
+	case 1:
+		data[0] = (u8)val;
+		break;
+	case 2:
+		put_unaligned_le16(val, (void *)data);
+		break;
+	case 4:
+		put_unaligned_le32(val, (void *)data);
+		break;
+#ifdef CONFIG_64BIT
+	case 8:
+		put_unaligned_le64(val, (void *)data);
+		break;
+#endif
+	}
+
+	virtio_pcidev_bar_copy_to(pdev, bar, offset, data, size);
+}
+
+static void virtio_pcidev_bar_set(struct um_pci_device *pdev, int bar,
+				  unsigned int offset, u8 value, int size)
+{
+	struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev);
+	struct {
+		struct virtio_pcidev_msg hdr;
+		u8 data;
+	} msg = {
+		.hdr = {
+			.op = VIRTIO_PCIDEV_OP_CFG_WRITE,
+			.bar = bar,
+			.size = size,
+			.addr = offset,
+		},
+		.data = value,
+	};
+
+	virtio_pcidev_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0);
+}
+
+static const struct um_pci_ops virtio_pcidev_um_pci_ops = {
+	.cfgspace_read	= virtio_pcidev_cfgspace_read,
+	.cfgspace_write	= virtio_pcidev_cfgspace_write,
+	.bar_read	= virtio_pcidev_bar_read,
+	.bar_write	= virtio_pcidev_bar_write,
+	.bar_copy_from	= virtio_pcidev_bar_copy_from,
+	.bar_copy_to	= virtio_pcidev_bar_copy_to,
+	.bar_set	= virtio_pcidev_bar_set,
+};
+
+static void virtio_pcidev_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick)
+{
+	struct scatterlist sg[1];
+
+	sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE);
+	if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC))
+		kfree(buf);
+	else if (kick)
+		virtqueue_kick(vq);
+}
+
+static void virtio_pcidev_handle_irq_message(struct virtqueue *vq,
+					     struct virtio_pcidev_msg *msg)
+{
+	struct virtio_device *vdev = vq->vdev;
+	struct virtio_pcidev_device *dev = vdev->priv;
+
+	if (!dev->pdev.irq)
+		return;
+
+	/* we should properly chain interrupts, but on ARCH=um we don't care */
+
+	switch (msg->op) {
+	case VIRTIO_PCIDEV_OP_INT:
+		generic_handle_irq(dev->pdev.irq);
+		break;
+	case VIRTIO_PCIDEV_OP_MSI:
+		/* our MSI message is just the interrupt number */
+		if (msg->size == sizeof(u32))
+			generic_handle_irq(le32_to_cpup((void *)msg->data));
+		else
+			generic_handle_irq(le16_to_cpup((void *)msg->data));
+		break;
+	case VIRTIO_PCIDEV_OP_PME:
+		/* nothing to do - we already woke up due to the message */
+		break;
+	default:
+		dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op);
+		break;
+	}
+}
+
+static void virtio_pcidev_cmd_vq_cb(struct virtqueue *vq)
+{
+	struct virtio_device *vdev = vq->vdev;
+	struct virtio_pcidev_device *dev = vdev->priv;
+	void *cmd;
+	int len;
+
+	if (test_bit(UM_PCI_STAT_WAITING, &dev->status))
+		return;
+
+	while ((cmd = virtqueue_get_buf(vq, &len)))
+		virtio_pcidev_free_buf(dev, cmd);
+}
+
+static void virtio_pcidev_irq_vq_cb(struct virtqueue *vq)
+{
+	struct virtio_pcidev_msg *msg;
+	int len;
+
+	while ((msg = virtqueue_get_buf(vq, &len))) {
+		if (len >= sizeof(*msg))
+			virtio_pcidev_handle_irq_message(vq, msg);
+
+		/* recycle the message buffer */
+		virtio_pcidev_irq_vq_addbuf(vq, msg, true);
+	}
+}
+
+static int virtio_pcidev_init_vqs(struct virtio_pcidev_device *dev)
+{
+	struct virtqueue_info vqs_info[] = {
+		{ "cmd", virtio_pcidev_cmd_vq_cb },
+		{ "irq", virtio_pcidev_irq_vq_cb },
+	};
+	struct virtqueue *vqs[2];
+	int err, i;
+
+	err = virtio_find_vqs(dev->vdev, 2, vqs, vqs_info, NULL);
+	if (err)
+		return err;
+
+	dev->cmd_vq = vqs[0];
+	dev->irq_vq = vqs[1];
+
+	virtio_device_ready(dev->vdev);
+
+	for (i = 0; i < NUM_IRQ_MSGS; i++) {
+		void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL);
+
+		if (msg)
+			virtio_pcidev_irq_vq_addbuf(dev->irq_vq, msg, false);
+	}
+
+	virtqueue_kick(dev->irq_vq);
+
+	return 0;
+}
+
+static void __virtio_pcidev_virtio_platform_remove(struct virtio_device *vdev,
+						   struct virtio_pcidev_device *dev)
+{
+	um_pci_platform_device_unregister(&dev->pdev);
+
+	virtio_reset_device(vdev);
+	vdev->config->del_vqs(vdev);
+
+	kfree(dev);
+}
+
+static int virtio_pcidev_virtio_platform_probe(struct virtio_device *vdev,
+					       struct virtio_pcidev_device *dev)
+{
+	int err;
+
+	dev->platform = true;
+
+	err = virtio_pcidev_init_vqs(dev);
+	if (err)
+		goto err_free;
+
+	err = um_pci_platform_device_register(&dev->pdev);
+	if (err)
+		goto err_reset;
+
+	err = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev);
+	if (err)
+		goto err_unregister;
+
+	return 0;
+
+err_unregister:
+	um_pci_platform_device_unregister(&dev->pdev);
+err_reset:
+	virtio_reset_device(vdev);
+	vdev->config->del_vqs(vdev);
+err_free:
+	kfree(dev);
+	return err;
+}
+
+static int virtio_pcidev_virtio_probe(struct virtio_device *vdev)
+{
+	struct virtio_pcidev_device *dev;
+	int err;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->vdev = vdev;
+	vdev->priv = dev;
+
+	dev->pdev.ops = &virtio_pcidev_um_pci_ops;
+
+	if (of_device_is_compatible(vdev->dev.of_node, "simple-bus"))
+		return virtio_pcidev_virtio_platform_probe(vdev, dev);
+
+	err = virtio_pcidev_init_vqs(dev);
+	if (err)
+		goto err_free;
+
+	err = um_pci_device_register(&dev->pdev);
+	if (err)
+		goto err_reset;
+
+	device_set_wakeup_enable(&vdev->dev, true);
+
+	/*
+	 * In order to do suspend-resume properly, don't allow VQs
+	 * to be suspended.
+	 */
+	virtio_uml_set_no_vq_suspend(vdev, true);
+
+	return 0;
+
+err_reset:
+	virtio_reset_device(vdev);
+	vdev->config->del_vqs(vdev);
+err_free:
+	kfree(dev);
+	return err;
+}
+
+static void virtio_pcidev_virtio_remove(struct virtio_device *vdev)
+{
+	struct virtio_pcidev_device *dev = vdev->priv;
+
+	if (dev->platform) {
+		of_platform_depopulate(&vdev->dev);
+		__virtio_pcidev_virtio_platform_remove(vdev, dev);
+		return;
+	}
+
+	device_set_wakeup_enable(&vdev->dev, false);
+
+	um_pci_device_unregister(&dev->pdev);
+
+	/* Stop all virtqueues */
+	virtio_reset_device(vdev);
+	dev->cmd_vq = NULL;
+	dev->irq_vq = NULL;
+	vdev->config->del_vqs(vdev);
+
+	kfree(dev);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+MODULE_DEVICE_TABLE(virtio, id_table);
+
+static struct virtio_driver virtio_pcidev_virtio_driver = {
+	.driver.name = "virtio-pci",
+	.id_table = id_table,
+	.probe = virtio_pcidev_virtio_probe,
+	.remove = virtio_pcidev_virtio_remove,
+};
+
+static int __init virtio_pcidev_init(void)
+{
+	if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0,
+		 "No virtio device ID configured for PCI - no PCI support\n"))
+		return 0;
+
+	return register_virtio_driver(&virtio_pcidev_virtio_driver);
+}
+late_initcall(virtio_pcidev_init);
+
+static void __exit virtio_pcidev_exit(void)
+{
+	unregister_virtio_driver(&virtio_pcidev_virtio_driver);
+}
+module_exit(virtio_pcidev_exit);
-- 
2.51.0