From a2f925a2f62254119cdaa360cfc9c0424bccd531 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Fri, 28 Feb 2025 13:26:04 +0100 Subject: [PATCH 01/16] Revert "ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives" This reverts commit cc77e2ce187d26cc66af3577bf896d7410eb25ab. It was reported that adding ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives breaks entering lower package states for certain systems. It turns out that Samsung SSD 870 QVO actually has working LPM when using a recent SSD firmware version. The author of commit cc77e2ce187d ("ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives") reported himself that only older SSD firmware versions have broken LPM: https://lore.kernel.org/stable/93c10d38-718c-459d-84a5-4d87680b4da7@debian.org/ Unfortunately, he did not specify which older firmware version he was using which had broken LPM. Let's revert this quirk, which has FW version field specified as NULL (which means that it applies for all Samsung SSD 870 QVO firmware versions) for now. Once the author reports which older firmware version(s) that are broken, we can create a more fine grained quirk, which populates the FW version field accordingly. Fixes: cc77e2ce187d ("ata: libata-core: Add ATA_QUIRK_NOLPM for Samsung SSD 870 QVO drives") Reported-by: Dieter Mummenschanz Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219747 Link: https://lore.kernel.org/r/20250228122603.91814-2-cassel@kernel.org Signed-off-by: Niklas Cassel --- drivers/ata/libata-core.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 63ec2f218431..c085dd81ebe7 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4143,10 +4143,6 @@ static const struct ata_dev_quirks_entry __ata_dev_quirks[] = { { "Samsung SSD 860*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI }, - { "Samsung SSD 870 QVO*", NULL, ATA_QUIRK_NO_NCQ_TRIM | - ATA_QUIRK_ZERO_AFTER_TRIM | - ATA_QUIRK_NO_NCQ_ON_ATI | - ATA_QUIRK_NOLPM }, { "Samsung SSD 870*", NULL, ATA_QUIRK_NO_NCQ_TRIM | ATA_QUIRK_ZERO_AFTER_TRIM | ATA_QUIRK_NO_NCQ_ON_ATI }, -- 2.51.0 From 7eb172143d5508b4da468ed59ee857c6e5e01da6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 2 Mar 2025 11:48:20 -0800 Subject: [PATCH 02/16] Linux 6.14-rc5 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 30dab4c8b012..70bdbf2218fc 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 14 SUBLEVEL = 0 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From 5550187c4c21740942c32a9ae56f9f472a104cb4 Mon Sep 17 00:00:00 2001 From: David Gow Date: Mon, 10 Feb 2025 18:53:51 +0800 Subject: [PATCH 03/16] um: Pass the correct Rust target and options with gcc In order to work around some issues with disabling SSE on older versions of gcc (compilation would fail upon seeing a function declaration containing a float, even if it was never called or defined), the corresponding CFLAGS and RUSTFLAGS were only set when using clang. However, this led to two problems: - Newer gcc versions also wouldn't get the correct flags, despite not having the bug. - The RUSTFLAGS for setting the rust target definition were not set, despite being unrelated. This works by chance for x86_64, as the built-in default target is close enough, but not for 32-bit x86. Move the target definition outside the conditional block, and update the condition to take into account the gcc version. Fixes: a3046a618a28 ("um: Only disable SSE on clang to work around old GCC bugs") Signed-off-by: David Gow Link: https://patch.msgid.link/20250210105353.2238769-2-davidgow@google.com Signed-off-by: Johannes Berg --- arch/x86/Makefile.um | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index a46b1397ad01..c86cbd9cbba3 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -7,12 +7,13 @@ core-y += arch/x86/crypto/ # GCC versions < 11. See: # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99652 # -ifeq ($(CONFIG_CC_IS_CLANG),y) -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json +ifeq ($(call gcc-min-version, 110000)$(CONFIG_CC_IS_CLANG),y) +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 endif +KBUILD_RUSTFLAGS += --target=$(objtree)/scripts/target.json + ifeq ($(CONFIG_X86_32),y) START := 0x8048000 -- 2.51.0 From d1d7f01f7cd35e16c6bcef5a0e31988b5c9980f9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 10 Feb 2025 17:09:25 +0100 Subject: [PATCH 04/16] um: mark rodata read-only and implement _nofault accesses Mark read-only data actually read-only (simple mprotect), and to be able to test it also implement _nofault accesses. This works by setting up a new "segv_continue" pointer in current, and then when we hit a segfault we change the signal return context so that we continue at that address. The code using this sets it up so that it jumps to a label and then aborts the access that way, returning -EFAULT. It's possible to optimize the ___backtrack_faulted() thing by using asm goto (compiler version dependent) and/or gcc's (not sure if clang has it) &&label extension, but at least in one attempt I made the && caused the compiler to not load -EFAULT into the register in case of jumping to the &&label from the fault handler. So leave it like this for now. Signed-off-by: Johannes Berg Co-developed-by: Benjamin Berg Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250210160926.420133-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/Kconfig | 1 + arch/um/include/asm/processor-generic.h | 2 ++ arch/um/include/asm/uaccess.h | 20 ++++++++++++----- arch/um/include/shared/arch.h | 2 ++ arch/um/include/shared/as-layout.h | 2 +- arch/um/include/shared/irq_user.h | 3 ++- arch/um/include/shared/kern_util.h | 12 ++++++---- arch/um/kernel/irq.c | 3 ++- arch/um/kernel/mem.c | 10 +++++++++ arch/um/kernel/trap.c | 28 +++++++++++++++++++----- arch/um/os-Linux/signal.c | 4 ++-- arch/um/os-Linux/skas/process.c | 8 +++---- arch/x86/um/os-Linux/mcontext.c | 12 ++++++++++ arch/x86/um/shared/sysdep/faultinfo_32.h | 12 ++++++++++ arch/x86/um/shared/sysdep/faultinfo_64.h | 12 ++++++++++ 15 files changed, 108 insertions(+), 23 deletions(-) diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 18051b1cfce0..79509c7f39de 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -12,6 +12,7 @@ config UML select ARCH_HAS_KCOV select ARCH_HAS_STRNCPY_FROM_USER select ARCH_HAS_STRNLEN_USER + select ARCH_HAS_STRICT_KERNEL_RWX select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h index 5d6356eafffe..8a789c17acd8 100644 --- a/arch/um/include/asm/processor-generic.h +++ b/arch/um/include/asm/processor-generic.h @@ -31,6 +31,8 @@ struct thread_struct { } thread; } request; + void *segv_continue; + /* Contains variable sized FP registers */ struct pt_regs regs; }; diff --git a/arch/um/include/asm/uaccess.h b/arch/um/include/asm/uaccess.h index 1d4b6bbc1b65..3a08f9029a3f 100644 --- a/arch/um/include/asm/uaccess.h +++ b/arch/um/include/asm/uaccess.h @@ -9,6 +9,7 @@ #include #include +#include #define __under_task_size(addr, size) \ (((unsigned long) (addr) < TASK_SIZE) && \ @@ -44,19 +45,28 @@ static inline int __access_ok(const void __user *ptr, unsigned long size) __access_ok_vsyscall(addr, size)); } -/* no pagefaults for kernel addresses in um */ #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ - *((type *)dst) = get_unaligned((type *)(src)); \ - if (0) /* make sure the label looks used to the compiler */ \ + int __faulted; \ + \ + ___backtrack_faulted(__faulted); \ + if (__faulted) { \ + *((type *)dst) = (type) 0; \ goto err_label; \ + } \ + *((type *)dst) = get_unaligned((type *)(src)); \ + current->thread.segv_continue = NULL; \ } while (0) #define __put_kernel_nofault(dst, src, type, err_label) \ do { \ - put_unaligned(*((type *)src), (type *)(dst)); \ - if (0) /* make sure the label looks used to the compiler */ \ + int __faulted; \ + \ + ___backtrack_faulted(__faulted); \ + if (__faulted) \ goto err_label; \ + put_unaligned(*((type *)src), (type *)(dst)); \ + current->thread.segv_continue = NULL; \ } while (0) #endif diff --git a/arch/um/include/shared/arch.h b/arch/um/include/shared/arch.h index 880ee42a3329..cc398a21ad96 100644 --- a/arch/um/include/shared/arch.h +++ b/arch/um/include/shared/arch.h @@ -12,4 +12,6 @@ extern void arch_check_bugs(void); extern int arch_fixup(unsigned long address, struct uml_pt_regs *regs); extern void arch_examine_signal(int sig, struct uml_pt_regs *regs); +void mc_set_rip(void *_mc, void *target); + #endif diff --git a/arch/um/include/shared/as-layout.h b/arch/um/include/shared/as-layout.h index ea65f151bf48..4f44dcce8a7c 100644 --- a/arch/um/include/shared/as-layout.h +++ b/arch/um/include/shared/as-layout.h @@ -50,7 +50,7 @@ extern int linux_main(int argc, char **argv, char **envp); extern void uml_finishsetup(void); struct siginfo; -extern void (*sig_info[])(int, struct siginfo *si, struct uml_pt_regs *); +extern void (*sig_info[])(int, struct siginfo *si, struct uml_pt_regs *, void *); #endif diff --git a/arch/um/include/shared/irq_user.h b/arch/um/include/shared/irq_user.h index da0f6eea30d0..88835b52ae2b 100644 --- a/arch/um/include/shared/irq_user.h +++ b/arch/um/include/shared/irq_user.h @@ -15,7 +15,8 @@ enum um_irq_type { }; struct siginfo; -extern void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +extern void sigio_handler(int sig, struct siginfo *unused_si, + struct uml_pt_regs *regs, void *mc); void sigio_run_timetravel_handlers(void); extern void free_irq_by_fd(int fd); extern void deactivate_fd(int fd, int irqnum); diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index f21dc8517538..00ca3e12fd9a 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -24,10 +24,12 @@ extern void free_stack(unsigned long stack, int order); struct pt_regs; extern void do_signal(struct pt_regs *regs); extern void interrupt_end(void); -extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs); +extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc); extern unsigned long segv(struct faultinfo fi, unsigned long ip, - int is_user, struct uml_pt_regs *regs); + int is_user, struct uml_pt_regs *regs, + void *mc); extern int handle_page_fault(unsigned long address, unsigned long ip, int is_write, int is_user, int *code_out); @@ -59,8 +61,10 @@ extern unsigned long from_irq_stack(int nested); extern int singlestepping(void); -extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); -extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs); +extern void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc); +extern void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc); extern void fatal_sigsegv(void) __attribute__ ((noreturn)); void um_idle_sleep(void); diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index a4991746f5ea..abe8f30a521c 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -236,7 +236,8 @@ static void _sigio_handler(struct uml_pt_regs *regs, free_irqs(); } -void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { preempt_disable(); _sigio_handler(regs, irqs_suspended); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index befed230aac2..05ffceb555b4 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include #include @@ -241,3 +243,11 @@ static const pgprot_t protection_map[16] = { [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED }; DECLARE_VM_GET_PAGE_PROT + +void mark_rodata_ro(void) +{ + unsigned long rodata_start = PFN_ALIGN(__start_rodata); + unsigned long rodata_end = PFN_ALIGN(__end_rodata); + + os_protect_memory((void *)rodata_start, rodata_end - rodata_start, 1, 0, 0); +} diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index cdaee3e94273..ce073150dc20 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -16,6 +16,7 @@ #include #include #include +#include /* * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by @@ -175,12 +176,14 @@ void fatal_sigsegv(void) * @sig: the signal number * @unused_si: the signal info struct; unused in this handler * @regs: the ptrace register information + * @mc: the mcontext of the signal * * The handler first extracts the faultinfo from the UML ptrace regs struct. * If the userfault did not happen in an UML userspace process, bad_segv is called. * Otherwise the signal did happen in a cloned userspace process, handle it. */ -void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { struct faultinfo * fi = UPT_FAULTINFO(regs); @@ -189,7 +192,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) bad_segv(*fi, UPT_IP(regs)); return; } - segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs); + segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc); } /* @@ -199,7 +202,7 @@ void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) * give us bad data! */ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, - struct uml_pt_regs *regs) + struct uml_pt_regs *regs, void *mc) { int si_code; int err; @@ -223,6 +226,19 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user, goto out; } else if (current->mm == NULL) { + if (current->pagefault_disabled) { + if (!mc) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault with pagefaults disabled but no mcontext"); + } + if (!current->thread.segv_continue) { + show_regs(container_of(regs, struct pt_regs, regs)); + panic("Segfault without recovery target"); + } + mc_set_rip(mc, current->thread.segv_continue); + current->thread.segv_continue = NULL; + goto out; + } show_regs(container_of(regs, struct pt_regs, regs)); panic("Segfault with no mm"); } @@ -274,7 +290,8 @@ out: return 0; } -void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) +void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs, + void *mc) { int code, err; if (!UPT_IS_USER(regs)) { @@ -302,7 +319,8 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) } } -void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs) +void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs, + void *mc) { do_IRQ(WINCH_IRQ, regs); } diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c index 9ea7269ffb77..e71e5b4878d1 100644 --- a/arch/um/os-Linux/signal.c +++ b/arch/um/os-Linux/signal.c @@ -21,7 +21,7 @@ #include #include -void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *) = { +void (*sig_info[NSIG])(int, struct siginfo *, struct uml_pt_regs *, void *mc) = { [SIGTRAP] = relay_signal, [SIGFPE] = relay_signal, [SIGILL] = relay_signal, @@ -47,7 +47,7 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc) if ((sig != SIGIO) && (sig != SIGWINCH)) unblock_signals_trace(); - (*sig_info[sig])(sig, si, &r); + (*sig_info[sig])(sig, si, &r, mc); errno = save_errno; } diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c index e2f8f156402f..ae2aea062f06 100644 --- a/arch/um/os-Linux/skas/process.c +++ b/arch/um/os-Linux/skas/process.c @@ -166,7 +166,7 @@ static void get_skas_faultinfo(int pid, struct faultinfo *fi) static void handle_segv(int pid, struct uml_pt_regs *regs) { get_skas_faultinfo(pid, ®s->faultinfo); - segv(regs->faultinfo, 0, 1, NULL); + segv(regs->faultinfo, 0, 1, NULL, NULL); } static void handle_trap(int pid, struct uml_pt_regs *regs) @@ -525,7 +525,7 @@ void userspace(struct uml_pt_regs *regs) get_skas_faultinfo(pid, ®s->faultinfo); (*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si, - regs); + regs, NULL); } else handle_segv(pid, regs); break; @@ -533,7 +533,7 @@ void userspace(struct uml_pt_regs *regs) handle_trap(pid, regs); break; case SIGTRAP: - relay_signal(SIGTRAP, (struct siginfo *)&si, regs); + relay_signal(SIGTRAP, (struct siginfo *)&si, regs, NULL); break; case SIGALRM: break; @@ -543,7 +543,7 @@ void userspace(struct uml_pt_regs *regs) case SIGFPE: case SIGWINCH: block_signals_trace(); - (*sig_info[sig])(sig, (struct siginfo *)&si, regs); + (*sig_info[sig])(sig, (struct siginfo *)&si, regs, NULL); unblock_signals_trace(); break; default: diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index e80ab7d28117..d2f3a595b4ef 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -4,6 +4,7 @@ #include #include #include +#include void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) { @@ -31,3 +32,14 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) regs->gp[CS / sizeof(unsigned long)] |= 3; #endif } + +void mc_set_rip(void *_mc, void *target) +{ + mcontext_t *mc = _mc; + +#ifdef __i386__ + mc->gregs[REG_EIP] = (unsigned long)target; +#else + mc->gregs[REG_RIP] = (unsigned long)target; +#endif +} diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h index b6f2437ec29c..ab5c8e47049c 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_32.h +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -29,4 +29,16 @@ struct faultinfo { #define PTRACE_FULL_FAULTINFO 0 +#define ___backtrack_faulted(_faulted) \ + asm volatile ( \ + "mov $0, %0\n" \ + "movl $__get_kernel_nofault_faulted_%=,%1\n" \ + "jmp _end_%=\n" \ + "__get_kernel_nofault_faulted_%=:\n" \ + "mov $1, %0;" \ + "_end_%=:" \ + : "=r" (_faulted), \ + "=m" (current->thread.segv_continue) :: \ + ) + #endif diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h index ee88f88974ea..26fb4835d3e9 100644 --- a/arch/x86/um/shared/sysdep/faultinfo_64.h +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -29,4 +29,16 @@ struct faultinfo { #define PTRACE_FULL_FAULTINFO 1 +#define ___backtrack_faulted(_faulted) \ + asm volatile ( \ + "mov $0, %0\n" \ + "movq $__get_kernel_nofault_faulted_%=,%1\n" \ + "jmp _end_%=\n" \ + "__get_kernel_nofault_faulted_%=:\n" \ + "mov $1, %0;" \ + "_end_%=:" \ + : "=r" (_faulted), \ + "=m" (current->thread.segv_continue) :: \ + ) + #endif -- 2.51.0 From 84a6fc378471fbeaf48f8604566a5a33a3d63c18 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 10 Feb 2025 17:09:26 +0100 Subject: [PATCH 05/16] um: remove copy_from_kernel_nofault_allowed There is no need to override the default version of this function anymore as UML now has proper _nofault memory access functions. Doing this also fixes the fact that the implementation was incorrect as using mincore() will incorrectly flag pages as inaccessible if they were swapped out by the host. Fixes: f75b1b1bedfb ("um: Implement probe_kernel_read()") Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250210160926.420133-3-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 1 - arch/um/kernel/Makefile | 2 +- arch/um/kernel/maccess.c | 19 -------------- arch/um/os-Linux/process.c | 51 ------------------------------------- 4 files changed, 1 insertion(+), 72 deletions(-) delete mode 100644 arch/um/kernel/maccess.c diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index 5babad8c5f75..bc02767f0639 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -213,7 +213,6 @@ extern int os_protect_memory(void *addr, unsigned long len, extern int os_unmap_memory(void *addr, int len); extern int os_drop_memory(void *addr, int length); extern int can_drop_memory(void); -extern int os_mincore(void *addr, unsigned long len); void os_set_pdeathsig(void); diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index f8567b933ffa..4df1cd0d2017 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -17,7 +17,7 @@ extra-y := vmlinux.lds obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \ physmem.o process.o ptrace.o reboot.o sigio.o \ signal.o sysrq.o time.o tlb.o trap.o \ - um_arch.o umid.o maccess.o kmsg_dump.o capflags.o skas/ + um_arch.o umid.o kmsg_dump.o capflags.o skas/ obj-y += load_file.o obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o diff --git a/arch/um/kernel/maccess.c b/arch/um/kernel/maccess.c deleted file mode 100644 index 8ccd56813f68..000000000000 --- a/arch/um/kernel/maccess.c +++ /dev/null @@ -1,19 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 2013 Richard Weinberger - */ - -#include -#include -#include - -bool copy_from_kernel_nofault_allowed(const void *src, size_t size) -{ - void *psrc = (void *)rounddown((unsigned long)src, PAGE_SIZE); - - if ((unsigned long)src < PAGE_SIZE || size <= 0) - return false; - if (os_mincore(psrc, size + src - psrc) <= 0) - return false; - return true; -} diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index 9f086f939420..184566edeee9 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -142,57 +142,6 @@ out: return ok; } -static int os_page_mincore(void *addr) -{ - char vec[2]; - int ret; - - ret = mincore(addr, UM_KERN_PAGE_SIZE, vec); - if (ret < 0) { - if (errno == ENOMEM || errno == EINVAL) - return 0; - else - return -errno; - } - - return vec[0] & 1; -} - -int os_mincore(void *addr, unsigned long len) -{ - char *vec; - int ret, i; - - if (len <= UM_KERN_PAGE_SIZE) - return os_page_mincore(addr); - - vec = calloc(1, (len + UM_KERN_PAGE_SIZE - 1) / UM_KERN_PAGE_SIZE); - if (!vec) - return -ENOMEM; - - ret = mincore(addr, UM_KERN_PAGE_SIZE, vec); - if (ret < 0) { - if (errno == ENOMEM || errno == EINVAL) - ret = 0; - else - ret = -errno; - - goto out; - } - - for (i = 0; i < ((len + UM_KERN_PAGE_SIZE - 1) / UM_KERN_PAGE_SIZE); i++) { - if (!(vec[i] & 1)) { - ret = 0; - goto out; - } - } - - ret = 1; -out: - free(vec); - return ret; -} - void init_new_thread_signals(void) { set_handler(SIGSEGV); -- 2.51.0 From 1fc350eed627762f4f6db3f35776d481e7f02c5c Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Wed, 12 Feb 2025 12:57:56 +0800 Subject: [PATCH 06/16] um: Allocate vdso page pointer statically Instead of dynamically allocating the pointer to the vdso page during boot, we can just allocate it statically. Doing so will reduce error handling and make the code slightly more readable. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250212045756.164977-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/x86/um/vdso/vma.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index f238f7b33cdd..dc8dfb2abd80 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -12,33 +12,22 @@ static unsigned int __read_mostly vdso_enabled = 1; unsigned long um_vdso_addr; +static struct page *um_vdso; extern unsigned long task_size; extern char vdso_start[], vdso_end[]; -static struct page **vdsop; - static int __init init_vdso(void) { - struct page *um_vdso; - BUG_ON(vdso_end - vdso_start > PAGE_SIZE); um_vdso_addr = task_size - PAGE_SIZE; - vdsop = kmalloc(sizeof(struct page *), GFP_KERNEL); - if (!vdsop) - goto oom; - um_vdso = alloc_page(GFP_KERNEL); - if (!um_vdso) { - kfree(vdsop); - + if (!um_vdso) goto oom; - } copy_page(page_address(um_vdso), vdso_start); - *vdsop = um_vdso; return 0; @@ -56,6 +45,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) struct mm_struct *mm = current->mm; static struct vm_special_mapping vdso_mapping = { .name = "[vdso]", + .pages = &um_vdso, }; if (!vdso_enabled) @@ -64,7 +54,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (mmap_write_lock_killable(mm)) return -EINTR; - vdso_mapping.pages = vdsop; vma = _install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, -- 2.51.0 From 0bc754d1e31f40f4a343b692096d9e092ccc0370 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 14 Feb 2025 10:28:22 +0100 Subject: [PATCH 07/16] um: hostfs: avoid issues on inode number reuse by host MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Some file systems (e.g. ext4) may reuse inode numbers once the inode is not in use anymore. Usually hostfs will keep an FD open for each inode, but this is not always the case. In the case of sockets, this cannot even be done properly. As such, the following sequence of events was possible: * application creates and deletes a socket * hostfs creates/deletes the socket on the host * inode is still in the hostfs cache * hostfs creates a new file * ext4 on the outside reuses the inode number * hostfs finds the socket inode for the newly created file * application receives -ENXIO when opening the file As mentioned, this can only happen if the deleted file is a special file that is never opened on the host (i.e. no .open fop). As such, to prevent issues, it is sufficient to check that the inode has the expected type. That said, also add a check for the inode birth time, just to be on the safe side. Fixes: 74ce793bcbde ("hostfs: Fix ephemeral inodes") Signed-off-by: Benjamin Berg Reviewed-by: Mickaël Salaün Tested-by: Mickaël Salaün Link: https://patch.msgid.link/20250214092822.1241575-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- fs/hostfs/hostfs.h | 2 +- fs/hostfs/hostfs_kern.c | 7 ++++- fs/hostfs/hostfs_user.c | 59 ++++++++++++++++++++++++----------------- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 8b39c15c408c..15b2f094d36e 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -60,7 +60,7 @@ struct hostfs_stat { unsigned int uid; unsigned int gid; unsigned long long size; - struct hostfs_timespec atime, mtime, ctime; + struct hostfs_timespec atime, mtime, ctime, btime; unsigned int blksize; unsigned long long blocks; struct { diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e0741e468956..e6e247235728 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -33,6 +33,7 @@ struct hostfs_inode_info { struct inode vfs_inode; struct mutex open_mutex; dev_t dev; + struct hostfs_timespec btime; }; static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) @@ -547,6 +548,7 @@ static int hostfs_inode_set(struct inode *ino, void *data) } HOSTFS_I(ino)->dev = dev; + HOSTFS_I(ino)->btime = st->btime; ino->i_ino = st->ino; ino->i_mode = st->mode; return hostfs_inode_update(ino, st); @@ -557,7 +559,10 @@ static int hostfs_inode_test(struct inode *inode, void *data) const struct hostfs_stat *st = data; dev_t dev = MKDEV(st->dev.maj, st->dev.min); - return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev; + return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev && + (inode->i_mode & S_IFMT) == (st->mode & S_IFMT) && + HOSTFS_I(inode)->btime.tv_sec == st->btime.tv_sec && + HOSTFS_I(inode)->btime.tv_nsec == st->btime.tv_nsec; } static struct inode *hostfs_iget(struct super_block *sb, char *name) diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 97e9c40a9448..3bcd9f35e70b 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -18,39 +18,48 @@ #include "hostfs.h" #include -static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) +static void statx_to_hostfs(const struct statx *buf, struct hostfs_stat *p) { - p->ino = buf->st_ino; - p->mode = buf->st_mode; - p->nlink = buf->st_nlink; - p->uid = buf->st_uid; - p->gid = buf->st_gid; - p->size = buf->st_size; - p->atime.tv_sec = buf->st_atime; - p->atime.tv_nsec = 0; - p->ctime.tv_sec = buf->st_ctime; - p->ctime.tv_nsec = 0; - p->mtime.tv_sec = buf->st_mtime; - p->mtime.tv_nsec = 0; - p->blksize = buf->st_blksize; - p->blocks = buf->st_blocks; - p->rdev.maj = os_major(buf->st_rdev); - p->rdev.min = os_minor(buf->st_rdev); - p->dev.maj = os_major(buf->st_dev); - p->dev.min = os_minor(buf->st_dev); + p->ino = buf->stx_ino; + p->mode = buf->stx_mode; + p->nlink = buf->stx_nlink; + p->uid = buf->stx_uid; + p->gid = buf->stx_gid; + p->size = buf->stx_size; + p->atime.tv_sec = buf->stx_atime.tv_sec; + p->atime.tv_nsec = buf->stx_atime.tv_nsec; + p->ctime.tv_sec = buf->stx_ctime.tv_sec; + p->ctime.tv_nsec = buf->stx_ctime.tv_nsec; + p->mtime.tv_sec = buf->stx_mtime.tv_sec; + p->mtime.tv_nsec = buf->stx_mtime.tv_nsec; + if (buf->stx_mask & STATX_BTIME) { + p->btime.tv_sec = buf->stx_btime.tv_sec; + p->btime.tv_nsec = buf->stx_btime.tv_nsec; + } else { + memset(&p->btime, 0, sizeof(p->btime)); + } + p->blksize = buf->stx_blksize; + p->blocks = buf->stx_blocks; + p->rdev.maj = buf->stx_rdev_major; + p->rdev.min = buf->stx_rdev_minor; + p->dev.maj = buf->stx_dev_major; + p->dev.min = buf->stx_dev_minor; } int stat_file(const char *path, struct hostfs_stat *p, int fd) { - struct stat64 buf; + struct statx buf; + int flags = AT_SYMLINK_NOFOLLOW; if (fd >= 0) { - if (fstat64(fd, &buf) < 0) - return -errno; - } else if (lstat64(path, &buf) < 0) { - return -errno; + flags |= AT_EMPTY_PATH; + path = ""; } - stat64_to_hostfs(&buf, p); + + if ((statx(fd, path, flags, STATX_BASIC_STATS | STATX_BTIME, &buf)) < 0) + return -errno; + + statx_to_hostfs(&buf, p); return 0; } -- 2.51.0 From f664a1399633c63706f763d4bb58c96110355330 Mon Sep 17 00:00:00 2001 From: Ethan Carter Edwards Date: Thu, 20 Feb 2025 17:39:40 -0500 Subject: [PATCH 08/16] um: use str_yes_no() to remove hardcoded "yes" and "no" Remove hard-coded strings by using the str_yes_no() helper function provided by . Signed-off-by: Ethan Carter Edwards Link: https://patch.msgid.link/20250220-um_yes_no-v1-1-2a355ed2d225@ethancedwards.com Signed-off-by: Johannes Berg --- arch/um/kernel/um_arch.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 79ea97d4797e..7f050783885a 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -78,7 +79,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "model name\t: UML\n"); seq_printf(m, "mode\t\t: skas\n"); seq_printf(m, "host\t\t: %s\n", host_info); - seq_printf(m, "fpu\t\t: %s\n", cpu_has(&boot_cpu_data, X86_FEATURE_FPU) ? "yes" : "no"); + seq_printf(m, "fpu\t\t: %s\n", str_yes_no(cpu_has(&boot_cpu_data, X86_FEATURE_FPU))); seq_printf(m, "flags\t\t:"); for (i = 0; i < 32*NCAPINTS; i++) if (cpu_has(&boot_cpu_data, i) && (x86_cap_flags[i] != NULL)) -- 2.51.0 From e82cf3051e6193f61e03898f8dba035199064d36 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 21 Feb 2025 12:18:55 +0800 Subject: [PATCH 09/16] um: Update min_low_pfn to match changes in uml_reserved When uml_reserved is updated, min_low_pfn must also be updated accordingly. Otherwise, min_low_pfn will not accurately reflect the lowest available PFN. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250221041855.1156109-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/kernel/mem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 05ffceb555b4..61b5a5ede01c 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -68,6 +68,7 @@ void __init mem_init(void) map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; + min_low_pfn = PFN_UP(__pa(uml_reserved)); /* this will put all low memory onto the freelists */ memblock_free_all(); -- 2.51.0 From 089db01ea7eb4f366be45b9390a04f1c601c0071 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 28 Feb 2025 10:00:08 +0100 Subject: [PATCH 10/16] um/locking: Remove semicolon from "lock" prefix Minimum version of binutils required to compile the kernel is 2.25. This version correctly handles the "lock" prefix, so it is possible to remove the semicolon, which was used to support ancient versions of GNU as. Due to the semicolon, the compiler considers "lock; insn" as two separate instructions. Removing the semicolon makes asm length calculations more accurate, consequently making scheduling and inlining decisions of the compiler more accurate. Removing the semicolon also enables assembler checks involving lock prefix. Trying to assemble e.g. "lock andl %eax, %ebx" results in: Error: expecting lockable instruction after `lock' Signed-off-by: Uros Bizjak Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Link: https://patch.msgid.link/20250228090058.2499163-1-ubizjak@gmail.com Signed-off-by: Johannes Berg --- arch/x86/um/asm/barrier.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 4da336965698..b51aefd6ec2b 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -12,9 +12,9 @@ */ #ifdef CONFIG_X86_32 -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#define mb() alternative("lock addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) #else /* CONFIG_X86_32 */ -- 2.51.0 From 887c5c12e80c8424bd471122d2e8b6b462e12874 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Fri, 14 Mar 2025 14:08:15 +0100 Subject: [PATCH 11/16] um: work around sched_yield not yielding in time-travel mode sched_yield by a userspace may not actually cause scheduling in time-travel mode as no time has passed. In the case seen it appears to be a badly implemented userspace spinlock in ASAN. Unfortunately, with time-travel it causes an extreme slowdown or even deadlock depending on the kernel configuration (CONFIG_UML_MAX_USERSPACE_ITERATIONS). Work around it by accounting time to the process whenever it executes a sched_yield syscall. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250314130815.226872-1-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/um/include/linux/time-internal.h | 2 ++ arch/um/kernel/skas/syscall.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h index b22226634ff6..138908b999d7 100644 --- a/arch/um/include/linux/time-internal.h +++ b/arch/um/include/linux/time-internal.h @@ -83,6 +83,8 @@ extern void time_travel_not_configured(void); #define time_travel_del_event(...) time_travel_not_configured() #endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */ +extern unsigned long tt_extra_sched_jiffies; + /* * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used, * which is intentional since we really shouldn't link it in that case. diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c index b09e85279d2b..a5beaea2967e 100644 --- a/arch/um/kernel/skas/syscall.c +++ b/arch/um/kernel/skas/syscall.c @@ -31,6 +31,17 @@ void handle_syscall(struct uml_pt_regs *r) goto out; syscall = UPT_SYSCALL_NR(r); + + /* + * If no time passes, then sched_yield may not actually yield, causing + * broken spinlock implementations in userspace (ASAN) to hang for long + * periods of time. + */ + if ((time_travel_mode == TT_MODE_INFCPU || + time_travel_mode == TT_MODE_EXTERNAL) && + syscall == __NR_sched_yield) + tt_extra_sched_jiffies += 1; + if (syscall >= 0 && syscall < __NR_syscalls) { unsigned long ret = EXECUTE_SYSCALL(syscall, regs); -- 2.51.0 From 24ffa71b0f15fe4827d3672d5918f97564b2fba1 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Sun, 16 Mar 2025 00:19:09 +0800 Subject: [PATCH 12/16] um: virt-pci: Refactor virtio_pcidev into its own module Decouple virt-pci and virtio_pcidev, refactoring virtio_pcidev into its own module. Define a set of APIs for virt-pci. This allows for future addition of more PCI emulation implementations. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250315161910.4082396-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/Kconfig | 12 +- arch/um/drivers/Makefile | 3 +- arch/um/drivers/virt-pci.c | 699 ++++++-------------------------- arch/um/drivers/virt-pci.h | 41 ++ arch/um/drivers/virtio_pcidev.c | 628 ++++++++++++++++++++++++++++ 5 files changed, 794 insertions(+), 589 deletions(-) create mode 100644 arch/um/drivers/virt-pci.h create mode 100644 arch/um/drivers/virtio_pcidev.c diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig index ede40a160c5e..9cb196070614 100644 --- a/arch/um/drivers/Kconfig +++ b/arch/um/drivers/Kconfig @@ -345,16 +345,20 @@ config UML_RTC by providing a fake RTC clock that causes a wakeup at the right time. -config UML_PCI_OVER_VIRTIO - bool "Enable PCI over VIRTIO device simulation" - # in theory, just VIRTIO is enough, but that causes recursion - depends on VIRTIO_UML +config UML_PCI + bool select FORCE_PCI select UML_IOMEM_EMULATION select UML_DMA_EMULATION select PCI_MSI select PCI_LOCKLESS_CONFIG +config UML_PCI_OVER_VIRTIO + bool "Enable PCI over VIRTIO device simulation" + # in theory, just VIRTIO is enough, but that causes recursion + depends on VIRTIO_UML + select UML_PCI + config UML_PCI_OVER_VIRTIO_DEVICE_ID int "set the virtio device ID for PCI emulation" default -1 diff --git a/arch/um/drivers/Makefile b/arch/um/drivers/Makefile index 57882e6bc215..0a5820343ad3 100644 --- a/arch/um/drivers/Makefile +++ b/arch/um/drivers/Makefile @@ -60,7 +60,8 @@ obj-$(CONFIG_BLK_DEV_COW_COMMON) += cow_user.o obj-$(CONFIG_UML_RANDOM) += random.o obj-$(CONFIG_VIRTIO_UML) += virtio_uml.o obj-$(CONFIG_UML_RTC) += rtc.o -obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virt-pci.o +obj-$(CONFIG_UML_PCI) += virt-pci.o +obj-$(CONFIG_UML_PCI_OVER_VIRTIO) += virtio_pcidev.o # pcap_user.o must be added explicitly. USER_OBJS := fd.o null.o pty.o tty.o xterm.o slip_common.o vde_user.o vector_user.o diff --git a/arch/um/drivers/virt-pci.c b/arch/um/drivers/virt-pci.c index dd5580f975cc..b83b5a765d4e 100644 --- a/arch/um/drivers/virt-pci.c +++ b/arch/um/drivers/virt-pci.c @@ -5,52 +5,19 @@ */ #include #include -#include -#include #include #include #include -#include -#include -#include #include #include #include +#include "virt-pci.h" + #define MAX_DEVICES 8 #define MAX_MSI_VECTORS 32 #define CFG_SPACE_SIZE 4096 -/* for MSI-X we have a 32-bit payload */ -#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) -#define NUM_IRQ_MSGS 10 - -struct um_pci_message_buffer { - struct virtio_pcidev_msg hdr; - u8 data[8]; -}; - -struct um_pci_device { - struct virtio_device *vdev; - - /* for now just standard BARs */ - u8 resptr[PCI_STD_NUM_BARS]; - - struct virtqueue *cmd_vq, *irq_vq; - -#define UM_PCI_WRITE_BUFS 20 - struct um_pci_message_buffer bufs[UM_PCI_WRITE_BUFS + 1]; - void *extra_ptrs[UM_PCI_WRITE_BUFS + 1]; - DECLARE_BITMAP(used_bufs, UM_PCI_WRITE_BUFS); - -#define UM_PCI_STAT_WAITING 0 - unsigned long status; - - int irq; - - bool platform; -}; - struct um_pci_device_reg { struct um_pci_device *dev; void __iomem *iomem; @@ -65,179 +32,15 @@ static struct irq_domain *um_pci_inner_domain; static struct irq_domain *um_pci_msi_domain; static unsigned long um_pci_msi_used[BITS_TO_LONGS(MAX_MSI_VECTORS)]; -static unsigned int um_pci_max_delay_us = 40000; -module_param_named(max_delay_us, um_pci_max_delay_us, uint, 0644); - -static int um_pci_get_buf(struct um_pci_device *dev, bool *posted) -{ - int i; - - for (i = 0; i < UM_PCI_WRITE_BUFS; i++) { - if (!test_and_set_bit(i, dev->used_bufs)) - return i; - } - - *posted = false; - return UM_PCI_WRITE_BUFS; -} - -static void um_pci_free_buf(struct um_pci_device *dev, void *buf) -{ - int i; - - if (buf == &dev->bufs[UM_PCI_WRITE_BUFS]) { - kfree(dev->extra_ptrs[UM_PCI_WRITE_BUFS]); - dev->extra_ptrs[UM_PCI_WRITE_BUFS] = NULL; - return; - } - - for (i = 0; i < UM_PCI_WRITE_BUFS; i++) { - if (buf == &dev->bufs[i]) { - kfree(dev->extra_ptrs[i]); - dev->extra_ptrs[i] = NULL; - WARN_ON(!test_and_clear_bit(i, dev->used_bufs)); - return; - } - } - - WARN_ON(1); -} - -static int um_pci_send_cmd(struct um_pci_device *dev, - struct virtio_pcidev_msg *cmd, - unsigned int cmd_size, - const void *extra, unsigned int extra_size, - void *out, unsigned int out_size) -{ - struct scatterlist out_sg, extra_sg, in_sg; - struct scatterlist *sgs_list[] = { - [0] = &out_sg, - [1] = extra ? &extra_sg : &in_sg, - [2] = extra ? &in_sg : NULL, - }; - struct um_pci_message_buffer *buf; - int delay_count = 0; - bool bounce_out; - int ret, len; - int buf_idx; - bool posted; - - if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) - return -EINVAL; - - switch (cmd->op) { - case VIRTIO_PCIDEV_OP_CFG_WRITE: - case VIRTIO_PCIDEV_OP_MMIO_WRITE: - case VIRTIO_PCIDEV_OP_MMIO_MEMSET: - /* in PCI, writes are posted, so don't wait */ - posted = !out; - WARN_ON(!posted); - break; - default: - posted = false; - break; - } - - bounce_out = !posted && cmd_size <= sizeof(*cmd) && - out && out_size <= sizeof(buf->data); - - buf_idx = um_pci_get_buf(dev, &posted); - buf = &dev->bufs[buf_idx]; - memcpy(buf, cmd, cmd_size); - - if (posted && extra && extra_size > sizeof(buf) - cmd_size) { - dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size, - GFP_ATOMIC); - - if (!dev->extra_ptrs[buf_idx]) { - um_pci_free_buf(dev, buf); - return -ENOMEM; - } - extra = dev->extra_ptrs[buf_idx]; - } else if (extra && extra_size <= sizeof(buf) - cmd_size) { - memcpy((u8 *)buf + cmd_size, extra, extra_size); - cmd_size += extra_size; - extra_size = 0; - extra = NULL; - cmd = (void *)buf; - } else { - cmd = (void *)buf; - } - - sg_init_one(&out_sg, cmd, cmd_size); - if (extra) - sg_init_one(&extra_sg, extra, extra_size); - /* allow stack for small buffers */ - if (bounce_out) - sg_init_one(&in_sg, buf->data, out_size); - else if (out) - sg_init_one(&in_sg, out, out_size); - - /* add to internal virtio queue */ - ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, - extra ? 2 : 1, - out ? 1 : 0, - cmd, GFP_ATOMIC); - if (ret) { - um_pci_free_buf(dev, buf); - return ret; - } - - if (posted) { - virtqueue_kick(dev->cmd_vq); - return 0; - } - - /* kick and poll for getting a response on the queue */ - set_bit(UM_PCI_STAT_WAITING, &dev->status); - virtqueue_kick(dev->cmd_vq); - ret = 0; - - while (1) { - void *completed = virtqueue_get_buf(dev->cmd_vq, &len); - - if (completed == buf) - break; - - if (completed) - um_pci_free_buf(dev, completed); - - if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || - ++delay_count > um_pci_max_delay_us, - "um virt-pci delay: %d", delay_count)) { - ret = -EIO; - break; - } - udelay(1); - } - clear_bit(UM_PCI_STAT_WAITING, &dev->status); - - if (bounce_out) - memcpy(out, buf->data, out_size); - - um_pci_free_buf(dev, buf); - - return ret; -} - static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, int size) { struct um_pci_device_reg *reg = priv; struct um_pci_device *dev = reg->dev; - struct virtio_pcidev_msg hdr = { - .op = VIRTIO_PCIDEV_OP_CFG_READ, - .size = size, - .addr = offset, - }; - /* max 8, we might not use it all */ - u8 data[8]; if (!dev) return ULONG_MAX; - memset(data, 0xff, sizeof(data)); - switch (size) { case 1: case 2: @@ -251,23 +54,7 @@ static unsigned long um_pci_cfgspace_read(void *priv, unsigned int offset, return ULONG_MAX; } - if (um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size)) - return ULONG_MAX; - - switch (size) { - case 1: - return data[0]; - case 2: - return le16_to_cpup((void *)data); - case 4: - return le32_to_cpup((void *)data); -#ifdef CONFIG_64BIT - case 8: - return le64_to_cpup((void *)data); -#endif - default: - return ULONG_MAX; - } + return dev->ops->cfgspace_read(dev, offset, size); } static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, @@ -275,42 +62,24 @@ static void um_pci_cfgspace_write(void *priv, unsigned int offset, int size, { struct um_pci_device_reg *reg = priv; struct um_pci_device *dev = reg->dev; - struct { - struct virtio_pcidev_msg hdr; - /* maximum size - we may only use parts of it */ - u8 data[8]; - } msg = { - .hdr = { - .op = VIRTIO_PCIDEV_OP_CFG_WRITE, - .size = size, - .addr = offset, - }, - }; if (!dev) return; switch (size) { case 1: - msg.data[0] = (u8)val; - break; case 2: - put_unaligned_le16(val, (void *)msg.data); - break; case 4: - put_unaligned_le32(val, (void *)msg.data); - break; #ifdef CONFIG_64BIT case 8: - put_unaligned_le64(val, (void *)msg.data); - break; #endif + break; default: WARN(1, "invalid config space write size %d\n", size); return; } - WARN_ON(um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0)); + dev->ops->cfgspace_write(dev, offset, size, val); } static const struct logic_iomem_ops um_pci_device_cfgspace_ops = { @@ -318,30 +87,14 @@ static const struct logic_iomem_ops um_pci_device_cfgspace_ops = { .write = um_pci_cfgspace_write, }; -static void um_pci_bar_copy_from(void *priv, void *buffer, - unsigned int offset, int size) +static unsigned long um_pci_bar_read(void *priv, unsigned int offset, + int size) { u8 *resptr = priv; struct um_pci_device *dev = container_of(resptr - *resptr, struct um_pci_device, resptr[0]); - struct virtio_pcidev_msg hdr = { - .op = VIRTIO_PCIDEV_OP_MMIO_READ, - .bar = *resptr, - .size = size, - .addr = offset, - }; - - memset(buffer, 0xff, size); - - um_pci_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size); -} - -static unsigned long um_pci_bar_read(void *priv, unsigned int offset, - int size) -{ - /* 8 is maximum size - we may only use parts of it */ - u8 data[8]; + u8 bar = *resptr; switch (size) { case 1: @@ -352,72 +105,60 @@ static unsigned long um_pci_bar_read(void *priv, unsigned int offset, #endif break; default: - WARN(1, "invalid config space read size %d\n", size); + WARN(1, "invalid bar read size %d\n", size); return ULONG_MAX; } - um_pci_bar_copy_from(priv, data, offset, size); + return dev->ops->bar_read(dev, bar, offset, size); +} + +static void um_pci_bar_write(void *priv, unsigned int offset, int size, + unsigned long val) +{ + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; switch (size) { case 1: - return data[0]; case 2: - return le16_to_cpup((void *)data); case 4: - return le32_to_cpup((void *)data); #ifdef CONFIG_64BIT case 8: - return le64_to_cpup((void *)data); #endif + break; default: - return ULONG_MAX; + WARN(1, "invalid bar write size %d\n", size); + return; } + + dev->ops->bar_write(dev, bar, offset, size, val); } -static void um_pci_bar_copy_to(void *priv, unsigned int offset, - const void *buffer, int size) +static void um_pci_bar_copy_from(void *priv, void *buffer, + unsigned int offset, int size) { u8 *resptr = priv; struct um_pci_device *dev = container_of(resptr - *resptr, struct um_pci_device, resptr[0]); - struct virtio_pcidev_msg hdr = { - .op = VIRTIO_PCIDEV_OP_MMIO_WRITE, - .bar = *resptr, - .size = size, - .addr = offset, - }; + u8 bar = *resptr; - um_pci_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0); + dev->ops->bar_copy_from(dev, bar, buffer, offset, size); } -static void um_pci_bar_write(void *priv, unsigned int offset, int size, - unsigned long val) +static void um_pci_bar_copy_to(void *priv, unsigned int offset, + const void *buffer, int size) { - /* maximum size - we may only use parts of it */ - u8 data[8]; - - switch (size) { - case 1: - data[0] = (u8)val; - break; - case 2: - put_unaligned_le16(val, (void *)data); - break; - case 4: - put_unaligned_le32(val, (void *)data); - break; -#ifdef CONFIG_64BIT - case 8: - put_unaligned_le64(val, (void *)data); - break; -#endif - default: - WARN(1, "invalid config space write size %d\n", size); - return; - } + u8 *resptr = priv; + struct um_pci_device *dev = container_of(resptr - *resptr, + struct um_pci_device, + resptr[0]); + u8 bar = *resptr; - um_pci_bar_copy_to(priv, offset, data, size); + dev->ops->bar_copy_to(dev, bar, offset, buffer, size); } static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size) @@ -426,20 +167,9 @@ static void um_pci_bar_set(void *priv, unsigned int offset, u8 value, int size) struct um_pci_device *dev = container_of(resptr - *resptr, struct um_pci_device, resptr[0]); - struct { - struct virtio_pcidev_msg hdr; - u8 data; - } msg = { - .hdr = { - .op = VIRTIO_PCIDEV_OP_CFG_WRITE, - .bar = *resptr, - .size = size, - .addr = offset, - }, - .data = value, - }; + u8 bar = *resptr; - um_pci_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0); + dev->ops->bar_set(dev, bar, offset, value, size); } static const struct logic_iomem_ops um_pci_device_bar_ops = { @@ -486,76 +216,6 @@ static void um_pci_rescan(void) pci_unlock_rescan_remove(); } -static void um_pci_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick) -{ - struct scatterlist sg[1]; - - sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE); - if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC)) - kfree(buf); - else if (kick) - virtqueue_kick(vq); -} - -static void um_pci_handle_irq_message(struct virtqueue *vq, - struct virtio_pcidev_msg *msg) -{ - struct virtio_device *vdev = vq->vdev; - struct um_pci_device *dev = vdev->priv; - - if (!dev->irq) - return; - - /* we should properly chain interrupts, but on ARCH=um we don't care */ - - switch (msg->op) { - case VIRTIO_PCIDEV_OP_INT: - generic_handle_irq(dev->irq); - break; - case VIRTIO_PCIDEV_OP_MSI: - /* our MSI message is just the interrupt number */ - if (msg->size == sizeof(u32)) - generic_handle_irq(le32_to_cpup((void *)msg->data)); - else - generic_handle_irq(le16_to_cpup((void *)msg->data)); - break; - case VIRTIO_PCIDEV_OP_PME: - /* nothing to do - we already woke up due to the message */ - break; - default: - dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op); - break; - } -} - -static void um_pci_cmd_vq_cb(struct virtqueue *vq) -{ - struct virtio_device *vdev = vq->vdev; - struct um_pci_device *dev = vdev->priv; - void *cmd; - int len; - - if (test_bit(UM_PCI_STAT_WAITING, &dev->status)) - return; - - while ((cmd = virtqueue_get_buf(vq, &len))) - um_pci_free_buf(dev, cmd); -} - -static void um_pci_irq_vq_cb(struct virtqueue *vq) -{ - struct virtio_pcidev_msg *msg; - int len; - - while ((msg = virtqueue_get_buf(vq, &len))) { - if (len >= sizeof(*msg)) - um_pci_handle_irq_message(vq, msg); - - /* recycle the message buffer */ - um_pci_irq_vq_addbuf(vq, msg, true); - } -} - #ifdef CONFIG_OF /* Copied from arch/x86/kernel/devicetree.c */ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) @@ -577,200 +237,6 @@ struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) } #endif -static int um_pci_init_vqs(struct um_pci_device *dev) -{ - struct virtqueue_info vqs_info[] = { - { "cmd", um_pci_cmd_vq_cb }, - { "irq", um_pci_irq_vq_cb }, - }; - struct virtqueue *vqs[2]; - int err, i; - - err = virtio_find_vqs(dev->vdev, 2, vqs, vqs_info, NULL); - if (err) - return err; - - dev->cmd_vq = vqs[0]; - dev->irq_vq = vqs[1]; - - virtio_device_ready(dev->vdev); - - for (i = 0; i < NUM_IRQ_MSGS; i++) { - void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL); - - if (msg) - um_pci_irq_vq_addbuf(dev->irq_vq, msg, false); - } - - virtqueue_kick(dev->irq_vq); - - return 0; -} - -static void __um_pci_virtio_platform_remove(struct virtio_device *vdev, - struct um_pci_device *dev) -{ - virtio_reset_device(vdev); - vdev->config->del_vqs(vdev); - - mutex_lock(&um_pci_mtx); - um_pci_platform_device = NULL; - mutex_unlock(&um_pci_mtx); - - kfree(dev); -} - -static int um_pci_virtio_platform_probe(struct virtio_device *vdev, - struct um_pci_device *dev) -{ - int ret; - - dev->platform = true; - - mutex_lock(&um_pci_mtx); - - if (um_pci_platform_device) { - mutex_unlock(&um_pci_mtx); - ret = -EBUSY; - goto out_free; - } - - ret = um_pci_init_vqs(dev); - if (ret) { - mutex_unlock(&um_pci_mtx); - goto out_free; - } - - um_pci_platform_device = dev; - - mutex_unlock(&um_pci_mtx); - - ret = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev); - if (ret) - __um_pci_virtio_platform_remove(vdev, dev); - - return ret; - -out_free: - kfree(dev); - return ret; -} - -static int um_pci_virtio_probe(struct virtio_device *vdev) -{ - struct um_pci_device *dev; - int i, free = -1; - int err = -ENOSPC; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - - dev->vdev = vdev; - vdev->priv = dev; - - if (of_device_is_compatible(vdev->dev.of_node, "simple-bus")) - return um_pci_virtio_platform_probe(vdev, dev); - - mutex_lock(&um_pci_mtx); - for (i = 0; i < MAX_DEVICES; i++) { - if (um_pci_devices[i].dev) - continue; - free = i; - break; - } - - if (free < 0) - goto error; - - err = um_pci_init_vqs(dev); - if (err) - goto error; - - dev->irq = irq_alloc_desc(numa_node_id()); - if (dev->irq < 0) { - err = dev->irq; - goto err_reset; - } - um_pci_devices[free].dev = dev; - vdev->priv = dev; - - mutex_unlock(&um_pci_mtx); - - device_set_wakeup_enable(&vdev->dev, true); - - /* - * In order to do suspend-resume properly, don't allow VQs - * to be suspended. - */ - virtio_uml_set_no_vq_suspend(vdev, true); - - um_pci_rescan(); - return 0; -err_reset: - virtio_reset_device(vdev); - vdev->config->del_vqs(vdev); -error: - mutex_unlock(&um_pci_mtx); - kfree(dev); - return err; -} - -static void um_pci_virtio_remove(struct virtio_device *vdev) -{ - struct um_pci_device *dev = vdev->priv; - int i; - - if (dev->platform) { - of_platform_depopulate(&vdev->dev); - __um_pci_virtio_platform_remove(vdev, dev); - return; - } - - device_set_wakeup_enable(&vdev->dev, false); - - mutex_lock(&um_pci_mtx); - for (i = 0; i < MAX_DEVICES; i++) { - if (um_pci_devices[i].dev != dev) - continue; - - um_pci_devices[i].dev = NULL; - irq_free_desc(dev->irq); - - break; - } - mutex_unlock(&um_pci_mtx); - - if (i < MAX_DEVICES) { - struct pci_dev *pci_dev; - - pci_dev = pci_get_slot(bridge->bus, i); - if (pci_dev) - pci_stop_and_remove_bus_device_locked(pci_dev); - } - - /* Stop all virtqueues */ - virtio_reset_device(vdev); - dev->cmd_vq = NULL; - dev->irq_vq = NULL; - vdev->config->del_vqs(vdev); - - kfree(dev); -} - -static struct virtio_device_id id_table[] = { - { CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID }, - { 0 }, -}; -MODULE_DEVICE_TABLE(virtio, id_table); - -static struct virtio_driver um_pci_virtio_driver = { - .driver.name = "virtio-pci", - .id_table = id_table, - .probe = um_pci_virtio_probe, - .remove = um_pci_virtio_remove, -}; - static struct resource virt_cfgspace_resource = { .name = "PCI config space", .start = 0xf0000000 - MAX_DEVICES * CFG_SPACE_SIZE, @@ -889,7 +355,7 @@ static void um_pci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) } static struct irq_chip um_pci_msi_bottom_irq_chip = { - .name = "UM virtio MSI", + .name = "UM virtual MSI", .irq_compose_msi_msg = um_pci_compose_msi_msg, }; @@ -939,7 +405,7 @@ static const struct irq_domain_ops um_pci_inner_domain_ops = { }; static struct irq_chip um_pci_msi_irq_chip = { - .name = "UM virtio PCIe MSI", + .name = "UM virtual PCIe MSI", .irq_mask = pci_msi_mask_irq, .irq_unmask = pci_msi_unmask_irq, }; @@ -998,6 +464,78 @@ static struct resource virt_platform_resource = { .flags = IORESOURCE_MEM, }; +int um_pci_device_register(struct um_pci_device *dev) +{ + int i, free = -1; + int err = 0; + + mutex_lock(&um_pci_mtx); + for (i = 0; i < MAX_DEVICES; i++) { + if (um_pci_devices[i].dev) + continue; + free = i; + break; + } + + if (free < 0) { + err = -ENOSPC; + goto out; + } + + dev->irq = irq_alloc_desc(numa_node_id()); + if (dev->irq < 0) { + err = dev->irq; + goto out; + } + + um_pci_devices[free].dev = dev; + +out: + mutex_unlock(&um_pci_mtx); + if (!err) + um_pci_rescan(); + return err; +} + +void um_pci_device_unregister(struct um_pci_device *dev) +{ + int i; + + mutex_lock(&um_pci_mtx); + for (i = 0; i < MAX_DEVICES; i++) { + if (um_pci_devices[i].dev != dev) + continue; + um_pci_devices[i].dev = NULL; + irq_free_desc(dev->irq); + break; + } + mutex_unlock(&um_pci_mtx); + + if (i < MAX_DEVICES) { + struct pci_dev *pci_dev; + + pci_dev = pci_get_slot(bridge->bus, i); + if (pci_dev) + pci_stop_and_remove_bus_device_locked(pci_dev); + } +} + +int um_pci_platform_device_register(struct um_pci_device *dev) +{ + guard(mutex)(&um_pci_mtx); + if (um_pci_platform_device) + return -EBUSY; + um_pci_platform_device = dev; + return 0; +} + +void um_pci_platform_device_unregister(struct um_pci_device *dev) +{ + guard(mutex)(&um_pci_mtx); + if (um_pci_platform_device == dev) + um_pci_platform_device = NULL; +} + static int __init um_pci_init(void) { struct irq_domain_info inner_domain_info = { @@ -1014,10 +552,6 @@ static int __init um_pci_init(void) WARN_ON(logic_iomem_add_region(&virt_platform_resource, &um_pci_platform_ops)); - if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0, - "No virtio device ID configured for PCI - no PCI support\n")) - return 0; - bridge = pci_alloc_host_bridge(0); if (!bridge) { err = -ENOMEM; @@ -1065,10 +599,8 @@ static int __init um_pci_init(void) if (err) goto free; - err = register_virtio_driver(&um_pci_virtio_driver); - if (err) - goto free; return 0; + free: if (!IS_ERR_OR_NULL(um_pci_inner_domain)) irq_domain_remove(um_pci_inner_domain); @@ -1080,11 +612,10 @@ free: } return err; } -module_init(um_pci_init); +device_initcall(um_pci_init); static void __exit um_pci_exit(void) { - unregister_virtio_driver(&um_pci_virtio_driver); irq_domain_remove(um_pci_msi_domain); irq_domain_remove(um_pci_inner_domain); pci_free_resource_list(&bridge->windows); diff --git a/arch/um/drivers/virt-pci.h b/arch/um/drivers/virt-pci.h new file mode 100644 index 000000000000..b20d1475d1eb --- /dev/null +++ b/arch/um/drivers/virt-pci.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __UM_VIRT_PCI_H +#define __UM_VIRT_PCI_H + +#include + +struct um_pci_device { + const struct um_pci_ops *ops; + + /* for now just standard BARs */ + u8 resptr[PCI_STD_NUM_BARS]; + + int irq; +}; + +struct um_pci_ops { + unsigned long (*cfgspace_read)(struct um_pci_device *dev, + unsigned int offset, int size); + void (*cfgspace_write)(struct um_pci_device *dev, unsigned int offset, + int size, unsigned long val); + + unsigned long (*bar_read)(struct um_pci_device *dev, int bar, + unsigned int offset, int size); + void (*bar_write)(struct um_pci_device *dev, int bar, + unsigned int offset, int size, unsigned long val); + + void (*bar_copy_from)(struct um_pci_device *dev, int bar, void *buffer, + unsigned int offset, int size); + void (*bar_copy_to)(struct um_pci_device *dev, int bar, + unsigned int offset, const void *buffer, int size); + void (*bar_set)(struct um_pci_device *dev, int bar, + unsigned int offset, u8 value, int size); +}; + +int um_pci_device_register(struct um_pci_device *dev); +void um_pci_device_unregister(struct um_pci_device *dev); + +int um_pci_platform_device_register(struct um_pci_device *dev); +void um_pci_platform_device_unregister(struct um_pci_device *dev); + +#endif /* __UM_VIRT_PCI_H */ diff --git a/arch/um/drivers/virtio_pcidev.c b/arch/um/drivers/virtio_pcidev.c new file mode 100644 index 000000000000..3c4c4c928fdd --- /dev/null +++ b/arch/um/drivers/virtio_pcidev.c @@ -0,0 +1,628 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Intel Corporation + * Author: Johannes Berg + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "virt-pci.h" + +#define to_virtio_pcidev(_pdev) \ + container_of(_pdev, struct virtio_pcidev_device, pdev) + +/* for MSI-X we have a 32-bit payload */ +#define MAX_IRQ_MSG_SIZE (sizeof(struct virtio_pcidev_msg) + sizeof(u32)) +#define NUM_IRQ_MSGS 10 + +struct virtio_pcidev_message_buffer { + struct virtio_pcidev_msg hdr; + u8 data[8]; +}; + +struct virtio_pcidev_device { + struct um_pci_device pdev; + struct virtio_device *vdev; + + struct virtqueue *cmd_vq, *irq_vq; + +#define VIRTIO_PCIDEV_WRITE_BUFS 20 + struct virtio_pcidev_message_buffer bufs[VIRTIO_PCIDEV_WRITE_BUFS + 1]; + void *extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS + 1]; + DECLARE_BITMAP(used_bufs, VIRTIO_PCIDEV_WRITE_BUFS); + +#define UM_PCI_STAT_WAITING 0 + unsigned long status; + + bool platform; +}; + +static unsigned int virtio_pcidev_max_delay_us = 40000; +module_param_named(max_delay_us, virtio_pcidev_max_delay_us, uint, 0644); + +static int virtio_pcidev_get_buf(struct virtio_pcidev_device *dev, bool *posted) +{ + int i; + + for (i = 0; i < VIRTIO_PCIDEV_WRITE_BUFS; i++) { + if (!test_and_set_bit(i, dev->used_bufs)) + return i; + } + + *posted = false; + return VIRTIO_PCIDEV_WRITE_BUFS; +} + +static void virtio_pcidev_free_buf(struct virtio_pcidev_device *dev, void *buf) +{ + int i; + + if (buf == &dev->bufs[VIRTIO_PCIDEV_WRITE_BUFS]) { + kfree(dev->extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS]); + dev->extra_ptrs[VIRTIO_PCIDEV_WRITE_BUFS] = NULL; + return; + } + + for (i = 0; i < VIRTIO_PCIDEV_WRITE_BUFS; i++) { + if (buf == &dev->bufs[i]) { + kfree(dev->extra_ptrs[i]); + dev->extra_ptrs[i] = NULL; + WARN_ON(!test_and_clear_bit(i, dev->used_bufs)); + return; + } + } + + WARN_ON(1); +} + +static int virtio_pcidev_send_cmd(struct virtio_pcidev_device *dev, + struct virtio_pcidev_msg *cmd, + unsigned int cmd_size, + const void *extra, unsigned int extra_size, + void *out, unsigned int out_size) +{ + struct scatterlist out_sg, extra_sg, in_sg; + struct scatterlist *sgs_list[] = { + [0] = &out_sg, + [1] = extra ? &extra_sg : &in_sg, + [2] = extra ? &in_sg : NULL, + }; + struct virtio_pcidev_message_buffer *buf; + int delay_count = 0; + bool bounce_out; + int ret, len; + int buf_idx; + bool posted; + + if (WARN_ON(cmd_size < sizeof(*cmd) || cmd_size > sizeof(*buf))) + return -EINVAL; + + switch (cmd->op) { + case VIRTIO_PCIDEV_OP_CFG_WRITE: + case VIRTIO_PCIDEV_OP_MMIO_WRITE: + case VIRTIO_PCIDEV_OP_MMIO_MEMSET: + /* in PCI, writes are posted, so don't wait */ + posted = !out; + WARN_ON(!posted); + break; + default: + posted = false; + break; + } + + bounce_out = !posted && cmd_size <= sizeof(*cmd) && + out && out_size <= sizeof(buf->data); + + buf_idx = virtio_pcidev_get_buf(dev, &posted); + buf = &dev->bufs[buf_idx]; + memcpy(buf, cmd, cmd_size); + + if (posted && extra && extra_size > sizeof(buf) - cmd_size) { + dev->extra_ptrs[buf_idx] = kmemdup(extra, extra_size, + GFP_ATOMIC); + + if (!dev->extra_ptrs[buf_idx]) { + virtio_pcidev_free_buf(dev, buf); + return -ENOMEM; + } + extra = dev->extra_ptrs[buf_idx]; + } else if (extra && extra_size <= sizeof(buf) - cmd_size) { + memcpy((u8 *)buf + cmd_size, extra, extra_size); + cmd_size += extra_size; + extra_size = 0; + extra = NULL; + cmd = (void *)buf; + } else { + cmd = (void *)buf; + } + + sg_init_one(&out_sg, cmd, cmd_size); + if (extra) + sg_init_one(&extra_sg, extra, extra_size); + /* allow stack for small buffers */ + if (bounce_out) + sg_init_one(&in_sg, buf->data, out_size); + else if (out) + sg_init_one(&in_sg, out, out_size); + + /* add to internal virtio queue */ + ret = virtqueue_add_sgs(dev->cmd_vq, sgs_list, + extra ? 2 : 1, + out ? 1 : 0, + cmd, GFP_ATOMIC); + if (ret) { + virtio_pcidev_free_buf(dev, buf); + return ret; + } + + if (posted) { + virtqueue_kick(dev->cmd_vq); + return 0; + } + + /* kick and poll for getting a response on the queue */ + set_bit(UM_PCI_STAT_WAITING, &dev->status); + virtqueue_kick(dev->cmd_vq); + ret = 0; + + while (1) { + void *completed = virtqueue_get_buf(dev->cmd_vq, &len); + + if (completed == buf) + break; + + if (completed) + virtio_pcidev_free_buf(dev, completed); + + if (WARN_ONCE(virtqueue_is_broken(dev->cmd_vq) || + ++delay_count > virtio_pcidev_max_delay_us, + "um virt-pci delay: %d", delay_count)) { + ret = -EIO; + break; + } + udelay(1); + } + clear_bit(UM_PCI_STAT_WAITING, &dev->status); + + if (bounce_out) + memcpy(out, buf->data, out_size); + + virtio_pcidev_free_buf(dev, buf); + + return ret; +} + +static unsigned long virtio_pcidev_cfgspace_read(struct um_pci_device *pdev, + unsigned int offset, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_READ, + .size = size, + .addr = offset, + }; + /* max 8, we might not use it all */ + u8 data[8]; + + memset(data, 0xff, sizeof(data)); + + /* size has been checked in um_pci_cfgspace_read() */ + if (virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, data, size)) + return ULONG_MAX; + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void virtio_pcidev_cfgspace_write(struct um_pci_device *pdev, + unsigned int offset, int size, + unsigned long val) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct { + struct virtio_pcidev_msg hdr; + /* maximum size - we may only use parts of it */ + u8 data[8]; + } msg = { + .hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_WRITE, + .size = size, + .addr = offset, + }, + }; + + /* size has been checked in um_pci_cfgspace_write() */ + switch (size) { + case 1: + msg.data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)msg.data); + break; + case 4: + put_unaligned_le32(val, (void *)msg.data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)msg.data); + break; +#endif + } + + WARN_ON(virtio_pcidev_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0)); +} + +static void virtio_pcidev_bar_copy_from(struct um_pci_device *pdev, + int bar, void *buffer, + unsigned int offset, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_MMIO_READ, + .bar = bar, + .size = size, + .addr = offset, + }; + + memset(buffer, 0xff, size); + + virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), NULL, 0, buffer, size); +} + +static unsigned long virtio_pcidev_bar_read(struct um_pci_device *pdev, int bar, + unsigned int offset, int size) +{ + /* 8 is maximum size - we may only use parts of it */ + u8 data[8]; + + /* size has been checked in um_pci_bar_read() */ + virtio_pcidev_bar_copy_from(pdev, bar, data, offset, size); + + switch (size) { + case 1: + return data[0]; + case 2: + return le16_to_cpup((void *)data); + case 4: + return le32_to_cpup((void *)data); +#ifdef CONFIG_64BIT + case 8: + return le64_to_cpup((void *)data); +#endif + default: + return ULONG_MAX; + } +} + +static void virtio_pcidev_bar_copy_to(struct um_pci_device *pdev, + int bar, unsigned int offset, + const void *buffer, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct virtio_pcidev_msg hdr = { + .op = VIRTIO_PCIDEV_OP_MMIO_WRITE, + .bar = bar, + .size = size, + .addr = offset, + }; + + virtio_pcidev_send_cmd(dev, &hdr, sizeof(hdr), buffer, size, NULL, 0); +} + +static void virtio_pcidev_bar_write(struct um_pci_device *pdev, int bar, + unsigned int offset, int size, + unsigned long val) +{ + /* maximum size - we may only use parts of it */ + u8 data[8]; + + /* size has been checked in um_pci_bar_write() */ + switch (size) { + case 1: + data[0] = (u8)val; + break; + case 2: + put_unaligned_le16(val, (void *)data); + break; + case 4: + put_unaligned_le32(val, (void *)data); + break; +#ifdef CONFIG_64BIT + case 8: + put_unaligned_le64(val, (void *)data); + break; +#endif + } + + virtio_pcidev_bar_copy_to(pdev, bar, offset, data, size); +} + +static void virtio_pcidev_bar_set(struct um_pci_device *pdev, int bar, + unsigned int offset, u8 value, int size) +{ + struct virtio_pcidev_device *dev = to_virtio_pcidev(pdev); + struct { + struct virtio_pcidev_msg hdr; + u8 data; + } msg = { + .hdr = { + .op = VIRTIO_PCIDEV_OP_CFG_WRITE, + .bar = bar, + .size = size, + .addr = offset, + }, + .data = value, + }; + + virtio_pcidev_send_cmd(dev, &msg.hdr, sizeof(msg), NULL, 0, NULL, 0); +} + +static const struct um_pci_ops virtio_pcidev_um_pci_ops = { + .cfgspace_read = virtio_pcidev_cfgspace_read, + .cfgspace_write = virtio_pcidev_cfgspace_write, + .bar_read = virtio_pcidev_bar_read, + .bar_write = virtio_pcidev_bar_write, + .bar_copy_from = virtio_pcidev_bar_copy_from, + .bar_copy_to = virtio_pcidev_bar_copy_to, + .bar_set = virtio_pcidev_bar_set, +}; + +static void virtio_pcidev_irq_vq_addbuf(struct virtqueue *vq, void *buf, bool kick) +{ + struct scatterlist sg[1]; + + sg_init_one(sg, buf, MAX_IRQ_MSG_SIZE); + if (virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC)) + kfree(buf); + else if (kick) + virtqueue_kick(vq); +} + +static void virtio_pcidev_handle_irq_message(struct virtqueue *vq, + struct virtio_pcidev_msg *msg) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pcidev_device *dev = vdev->priv; + + if (!dev->pdev.irq) + return; + + /* we should properly chain interrupts, but on ARCH=um we don't care */ + + switch (msg->op) { + case VIRTIO_PCIDEV_OP_INT: + generic_handle_irq(dev->pdev.irq); + break; + case VIRTIO_PCIDEV_OP_MSI: + /* our MSI message is just the interrupt number */ + if (msg->size == sizeof(u32)) + generic_handle_irq(le32_to_cpup((void *)msg->data)); + else + generic_handle_irq(le16_to_cpup((void *)msg->data)); + break; + case VIRTIO_PCIDEV_OP_PME: + /* nothing to do - we already woke up due to the message */ + break; + default: + dev_err(&vdev->dev, "unexpected virt-pci message %d\n", msg->op); + break; + } +} + +static void virtio_pcidev_cmd_vq_cb(struct virtqueue *vq) +{ + struct virtio_device *vdev = vq->vdev; + struct virtio_pcidev_device *dev = vdev->priv; + void *cmd; + int len; + + if (test_bit(UM_PCI_STAT_WAITING, &dev->status)) + return; + + while ((cmd = virtqueue_get_buf(vq, &len))) + virtio_pcidev_free_buf(dev, cmd); +} + +static void virtio_pcidev_irq_vq_cb(struct virtqueue *vq) +{ + struct virtio_pcidev_msg *msg; + int len; + + while ((msg = virtqueue_get_buf(vq, &len))) { + if (len >= sizeof(*msg)) + virtio_pcidev_handle_irq_message(vq, msg); + + /* recycle the message buffer */ + virtio_pcidev_irq_vq_addbuf(vq, msg, true); + } +} + +static int virtio_pcidev_init_vqs(struct virtio_pcidev_device *dev) +{ + struct virtqueue_info vqs_info[] = { + { "cmd", virtio_pcidev_cmd_vq_cb }, + { "irq", virtio_pcidev_irq_vq_cb }, + }; + struct virtqueue *vqs[2]; + int err, i; + + err = virtio_find_vqs(dev->vdev, 2, vqs, vqs_info, NULL); + if (err) + return err; + + dev->cmd_vq = vqs[0]; + dev->irq_vq = vqs[1]; + + virtio_device_ready(dev->vdev); + + for (i = 0; i < NUM_IRQ_MSGS; i++) { + void *msg = kzalloc(MAX_IRQ_MSG_SIZE, GFP_KERNEL); + + if (msg) + virtio_pcidev_irq_vq_addbuf(dev->irq_vq, msg, false); + } + + virtqueue_kick(dev->irq_vq); + + return 0; +} + +static void __virtio_pcidev_virtio_platform_remove(struct virtio_device *vdev, + struct virtio_pcidev_device *dev) +{ + um_pci_platform_device_unregister(&dev->pdev); + + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); + + kfree(dev); +} + +static int virtio_pcidev_virtio_platform_probe(struct virtio_device *vdev, + struct virtio_pcidev_device *dev) +{ + int err; + + dev->platform = true; + + err = virtio_pcidev_init_vqs(dev); + if (err) + goto err_free; + + err = um_pci_platform_device_register(&dev->pdev); + if (err) + goto err_reset; + + err = of_platform_default_populate(vdev->dev.of_node, NULL, &vdev->dev); + if (err) + goto err_unregister; + + return 0; + +err_unregister: + um_pci_platform_device_unregister(&dev->pdev); +err_reset: + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); +err_free: + kfree(dev); + return err; +} + +static int virtio_pcidev_virtio_probe(struct virtio_device *vdev) +{ + struct virtio_pcidev_device *dev; + int err; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + dev->vdev = vdev; + vdev->priv = dev; + + dev->pdev.ops = &virtio_pcidev_um_pci_ops; + + if (of_device_is_compatible(vdev->dev.of_node, "simple-bus")) + return virtio_pcidev_virtio_platform_probe(vdev, dev); + + err = virtio_pcidev_init_vqs(dev); + if (err) + goto err_free; + + err = um_pci_device_register(&dev->pdev); + if (err) + goto err_reset; + + device_set_wakeup_enable(&vdev->dev, true); + + /* + * In order to do suspend-resume properly, don't allow VQs + * to be suspended. + */ + virtio_uml_set_no_vq_suspend(vdev, true); + + return 0; + +err_reset: + virtio_reset_device(vdev); + vdev->config->del_vqs(vdev); +err_free: + kfree(dev); + return err; +} + +static void virtio_pcidev_virtio_remove(struct virtio_device *vdev) +{ + struct virtio_pcidev_device *dev = vdev->priv; + + if (dev->platform) { + of_platform_depopulate(&vdev->dev); + __virtio_pcidev_virtio_platform_remove(vdev, dev); + return; + } + + device_set_wakeup_enable(&vdev->dev, false); + + um_pci_device_unregister(&dev->pdev); + + /* Stop all virtqueues */ + virtio_reset_device(vdev); + dev->cmd_vq = NULL; + dev->irq_vq = NULL; + vdev->config->del_vqs(vdev); + + kfree(dev); +} + +static struct virtio_device_id id_table[] = { + { CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID, VIRTIO_DEV_ANY_ID }, + { 0 }, +}; +MODULE_DEVICE_TABLE(virtio, id_table); + +static struct virtio_driver virtio_pcidev_virtio_driver = { + .driver.name = "virtio-pci", + .id_table = id_table, + .probe = virtio_pcidev_virtio_probe, + .remove = virtio_pcidev_virtio_remove, +}; + +static int __init virtio_pcidev_init(void) +{ + if (WARN(CONFIG_UML_PCI_OVER_VIRTIO_DEVICE_ID < 0, + "No virtio device ID configured for PCI - no PCI support\n")) + return 0; + + return register_virtio_driver(&virtio_pcidev_virtio_driver); +} +late_initcall(virtio_pcidev_init); + +static void __exit virtio_pcidev_exit(void) +{ + unregister_virtio_driver(&virtio_pcidev_virtio_driver); +} +module_exit(virtio_pcidev_exit); -- 2.51.0 From cef721e0d53d2b64f2ba177c63a0dfdd7c0daf17 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 24 Feb 2025 19:18:19 +0100 Subject: [PATCH 13/16] um: Store full CSGSFS and SS register from mcontext Doing this allows using registers as retrieved from an mcontext to be pushed to a process using PTRACE_SETREGS. It is not entirely clear to me why CSGSFS was masked. Doing so creates issues when using the mcontext as process state in seccomp and simply copying the register appears to work perfectly fine for ptrace. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250224181827.647129-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg --- arch/x86/um/os-Linux/mcontext.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index d2f3a595b4ef..37decaa74761 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -28,8 +28,7 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY(RIP); COPY2(EFLAGS, EFL); COPY2(CS, CSGSFS); - regs->gp[CS / sizeof(unsigned long)] &= 0xffff; - regs->gp[CS / sizeof(unsigned long)] |= 3; + regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48; #endif } -- 2.51.0 From 16a0ca5e4e79ca52fb1c4a9194dc6bdadacae4fb Mon Sep 17 00:00:00 2001 From: Hajime Tazaki Date: Mon, 20 Jan 2025 15:00:03 +0900 Subject: [PATCH 14/16] um: x86: clean up elf specific definitions The file arch/x86/um/asm/module.h is equivalent to the definition of asm-generic. Thus this commit cleans up to use it. Signed-off-by: Hajime Tazaki Link: https://patch.msgid.link/2d70a0ed79ee0a0bef80ad4790063f4833dd9bed.1737348399.git.thehajime@gmail.com Signed-off-by: Johannes Berg --- arch/um/include/asm/Kbuild | 1 + arch/x86/um/asm/module.h | 24 ------------------------ 2 files changed, 1 insertion(+), 24 deletions(-) delete mode 100644 arch/x86/um/asm/module.h diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 428f2c5158c2..04ab3b653a48 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -13,6 +13,7 @@ generic-y += irq_work.h generic-y += kdebug.h generic-y += mcs_spinlock.h generic-y += mmiowb.h +generic-y += module.h generic-y += module.lds.h generic-y += param.h generic-y += parport.h diff --git a/arch/x86/um/asm/module.h b/arch/x86/um/asm/module.h deleted file mode 100644 index a3b061d66082..000000000000 --- a/arch/x86/um/asm/module.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __UM_MODULE_H -#define __UM_MODULE_H - -/* UML is simple */ -struct mod_arch_specific -{ -}; - -#ifdef CONFIG_X86_32 - -#define Elf_Shdr Elf32_Shdr -#define Elf_Sym Elf32_Sym -#define Elf_Ehdr Elf32_Ehdr - -#else - -#define Elf_Shdr Elf64_Shdr -#define Elf_Sym Elf64_Sym -#define Elf_Ehdr Elf64_Ehdr - -#endif - -#endif -- 2.51.0 From 4f087eafdcef24b7160b097ddb9704084767b77a Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Wed, 19 Mar 2025 21:55:20 +0800 Subject: [PATCH 15/16] um: Add pthread-based helper support Introduce a new set of utility functions that can be used to create pthread-based helpers. Helper threads created in this way will ensure thread safety for errno while sharing the same memory space. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250319135523.97050-2-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/include/shared/os.h | 5 +++ arch/um/os-Linux/helper.c | 63 +++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h index bc02767f0639..d0ae42911cb5 100644 --- a/arch/um/include/shared/os.h +++ b/arch/um/include/shared/os.h @@ -224,6 +224,11 @@ extern int run_helper_thread(int (*proc)(void *), void *arg, unsigned int flags, unsigned long *stack_out); extern int helper_wait(int pid); +struct os_helper_thread; +int os_run_helper_thread(struct os_helper_thread **td_out, + void *(*routine)(void *), void *arg); +void os_kill_helper_thread(struct os_helper_thread *td); +void os_fix_helper_thread_signals(void); /* umid.c */ extern int umid_file_name(char *name, char *buf, int len); diff --git a/arch/um/os-Linux/helper.c b/arch/um/os-Linux/helper.c index 3cb8ac63be6e..df22cba24d82 100644 --- a/arch/um/os-Linux/helper.c +++ b/arch/um/os-Linux/helper.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -167,3 +168,65 @@ int helper_wait(int pid) } else return 0; } + +struct os_helper_thread { + pthread_t handle; +}; + +int os_run_helper_thread(struct os_helper_thread **td_out, + void *(*routine)(void *), void *arg) +{ + struct os_helper_thread *td; + sigset_t sigset, oset; + int err, flags; + + flags = __uml_cant_sleep() ? UM_GFP_ATOMIC : UM_GFP_KERNEL; + td = uml_kmalloc(sizeof(*td), flags); + if (!td) + return -ENOMEM; + + sigfillset(&sigset); + if (sigprocmask(SIG_SETMASK, &sigset, &oset) < 0) { + err = -errno; + kfree(td); + return err; + } + + err = pthread_create(&td->handle, NULL, routine, arg); + + if (sigprocmask(SIG_SETMASK, &oset, NULL) < 0) + panic("Failed to restore the signal mask: %d", errno); + + if (err != 0) + kfree(td); + else + *td_out = td; + + return -err; +} + +void os_kill_helper_thread(struct os_helper_thread *td) +{ + pthread_cancel(td->handle); + pthread_join(td->handle, NULL); + kfree(td); +} + +void os_fix_helper_thread_signals(void) +{ + sigset_t sigset; + + sigemptyset(&sigset); + + sigaddset(&sigset, SIGWINCH); + sigaddset(&sigset, SIGPIPE); + sigaddset(&sigset, SIGPROF); + sigaddset(&sigset, SIGINT); + sigaddset(&sigset, SIGTERM); + sigaddset(&sigset, SIGCHLD); + sigaddset(&sigset, SIGALRM); + sigaddset(&sigset, SIGIO); + sigaddset(&sigset, SIGUSR1); + + pthread_sigmask(SIG_SETMASK, &sigset, NULL); +} -- 2.51.0 From d7f89a9da4328eee450d2e72631d1ec7e52976f0 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Wed, 19 Mar 2025 21:55:21 +0800 Subject: [PATCH 16/16] um: ubd: Switch to the pthread-based helper The ubd io thread and UML kernel thread share the same errno, which can lead to conflicts when both call syscalls concurrently. Switch to the pthread-based helper to address this issue. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250319135523.97050-3-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg --- arch/um/drivers/ubd.h | 6 ++++-- arch/um/drivers/ubd_kern.c | 25 +++++++++++-------------- arch/um/drivers/ubd_user.c | 14 +++++++------- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/arch/um/drivers/ubd.h b/arch/um/drivers/ubd.h index f016fe15499f..2985c14661f4 100644 --- a/arch/um/drivers/ubd.h +++ b/arch/um/drivers/ubd.h @@ -7,8 +7,10 @@ #ifndef __UM_UBD_USER_H #define __UM_UBD_USER_H -extern int start_io_thread(unsigned long sp, int *fds_out); -extern int io_thread(void *arg); +#include + +int start_io_thread(struct os_helper_thread **td_out, int *fd_out); +void *io_thread(void *arg); extern int kernel_fd; extern int ubd_read_poll(int timeout); diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 0b1e61f72fb3..4de6613e7468 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -474,12 +474,12 @@ static irqreturn_t ubd_intr(int irq, void *dev) } /* Only changed by ubd_init, which is an initcall. */ -static int io_pid = -1; +static struct os_helper_thread *io_td; static void kill_io_thread(void) { - if(io_pid != -1) - os_kill_process(io_pid, 1); + if (io_td) + os_kill_helper_thread(io_td); } __uml_exitcall(kill_io_thread); @@ -1104,8 +1104,8 @@ static int __init ubd_init(void) late_initcall(ubd_init); -static int __init ubd_driver_init(void){ - unsigned long stack; +static int __init ubd_driver_init(void) +{ int err; /* Set by CONFIG_BLK_DEV_UBD_SYNC or ubd=sync.*/ @@ -1114,13 +1114,11 @@ static int __init ubd_driver_init(void){ /* Letting ubd=sync be like using ubd#s= instead of ubd#= is * enough. So use anyway the io thread. */ } - stack = alloc_stack(0, 0); - io_pid = start_io_thread(stack + PAGE_SIZE, &thread_fd); - if(io_pid < 0){ + err = start_io_thread(&io_td, &thread_fd); + if (err < 0) { printk(KERN_ERR "ubd : Failed to start I/O thread (errno = %d) - " - "falling back to synchronous I/O\n", -io_pid); - io_pid = -1; + "falling back to synchronous I/O\n", -err); return 0; } err = um_request_irq(UBD_IRQ, thread_fd, IRQ_READ, ubd_intr, @@ -1496,12 +1494,11 @@ int kernel_fd = -1; /* Only changed by the io thread. XXX: currently unused. */ static int io_count; -int io_thread(void *arg) +void *io_thread(void *arg) { int n, count, written, res; - os_set_pdeathsig(); - os_fix_helper_signals(); + os_fix_helper_thread_signals(); while(1){ n = bulk_req_safe_read( @@ -1543,5 +1540,5 @@ int io_thread(void *arg) } while (written < n); } - return 0; + return NULL; } diff --git a/arch/um/drivers/ubd_user.c b/arch/um/drivers/ubd_user.c index b4f8b8e60564..c5e6545f6fcf 100644 --- a/arch/um/drivers/ubd_user.c +++ b/arch/um/drivers/ubd_user.c @@ -25,9 +25,9 @@ static struct pollfd kernel_pollfd; -int start_io_thread(unsigned long sp, int *fd_out) +int start_io_thread(struct os_helper_thread **td_out, int *fd_out) { - int pid, fds[2], err; + int fds[2], err; err = os_pipe(fds, 1, 1); if(err < 0){ @@ -47,14 +47,14 @@ int start_io_thread(unsigned long sp, int *fd_out) goto out_close; } - pid = clone(io_thread, (void *) sp, CLONE_FILES | CLONE_VM, NULL); - if(pid < 0){ - err = -errno; - printk("start_io_thread - clone failed : errno = %d\n", errno); + err = os_run_helper_thread(td_out, io_thread, NULL); + if (err < 0) { + printk("%s - failed to run helper thread, err = %d\n", + __func__, -err); goto out_close; } - return(pid); + return 0; out_close: os_close_file(fds[0]); -- 2.51.0