From 7bc1ee28f4d21fc1e2f3d09534baf86ce7f3ba5e Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 19 Nov 2024 13:22:47 +0100 Subject: [PATCH 01/16] s390/cpum_sf: Simplify release of SDBs and SDBTs Free_sampling_buffer() releases the Sampling Data Buffers (SDBs) and Sampling Data Buffer Table (SDBTs) allocated at event initialization. Both buffers are of PAGE_SIZE bytes. Each SDBT consists of 512 entries. The first 511 entries point to SDBs the last entry points to a successor SDBT. The last SDBT in the list points to the origin of all SDBTs. SDBTs do not contain holes, that is an entry always points to a SDB. If less than 511 SDBs have been allocation, the last entry points to the origin SDBT. Simplify the release of the SDBs and SDBTs, walk along the SDBT chain, release SDBs and SDBTs and stop when reaching the origin again. Signed-off-by: Thomas Richter Reviewed-by: Sumanth Korikkar Signed-off-by: Heiko Carstens --- arch/s390/kernel/perf_cpum_sf.c | 38 +++++++++++---------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 0cde42f8af6e..1e99514fb7ae 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -180,39 +180,27 @@ static int sf_buffer_available(struct cpu_hw_sf *cpuhw) */ static void free_sampling_buffer(struct sf_buffer *sfb) { - unsigned long *sdbt, *curr; - - if (!sfb->sdbt) - return; + unsigned long *sdbt, *curr, *head; sdbt = sfb->sdbt; - curr = sdbt; - + if (!sdbt) + return; + sfb->sdbt = NULL; /* Free the SDBT after all SDBs are processed... */ - while (1) { - if (!*curr || !sdbt) - break; - - /* Process table-link entries */ + head = sdbt; + curr = sdbt; + do { if (is_link_entry(curr)) { + /* Process table-link entries */ curr = get_next_sdbt(curr); - if (sdbt) - free_page((unsigned long)sdbt); - - /* If the origin is reached, sampling buffer is freed */ - if (curr == sfb->sdbt) - break; - else - sdbt = curr; + free_page((unsigned long)sdbt); + sdbt = curr; } else { /* Process SDB pointer */ - if (*curr) { - free_page((unsigned long)phys_to_virt(*curr)); - curr++; - } + free_page((unsigned long)phys_to_virt(*curr)); + curr++; } - } - + } while (curr != head); memset(sfb, 0, sizeof(*sfb)); } -- 2.51.0 From 45c9f2b856a075a34873d00788d2e8a250c1effd Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Tue, 19 Nov 2024 14:54:07 +0100 Subject: [PATCH 02/16] s390/entry: Mark IRQ entries to fix stack depot warnings The stack depot filters out everything outside of the top interrupt context as an uninteresting or irrelevant part of the stack traces. This helps with stack trace de-duplication, avoiding an explosion of saved stack traces that share the same IRQ context code path but originate from different randomly interrupted points, eventually exhausting the stack depot. Filtering uses in_irqentry_text() to identify functions within the .irqentry.text and .softirqentry.text sections, which then become the last stack trace entries being saved. While __do_softirq() is placed into the .softirqentry.text section by common code, populating .irqentry.text is architecture-specific. Currently, the .irqentry.text section on s390 is empty, which prevents stack depot filtering and de-duplication and could result in warnings like: Stack depot reached limit capacity WARNING: CPU: 0 PID: 286113 at lib/stackdepot.c:252 depot_alloc_stack+0x39a/0x3c8 with PREEMPT and KASAN enabled. Fix this by moving the IO/EXT interrupt handlers from .kprobes.text into the .irqentry.text section and updating the kprobes blacklist to include the .irqentry.text section. This is done only for asynchronous interrupts and explicitly not for program checks, which are synchronous and where the context beyond the program check is important to preserve. Despite machine checks being somewhat in between, they are extremely rare, and preserving context when possible is also of value. SVCs and Restart Interrupts are not relevant, one being always at the boundary to user space and the other being a one-time thing. IRQ entries filtering is also optionally used in ftrace function graph, where the same logic applies. Cc: stable@vger.kernel.org # 5.15+ Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/kernel/entry.S | 4 ++++ arch/s390/kernel/kprobes.c | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 1ff13239d4e5..960c08700cf6 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -430,9 +430,13 @@ SYM_CODE_START(\name) SYM_CODE_END(\name) .endm + .section .irqentry.text, "ax" + INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq + .section .kprobes.text, "ax" + /* * Machine check handler routines */ diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 6295faf0987d..8b80ea57125f 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -489,6 +489,12 @@ int __init arch_init_kprobes(void) return 0; } +int __init arch_populate_kprobe_blacklist(void) +{ + return kprobe_add_area_blacklist((unsigned long)__irqentry_text_start, + (unsigned long)__irqentry_text_end); +} + int arch_trampoline_kprobe(struct kprobe *p) { return 0; -- 2.51.0 From 546d7bd47973790073b824d69ee59440337b407b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 21 Nov 2024 12:41:07 +0100 Subject: [PATCH 03/16] s390: Add missing _TIF defines Add missing _TIF defines so that for every TIF bit its corresponding _TIF mask exists. Sort the _TIF list to match the TIF order. Also remove two leftover comments from the pre generic entry time. Signed-off-by: Heiko Carstens --- arch/s390/include/asm/thread_info.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 00ac01874a12..3802f281eeed 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -61,7 +61,6 @@ void arch_setup_new_exec(void); /* * thread information flags bit numbers */ -/* _TIF_WORK bits */ #define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ @@ -72,33 +71,33 @@ void arch_setup_new_exec(void); #define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ #define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ #define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ - #define TIF_31BIT 16 /* 32bit process */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ #define TIF_RESTORE_SIGMASK 18 /* restore signal mask in do_signal() */ #define TIF_SINGLE_STEP 19 /* This task is single stepped */ #define TIF_BLOCK_STEP 20 /* This task is block stepped */ #define TIF_UPROBE_SINGLESTEP 21 /* This task is uprobe single stepped */ - -/* _TIF_TRACE bits */ #define TIF_SYSCALL_TRACE 24 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 25 /* syscall auditing active */ #define TIF_SECCOMP 26 /* secure computing */ #define TIF_SYSCALL_TRACEPOINT 27 /* syscall tracepoint instrumentation */ #define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) -#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_SIGPENDING BIT(TIF_SIGPENDING) #define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) #define _TIF_UPROBE BIT(TIF_UPROBE) #define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) +#define _TIF_PGSTE BIT(TIF_PGSTE) +#define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) - #define _TIF_31BIT BIT(TIF_31BIT) +#define _TIF_MEMDIE BIT(TIF_MEMDIE) +#define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #define _TIF_SINGLE_STEP BIT(TIF_SINGLE_STEP) - +#define _TIF_BLOCK_STEP BIT(TIF_BLOCK_STEP) +#define _TIF_UPROBE_SINGLESTEP BIT(TIF_UPROBE_SINGLESTEP) #define _TIF_SYSCALL_TRACE BIT(TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT BIT(TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP BIT(TIF_SECCOMP) -- 2.51.0 From 9de3e4bf6cfbcb62e9ec658e3b291cd479018b43 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 21 Nov 2024 12:41:08 +0100 Subject: [PATCH 04/16] s390: Add ARCH_HAS_PREEMPT_LAZY support Just add the required TIF bit for ARCH_HAS_PREEMPT_LAZY support. Shuffle TIF bits to get TIF_NEED_RESCHED_LAZY next to TIF_NEED_RESCHED. Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + arch/s390/include/asm/thread_info.h | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a45259d4c0a5..b7e1eec7903e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -87,6 +87,7 @@ config S390 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SCALED_CPUTIME select ARCH_HAS_SET_DIRECT_MAP diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 3802f281eeed..c33f7144d1b9 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -64,11 +64,12 @@ void arch_setup_new_exec(void); #define TIF_NOTIFY_RESUME 0 /* callback before returning to user */ #define TIF_SIGPENDING 1 /* signal pending */ #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ -#define TIF_UPROBE 3 /* breakpointed or single-stepping */ -#define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ +#define TIF_NEED_RESCHED_LAZY 3 /* lazy rescheduling needed */ +#define TIF_UPROBE 4 /* breakpointed or single-stepping */ #define TIF_PATCH_PENDING 5 /* pending live patching update */ #define TIF_PGSTE 6 /* New mm's will use 4K page tables */ #define TIF_NOTIFY_SIGNAL 7 /* signal notifications exist */ +#define TIF_GUARDED_STORAGE 8 /* load guarded storage control block */ #define TIF_ISOLATE_BP_GUEST 9 /* Run KVM guests with isolated BP */ #define TIF_PER_TRAP 10 /* Need to handle PER trap on exit to usermode */ #define TIF_31BIT 16 /* 32bit process */ @@ -85,11 +86,12 @@ void arch_setup_new_exec(void); #define _TIF_NOTIFY_RESUME BIT(TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING BIT(TIF_SIGPENDING) #define _TIF_NEED_RESCHED BIT(TIF_NEED_RESCHED) +#define _TIF_NEED_RESCHED_LAZY BIT(TIF_NEED_RESCHED_LAZY) #define _TIF_UPROBE BIT(TIF_UPROBE) -#define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #define _TIF_PGSTE BIT(TIF_PGSTE) #define _TIF_NOTIFY_SIGNAL BIT(TIF_NOTIFY_SIGNAL) +#define _TIF_GUARDED_STORAGE BIT(TIF_GUARDED_STORAGE) #define _TIF_ISOLATE_BP_GUEST BIT(TIF_ISOLATE_BP_GUEST) #define _TIF_PER_TRAP BIT(TIF_PER_TRAP) #define _TIF_31BIT BIT(TIF_31BIT) -- 2.51.0 From ff123eb7741638d55abf82fac090bb3a543c1e74 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Fri, 22 Nov 2024 13:41:33 +0100 Subject: [PATCH 05/16] s390/mm: Allow large pages for KASAN shadow mapping Commit c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") introduced a large_allowed() helper that restricts which mapping modes can use large pages. This change unintentionally prevented KASAN shadow mappings from using large pages, despite there being no reason to avoid them. In fact, large pages are preferred for performance. Add POPULATE_KASAN_MAP_SHADOW to the allowed list in large_allowed() to restore large page mappings for KASAN shadows. While large_allowed() isn't strictly necessary with current mapping modes since disallowed modes either don't map anything or fail alignment and size checks, keep it for clarity. Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") Acked-by: Alexander Gordeev Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/boot/vmem.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 145035f84a0e..8476b6f965a9 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -264,7 +264,17 @@ static unsigned long _pa(unsigned long addr, unsigned long size, enum populate_m static bool large_allowed(enum populate_mode mode) { - return (mode == POPULATE_DIRECT) || (mode == POPULATE_IDENTITY) || (mode == POPULATE_KERNEL); + switch (mode) { + case POPULATE_DIRECT: + case POPULATE_IDENTITY: + case POPULATE_KERNEL: +#ifdef CONFIG_KASAN + case POPULATE_KASAN_MAP_SHADOW: +#endif + return true; + default: + return false; + } } static bool can_large_pud(pud_t *pu_dir, unsigned long addr, unsigned long end, -- 2.51.0 From 7726b55b5d6c22dc0e66570597bf6e43d410d093 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 25 Nov 2024 12:54:30 +0100 Subject: [PATCH 06/16] s390/ap: Replace xchg() with WRITE_ONCE() The result of xchg() is not used, and in addition it is used on a one byte memory area which leads to inefficient code. Use WRITE_ONCE() instead to achieve the same result with much less generated code. Acked-by: Harald Freudenberger Acked-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- drivers/s390/crypto/ap_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index e14638936de6..26e1ea1940ec 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -453,7 +453,7 @@ static void ap_tasklet_fn(unsigned long dummy) * important that no requests on any AP get lost. */ if (ap_irq_flag) - xchg(ap_airq.lsi_ptr, 0); + WRITE_ONCE(*ap_airq.lsi_ptr, 0); spin_lock_bh(&ap_queues_lock); hash_for_each(ap_queues, bkt, aq, hnode) { -- 2.51.0 From 5618c53d96d1bf2df07df8b9204773db28acbc1b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 11:25:13 +0100 Subject: [PATCH 07/16] KVM: s390: Use try_cmpxchg() instead of cmpxchg() loops Convert all cmpxchg() loops to try_cmpxchg() loops. With gcc 14 and the usage of flag output operands in try_cmpxchg() this allows the compiler to generate slightly better code. Acked-by: Claudio Imbrenda Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20241126102515.3178914-2-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/kvm/gaccess.c | 16 ++++++++-------- arch/s390/kvm/interrupt.c | 12 ++++++------ arch/s390/kvm/kvm-s390.c | 4 ++-- arch/s390/kvm/pci.c | 5 ++--- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index a688351f4ab5..9816b0060fbe 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -129,8 +129,8 @@ static void ipte_lock_simple(struct kvm *kvm) retry: read_lock(&kvm->arch.sca_lock); ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); if (old.k) { read_unlock(&kvm->arch.sca_lock); cond_resched(); @@ -138,7 +138,7 @@ retry: } new = old; new.k = 1; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); out: mutex_unlock(&kvm->arch.ipte_mutex); @@ -154,11 +154,11 @@ static void ipte_unlock_simple(struct kvm *kvm) goto out; read_lock(&kvm->arch.sca_lock); ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); new = old; new.k = 0; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); wake_up(&kvm->arch.ipte_wq); out: @@ -172,8 +172,8 @@ static void ipte_lock_siif(struct kvm *kvm) retry: read_lock(&kvm->arch.sca_lock); ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); if (old.kg) { read_unlock(&kvm->arch.sca_lock); cond_resched(); @@ -182,7 +182,7 @@ retry: new = old; new.k = 1; new.kh++; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); } @@ -192,13 +192,13 @@ static void ipte_unlock_siif(struct kvm *kvm) read_lock(&kvm->arch.sca_lock); ic = kvm_s390_get_ipte_control(kvm); + old = READ_ONCE(*ic); do { - old = READ_ONCE(*ic); new = old; new.kh--; if (!new.kh) new.k = 0; - } while (cmpxchg(&ic->val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&ic->val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); if (!new.kh) wake_up(&kvm->arch.ipte_wq); diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 4f0e7f61edf7..eff69018cbeb 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -247,12 +247,12 @@ static inline int gisa_set_iam(struct kvm_s390_gisa *gisa, u8 iam) { u64 word, _word; + word = READ_ONCE(gisa->u64.word[0]); do { - word = READ_ONCE(gisa->u64.word[0]); if ((u64)gisa != word >> 32) return -EBUSY; _word = (word & ~0xffUL) | iam; - } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gisa->u64.word[0], &word, _word)); return 0; } @@ -270,10 +270,10 @@ static inline void gisa_clear_ipm(struct kvm_s390_gisa *gisa) { u64 word, _word; + word = READ_ONCE(gisa->u64.word[0]); do { - word = READ_ONCE(gisa->u64.word[0]); _word = word & ~(0xffUL << 24); - } while (cmpxchg(&gisa->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gisa->u64.word[0], &word, _word)); } /** @@ -291,14 +291,14 @@ static inline u8 gisa_get_ipm_or_restore_iam(struct kvm_s390_gisa_interrupt *gi) u8 pending_mask, alert_mask; u64 word, _word; + word = READ_ONCE(gi->origin->u64.word[0]); do { - word = READ_ONCE(gi->origin->u64.word[0]); alert_mask = READ_ONCE(gi->alert.mask); pending_mask = (u8)(word >> 24) & alert_mask; if (pending_mask) return pending_mask; _word = (word & ~0xffUL) | alert_mask; - } while (cmpxchg(&gi->origin->u64.word[0], word, _word) != word); + } while (!try_cmpxchg(&gi->origin->u64.word[0], &word, _word)); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index deeb32034ad5..a1fcdb2479a1 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1907,11 +1907,11 @@ static void kvm_s390_update_topology_change_report(struct kvm *kvm, bool val) read_lock(&kvm->arch.sca_lock); sca = kvm->arch.sca; + old = READ_ONCE(sca->utility); do { - old = READ_ONCE(sca->utility); new = old; new.mtcr = val; - } while (cmpxchg(&sca->utility.val, old.val, new.val) != old.val); + } while (!try_cmpxchg(&sca->utility.val, &old.val, new.val)); read_unlock(&kvm->arch.sca_lock); } diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index a61518b549f0..9b9e7fdd5380 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -208,13 +208,12 @@ static inline int account_mem(unsigned long nr_pages) page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur_pages = atomic_long_read(&user->locked_vm); do { - cur_pages = atomic_long_read(&user->locked_vm); new_pages = cur_pages + nr_pages; if (new_pages > page_limit) return -ENOMEM; - } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, - new_pages) != cur_pages); + } while (!atomic_long_try_cmpxchg(&user->locked_vm, &cur_pages, new_pages)); atomic64_add(nr_pages, ¤t->mm->pinned_vm); -- 2.51.0 From 7061c63919bde3bb0daa63e700a117ab1f2c97b1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 11:25:14 +0100 Subject: [PATCH 08/16] KVM: s390: Remove one byte cmpxchg() usage Within sca_clear_ext_call() cmpxchg() is used to clear one or two bytes (depending on sca format). The cmpxchg() calls are not supposed to fail; if so that would be a bug. Given that cmpxchg() usage on one and two byte areas generates very inefficient code, replace them with block concurrent WRITE_ONCE() calls, and remove the WARN_ON(). Acked-by: Claudio Imbrenda Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20241126102515.3178914-3-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/kvm/interrupt.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index eff69018cbeb..ea8dce299954 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -118,8 +118,6 @@ static int sca_inject_ext_call(struct kvm_vcpu *vcpu, int src_id) static void sca_clear_ext_call(struct kvm_vcpu *vcpu) { - int rc, expect; - if (!kvm_s390_use_sca_entries()) return; kvm_s390_clear_cpuflags(vcpu, CPUSTAT_ECALL_PEND); @@ -128,23 +126,16 @@ static void sca_clear_ext_call(struct kvm_vcpu *vcpu) struct esca_block *sca = vcpu->kvm->arch.sca; union esca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union esca_sigp_ctrl old; - old = READ_ONCE(*sigp_ctrl); - expect = old.value; - rc = cmpxchg(&sigp_ctrl->value, old.value, 0); + WRITE_ONCE(sigp_ctrl->value, 0); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; union bsca_sigp_ctrl *sigp_ctrl = &(sca->cpu[vcpu->vcpu_id].sigp_ctrl); - union bsca_sigp_ctrl old; - old = READ_ONCE(*sigp_ctrl); - expect = old.value; - rc = cmpxchg(&sigp_ctrl->value, old.value, 0); + WRITE_ONCE(sigp_ctrl->value, 0); } read_unlock(&vcpu->kvm->arch.sca_lock); - WARN_ON(rc != expect); /* cannot clear? */ } int psw_extint_disabled(struct kvm_vcpu *vcpu) -- 2.51.0 From f93d6d62e4697b0df1474f0b1314c3c158335f0a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 26 Nov 2024 11:25:15 +0100 Subject: [PATCH 09/16] KVM: s390: Increase size of union sca_utility to four bytes kvm_s390_update_topology_change_report() modifies a single bit within sca_utility using cmpxchg(). Given that the size of the sca_utility union is two bytes this generates very inefficient code. Change the size to four bytes, so better code can be generated. Even though the size of sca_utility doesn't reflect architecture anymore this seems to be the easiest and most pragmatic approach to avoid inefficient code. Acked-by: Claudio Imbrenda Acked-by: Janosch Frank Link: https://lore.kernel.org/r/20241126102515.3178914-4-hca@linux.ibm.com Signed-off-by: Heiko Carstens --- arch/s390/include/asm/kvm_host.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 51201b4ac93a..7ef37f746d47 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -94,11 +94,16 @@ union ipte_control { }; }; +/* + * Utility is defined as two bytes but having it four bytes wide + * generates more efficient code. Since the following bytes are + * reserved this makes no functional difference. + */ union sca_utility { - __u16 val; + __u32 val; struct { - __u16 mtcr : 1; - __u16 reserved : 15; + __u32 mtcr : 1; + __u32 : 31; }; }; @@ -107,7 +112,7 @@ struct bsca_block { __u64 reserved[5]; __u64 mcn; union sca_utility utility; - __u8 reserved2[6]; + __u8 reserved2[4]; struct bsca_entry cpu[KVM_S390_BSCA_CPU_SLOTS]; }; @@ -115,7 +120,7 @@ struct esca_block { union ipte_control ipte_control; __u64 reserved1[6]; union sca_utility utility; - __u8 reserved2[6]; + __u8 reserved2[4]; __u64 mcn[4]; __u64 reserved3[20]; struct esca_entry cpu[KVM_S390_ESCA_CPU_SLOTS]; -- 2.51.0 From ae1b9fb2d556e95001399dcd4e228525aead5122 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Nov 2024 18:45:20 +0100 Subject: [PATCH 10/16] s390/mm: Rearrange region-third and segment table entry SW bits Rearrange region-third and segment table entry SW bits, in order to make room for future encoding of region/segment table swap entries. Also adjust _SEGMENT_ENTRY_GMAP_UC and _SEGMENT_ENTRY_GMAP_IN bits in gmap code. Those should only apply for gmap PMDs, and not really depend on or conflict with host PMD bits, but for consistency also adjust them: - _SEGMENT_ENTRY_GMAP_UC "dirty (migration)" was using the same bit as _SEGMENT_ENTRY_SOFT_DIRTY in the host PMD -> make it use the new SOFT_DIRTY bit 63 (0x0002) - _SEGMENT_ENTRY_GMAP_IN "invalidation notify bit" was using 0x8000, which was an unused bit in the host PMD, that is now used for _SEGMENT_ENTRY_WRITE -> make it use bit 52 (0x0800) instead, which is still unused in the host PMD This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/gmap.h | 4 ++-- arch/s390/include/asm/pgtable.h | 13 +++++++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h index 64761c78f774..13f51a6a5bb1 100644 --- a/arch/s390/include/asm/gmap.h +++ b/arch/s390/include/asm/gmap.h @@ -17,8 +17,8 @@ #define GMAP_NOTIFY_MPROT 0x1 /* Status bits only for huge segment entries */ -#define _SEGMENT_ENTRY_GMAP_IN 0x8000 /* invalidation notify bit */ -#define _SEGMENT_ENTRY_GMAP_UC 0x4000 /* dirty (migration) */ +#define _SEGMENT_ENTRY_GMAP_IN 0x0800 /* invalidation notify bit */ +#define _SEGMENT_ENTRY_GMAP_UC 0x0002 /* dirty (migration) */ /** * struct gmap_struct - guest address space diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0ffbaf741955..ee5d047a2d67 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -286,11 +286,11 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ -#define _REGION3_ENTRY_WRITE 0x0002 /* SW region write bit */ -#define _REGION3_ENTRY_READ 0x0001 /* SW region read bit */ +#define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */ +#define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */ +#define _REGION3_ENTRY_SOFT_DIRTY 0x0002 /* SW region soft dirty bit */ #else #define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */ #endif @@ -313,12 +313,13 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ + #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ -#define _SEGMENT_ENTRY_WRITE 0x0002 /* SW segment write bit */ -#define _SEGMENT_ENTRY_READ 0x0001 /* SW segment read bit */ +#define _SEGMENT_ENTRY_WRITE 0x8000 /* SW segment write bit */ +#define _SEGMENT_ENTRY_READ 0x4000 /* SW segment read bit */ #ifdef CONFIG_MEM_SOFT_DIRTY -#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */ +#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0002 /* SW segment soft dirty bit */ #else #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif -- 2.51.0 From 03e6db16b808afbe23e10617f2d18578846bdce0 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Nov 2024 18:45:21 +0100 Subject: [PATCH 11/16] s390/mm: Introduce region-third and segment table entry present bits Introduce region-third and segment table entry present SW bits, and adjust pmd/pud_present() accordingly. Also add pmd/pud_present() checks to pmd/pud_leaf(), to return false for future swap entries. Same logic applies to pmd_trans_huge(), make that return pmd_leaf() instead of duplicating the same check. huge_pte_offset() also needs to be adjusted, current code would return NULL for !pud_present(). Use the same logic as in the generic version, which allows for !pud_present() swap entries. Similar to PTE, bit 63 can be used for the new SW present bit in region and segment table entries. For segment-table entries (PMD) the architecture says that "Bits 62-63 are available for programming", so they are safe to use. The same is true for large leaf region-third-table entries (PUD). However, for non-leaf region-third-table entries, bits 62-63 indicate the TABLE LENGTH and both must be set to 1. But such entries would always be considered as present, so it is safe to use bit 63 as PRESENT bit for PUD. They also should not conflict with bit 62 potentially later used for preserving SOFT_DIRTY in swap entries, because they are not swap entries. Valid PMDs / PUDs should always have the present bit set, so add it to the various pgprot defines, and also _SEGMENT_ENTRY which is OR'ed e.g. in pmd_populate(). _REGION3_ENTRY wouldn't need any change, as the present bit is already included in the TABLE LENGTH, but also explicitly add it there, for completeness, and just in case the bit would ever be changed. gmap code needs some adjustment, to also OR the _SEGMENT_ENTRY, like it is already done gmap_shadow_pgt() when creating new PMDs, but not in __gmap_link(). Otherwise, the gmap PMDs would not be considered present, e.g. when using pmd_leaf() checks in gmap code. The various WARN_ON checks in gmap code also need adjustment, to tolerate the new present bit. This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pgtable.h | 51 ++++++++++++++++++++++----------- arch/s390/mm/gmap.c | 12 +++++--- arch/s390/mm/hugetlbpage.c | 8 +++--- 3 files changed, 47 insertions(+), 24 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index ee5d047a2d67..8213e05770f8 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -277,7 +277,8 @@ static inline int is_module_addr(void *addr) #define _REGION1_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID) #define _REGION2_ENTRY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_LENGTH) #define _REGION2_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID) -#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH) +#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH | \ + _REGION3_ENTRY_PRESENT) #define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID) #define _REGION3_ENTRY_HARDWARE_BITS 0xfffffffffffff6ffUL @@ -297,6 +298,14 @@ static inline int is_module_addr(void *addr) #define _REGION_ENTRY_BITS 0xfffffffffffff22fUL +/* + * SW region present bit. For non-leaf region-third-table entries, bits 62-63 + * indicate the TABLE LENGTH and both must be set to 1. But such entries + * would always be considered as present, so it is safe to use bit 63 as + * PRESENT bit for PUD. + */ +#define _REGION3_ENTRY_PRESENT 0x0001 + /* Bits in the segment table entry */ #define _SEGMENT_ENTRY_BITS 0xfffffffffffffe3fUL #define _SEGMENT_ENTRY_HARDWARE_BITS 0xfffffffffffffe3cUL @@ -308,7 +317,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ #define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */ -#define _SEGMENT_ENTRY (0) +#define _SEGMENT_ENTRY (_SEGMENT_ENTRY_PRESENT) #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ @@ -324,6 +333,8 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */ #endif +#define _SEGMENT_ENTRY_PRESENT 0x0001 /* SW segment present bit */ + #define _CRST_ENTRIES 2048 /* number of region/segment table entries */ #define _PAGE_ENTRIES 256 /* number of page table entries */ @@ -455,17 +466,22 @@ static inline int is_module_addr(void *addr) /* * Segment entry (large page) protection definitions. */ -#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_INVALID | \ +#define SEGMENT_NONE __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_INVALID | \ _SEGMENT_ENTRY_PROTECT) -#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PROTECT | \ +#define SEGMENT_RO __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PROTECT | \ +#define SEGMENT_RX __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_PROTECT | \ _SEGMENT_ENTRY_READ) -#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_READ | \ +#define SEGMENT_RW __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE | \ _SEGMENT_ENTRY_NOEXEC) -#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_READ | \ +#define SEGMENT_RWX __pgprot(_SEGMENT_ENTRY_PRESENT | \ + _SEGMENT_ENTRY_READ | \ _SEGMENT_ENTRY_WRITE) #define SEGMENT_KERNEL __pgprot(_SEGMENT_ENTRY | \ _SEGMENT_ENTRY_LARGE | \ @@ -492,6 +508,7 @@ static inline int is_module_addr(void *addr) */ #define REGION3_KERNEL __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_WRITE | \ @@ -499,12 +516,14 @@ static inline int is_module_addr(void *addr) _REGION3_ENTRY_DIRTY | \ _REGION_ENTRY_NOEXEC) #define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_YOUNG | \ _REGION_ENTRY_PROTECT | \ _REGION_ENTRY_NOEXEC) #define REGION3_KERNEL_EXEC __pgprot(_REGION_ENTRY_TYPE_R3 | \ + _REGION3_ENTRY_PRESENT | \ _REGION3_ENTRY_LARGE | \ _REGION3_ENTRY_READ | \ _REGION3_ENTRY_WRITE | \ @@ -747,7 +766,7 @@ static inline int pud_present(pud_t pud) { if (pud_folded(pud)) return 1; - return (pud_val(pud) & _REGION_ENTRY_ORIGIN) != 0UL; + return (pud_val(pud) & _REGION3_ENTRY_PRESENT) != 0; } static inline int pud_none(pud_t pud) @@ -762,13 +781,18 @@ static inline bool pud_leaf(pud_t pud) { if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) != _REGION_ENTRY_TYPE_R3) return 0; - return !!(pud_val(pud) & _REGION3_ENTRY_LARGE); + return (pud_present(pud) && (pud_val(pud) & _REGION3_ENTRY_LARGE) != 0); +} + +static inline int pmd_present(pmd_t pmd) +{ + return (pmd_val(pmd) & _SEGMENT_ENTRY_PRESENT) != 0; } #define pmd_leaf pmd_leaf static inline bool pmd_leaf(pmd_t pmd) { - return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0; + return (pmd_present(pmd) && (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0); } static inline int pmd_bad(pmd_t pmd) @@ -800,11 +824,6 @@ static inline int p4d_bad(p4d_t p4d) return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; } -static inline int pmd_present(pmd_t pmd) -{ - return pmd_val(pmd) != _SEGMENT_ENTRY_EMPTY; -} - static inline int pmd_none(pmd_t pmd) { return pmd_val(pmd) == _SEGMENT_ENTRY_EMPTY; @@ -1852,7 +1871,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, static inline int pmd_trans_huge(pmd_t pmd) { - return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; + return pmd_leaf(pmd); } #define has_transparent_hugepage has_transparent_hugepage diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 329682655af2..404489ecaf8a 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -587,7 +587,8 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) if (pmd_leaf(*pmd)) { *table = (pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS_LARGE) - | _SEGMENT_ENTRY_GMAP_UC; + | _SEGMENT_ENTRY_GMAP_UC + | _SEGMENT_ENTRY; } else *table = pmd_val(*pmd) & _SEGMENT_ENTRY_HARDWARE_BITS; @@ -2396,7 +2397,8 @@ static void gmap_pmdp_clear(struct mm_struct *mm, unsigned long vmaddr, gaddr = __gmap_segment_gaddr((unsigned long *)pmdp); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(pmd_val(*pmdp) & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (purge) __pmdp_csp(pmdp); set_pmd(pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); @@ -2450,7 +2452,8 @@ void gmap_pmdp_idte_local(struct mm_struct *mm, unsigned long vmaddr) gaddr = __gmap_segment_gaddr(entry); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_LOCAL); @@ -2485,7 +2488,8 @@ void gmap_pmdp_idte_global(struct mm_struct *mm, unsigned long vmaddr) gaddr = __gmap_segment_gaddr(entry); pmdp_notify_gmap(gmap, pmdp, gaddr); WARN_ON(*entry & ~(_SEGMENT_ENTRY_HARDWARE_BITS_LARGE | - _SEGMENT_ENTRY_GMAP_UC)); + _SEGMENT_ENTRY_GMAP_UC | + _SEGMENT_ENTRY)); if (MACHINE_HAS_TLB_GUEST) __pmdp_idte(gaddr, pmdp, IDTE_GUEST_ASCE, gmap->asce, IDTE_GLOBAL); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index ded0eff58a19..a65bebbb5a34 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -48,6 +48,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) */ if (pte_present(pte)) { rste = pte_val(pte) & PAGE_MASK; + rste |= _SEGMENT_ENTRY_PRESENT; rste |= move_set_bit(pte_val(pte), _PAGE_READ, _SEGMENT_ENTRY_READ); rste |= move_set_bit(pte_val(pte), _PAGE_WRITE, @@ -223,11 +224,10 @@ pte_t *huge_pte_offset(struct mm_struct *mm, p4dp = p4d_offset(pgdp, addr); if (p4d_present(*p4dp)) { pudp = pud_offset(p4dp, addr); - if (pud_present(*pudp)) { - if (pud_leaf(*pudp)) - return (pte_t *) pudp; + if (sz == PUD_SIZE) + return (pte_t *)pudp; + if (pud_present(*pudp)) pmdp = pmd_offset(pudp, addr); - } } } return (pte_t *) pmdp; -- 2.51.0 From f934f6be76c1d033692f7fbfcbb06ec44a25bf3f Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Nov 2024 18:45:22 +0100 Subject: [PATCH 12/16] s390/mm: Introduce region-third and segment table swap entries Introduce region-third (PUD) and segment table (PMD) swap entries, and make hugetlbfs RSTE <-> PTE conversion code aware of them, so that they can be used for hugetlbfs PTE_MARKER entries. Future work could also build on this to enable THP_SWAP and THP_MIGRATION for s390. Similar to PTE swap entries, bits 0-51 can be used to store the swap offset, but bits 57-61 cannot be used for swap type because that overlaps with the INVALID and TABLE TYPE bits. PMD/PUD swap entries must be invalid, and have a correct table type so that pud_folded() check still works. Bits 53-57 can be used for swap type, but those include the PROTECT bit. So unlike swap PTEs, the PROTECT bit cannot be used to mark the swap entry. Use the "Common-Segment/Region" bit 59 instead for that. Also remove the !MACHINE_HAS_NX check in __set_huge_pte_at(). Otherwise, that would clear the _SEGMENT_ENTRY_NOEXEC bit also for swap entries, where it is used for encoding the swap type. The architecture only requires this bit to be 0 for PTEs, with !MACHINE_HAS_NX, not for segment or region-third entries. And the check is also redundant, because after __pte_to_rste() conversion, for non-swap PTEs it would only be set if it was already set in the PTE, which should never be the case for !MACHINE_HAS_NX. This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which is needed to fix a regression introduced with commit 8a13897fb0da ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit depends on the availability of swap entries for hugetlbfs, which were not available for s390 so far. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/pgtable.h | 53 +++++++++++++++++++++++++++++++++ arch/s390/mm/hugetlbpage.c | 23 ++++++++++---- 2 files changed, 71 insertions(+), 5 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 8213e05770f8..300affac1698 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -286,6 +286,7 @@ static inline int is_module_addr(void *addr) #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */ #define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */ #define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */ +#define _REGION3_ENTRY_COMM 0x0010 /* Common-Region, marks swap entry */ #define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */ #define _REGION3_ENTRY_WRITE 0x8000 /* SW region write bit */ #define _REGION3_ENTRY_READ 0x4000 /* SW region read bit */ @@ -323,6 +324,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */ #define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */ +#define _SEGMENT_ENTRY_COMM 0x0010 /* Common-Segment, marks swap entry */ #define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */ #define _SEGMENT_ENTRY_WRITE 0x8000 /* SW segment write bit */ #define _SEGMENT_ENTRY_READ 0x4000 /* SW segment read bit */ @@ -335,6 +337,10 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_PRESENT 0x0001 /* SW segment present bit */ +/* Common bits in region and segment table entries, for swap entries */ +#define _RST_ENTRY_COMM 0x0010 /* Common-Region/Segment, marks swap entry */ +#define _RST_ENTRY_INVALID 0x0020 /* invalid region/segment table entry */ + #define _CRST_ENTRIES 2048 /* number of region/segment table entries */ #define _PAGE_ENTRIES 256 /* number of page table entries */ @@ -1931,6 +1937,53 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val(pte) }) #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +/* + * 64 bit swap entry format for REGION3 and SEGMENT table entries (RSTE) + * Bits 59 and 63 are used to indicate the swap entry. Bit 58 marks the rste + * as invalid. + * A swap entry is indicated by bit pattern (rste & 0x011) == 0x010 + * | offset |Xtype |11TT|S0| + * |0000000000111111111122222222223333333333444444444455|555555|5566|66| + * |0123456789012345678901234567890123456789012345678901|234567|8901|23| + * + * Bits 0-51 store the offset. + * Bits 53-57 store the type. + * Bit 62 (S) is used for softdirty tracking. + * Bits 60-61 (TT) indicate the table type: 0x01 for REGION3 and 0x00 for SEGMENT. + * Bit 52 (X) is unused. + */ + +#define __SWP_OFFSET_MASK_RSTE ((1UL << 52) - 1) +#define __SWP_OFFSET_SHIFT_RSTE 12 +#define __SWP_TYPE_MASK_RSTE ((1UL << 5) - 1) +#define __SWP_TYPE_SHIFT_RSTE 6 + +/* + * TT bits set to 0x00 == SEGMENT. For REGION3 entries, caller must add R3 + * bits 0x01. See also __set_huge_pte_at(). + */ +static inline unsigned long mk_swap_rste(unsigned long type, unsigned long offset) +{ + unsigned long rste; + + rste = _RST_ENTRY_INVALID | _RST_ENTRY_COMM; + rste |= (offset & __SWP_OFFSET_MASK_RSTE) << __SWP_OFFSET_SHIFT_RSTE; + rste |= (type & __SWP_TYPE_MASK_RSTE) << __SWP_TYPE_SHIFT_RSTE; + return rste; +} + +static inline unsigned long __swp_type_rste(swp_entry_t entry) +{ + return (entry.val >> __SWP_TYPE_SHIFT_RSTE) & __SWP_TYPE_MASK_RSTE; +} + +static inline unsigned long __swp_offset_rste(swp_entry_t entry) +{ + return (entry.val >> __SWP_OFFSET_SHIFT_RSTE) & __SWP_OFFSET_MASK_RSTE; +} + +#define __rste_to_swp_entry(rste) ((swp_entry_t) { rste }) + extern int vmem_add_mapping(unsigned long start, unsigned long size); extern void vmem_remove_mapping(unsigned long start, unsigned long size); extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index a65bebbb5a34..c58a67416a04 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -24,6 +24,7 @@ static inline unsigned long __pte_to_rste(pte_t pte) { + swp_entry_t arch_entry; unsigned long rste; /* @@ -67,6 +68,10 @@ static inline unsigned long __pte_to_rste(pte_t pte) #endif rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC, _SEGMENT_ENTRY_NOEXEC); + } else if (!pte_none(pte)) { + /* swap pte */ + arch_entry = __pte_to_swp_entry(pte); + rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry)); } else rste = _SEGMENT_ENTRY_EMPTY; return rste; @@ -74,13 +79,18 @@ static inline unsigned long __pte_to_rste(pte_t pte) static inline pte_t __rste_to_pte(unsigned long rste) { + swp_entry_t arch_entry; unsigned long pteval; - int present; + int present, none; + pte_t pte; - if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) + if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { present = pud_present(__pud(rste)); - else + none = pud_none(__pud(rste)); + } else { present = pmd_present(__pmd(rste)); + none = pmd_none(__pmd(rste)); + } /* * Convert encoding pmd / pud bits pte bits @@ -115,6 +125,11 @@ static inline pte_t __rste_to_pte(unsigned long rste) pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY); #endif pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC); + } else if (!none) { + /* swap rste */ + arch_entry = __rste_to_swp_entry(rste); + pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry)); + pteval = pte_val(pte); } else pteval = _PAGE_INVALID; return __pte(pteval); @@ -149,8 +164,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, unsigned long rste; rste = __pte_to_rste(pte); - if (!MACHINE_HAS_NX) - rste &= ~_SEGMENT_ENTRY_NOEXEC; /* Set correct table type for 2G hugepages */ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) { -- 2.51.0 From 487ef5d4d912f6a32556d8a61cede17870925295 Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Thu, 21 Nov 2024 18:45:23 +0100 Subject: [PATCH 13/16] s390/mm: Add PTE_MARKER support for hugetlbfs mappings Commit 8a13897fb0daa ("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs") added support for PTE_MARKER_POISONED for hugetlbfs, but PTE_MARKER also needs support for swap entries. For s390, swap entries were only supported on PTE level, not on the PMD/PUD levels that are used for large hugetlbfs mappings. Therefore, when writing a PTE_MARKER_POISONED entry, the resulting entry on PMD/PUD level would be an invalid / empty entry. Further access would then generate a pagefault loop, instead of the expected SIGBUS. It is a loop inside the kernel, but interruptible and uffd fault handling also calls schedule() in between, so at least it won't completely block the system. Previous commits prepared support for swap entries on PMD/PUD levels. PTE_MARKER support for hugetlbfs can now be enabled by simply adding an extra is_pte_marker() check to huge_pte_none_mostly(). Fault handling code also needs to be adjusted to expect the VM_FAULT_HWPOISON_LARGE fault flag, which was not possible on s390 before. Reviewed-by: Alexander Gordeev Signed-off-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/include/asm/hugetlb.h | 2 +- arch/s390/mm/fault.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index cf1b5d6fb1a6..0f621c99b216 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -91,7 +91,7 @@ static inline int huge_pte_none(pte_t pte) static inline int huge_pte_none_mostly(pte_t pte) { - return huge_pte_none(pte); + return huge_pte_none(pte) || is_pte_marker(pte); } static inline int huge_pte_write(pte_t pte) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 94cb2e092075..4ca35545010f 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -338,7 +338,8 @@ done: handle_fault_error_nolock(regs, 0); else do_sigsegv(regs, SEGV_MAPERR); - } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)) { + } else if (fault & (VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { if (!user_mode(regs)) handle_fault_error_nolock(regs, 0); else -- 2.51.0 From adb44a4bfc8a3312d2a13cc09d2b89d5828e7702 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 28 Nov 2024 09:43:29 +0100 Subject: [PATCH 14/16] s390/mm/hugetlbfs: Add missing includes Add missing includes to fix this randconfig compile error: All errors (new ones prefixed by >>): In file included from mm/pagewalk.c:5: In file included from include/linux/hugetlb.h:798: >> arch/s390/include/asm/hugetlb.h:94:31: error: call to undeclared function 'is_pte_marker'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 94 | return huge_pte_none(pte) || is_pte_marker(pte); | ^ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202411281002.IPkRpIcR-lkp@intel.com/ Fixes: 487ef5d4d912 ("s390/mm: Add PTE_MARKER support for hugetlbfs mappings") Signed-off-by: Heiko Carstens --- arch/s390/include/asm/hugetlb.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 0f621c99b216..5bdb35991ecb 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -10,6 +10,8 @@ #define _ASM_S390_HUGETLB_H #include +#include +#include #include #define hugetlb_free_pgd_range free_pgd_range -- 2.51.0 From 48796104c864cf4dafa80bd8c2ce88f9c92a65ea Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 25 Nov 2024 10:35:10 +0100 Subject: [PATCH 15/16] s390/pci: Fix leak of struct zpci_dev when zpci_add_device() fails Prior to commit 0467cdde8c43 ("s390/pci: Sort PCI functions prior to creating virtual busses") the IOMMU was initialized and the device was registered as part of zpci_create_device() with the struct zpci_dev freed if either resulted in an error. With that commit this was moved into a separate function called zpci_add_device(). While this new function logs when adding failed, it expects the caller not to use and to free the struct zpci_dev on error. This difference between it and zpci_create_device() was missed while changing the callers and the incompletely initialized struct zpci_dev may get used in zpci_scan_configured_device in the error path. This then leads to a crash due to the device not being registered with the zbus. It was also not freed in this case. Fix this by handling the error return of zpci_add_device(). Since in this case the zdev was not added to the zpci_list it can simply be discarded and freed. Also make this more explicit by moving the kref_init() into zpci_add_device() and document that zpci_zdev_get()/zpci_zdev_put() must be used after adding. Cc: stable@vger.kernel.org Fixes: 0467cdde8c43 ("s390/pci: Sort PCI functions prior to creating virtual busses") Reviewed-by: Gerd Bayer Reviewed-by: Matthew Rosato Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/pci/pci.c | 21 +++++++++++++++++---- arch/s390/pci/pci_event.c | 10 ++++++++-- 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index b7efa96776ea..6a011d040dfe 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -776,8 +776,9 @@ int zpci_hot_reset_device(struct zpci_dev *zdev) * @fh: Current Function Handle of the device to be created * @state: Initial state after creation either Standby or Configured * - * Creates a new zpci device and adds it to its, possibly newly created, zbus - * as well as zpci_list. + * Allocates a new struct zpci_dev and queries the platform for its details. + * If successful the device can subsequently be added to the zPCI subsystem + * using zpci_add_device(). * * Returns: the zdev on success or an error pointer otherwise */ @@ -800,7 +801,6 @@ struct zpci_dev *zpci_create_device(u32 fid, u32 fh, enum zpci_state state) goto error; zdev->state = state; - kref_init(&zdev->kref); mutex_init(&zdev->state_lock); mutex_init(&zdev->fmb_lock); mutex_init(&zdev->kzdev_lock); @@ -813,6 +813,17 @@ error: return ERR_PTR(rc); } +/** + * zpci_add_device() - Add a previously created zPCI device to the zPCI subsystem + * @zdev: The zPCI device to be added + * + * A struct zpci_dev is added to the zPCI subsystem and to a virtual PCI bus creating + * a new one as necessary. A hotplug slot is created and events start to be handled. + * If successful from this point on zpci_zdev_get() and zpci_zdev_put() must be used. + * If adding the struct zpci_dev fails the device was not added and should be freed. + * + * Return: 0 on success, or an error code otherwise + */ int zpci_add_device(struct zpci_dev *zdev) { int rc; @@ -826,6 +837,7 @@ int zpci_add_device(struct zpci_dev *zdev) if (rc) goto error_destroy_iommu; + kref_init(&zdev->kref); spin_lock(&zpci_list_lock); list_add_tail(&zdev->entry, &zpci_list); spin_unlock(&zpci_list_lock); @@ -1118,7 +1130,8 @@ static void zpci_add_devices(struct list_head *scan_list) list_sort(NULL, scan_list, &zpci_cmp_rid); list_for_each_entry_safe(zdev, tmp, scan_list, entry) { list_del_init(&zdev->entry); - zpci_add_device(zdev); + if (zpci_add_device(zdev)) + kfree(zdev); } } diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 47f934f4e828..7f7b732b3f3e 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -340,7 +340,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_CONFIGURED); if (IS_ERR(zdev)) break; - zpci_add_device(zdev); + if (zpci_add_device(zdev)) { + kfree(zdev); + break; + } } else { /* the configuration request may be stale */ if (zdev->state != ZPCI_FN_STATE_STANDBY) @@ -354,7 +357,10 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) zdev = zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); if (IS_ERR(zdev)) break; - zpci_add_device(zdev); + if (zpci_add_device(zdev)) { + kfree(zdev); + break; + } } else { zpci_update_fh(zdev, ccdf->fh); } -- 2.51.0 From c4a585e952ca403a370586d3f16e8331a7564901 Mon Sep 17 00:00:00 2001 From: Niklas Schnelle Date: Mon, 25 Nov 2024 16:02:38 +0100 Subject: [PATCH 16/16] s390/pci: Fix potential double remove of hotplug slot In commit 6ee600bfbe0f ("s390/pci: remove hotplug slot when releasing the device") the zpci_exit_slot() was moved from zpci_device_reserved() to zpci_release_device() with the intention of keeping the hotplug slot around until the device is actually removed. Now zpci_release_device() is only called once all references are dropped. Since the zPCI subsystem only drops its reference once the device is in the reserved state it follows that zpci_release_device() must only deal with devices in the reserved state. Despite that it contains code to tear down from both configured and standby state. For the standby case this already includes the removal of the hotplug slot so would cause a double removal if a device was ever removed in either configured or standby state. Instead of causing a potential double removal in a case that should never happen explicitly WARN_ON() if a device in non-reserved state is released and get rid of the dead code cases. Fixes: 6ee600bfbe0f ("s390/pci: remove hotplug slot when releasing the device") Reviewed-by: Matthew Rosato Reviewed-by: Gerd Bayer Tested-by: Gerd Bayer Signed-off-by: Niklas Schnelle Signed-off-by: Heiko Carstens --- arch/s390/pci/pci.c | 34 +++++++++------------------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 6a011d040dfe..7aeab522f28d 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -937,10 +937,8 @@ void zpci_device_reserved(struct zpci_dev *zdev) void zpci_release_device(struct kref *kref) { struct zpci_dev *zdev = container_of(kref, struct zpci_dev, kref); - int ret; - if (zdev->has_hp_slot) - zpci_exit_slot(zdev); + WARN_ON(zdev->state != ZPCI_FN_STATE_RESERVED); if (zdev->zbus->bus) zpci_bus_remove_device(zdev, false); @@ -948,28 +946,14 @@ void zpci_release_device(struct kref *kref) if (zdev_enabled(zdev)) zpci_disable_device(zdev); - switch (zdev->state) { - case ZPCI_FN_STATE_CONFIGURED: - ret = sclp_pci_deconfigure(zdev->fid); - zpci_dbg(3, "deconf fid:%x, rc:%d\n", zdev->fid, ret); - fallthrough; - case ZPCI_FN_STATE_STANDBY: - if (zdev->has_hp_slot) - zpci_exit_slot(zdev); - spin_lock(&zpci_list_lock); - list_del(&zdev->entry); - spin_unlock(&zpci_list_lock); - zpci_dbg(3, "rsv fid:%x\n", zdev->fid); - fallthrough; - case ZPCI_FN_STATE_RESERVED: - if (zdev->has_resources) - zpci_cleanup_bus_resources(zdev); - zpci_bus_device_unregister(zdev); - zpci_destroy_iommu(zdev); - fallthrough; - default: - break; - } + if (zdev->has_hp_slot) + zpci_exit_slot(zdev); + + if (zdev->has_resources) + zpci_cleanup_bus_resources(zdev); + + zpci_bus_device_unregister(zdev); + zpci_destroy_iommu(zdev); zpci_dbg(3, "rem fid:%x\n", zdev->fid); kfree_rcu(zdev, rcu); } -- 2.51.0