From: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Date: Thu, 21 Nov 2024 17:45:22 +0000 (+0100)
Subject: s390/mm: Introduce region-third and segment table swap entries
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=f934f6be76c1d033692f7fbfcbb06ec44a25bf3f;p=users%2Fdwmw2%2Flinux.git

s390/mm: Introduce region-third and segment table swap entries

Introduce region-third (PUD) and segment table (PMD) swap entries, and
make hugetlbfs RSTE <-> PTE conversion code aware of them, so that they
can be used for hugetlbfs PTE_MARKER entries. Future work could also
build on this to enable THP_SWAP and THP_MIGRATION for s390.

Similar to PTE swap entries, bits 0-51 can be used to store the swap
offset, but bits 57-61 cannot be used for swap type because that overlaps
with the INVALID and TABLE TYPE bits. PMD/PUD swap entries must be invalid,
and have a correct table type so that pud_folded() check still works.

Bits 53-57 can be used for swap type, but those include the PROTECT bit.
So unlike swap PTEs, the PROTECT bit cannot be used to mark the swap entry.
Use the "Common-Segment/Region" bit 59 instead for that.

Also remove the !MACHINE_HAS_NX check in __set_huge_pte_at(). Otherwise,
that would clear the _SEGMENT_ENTRY_NOEXEC bit also for swap entries, where
it is used for encoding the swap type. The architecture only requires this
bit to be 0 for PTEs, with !MACHINE_HAS_NX, not for segment or region-third
entries. And the check is also redundant, because after __pte_to_rste()
conversion, for non-swap PTEs it would only be set if it was already set in
the PTE, which should never be the case for !MACHINE_HAS_NX.

This is a prerequisite for hugetlbfs PTE_MARKER support on s390, which
is needed to fix a regression introduced with commit 8a13897fb0da
("mm: userfaultfd: support UFFDIO_POISON for hugetlbfs"). That commit
depends on the availability of swap entries for hugetlbfs, which were
not available for s390 so far.

Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
---

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 8213e05770f8b..300affac1698b 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -286,6 +286,7 @@ static inline int is_module_addr(void *addr)
 #define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address	     */
 #define _REGION3_ENTRY_DIRTY	0x2000	/* SW region dirty bit */
 #define _REGION3_ENTRY_YOUNG	0x1000	/* SW region young bit */
+#define _REGION3_ENTRY_COMM	0x0010	/* Common-Region, marks swap entry */
 #define _REGION3_ENTRY_LARGE	0x0400	/* RTTE-format control, large page  */
 #define _REGION3_ENTRY_WRITE	0x8000	/* SW region write bit */
 #define _REGION3_ENTRY_READ	0x4000	/* SW region read bit */
@@ -323,6 +324,7 @@ static inline int is_module_addr(void *addr)
 #define _SEGMENT_ENTRY_DIRTY	0x2000	/* SW segment dirty bit */
 #define _SEGMENT_ENTRY_YOUNG	0x1000	/* SW segment young bit */
 
+#define _SEGMENT_ENTRY_COMM	0x0010	/* Common-Segment, marks swap entry */
 #define _SEGMENT_ENTRY_LARGE	0x0400	/* STE-format control, large page */
 #define _SEGMENT_ENTRY_WRITE	0x8000	/* SW segment write bit */
 #define _SEGMENT_ENTRY_READ	0x4000	/* SW segment read bit */
@@ -335,6 +337,10 @@ static inline int is_module_addr(void *addr)
 
 #define _SEGMENT_ENTRY_PRESENT	0x0001	/* SW segment present bit */
 
+/* Common bits in region and segment table entries, for swap entries */
+#define _RST_ENTRY_COMM		0x0010	/* Common-Region/Segment, marks swap entry */
+#define _RST_ENTRY_INVALID	0x0020	/* invalid region/segment table entry */
+
 #define _CRST_ENTRIES	2048	/* number of region/segment table entries */
 #define _PAGE_ENTRIES	256	/* number of page table entries	*/
 
@@ -1931,6 +1937,53 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
 #define __swp_entry_to_pte(x)	((pte_t) { (x).val })
 
+/*
+ * 64 bit swap entry format for REGION3 and SEGMENT table entries (RSTE)
+ * Bits 59 and 63 are used to indicate the swap entry. Bit 58 marks the rste
+ * as invalid.
+ * A swap entry is indicated by bit pattern (rste & 0x011) == 0x010
+ * |			  offset			|Xtype |11TT|S0|
+ * |0000000000111111111122222222223333333333444444444455|555555|5566|66|
+ * |0123456789012345678901234567890123456789012345678901|234567|8901|23|
+ *
+ * Bits 0-51 store the offset.
+ * Bits 53-57 store the type.
+ * Bit 62 (S) is used for softdirty tracking.
+ * Bits 60-61 (TT) indicate the table type: 0x01 for REGION3 and 0x00 for SEGMENT.
+ * Bit 52 (X) is unused.
+ */
+
+#define __SWP_OFFSET_MASK_RSTE	((1UL << 52) - 1)
+#define __SWP_OFFSET_SHIFT_RSTE	12
+#define __SWP_TYPE_MASK_RSTE		((1UL << 5) - 1)
+#define __SWP_TYPE_SHIFT_RSTE	6
+
+/*
+ * TT bits set to 0x00 == SEGMENT. For REGION3 entries, caller must add R3
+ * bits 0x01. See also __set_huge_pte_at().
+ */
+static inline unsigned long mk_swap_rste(unsigned long type, unsigned long offset)
+{
+	unsigned long rste;
+
+	rste = _RST_ENTRY_INVALID | _RST_ENTRY_COMM;
+	rste |= (offset & __SWP_OFFSET_MASK_RSTE) << __SWP_OFFSET_SHIFT_RSTE;
+	rste |= (type & __SWP_TYPE_MASK_RSTE) << __SWP_TYPE_SHIFT_RSTE;
+	return rste;
+}
+
+static inline unsigned long __swp_type_rste(swp_entry_t entry)
+{
+	return (entry.val >> __SWP_TYPE_SHIFT_RSTE) & __SWP_TYPE_MASK_RSTE;
+}
+
+static inline unsigned long __swp_offset_rste(swp_entry_t entry)
+{
+	return (entry.val >> __SWP_OFFSET_SHIFT_RSTE) & __SWP_OFFSET_MASK_RSTE;
+}
+
+#define __rste_to_swp_entry(rste)	((swp_entry_t) { rste })
+
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern void vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int __vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot, bool alloc);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index a65bebbb5a347..c58a67416a043 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -24,6 +24,7 @@
 
 static inline unsigned long __pte_to_rste(pte_t pte)
 {
+	swp_entry_t arch_entry;
 	unsigned long rste;
 
 	/*
@@ -67,6 +68,10 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 #endif
 		rste |= move_set_bit(pte_val(pte), _PAGE_NOEXEC,
 				     _SEGMENT_ENTRY_NOEXEC);
+	} else if (!pte_none(pte)) {
+		/* swap pte */
+		arch_entry = __pte_to_swp_entry(pte);
+		rste = mk_swap_rste(__swp_type(arch_entry), __swp_offset(arch_entry));
 	} else
 		rste = _SEGMENT_ENTRY_EMPTY;
 	return rste;
@@ -74,13 +79,18 @@ static inline unsigned long __pte_to_rste(pte_t pte)
 
 static inline pte_t __rste_to_pte(unsigned long rste)
 {
+	swp_entry_t arch_entry;
 	unsigned long pteval;
-	int present;
+	int present, none;
+	pte_t pte;
 
-	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+	if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {
 		present = pud_present(__pud(rste));
-	else
+		none = pud_none(__pud(rste));
+	} else {
 		present = pmd_present(__pmd(rste));
+		none = pmd_none(__pmd(rste));
+	}
 
 	/*
 	 * Convert encoding		pmd / pud bits	    pte bits
@@ -115,6 +125,11 @@ static inline pte_t __rste_to_pte(unsigned long rste)
 		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY, _PAGE_SOFT_DIRTY);
 #endif
 		pteval |= move_set_bit(rste, _SEGMENT_ENTRY_NOEXEC, _PAGE_NOEXEC);
+	} else if (!none) {
+		/* swap rste */
+		arch_entry = __rste_to_swp_entry(rste);
+		pte = mk_swap_pte(__swp_type_rste(arch_entry), __swp_offset_rste(arch_entry));
+		pteval = pte_val(pte);
 	} else
 		pteval = _PAGE_INVALID;
 	return __pte(pteval);
@@ -149,8 +164,6 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	unsigned long rste;
 
 	rste = __pte_to_rste(pte);
-	if (!MACHINE_HAS_NX)
-		rste &= ~_SEGMENT_ENTRY_NOEXEC;
 
 	/* Set correct table type for 2G hugepages */
 	if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) {