From 8e8299bf386a0e41b1fd2106e81060d56d1b7b31 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:15 +0100 Subject: [PATCH 01/16] powerpc: mm: call ctor/dtor for kernel PTEs The generic implementation of pte_{alloc_one,free}_kernel now calls the [cd]tor, without initialising the ptlock needlessly as pagetable_pte_ctor() skips it for init_mm. On powerpc, all functions related to PTE allocation are implemented by common helpers, which are passed a boolean to differentiate user from kernel pgtables. This patch aligns the powerpc implementation with the generic one by calling pagetable_pte_[cd]tor() unconditionally in those helpers. Link: https://lkml.kernel.org/r/20250408095222.860601-6-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- arch/powerpc/mm/pgtable-frag.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 387e9b1fe12c..77e55eac16e4 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -56,19 +56,17 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) { void *ret = NULL; struct ptdesc *ptdesc; + gfp_t gfp = PGALLOC_GFP; - if (!kernel) { - ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0); - if (!ptdesc) - return NULL; - if (!pagetable_pte_ctor(mm, ptdesc)) { - pagetable_free(ptdesc); - return NULL; - } - } else { - ptdesc = pagetable_alloc(PGALLOC_GFP, 0); - if (!ptdesc) - return NULL; + if (!kernel) + gfp |= __GFP_ACCOUNT; + + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) + return NULL; + if (!pagetable_pte_ctor(mm, ptdesc)) { + pagetable_free(ptdesc); + return NULL; } atomic_set(&ptdesc->pt_frag_refcount, 1); @@ -124,12 +122,10 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { - if (kernel) - pagetable_free(ptdesc); - else if (folio_test_clear_active(ptdesc_folio(ptdesc))) - call_rcu(&ptdesc->pt_rcu_head, pte_free_now); - else + if (kernel || !folio_test_clear_active(ptdesc_folio(ptdesc))) pte_free_now(&ptdesc->pt_rcu_head); + else + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); } } -- 2.50.1 From 10a2e444e4ade64a1b12ccc698a77bd0ff5a57db Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:16 +0100 Subject: [PATCH 02/16] sparc64: mm: call ctor/dtor for kernel PTEs The generic implementation of pte_{alloc_one,free}_kernel now calls the [cd]tor, without initialising the ptlock needlessly as pagetable_pte_ctor() skips it for init_mm. Align sparc64 with the generic implementation by ensuring pagetable_pte_[cd]tor() are called for kernel PTEs. As a result the kernel and user alloc/free functions have the same implementation, and since pgtable_t is defined as pte_t *, we can have both call a common helper. Link: https://lkml.kernel.org/r/20250408095222.860601-7-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- arch/sparc/mm/init_64.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 5c8eabda1d17..25ae4c897aae 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2878,18 +2878,7 @@ void __flush_tlb_all(void) : : "r" (pstate)); } -pte_t *pte_alloc_one_kernel(struct mm_struct *mm) -{ - struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); - pte_t *pte = NULL; - - if (page) - pte = (pte_t *) page_address(page); - - return pte; -} - -pgtable_t pte_alloc_one(struct mm_struct *mm) +static pte_t *__pte_alloc_one(struct mm_struct *mm) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0); @@ -2902,9 +2891,14 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) return ptdesc_address(ptdesc); } -void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - free_page((unsigned long)pte); + return __pte_alloc_one(mm); +} + +pgtable_t pte_alloc_one(struct mm_struct *mm) +{ + return __pte_alloc_one(mm); } static void __pte_free(pgtable_t pte) @@ -2915,6 +2909,11 @@ static void __pte_free(pgtable_t pte) pagetable_free(ptdesc); } +void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +{ + __pte_free(pte); +} + void pte_free(struct mm_struct *mm, pgtable_t pte) { __pte_free(pte); -- 2.50.1 From 8240d8d3c5fb65fbdace7ae4db909b51f2253da7 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:17 +0100 Subject: [PATCH 03/16] mm: skip ptlock_init() for kernel PMDs Split page table locks are not used for pgtables associated to init_mm, at any level. pte_alloc_kernel() does not call ptlock_init() as a result. There is however no separate alloc/free functions for kernel PMDs, and pmd_ptlock_init() is called unconditionally. When ALLOC_SPLIT_PTLOCKS is true (e.g. 32-bit architectures or if CONFIG_PREEMPT_RT is selected), this results in unnecessary dynamic memory allocation every time a kernel PMD is allocated. Now that pagetable_pmd_ctor() is passed the associated mm, we can easily remove this overhead by skipping pmd_ptlock_init() if the pgtable is associated to init_mm. No special-casing is needed on the dtor path, as ptlock_free() is already called unconditionally for all levels. (ptlock_free() is a no-op unless a ptlock was allocated for the given PTP.) Link: https://lkml.kernel.org/r/20250408095222.860601-8-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ce6832787f6d..5eb0d77c4438 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3257,7 +3257,7 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pagetable_pmd_ctor(struct mm_struct *mm, struct ptdesc *ptdesc) { - if (!pmd_ptlock_init(ptdesc)) + if (mm != &init_mm && !pmd_ptlock_init(ptdesc)) return false; ptdesc_pmd_pts_init(ptdesc); __pagetable_ctor(ptdesc); -- 2.50.1 From c64f46ee1377961602832fc4e6768bb2f76642c2 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:18 +0100 Subject: [PATCH 04/16] arm64: mm: use enum to identify pgtable level instead of *_SHIFT Commit 90292aca9854 ("arm64: mm: use appropriate ctors for page tables") introduced pgtable ctor calls in pgd_pgtable_alloc(). To identify the pgtable level and call the appropriate ctor, the *_SHIFT value associated with the pgtable level is used. However, those values do not unambiguously identify a level, because if a given level is folded, the *_SHIFT value will be equal to that of the upper level (e.g. PMD_SHIFT == PUD_SHIFT if PMD is folded). As things stand, there is probably not much damaged done by calling the ctor for a different level, and ARCH_ENABLE_SPLIT_PMD_PTLOCK is only selected if PMD isn't folded (so we don't needlessly initialise pmd_ptlock). Still, this is pretty confusing, and it would get even more confusing when adding ctor calls for the remaining levels. Let's simplify all this by using an enum to identify the pgtable level instead; this way folding becomes irrelevant. This is inspired by one of the m68k pgtable allocators (arch/m68k/include/asm/motorola_pgalloc.h). Link: https://lkml.kernel.org/r/20250408095222.860601-9-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 54 +++++++++++++++++++++++++++------------------ 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 8c5c471cfb06..eca324b3a6fc 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -46,6 +46,13 @@ #define NO_CONT_MAPPINGS BIT(1) #define NO_EXEC_MAPPINGS BIT(2) /* assumes FEAT_HPDS is not used */ +enum pgtable_type { + TABLE_PTE, + TABLE_PMD, + TABLE_PUD, + TABLE_P4D, +}; + u64 kimage_voffset __ro_after_init; EXPORT_SYMBOL(kimage_voffset); @@ -107,7 +114,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static phys_addr_t __init early_pgtable_alloc(int shift) +static phys_addr_t __init early_pgtable_alloc(enum pgtable_type pgtable_type) { phys_addr_t phys; @@ -192,7 +199,7 @@ static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end, static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -207,7 +214,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pmdval |= PMD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(PAGE_SHIFT); + pte_phys = pgtable_alloc(TABLE_PTE); ptep = pte_set_fixmap(pte_phys); init_clear_pgtable(ptep); ptep += pte_index(addr); @@ -243,7 +250,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags) + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -277,7 +284,8 @@ static void init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end, static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags) + phys_addr_t (*pgtable_alloc)(enum pgtable_type), + int flags) { unsigned long next; pud_t pud = READ_ONCE(*pudp); @@ -294,7 +302,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, if (flags & NO_EXEC_MAPPINGS) pudval |= PUD_TABLE_PXN; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(PMD_SHIFT); + pmd_phys = pgtable_alloc(TABLE_PMD); pmdp = pmd_set_fixmap(pmd_phys); init_clear_pgtable(pmdp); pmdp += pmd_index(addr); @@ -325,7 +333,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -339,7 +347,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) p4dval |= P4D_TABLE_PXN; BUG_ON(!pgtable_alloc); - pud_phys = pgtable_alloc(PUD_SHIFT); + pud_phys = pgtable_alloc(TABLE_PUD); pudp = pud_set_fixmap(pud_phys); init_clear_pgtable(pudp); pudp += pud_index(addr); @@ -383,7 +391,7 @@ static void alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end, static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long next; @@ -397,7 +405,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, if (flags & NO_EXEC_MAPPINGS) pgdval |= PGD_TABLE_PXN; BUG_ON(!pgtable_alloc); - p4d_phys = pgtable_alloc(P4D_SHIFT); + p4d_phys = pgtable_alloc(TABLE_P4D); p4dp = p4d_set_fixmap(p4d_phys); init_clear_pgtable(p4dp); p4dp += p4d_index(addr); @@ -427,7 +435,7 @@ static void alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end, static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { unsigned long addr, end, next; @@ -455,7 +463,7 @@ static void __create_pgd_mapping_locked(pgd_t *pgdir, phys_addr_t phys, static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), + phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags) { mutex_lock(&fixmap_lock); @@ -468,10 +476,11 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, extern __alias(__create_pgd_mapping_locked) void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(int), int flags); + phys_addr_t (*pgtable_alloc)(enum pgtable_type), + int flags); #endif -static phys_addr_t __pgd_pgtable_alloc(int shift) +static phys_addr_t __pgd_pgtable_alloc(enum pgtable_type pgtable_type) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO); @@ -480,23 +489,26 @@ static phys_addr_t __pgd_pgtable_alloc(int shift) return __pa(ptr); } -static phys_addr_t pgd_pgtable_alloc(int shift) +static phys_addr_t pgd_pgtable_alloc(enum pgtable_type pgtable_type) { - phys_addr_t pa = __pgd_pgtable_alloc(shift); + phys_addr_t pa = __pgd_pgtable_alloc(pgtable_type); struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); /* * Call proper page table ctor in case later we need to * call core mm functions like apply_to_page_range() on * this pre-allocated page table. - * - * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is - * folded, and if so pagetable_pte_ctor() becomes nop. */ - if (shift == PAGE_SHIFT) + switch (pgtable_type) { + case TABLE_PTE: BUG_ON(!pagetable_pte_ctor(NULL, ptdesc)); - else if (shift == PMD_SHIFT) + break; + case TABLE_PMD: BUG_ON(!pagetable_pmd_ctor(NULL, ptdesc)); + break; + default: + break; + } return pa; } -- 2.50.1 From 5e8eb9aeeda3a7aaf48efa1d34ae804e894e307f Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:19 +0100 Subject: [PATCH 05/16] arm64: mm: always call PTE/PMD ctor in __create_pgd_mapping() TL;DR: always call the PTE/PMD ctor, passing the appropriate mm to skip ptlock_init() if unneeded. __create_pgd_mapping() is used for creating different kinds of mappings, and may allocate page table pages if passed an allocator callback. There are currently three such cases: 1. create_pgd_mapping(), which is used to create the EFI mapping 2. arch_add_memory() 3. map_entry_trampoline() 1. uses pgd_pgtable_alloc() as allocator callback, which calls the PTE/PMD ctor, while 2. and 3. use __pgd_pgtable_alloc(), which does not. The rationale is most likely that pgtables associated with init_mm do not make use of split page table locks, and it is therefore unnecessary to initialise them by calling the ctor. 2. operates on swapper_pg_dir so the allocated pgtables are clearly associated with init_mm, this is arguably the case for 3. too (the trampoline mapping is never modified so ptlocks are anyway irrelevant). 1. corresponds to efi_mm so ptlocks need to be initialised in that case. We are now moving towards calling the ctor for all page tables, even those associated with init_mm. pagetable_{pte,pmd}_ctor() have become aware of the associated mm so that the ptlock initialisation can be skipped for init_mm. This patch therefore amends the allocator callbacks so that the PTE/PMD ctor are always called, with an appropriate mm pointer to avoid unnecessary ptlock overhead. Modifying the prototype of the allocator callbacks to take the mm and propagating that pointer all the way down would be pretty invasive. Instead: * __pgd_pgtable_alloc() (cases 2. and 3. above) is replaced with pgd_pgtable_alloc_init_mm(), resulting in the ctors being called with &init_mm. This is the main functional change in this patch; the ptlock still isn't initialised, but other ctor actions (e.g. accounting-related) are now carried out for those allocated pgtables. * pgd_pgtable_alloc() (case 1. above) is replaced with pgd_pgtable_alloc_special_mm(), resulting in the ctors being called with NULL as mm. No functional change here; NULL essentially means "not init_mm", and the ptlock is still initialised. __pgd_pgtable_alloc() is now the common implementation of those two helpers. While at it we switch it to using pagetable_alloc() like standard pgtable allocator functions and remove the comment regarding ctor calls (ctors are now always expected to be called). Link: https://lkml.kernel.org/r/20250408095222.860601-10-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index eca324b3a6fc..51cfc891f6a1 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -480,31 +480,22 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, int flags); #endif -static phys_addr_t __pgd_pgtable_alloc(enum pgtable_type pgtable_type) +static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, + enum pgtable_type pgtable_type) { /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */ - void *ptr = (void *)__get_free_page(GFP_PGTABLE_KERNEL & ~__GFP_ZERO); + struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0); + phys_addr_t pa; - BUG_ON(!ptr); - return __pa(ptr); -} - -static phys_addr_t pgd_pgtable_alloc(enum pgtable_type pgtable_type) -{ - phys_addr_t pa = __pgd_pgtable_alloc(pgtable_type); - struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); + BUG_ON(!ptdesc); + pa = page_to_phys(ptdesc_page(ptdesc)); - /* - * Call proper page table ctor in case later we need to - * call core mm functions like apply_to_page_range() on - * this pre-allocated page table. - */ switch (pgtable_type) { case TABLE_PTE: - BUG_ON(!pagetable_pte_ctor(NULL, ptdesc)); + BUG_ON(!pagetable_pte_ctor(mm, ptdesc)); break; case TABLE_PMD: - BUG_ON(!pagetable_pmd_ctor(NULL, ptdesc)); + BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); break; default: break; @@ -513,6 +504,18 @@ static phys_addr_t pgd_pgtable_alloc(enum pgtable_type pgtable_type) return pa; } +static phys_addr_t __maybe_unused +pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type) +{ + return __pgd_pgtable_alloc(&init_mm, pgtable_type); +} + +static phys_addr_t +pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type) +{ + return __pgd_pgtable_alloc(NULL, pgtable_type); +} + /* * This function can only be used to modify existing table entries, * without allocating new levels of table. Note that this permits the @@ -542,7 +545,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, flags); + pgd_pgtable_alloc_special_mm, flags); } static void update_mapping_prot(phys_addr_t phys, unsigned long virt, @@ -756,7 +759,7 @@ static int __init map_entry_trampoline(void) memset(tramp_pg_dir, 0, PGD_SIZE); __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, entry_tramp_text_size(), prot, - __pgd_pgtable_alloc, NO_BLOCK_MAPPINGS); + pgd_pgtable_alloc_init_mm, NO_BLOCK_MAPPINGS); /* Map both the text and data into the kernel page table */ for (i = 0; i < DIV_ROUND_UP(entry_tramp_text_size(), PAGE_SIZE); i++) @@ -1362,7 +1365,7 @@ int arch_add_memory(int nid, u64 start, u64 size, flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), - size, params->pgprot, __pgd_pgtable_alloc, + size, params->pgprot, pgd_pgtable_alloc_init_mm, flags); memblock_clear_nomap(start, size); -- 2.50.1 From 0e3a16a760c6aececc7f8211f6a6807d1719c7bd Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:20 +0100 Subject: [PATCH 06/16] riscv: mm: clarify ctor mm argument in alloc_{pte,pmd}_late pagetable_{pte,pmd}_ctor(mm, ptdesc) skip the ptlock initialisation if mm is &init_mm. To avoid unnecessary overhead, it is therefore preferable to pass the actual mm associated to the PTE/PMD. Unfortunately, this proves challenging for alloc_{pte,pmd}_late() as the associated mm is not available at the point where they are called - in fact not even top-level functions like create_pgd_mapping() are passed the mm. As a result they both call the ctor with NULL as mm; this is safe but potentially wasteful. This is not a new situation, but let's add a couple of comments to clarify it. Link: https://lkml.kernel.org/r/20250408095222.860601-11-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- arch/riscv/mm/init.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index e5ef693fc778..59a982f88908 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -442,6 +442,11 @@ static phys_addr_t __meminit alloc_pte_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); + /* + * We do not know which mm the PTE page is associated to at this point. + * Passing NULL to the ctor is the safe option, though it may result + * in unnecessary work (e.g. initialising the ptlock for init_mm). + */ BUG_ON(!ptdesc || !pagetable_pte_ctor(NULL, ptdesc)); return __pa((pte_t *)ptdesc_address(ptdesc)); } @@ -522,6 +527,7 @@ static phys_addr_t __meminit alloc_pmd_late(uintptr_t va) { struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); + /* See comment in alloc_pte_late() regarding NULL passed the ctor */ BUG_ON(!ptdesc || !pagetable_pmd_ctor(NULL, ptdesc)); return __pa((pmd_t *)ptdesc_address(ptdesc)); } -- 2.50.1 From cb5d2be83862d67eea4d08493a3ad5e4e4cb2f89 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:21 +0100 Subject: [PATCH 07/16] arm64: mm: call PUD/P4D ctor in __create_pgd_mapping() Constructors for PUD/P4D-level pgtables were recently introduced. They should be called for all pgtables; make sure they are called for special kernel mappings created by __create_pgd_mapping() too. Link: https://lkml.kernel.org/r/20250408095222.860601-12-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- arch/arm64/mm/mmu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 51cfc891f6a1..8fcf59ba39db 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -497,7 +497,11 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, case TABLE_PMD: BUG_ON(!pagetable_pmd_ctor(mm, ptdesc)); break; - default: + case TABLE_PUD: + pagetable_pud_ctor(ptdesc); + break; + case TABLE_P4D: + pagetable_p4d_ctor(ptdesc); break; } -- 2.50.1 From 8472cc4503eb8efac9a9a264814bab3b89ddbd87 Mon Sep 17 00:00:00 2001 From: Kevin Brodsky Date: Tue, 8 Apr 2025 10:52:22 +0100 Subject: [PATCH 08/16] riscv: mm: call PUD/P4D ctor in special kernel pgtable alloc Constructors for PUD/P4D-level pgtables were recently introduced. They should be called for all pgtables; make sure they are called for special kernel mappings created by create_pgd_mapping() too. While at it also switch to using pagetable_alloc() like in alloc_{pte,pmd}_late(). Link: https://lkml.kernel.org/r/20250408095222.860601-13-kevin.brodsky@arm.com Signed-off-by: Kevin Brodsky Cc: Albert Ou Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Geert Uytterhoeven Cc: Linus Waleij Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Matthew Wilcox (Oracle) Cc: Michael Ellerman Cc: Mike Rapoport Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ryan Roberts Cc: Will Deacon Cc: Cc: Yang Shi Cc: Dave Hansen Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- arch/riscv/mm/init.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 59a982f88908..8d0374d7ce8e 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -590,11 +590,11 @@ static phys_addr_t __init alloc_pud_fixmap(uintptr_t va) static phys_addr_t __meminit alloc_pud_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); - return __pa(vaddr); + BUG_ON(!ptdesc); + pagetable_pud_ctor(ptdesc); + return __pa((pud_t *)ptdesc_address(ptdesc)); } static p4d_t *__init get_p4d_virt_early(phys_addr_t pa) @@ -628,11 +628,11 @@ static phys_addr_t __init alloc_p4d_fixmap(uintptr_t va) static phys_addr_t __meminit alloc_p4d_late(uintptr_t va) { - unsigned long vaddr; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, 0); - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr); - return __pa(vaddr); + BUG_ON(!ptdesc); + pagetable_p4d_ctor(ptdesc); + return __pa((p4d_t *)ptdesc_address(ptdesc)); } static void __meminit create_pud_mapping(pud_t *pudp, uintptr_t va, phys_addr_t pa, phys_addr_t sz, -- 2.50.1 From 5bb9ed6cdfeb75883652fd0ed3e3885083a92b4b Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 8 Apr 2025 09:22:38 +0000 Subject: [PATCH 09/16] mm: rust: add abstraction for struct mm_struct MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Patch series "Rust support for mm_struct, vm_area_struct, and mmap", v16. This updates the vm_area_struct support to use the approach we discussed at LPC where there are several different Rust wrappers for vm_area_struct depending on the kind of access you have to the vma. Each case allows a different set of operations on the vma. This includes an MM MAINTAINERS entry as proposed by Lorenzo: https://lore.kernel.org/all/33e64b12-aa07-4e78-933a-b07c37ff1d84@lucifer.local/ This patch (of 9): These abstractions allow you to reference a `struct mm_struct` using both mmgrab and mmget refcounts. This is done using two Rust types: * Mm - represents an mm_struct where you don't know anything about the value of mm_users. * MmWithUser - represents an mm_struct where you know at compile time that mm_users is non-zero. This allows us to encode in the type system whether a method requires that mm_users is non-zero or not. For instance, you can always call `mmget_not_zero` but you can only call `mmap_read_lock` when mm_users is non-zero. The struct is called Mm to keep consistency with the C side. The ability to obtain `current->mm` is added later in this series. The mm module is defined to only exist when CONFIG_MMU is set. This avoids various errors due to missing types and functions when CONFIG_MMU is disabled. More fine-grained cfgs can be considered in the future. See the thread at [1] for more info. Link: https://lkml.kernel.org/r/20250408-vma-v16-9-d8b446e885d9@google.com Link: https://lkml.kernel.org/r/20250408-vma-v16-1-d8b446e885d9@google.com Link: https://lore.kernel.org/all/202503091916.QousmtcY-lkp@intel.com/ Signed-off-by: Alice Ryhl Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Acked-by: Balbir Singh Reviewed-by: Andreas Hindborg Reviewed-by: Gary Guo Cc: Alex Gaynor Cc: Arnd Bergmann Cc: Benno Lossin Cc: Björn Roy Baron Cc: Boqun Feng Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miguel Ojeda Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/helpers/helpers.c | 1 + rust/helpers/mm.c | 39 ++++++++ rust/kernel/lib.rs | 1 + rust/kernel/mm.rs | 210 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 251 insertions(+) create mode 100644 rust/helpers/mm.c create mode 100644 rust/kernel/mm.rs diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index 1e7c84df7252..0aea103c16be 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -20,6 +20,7 @@ #include "io.c" #include "jump_label.c" #include "kunit.c" +#include "mm.c" #include "mutex.c" #include "page.c" #include "platform.c" diff --git a/rust/helpers/mm.c b/rust/helpers/mm.c new file mode 100644 index 000000000000..7201747a5d31 --- /dev/null +++ b/rust/helpers/mm.c @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +void rust_helper_mmgrab(struct mm_struct *mm) +{ + mmgrab(mm); +} + +void rust_helper_mmdrop(struct mm_struct *mm) +{ + mmdrop(mm); +} + +void rust_helper_mmget(struct mm_struct *mm) +{ + mmget(mm); +} + +bool rust_helper_mmget_not_zero(struct mm_struct *mm) +{ + return mmget_not_zero(mm); +} + +void rust_helper_mmap_read_lock(struct mm_struct *mm) +{ + mmap_read_lock(mm); +} + +bool rust_helper_mmap_read_trylock(struct mm_struct *mm) +{ + return mmap_read_trylock(mm); +} + +void rust_helper_mmap_read_unlock(struct mm_struct *mm) +{ + mmap_read_unlock(mm); +} diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs index de07aadd1ff5..42ab6cf4053f 100644 --- a/rust/kernel/lib.rs +++ b/rust/kernel/lib.rs @@ -61,6 +61,7 @@ pub mod jump_label; pub mod kunit; pub mod list; pub mod miscdevice; +pub mod mm; #[cfg(CONFIG_NET)] pub mod net; pub mod of; diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs new file mode 100644 index 000000000000..eda7a479cff7 --- /dev/null +++ b/rust/kernel/mm.rs @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 Google LLC. + +//! Memory management. +//! +//! This module deals with managing the address space of userspace processes. Each process has an +//! instance of [`Mm`], which keeps track of multiple VMAs (virtual memory areas). Each VMA +//! corresponds to a region of memory that the userspace process can access, and the VMA lets you +//! control what happens when userspace reads or writes to that region of memory. +//! +//! C header: [`include/linux/mm.h`](srctree/include/linux/mm.h) +#![cfg(CONFIG_MMU)] + +use crate::{ + bindings, + types::{ARef, AlwaysRefCounted, NotThreadSafe, Opaque}, +}; +use core::{ops::Deref, ptr::NonNull}; + +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This represents the address space of a userspace process, so each process has one `Mm` +/// instance. It may hold many VMAs internally. +/// +/// There is a counter called `mm_users` that counts the users of the address space; this includes +/// the userspace process itself, but can also include kernel threads accessing the address space. +/// Once `mm_users` reaches zero, this indicates that the address space can be destroyed. To access +/// the address space, you must prevent `mm_users` from reaching zero while you are accessing it. +/// The [`MmWithUser`] type represents an address space where this is guaranteed, and you can +/// create one using [`mmget_not_zero`]. +/// +/// The `ARef` smart pointer holds an `mmgrab` refcount. Its destructor may sleep. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmgrab`. +/// +/// [`mmget_not_zero`]: Mm::mmget_not_zero +#[repr(transparent)] +pub struct Mm { + mm: Opaque, +} + +// SAFETY: It is safe to call `mmdrop` on another thread than where `mmgrab` was called. +unsafe impl Send for Mm {} +// SAFETY: All methods on `Mm` can be called in parallel from several threads. +unsafe impl Sync for Mm {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for Mm { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmgrab(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmdrop(obj.cast().as_ptr()) }; + } +} + +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This type is like [`Mm`], but with non-zero `mm_users`. It can only be used when `mm_users` can +/// be proven to be non-zero at compile-time, usually because the relevant code holds an `mmget` +/// refcount. It can be used to access the associated address space. +/// +/// The `ARef` smart pointer holds an `mmget` refcount. Its destructor may sleep. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmget`. The value of `mm_users` is non-zero. +#[repr(transparent)] +pub struct MmWithUser { + mm: Mm, +} + +// SAFETY: It is safe to call `mmput` on another thread than where `mmget` was called. +unsafe impl Send for MmWithUser {} +// SAFETY: All methods on `MmWithUser` can be called in parallel from several threads. +unsafe impl Sync for MmWithUser {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for MmWithUser { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmget(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmput(obj.cast().as_ptr()) }; + } +} + +// Make all `Mm` methods available on `MmWithUser`. +impl Deref for MmWithUser { + type Target = Mm; + + #[inline] + fn deref(&self) -> &Mm { + &self.mm + } +} + +// These methods are safe to call even if `mm_users` is zero. +impl Mm { + /// Returns a raw pointer to the inner `mm_struct`. + #[inline] + pub fn as_raw(&self) -> *mut bindings::mm_struct { + self.mm.get() + } + + /// Obtain a reference from a raw pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` points at an `mm_struct`, and that it is not deallocated + /// during the lifetime 'a. + #[inline] + pub unsafe fn from_raw<'a>(ptr: *const bindings::mm_struct) -> &'a Mm { + // SAFETY: Caller promises that the pointer is valid for 'a. Layouts are compatible due to + // repr(transparent). + unsafe { &*ptr.cast() } + } + + /// Calls `mmget_not_zero` and returns a handle if it succeeds. + #[inline] + pub fn mmget_not_zero(&self) -> Option> { + // SAFETY: The pointer is valid since self is a reference. + let success = unsafe { bindings::mmget_not_zero(self.as_raw()) }; + + if success { + // SAFETY: We just created an `mmget` refcount. + Some(unsafe { ARef::from_raw(NonNull::new_unchecked(self.as_raw().cast())) }) + } else { + None + } + } +} + +// These methods require `mm_users` to be non-zero. +impl MmWithUser { + /// Obtain a reference from a raw pointer. + /// + /// # Safety + /// + /// The caller must ensure that `ptr` points at an `mm_struct`, and that `mm_users` remains + /// non-zero for the duration of the lifetime 'a. + #[inline] + pub unsafe fn from_raw<'a>(ptr: *const bindings::mm_struct) -> &'a MmWithUser { + // SAFETY: Caller promises that the pointer is valid for 'a. The layout is compatible due + // to repr(transparent). + unsafe { &*ptr.cast() } + } + + /// Lock the mmap read lock. + #[inline] + pub fn mmap_read_lock(&self) -> MmapReadGuard<'_> { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmap_read_lock(self.as_raw()) }; + + // INVARIANT: We just acquired the read lock. + MmapReadGuard { + mm: self, + _nts: NotThreadSafe, + } + } + + /// Try to lock the mmap read lock. + #[inline] + pub fn mmap_read_trylock(&self) -> Option> { + // SAFETY: The pointer is valid since self is a reference. + let success = unsafe { bindings::mmap_read_trylock(self.as_raw()) }; + + if success { + // INVARIANT: We just acquired the read lock. + Some(MmapReadGuard { + mm: self, + _nts: NotThreadSafe, + }) + } else { + None + } + } +} + +/// A guard for the mmap read lock. +/// +/// # Invariants +/// +/// This `MmapReadGuard` guard owns the mmap read lock. +pub struct MmapReadGuard<'a> { + mm: &'a MmWithUser, + // `mmap_read_lock` and `mmap_read_unlock` must be called on the same thread + _nts: NotThreadSafe, +} + +impl Drop for MmapReadGuard<'_> { + #[inline] + fn drop(&mut self) { + // SAFETY: We hold the read lock by the type invariants. + unsafe { bindings::mmap_read_unlock(self.mm.as_raw()) }; + } +} -- 2.50.1 From 040f404b731207935ed644b14bcc2bb8b8488d00 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 8 Apr 2025 09:22:39 +0000 Subject: [PATCH 10/16] mm: rust: add vm_area_struct methods that require read access MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This adds a type called VmaRef which is used when referencing a vma that you have read access to. Here, read access means that you hold either the mmap read lock or the vma read lock (or stronger). Additionally, a vma_lookup method is added to the mmap read guard, which enables you to obtain a &VmaRef in safe Rust code. This patch only provides a way to lock the mmap read lock, but a follow-up patch also provides a way to just lock the vma read lock. Link: https://lkml.kernel.org/r/20250408-vma-v16-2-d8b446e885d9@google.com Signed-off-by: Alice Ryhl Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Reviewed-by: Jann Horn Reviewed-by: Andreas Hindborg Reviewed-by: Gary Guo Cc: Alex Gaynor Cc: Arnd Bergmann Cc: Balbir Singh Cc: Benno Lossin Cc: Björn Roy Baron Cc: Boqun Feng Cc: Greg Kroah-Hartman Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miguel Ojeda Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/helpers/mm.c | 6 ++ rust/kernel/mm.rs | 23 +++++ rust/kernel/mm/virt.rs | 210 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 rust/kernel/mm/virt.rs diff --git a/rust/helpers/mm.c b/rust/helpers/mm.c index 7201747a5d31..7b72eb065a3e 100644 --- a/rust/helpers/mm.c +++ b/rust/helpers/mm.c @@ -37,3 +37,9 @@ void rust_helper_mmap_read_unlock(struct mm_struct *mm) { mmap_read_unlock(mm); } + +struct vm_area_struct *rust_helper_vma_lookup(struct mm_struct *mm, + unsigned long addr) +{ + return vma_lookup(mm, addr); +} diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs index eda7a479cff7..f1689ccb3740 100644 --- a/rust/kernel/mm.rs +++ b/rust/kernel/mm.rs @@ -18,6 +18,8 @@ use crate::{ }; use core::{ops::Deref, ptr::NonNull}; +pub mod virt; + /// A wrapper for the kernel's `struct mm_struct`. /// /// This represents the address space of a userspace process, so each process has one `Mm` @@ -201,6 +203,27 @@ pub struct MmapReadGuard<'a> { _nts: NotThreadSafe, } +impl<'a> MmapReadGuard<'a> { + /// Look up a vma at the given address. + #[inline] + pub fn vma_lookup(&self, vma_addr: usize) -> Option<&virt::VmaRef> { + // SAFETY: By the type invariants we hold the mmap read guard, so we can safely call this + // method. Any value is okay for `vma_addr`. + let vma = unsafe { bindings::vma_lookup(self.mm.as_raw(), vma_addr) }; + + if vma.is_null() { + None + } else { + // SAFETY: We just checked that a vma was found, so the pointer references a valid vma. + // + // Furthermore, the returned vma is still under the protection of the read lock guard + // and can be used while the mmap read lock is still held. That the vma is not used + // after the MmapReadGuard gets dropped is enforced by the borrow-checker. + unsafe { Some(virt::VmaRef::from_raw(vma)) } + } + } +} + impl Drop for MmapReadGuard<'_> { #[inline] fn drop(&mut self) { diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs new file mode 100644 index 000000000000..a66be649f0b8 --- /dev/null +++ b/rust/kernel/mm/virt.rs @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 + +// Copyright (C) 2024 Google LLC. + +//! Virtual memory. +//! +//! This module deals with managing a single VMA in the address space of a userspace process. Each +//! VMA corresponds to a region of memory that the userspace process can access, and the VMA lets +//! you control what happens when userspace reads or writes to that region of memory. +//! +//! The module has several different Rust types that all correspond to the C type called +//! `vm_area_struct`. The different structs represent what kind of access you have to the VMA, e.g. +//! [`VmaRef`] is used when you hold the mmap or vma read lock. Using the appropriate struct +//! ensures that you can't, for example, accidentally call a function that requires holding the +//! write lock when you only hold the read lock. + +use crate::{bindings, mm::MmWithUser, types::Opaque}; + +/// A wrapper for the kernel's `struct vm_area_struct` with read access. +/// +/// It represents an area of virtual memory. +/// +/// # Invariants +/// +/// The caller must hold the mmap read lock or the vma read lock. +#[repr(transparent)] +pub struct VmaRef { + vma: Opaque, +} + +// Methods you can call when holding the mmap or vma read lock (or stronger). They must be usable +// no matter what the vma flags are. +impl VmaRef { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap or vma + /// read lock (or stronger) is held for at least the duration of 'a. + #[inline] + pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Returns a raw pointer to this area. + #[inline] + pub fn as_ptr(&self) -> *mut bindings::vm_area_struct { + self.vma.get() + } + + /// Access the underlying `mm_struct`. + #[inline] + pub fn mm(&self) -> &MmWithUser { + // SAFETY: By the type invariants, this `vm_area_struct` is valid and we hold the mmap/vma + // read lock or stronger. This implies that the underlying mm has a non-zero value of + // `mm_users`. + unsafe { MmWithUser::from_raw((*self.as_ptr()).vm_mm) } + } + + /// Returns the flags associated with the virtual memory area. + /// + /// The possible flags are a combination of the constants in [`flags`]. + #[inline] + pub fn flags(&self) -> vm_flags_t { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_2.vm_flags } + } + + /// Returns the (inclusive) start address of the virtual memory area. + #[inline] + pub fn start(&self) -> usize { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_start } + } + + /// Returns the (exclusive) end address of the virtual memory area. + #[inline] + pub fn end(&self) -> usize { + // SAFETY: By the type invariants, the caller holds at least the mmap read lock, so this + // access is not a data race. + unsafe { (*self.as_ptr()).__bindgen_anon_1.__bindgen_anon_1.vm_end } + } + + /// Zap pages in the given page range. + /// + /// This clears page table mappings for the range at the leaf level, leaving all other page + /// tables intact, and freeing any memory referenced by the VMA in this range. That is, + /// anonymous memory is completely freed, file-backed memory has its reference count on page + /// cache folio's dropped, any dirty data will still be written back to disk as usual. + /// + /// It may seem odd that we clear at the leaf level, this is however a product of the page + /// table structure used to map physical memory into a virtual address space - each virtual + /// address actually consists of a bitmap of array indices into page tables, which form a + /// hierarchical page table level structure. + /// + /// As a result, each page table level maps a multiple of page table levels below, and thus + /// span ever increasing ranges of pages. At the leaf or PTE level, we map the actual physical + /// memory. + /// + /// It is here where a zap operates, as it the only place we can be certain of clearing without + /// impacting any other virtual mappings. It is an implementation detail as to whether the + /// kernel goes further in freeing unused page tables, but for the purposes of this operation + /// we must only assume that the leaf level is cleared. + #[inline] + pub fn zap_page_range_single(&self, address: usize, size: usize) { + let (end, did_overflow) = address.overflowing_add(size); + if did_overflow || address < self.start() || self.end() < end { + // TODO: call WARN_ONCE once Rust version of it is added + return; + } + + // SAFETY: By the type invariants, the caller has read access to this VMA, which is + // sufficient for this method call. This method has no requirements on the vma flags. The + // address range is checked to be within the vma. + unsafe { + bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut()) + }; + } +} + +/// The integer type used for vma flags. +#[doc(inline)] +pub use bindings::vm_flags_t; + +/// All possible flags for [`VmaRef`]. +pub mod flags { + use super::vm_flags_t; + use crate::bindings; + + /// No flags are set. + pub const NONE: vm_flags_t = bindings::VM_NONE as _; + + /// Mapping allows reads. + pub const READ: vm_flags_t = bindings::VM_READ as _; + + /// Mapping allows writes. + pub const WRITE: vm_flags_t = bindings::VM_WRITE as _; + + /// Mapping allows execution. + pub const EXEC: vm_flags_t = bindings::VM_EXEC as _; + + /// Mapping is shared. + pub const SHARED: vm_flags_t = bindings::VM_SHARED as _; + + /// Mapping may be updated to allow reads. + pub const MAYREAD: vm_flags_t = bindings::VM_MAYREAD as _; + + /// Mapping may be updated to allow writes. + pub const MAYWRITE: vm_flags_t = bindings::VM_MAYWRITE as _; + + /// Mapping may be updated to allow execution. + pub const MAYEXEC: vm_flags_t = bindings::VM_MAYEXEC as _; + + /// Mapping may be updated to be shared. + pub const MAYSHARE: vm_flags_t = bindings::VM_MAYSHARE as _; + + /// Page-ranges managed without `struct page`, just pure PFN. + pub const PFNMAP: vm_flags_t = bindings::VM_PFNMAP as _; + + /// Memory mapped I/O or similar. + pub const IO: vm_flags_t = bindings::VM_IO as _; + + /// Do not copy this vma on fork. + pub const DONTCOPY: vm_flags_t = bindings::VM_DONTCOPY as _; + + /// Cannot expand with mremap(). + pub const DONTEXPAND: vm_flags_t = bindings::VM_DONTEXPAND as _; + + /// Lock the pages covered when they are faulted in. + pub const LOCKONFAULT: vm_flags_t = bindings::VM_LOCKONFAULT as _; + + /// Is a VM accounted object. + pub const ACCOUNT: vm_flags_t = bindings::VM_ACCOUNT as _; + + /// Should the VM suppress accounting. + pub const NORESERVE: vm_flags_t = bindings::VM_NORESERVE as _; + + /// Huge TLB Page VM. + pub const HUGETLB: vm_flags_t = bindings::VM_HUGETLB as _; + + /// Synchronous page faults. (DAX-specific) + pub const SYNC: vm_flags_t = bindings::VM_SYNC as _; + + /// Architecture-specific flag. + pub const ARCH_1: vm_flags_t = bindings::VM_ARCH_1 as _; + + /// Wipe VMA contents in child on fork. + pub const WIPEONFORK: vm_flags_t = bindings::VM_WIPEONFORK as _; + + /// Do not include in the core dump. + pub const DONTDUMP: vm_flags_t = bindings::VM_DONTDUMP as _; + + /// Not soft dirty clean area. + pub const SOFTDIRTY: vm_flags_t = bindings::VM_SOFTDIRTY as _; + + /// Can contain `struct page` and pure PFN pages. + pub const MIXEDMAP: vm_flags_t = bindings::VM_MIXEDMAP as _; + + /// MADV_HUGEPAGE marked this vma. + pub const HUGEPAGE: vm_flags_t = bindings::VM_HUGEPAGE as _; + + /// MADV_NOHUGEPAGE marked this vma. + pub const NOHUGEPAGE: vm_flags_t = bindings::VM_NOHUGEPAGE as _; + + /// KSM may merge identical pages. + pub const MERGEABLE: vm_flags_t = bindings::VM_MERGEABLE as _; +} -- 2.50.1 From bf3d331bb80749542cf299f94c471f611fb113b1 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 8 Apr 2025 09:22:40 +0000 Subject: [PATCH 11/16] mm: rust: add vm_insert_page MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The vm_insert_page method is only usable on vmas with the VM_MIXEDMAP flag, so we introduce a new type to keep track of such vmas. The approach used in this patch assumes that we will not need to encode many flag combinations in the type. I don't think we need to encode more than VM_MIXEDMAP and VM_PFNMAP as things are now. However, if that becomes necessary, using generic parameters in a single type would scale better as the number of flags increases. Link: https://lkml.kernel.org/r/20250408-vma-v16-3-d8b446e885d9@google.com Signed-off-by: Alice Ryhl Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Reviewed-by: Andreas Hindborg Reviewed-by: Gary Guo Cc: Alex Gaynor Cc: Arnd Bergmann Cc: Balbir Singh Cc: Benno Lossin Cc: Björn Roy Baron Cc: Boqun Feng Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miguel Ojeda Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/kernel/mm/virt.rs | 79 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index a66be649f0b8..3e2eabcc2145 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -14,7 +14,15 @@ //! ensures that you can't, for example, accidentally call a function that requires holding the //! write lock when you only hold the read lock. -use crate::{bindings, mm::MmWithUser, types::Opaque}; +use crate::{ + bindings, + error::{to_result, Result}, + mm::MmWithUser, + page::Page, + types::Opaque, +}; + +use core::ops::Deref; /// A wrapper for the kernel's `struct vm_area_struct` with read access. /// @@ -119,6 +127,75 @@ impl VmaRef { bindings::zap_page_range_single(self.as_ptr(), address, size, core::ptr::null_mut()) }; } + + /// If the [`VM_MIXEDMAP`] flag is set, returns a [`VmaMixedMap`] to this VMA, otherwise + /// returns `None`. + /// + /// This can be used to access methods that require [`VM_MIXEDMAP`] to be set. + /// + /// [`VM_MIXEDMAP`]: flags::MIXEDMAP + #[inline] + pub fn as_mixedmap_vma(&self) -> Option<&VmaMixedMap> { + if self.flags() & flags::MIXEDMAP != 0 { + // SAFETY: We just checked that `VM_MIXEDMAP` is set. All other requirements are + // satisfied by the type invariants of `VmaRef`. + Some(unsafe { VmaMixedMap::from_raw(self.as_ptr()) }) + } else { + None + } + } +} + +/// A wrapper for the kernel's `struct vm_area_struct` with read access and [`VM_MIXEDMAP`] set. +/// +/// It represents an area of virtual memory. +/// +/// This struct is identical to [`VmaRef`] except that it must only be used when the +/// [`VM_MIXEDMAP`] flag is set on the vma. +/// +/// # Invariants +/// +/// The caller must hold the mmap read lock or the vma read lock. The `VM_MIXEDMAP` flag must be +/// set. +/// +/// [`VM_MIXEDMAP`]: flags::MIXEDMAP +#[repr(transparent)] +pub struct VmaMixedMap { + vma: VmaRef, +} + +// Make all `VmaRef` methods available on `VmaMixedMap`. +impl Deref for VmaMixedMap { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + &self.vma + } +} + +impl VmaMixedMap { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is valid for the duration of 'a, and that the mmap read lock + /// (or stronger) is held for at least the duration of 'a. The `VM_MIXEDMAP` flag must be set. + #[inline] + pub unsafe fn from_raw<'a>(vma: *const bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Maps a single page at the given address within the virtual memory area. + /// + /// This operation does not take ownership of the page. + #[inline] + pub fn vm_insert_page(&self, address: usize, page: &Page) -> Result { + // SAFETY: By the type invariant of `Self` caller has read access and has verified that + // `VM_MIXEDMAP` is set. By invariant on `Page` the page has order 0. + to_result(unsafe { bindings::vm_insert_page(self.as_ptr(), address, page.as_ptr()) }) + } } /// The integer type used for vma flags. -- 2.50.1 From 3105f8f391ce00153c553bfe89efbb30b120d858 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 8 Apr 2025 09:22:41 +0000 Subject: [PATCH 12/16] mm: rust: add lock_vma_under_rcu MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Currently, the binder driver always uses the mmap lock to make changes to its vma. Because the mmap lock is global to the process, this can involve significant contention. However, the kernel has a feature called per-vma locks, which can significantly reduce contention. For example, you can take a vma lock in parallel with an mmap write lock. This is important because contention on the mmap lock has been a long-term recurring challenge for the Binder driver. This patch introduces support for using `lock_vma_under_rcu` from Rust. The Rust Binder driver will be able to use this to reduce contention on the mmap lock. Link: https://lkml.kernel.org/r/20250408-vma-v16-4-d8b446e885d9@google.com Signed-off-by: Alice Ryhl Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Reviewed-by: Jann Horn Reviewed-by: Andreas Hindborg Reviewed-by: Gary Guo Cc: Alex Gaynor Cc: Arnd Bergmann Cc: Balbir Singh Cc: Benno Lossin Cc: Björn Roy Baron Cc: Boqun Feng Cc: Greg Kroah-Hartman Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miguel Ojeda Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/helpers/mm.c | 5 ++++ rust/kernel/mm.rs | 60 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/rust/helpers/mm.c b/rust/helpers/mm.c index 7b72eb065a3e..81b510c96fd2 100644 --- a/rust/helpers/mm.c +++ b/rust/helpers/mm.c @@ -43,3 +43,8 @@ struct vm_area_struct *rust_helper_vma_lookup(struct mm_struct *mm, { return vma_lookup(mm, addr); } + +void rust_helper_vma_end_read(struct vm_area_struct *vma) +{ + vma_end_read(vma); +} diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs index f1689ccb3740..c160fb52603f 100644 --- a/rust/kernel/mm.rs +++ b/rust/kernel/mm.rs @@ -19,6 +19,7 @@ use crate::{ use core::{ops::Deref, ptr::NonNull}; pub mod virt; +use virt::VmaRef; /// A wrapper for the kernel's `struct mm_struct`. /// @@ -161,6 +162,36 @@ impl MmWithUser { unsafe { &*ptr.cast() } } + /// Attempt to access a vma using the vma read lock. + /// + /// This is an optimistic trylock operation, so it may fail if there is contention. In that + /// case, you should fall back to taking the mmap read lock. + /// + /// When per-vma locks are disabled, this always returns `None`. + #[inline] + pub fn lock_vma_under_rcu(&self, vma_addr: usize) -> Option> { + #[cfg(CONFIG_PER_VMA_LOCK)] + { + // SAFETY: Calling `bindings::lock_vma_under_rcu` is always okay given an mm where + // `mm_users` is non-zero. + let vma = unsafe { bindings::lock_vma_under_rcu(self.as_raw(), vma_addr) }; + if !vma.is_null() { + return Some(VmaReadGuard { + // SAFETY: If `lock_vma_under_rcu` returns a non-null ptr, then it points at a + // valid vma. The vma is stable for as long as the vma read lock is held. + vma: unsafe { VmaRef::from_raw(vma) }, + _nts: NotThreadSafe, + }); + } + } + + // Silence warnings about unused variables. + #[cfg(not(CONFIG_PER_VMA_LOCK))] + let _ = vma_addr; + + None + } + /// Lock the mmap read lock. #[inline] pub fn mmap_read_lock(&self) -> MmapReadGuard<'_> { @@ -231,3 +262,32 @@ impl Drop for MmapReadGuard<'_> { unsafe { bindings::mmap_read_unlock(self.mm.as_raw()) }; } } + +/// A guard for the vma read lock. +/// +/// # Invariants +/// +/// This `VmaReadGuard` guard owns the vma read lock. +pub struct VmaReadGuard<'a> { + vma: &'a VmaRef, + // `vma_end_read` must be called on the same thread as where the lock was taken + _nts: NotThreadSafe, +} + +// Make all `VmaRef` methods available on `VmaReadGuard`. +impl Deref for VmaReadGuard<'_> { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + self.vma + } +} + +impl Drop for VmaReadGuard<'_> { + #[inline] + fn drop(&mut self) { + // SAFETY: We hold the read lock by the type invariants. + unsafe { bindings::vma_end_read(self.vma.as_ptr()) }; + } +} -- 2.50.1 From 114ba9b9e8191925438365089c0d300f4f1b6fb1 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 8 Apr 2025 09:22:42 +0000 Subject: [PATCH 13/16] mm: rust: add mmput_async support MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Adds an MmWithUserAsync type that uses mmput_async when dropped but is otherwise identical to MmWithUser. This has to be done using a separate type because the thing we are changing is the destructor. Rust Binder needs this to avoid a certain deadlock. See commit 9a9ab0d96362 ("binder: fix race between mmput() and do_exit()") for details. It's also needed in the shrinker to avoid cleaning up the mm in the shrinker's context. Link: https://lkml.kernel.org/r/20250408-vma-v16-5-d8b446e885d9@google.com Signed-off-by: Alice Ryhl Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Reviewed-by: Andreas Hindborg Reviewed-by: Gary Guo Cc: Alex Gaynor Cc: Arnd Bergmann Cc: Balbir Singh Cc: Benno Lossin Cc: Björn Roy Baron Cc: Boqun Feng Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miguel Ojeda Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/kernel/mm.rs | 51 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/rust/kernel/mm.rs b/rust/kernel/mm.rs index c160fb52603f..615907a0f3b4 100644 --- a/rust/kernel/mm.rs +++ b/rust/kernel/mm.rs @@ -111,6 +111,50 @@ impl Deref for MmWithUser { } } +/// A wrapper for the kernel's `struct mm_struct`. +/// +/// This type is identical to `MmWithUser` except that it uses `mmput_async` when dropping a +/// refcount. This means that the destructor of `ARef` is safe to call in atomic +/// context. +/// +/// # Invariants +/// +/// Values of this type are always refcounted using `mmget`. The value of `mm_users` is non-zero. +#[repr(transparent)] +pub struct MmWithUserAsync { + mm: MmWithUser, +} + +// SAFETY: It is safe to call `mmput_async` on another thread than where `mmget` was called. +unsafe impl Send for MmWithUserAsync {} +// SAFETY: All methods on `MmWithUserAsync` can be called in parallel from several threads. +unsafe impl Sync for MmWithUserAsync {} + +// SAFETY: By the type invariants, this type is always refcounted. +unsafe impl AlwaysRefCounted for MmWithUserAsync { + #[inline] + fn inc_ref(&self) { + // SAFETY: The pointer is valid since self is a reference. + unsafe { bindings::mmget(self.as_raw()) }; + } + + #[inline] + unsafe fn dec_ref(obj: NonNull) { + // SAFETY: The caller is giving up their refcount. + unsafe { bindings::mmput_async(obj.cast().as_ptr()) }; + } +} + +// Make all `MmWithUser` methods available on `MmWithUserAsync`. +impl Deref for MmWithUserAsync { + type Target = MmWithUser; + + #[inline] + fn deref(&self) -> &MmWithUser { + &self.mm + } +} + // These methods are safe to call even if `mm_users` is zero. impl Mm { /// Returns a raw pointer to the inner `mm_struct`. @@ -162,6 +206,13 @@ impl MmWithUser { unsafe { &*ptr.cast() } } + /// Use `mmput_async` when dropping this refcount. + #[inline] + pub fn into_mmput_async(me: ARef) -> ARef { + // SAFETY: The layouts and invariants are compatible. + unsafe { ARef::from_raw(ARef::into_raw(me).cast()) } + } + /// Attempt to access a vma using the vma read lock. /// /// This is an optimistic trylock operation, so it may fail if there is contention. In that -- 2.50.1 From dcb81aeab406e417bc0b4cf68de6eb07a1d2e6ce Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 8 Apr 2025 09:22:43 +0000 Subject: [PATCH 14/16] mm: rust: add VmaNew for f_ops->mmap() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This type will be used when setting up a new vma in an f_ops->mmap() hook. Using a separate type from VmaRef allows us to have a separate set of operations that you are only able to use during the mmap() hook. For example, the VM_MIXEDMAP flag must not be changed after the initial setup that happens during the f_ops->mmap() hook. To avoid setting invalid flag values, the methods for clearing VM_MAYWRITE and similar involve a check of VM_WRITE, and return an error if VM_WRITE is set. Trying to use `try_clear_maywrite` without checking the return value results in a compilation error because the `Result` type is marked #[must_use]. For now, there's only a method for VM_MIXEDMAP and not VM_PFNMAP. When we add a VM_PFNMAP method, we will need some way to prevent you from setting both VM_MIXEDMAP and VM_PFNMAP on the same vma. Link: https://lkml.kernel.org/r/20250408-vma-v16-6-d8b446e885d9@google.com Signed-off-by: Alice Ryhl Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Reviewed-by: Jann Horn Reviewed-by: Andreas Hindborg Cc: Alex Gaynor Cc: Arnd Bergmann Cc: Balbir Singh Cc: Benno Lossin Cc: Björn Roy Baron Cc: Boqun Feng Cc: Gary Guo Cc: Greg Kroah-Hartman Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miguel Ojeda Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/kernel/mm/virt.rs | 186 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 185 insertions(+), 1 deletion(-) diff --git a/rust/kernel/mm/virt.rs b/rust/kernel/mm/virt.rs index 3e2eabcc2145..31803674aecc 100644 --- a/rust/kernel/mm/virt.rs +++ b/rust/kernel/mm/virt.rs @@ -16,7 +16,7 @@ use crate::{ bindings, - error::{to_result, Result}, + error::{code::EINVAL, to_result, Result}, mm::MmWithUser, page::Page, types::Opaque, @@ -198,6 +198,190 @@ impl VmaMixedMap { } } +/// A configuration object for setting up a VMA in an `f_ops->mmap()` hook. +/// +/// The `f_ops->mmap()` hook is called when a new VMA is being created, and the hook is able to +/// configure the VMA in various ways to fit the driver that owns it. Using `VmaNew` indicates that +/// you are allowed to perform operations on the VMA that can only be performed before the VMA is +/// fully initialized. +/// +/// # Invariants +/// +/// For the duration of 'a, the referenced vma must be undergoing initialization in an +/// `f_ops->mmap()` hook. +pub struct VmaNew { + vma: VmaRef, +} + +// Make all `VmaRef` methods available on `VmaNew`. +impl Deref for VmaNew { + type Target = VmaRef; + + #[inline] + fn deref(&self) -> &VmaRef { + &self.vma + } +} + +impl VmaNew { + /// Access a virtual memory area given a raw pointer. + /// + /// # Safety + /// + /// Callers must ensure that `vma` is undergoing initial vma setup for the duration of 'a. + #[inline] + pub unsafe fn from_raw<'a>(vma: *mut bindings::vm_area_struct) -> &'a Self { + // SAFETY: The caller ensures that the invariants are satisfied for the duration of 'a. + unsafe { &*vma.cast() } + } + + /// Internal method for updating the vma flags. + /// + /// # Safety + /// + /// This must not be used to set the flags to an invalid value. + #[inline] + unsafe fn update_flags(&self, set: vm_flags_t, unset: vm_flags_t) { + let mut flags = self.flags(); + flags |= set; + flags &= !unset; + + // SAFETY: This is not a data race: the vma is undergoing initial setup, so it's not yet + // shared. Additionally, `VmaNew` is `!Sync`, so it cannot be used to write in parallel. + // The caller promises that this does not set the flags to an invalid value. + unsafe { (*self.as_ptr()).__bindgen_anon_2.__vm_flags = flags }; + } + + /// Set the `VM_MIXEDMAP` flag on this vma. + /// + /// This enables the vma to contain both `struct page` and pure PFN pages. Returns a reference + /// that can be used to call `vm_insert_page` on the vma. + #[inline] + pub fn set_mixedmap(&self) -> &VmaMixedMap { + // SAFETY: We don't yet provide a way to set VM_PFNMAP, so this cannot put the flags in an + // invalid state. + unsafe { self.update_flags(flags::MIXEDMAP, 0) }; + + // SAFETY: We just set `VM_MIXEDMAP` on the vma. + unsafe { VmaMixedMap::from_raw(self.vma.as_ptr()) } + } + + /// Set the `VM_IO` flag on this vma. + /// + /// This is used for memory mapped IO and similar. The flag tells other parts of the kernel to + /// avoid looking at the pages. For memory mapped IO this is useful as accesses to the pages + /// could have side effects. + #[inline] + pub fn set_io(&self) { + // SAFETY: Setting the VM_IO flag is always okay. + unsafe { self.update_flags(flags::IO, 0) }; + } + + /// Set the `VM_DONTEXPAND` flag on this vma. + /// + /// This prevents the vma from being expanded with `mremap()`. + #[inline] + pub fn set_dontexpand(&self) { + // SAFETY: Setting the VM_DONTEXPAND flag is always okay. + unsafe { self.update_flags(flags::DONTEXPAND, 0) }; + } + + /// Set the `VM_DONTCOPY` flag on this vma. + /// + /// This prevents the vma from being copied on fork. This option is only permanent if `VM_IO` + /// is set. + #[inline] + pub fn set_dontcopy(&self) { + // SAFETY: Setting the VM_DONTCOPY flag is always okay. + unsafe { self.update_flags(flags::DONTCOPY, 0) }; + } + + /// Set the `VM_DONTDUMP` flag on this vma. + /// + /// This prevents the vma from being included in core dumps. This option is only permanent if + /// `VM_IO` is set. + #[inline] + pub fn set_dontdump(&self) { + // SAFETY: Setting the VM_DONTDUMP flag is always okay. + unsafe { self.update_flags(flags::DONTDUMP, 0) }; + } + + /// Returns whether `VM_READ` is set. + /// + /// This flag indicates whether userspace is mapping this vma as readable. + #[inline] + pub fn readable(&self) -> bool { + (self.flags() & flags::READ) != 0 + } + + /// Try to clear the `VM_MAYREAD` flag, failing if `VM_READ` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma readable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYREAD` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_mayread(&self) -> Result { + if self.readable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYREAD` is okay when `VM_READ` is not set. + unsafe { self.update_flags(0, flags::MAYREAD) }; + Ok(()) + } + + /// Returns whether `VM_WRITE` is set. + /// + /// This flag indicates whether userspace is mapping this vma as writable. + #[inline] + pub fn writable(&self) -> bool { + (self.flags() & flags::WRITE) != 0 + } + + /// Try to clear the `VM_MAYWRITE` flag, failing if `VM_WRITE` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma writable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYWRITE` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_maywrite(&self) -> Result { + if self.writable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYWRITE` is okay when `VM_WRITE` is not set. + unsafe { self.update_flags(0, flags::MAYWRITE) }; + Ok(()) + } + + /// Returns whether `VM_EXEC` is set. + /// + /// This flag indicates whether userspace is mapping this vma as executable. + #[inline] + pub fn executable(&self) -> bool { + (self.flags() & flags::EXEC) != 0 + } + + /// Try to clear the `VM_MAYEXEC` flag, failing if `VM_EXEC` is set. + /// + /// This flag indicates whether userspace is allowed to make this vma executable with + /// `mprotect()`. + /// + /// Note that this operation is irreversible. Once `VM_MAYEXEC` has been cleared, it can never + /// be set again. + #[inline] + pub fn try_clear_mayexec(&self) -> Result { + if self.executable() { + return Err(EINVAL); + } + // SAFETY: Clearing `VM_MAYEXEC` is okay when `VM_EXEC` is not set. + unsafe { self.update_flags(0, flags::MAYEXEC) }; + Ok(()) + } +} + /// The integer type used for vma flags. #[doc(inline)] pub use bindings::vm_flags_t; -- 2.50.1 From f8c78198816f04619da6eebfc5b67741bff56325 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 8 Apr 2025 09:22:44 +0000 Subject: [PATCH 15/16] rust: miscdevice: add mmap support MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add the ability to write a file_operations->mmap hook in Rust when using the miscdevice abstraction. The `vma` argument to the `mmap` hook uses the `VmaNew` type from the previous commit; this type provides the correct set of operations for a file_operations->mmap hook. Link: https://lkml.kernel.org/r/20250408-vma-v16-7-d8b446e885d9@google.com Signed-off-by: Alice Ryhl Acked-by: Greg Kroah-Hartman Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Reviewed-by: Andreas Hindborg Reviewed-by: Gary Guo Cc: Alex Gaynor Cc: Arnd Bergmann Cc: Balbir Singh Cc: Benno Lossin Cc: Björn Roy Baron Cc: Boqun Feng Cc: Jann Horn Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miguel Ojeda Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/kernel/miscdevice.rs | 45 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/rust/kernel/miscdevice.rs b/rust/kernel/miscdevice.rs index fa9ecc42602a..9d9771247c38 100644 --- a/rust/kernel/miscdevice.rs +++ b/rust/kernel/miscdevice.rs @@ -14,6 +14,7 @@ use crate::{ error::{to_result, Error, Result, VTABLE_DEFAULT_ERROR}, ffi::{c_int, c_long, c_uint, c_ulong}, fs::File, + mm::virt::VmaNew, prelude::*, seq_file::SeqFile, str::CStr, @@ -119,6 +120,22 @@ pub trait MiscDevice: Sized { drop(device); } + /// Handle for mmap. + /// + /// This function is invoked when a user space process invokes the `mmap` system call on + /// `file`. The function is a callback that is part of the VMA initializer. The kernel will do + /// initial setup of the VMA before calling this function. The function can then interact with + /// the VMA initialization by calling methods of `vma`. If the function does not return an + /// error, the kernel will complete initialization of the VMA according to the properties of + /// `vma`. + fn mmap( + _device: ::Borrowed<'_>, + _file: &File, + _vma: &VmaNew, + ) -> Result { + build_error!(VTABLE_DEFAULT_ERROR) + } + /// Handler for ioctls. /// /// The `cmd` argument is usually manipulated using the utilties in [`kernel::ioctl`]. @@ -223,6 +240,33 @@ impl MiscdeviceVTable { 0 } + /// # Safety + /// + /// `file` must be a valid file that is associated with a `MiscDeviceRegistration`. + /// `vma` must be a vma that is currently being mmap'ed with this file. + unsafe extern "C" fn mmap( + file: *mut bindings::file, + vma: *mut bindings::vm_area_struct, + ) -> c_int { + // SAFETY: The mmap call of a file can access the private data. + let private = unsafe { (*file).private_data }; + // SAFETY: This is a Rust Miscdevice, so we call `into_foreign` in `open` and + // `from_foreign` in `release`, and `fops_mmap` is guaranteed to be called between those + // two operations. + let device = unsafe { ::borrow(private) }; + // SAFETY: The caller provides a vma that is undergoing initial VMA setup. + let area = unsafe { VmaNew::from_raw(vma) }; + // SAFETY: + // * The file is valid for the duration of this call. + // * There is no active fdget_pos region on the file on this thread. + let file = unsafe { File::from_raw_file(file) }; + + match T::mmap(device, file, area) { + Ok(()) => 0, + Err(err) => err.to_errno(), + } + } + /// # Safety /// /// `file` must be a valid file that is associated with a `MiscDeviceRegistration`. @@ -291,6 +335,7 @@ impl MiscdeviceVTable { const VTABLE: bindings::file_operations = bindings::file_operations { open: Some(Self::open), release: Some(Self::release), + mmap: if T::HAS_MMAP { Some(Self::mmap) } else { None }, unlocked_ioctl: if T::HAS_IOCTL { Some(Self::ioctl) } else { -- 2.50.1 From 6acb75ad7b9e1548ae7f7532312295f74e48c973 Mon Sep 17 00:00:00 2001 From: Alice Ryhl Date: Tue, 8 Apr 2025 09:22:45 +0000 Subject: [PATCH 16/16] task: rust: rework how current is accessed MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Introduce a new type called `CurrentTask` that lets you perform various operations that are only safe on the `current` task. Use the new type to provide a way to access the current mm without incrementing its refcount. With this change, you can write stuff such as let vma = current!().mm().lock_vma_under_rcu(addr); without incrementing any refcounts. This replaces the existing abstractions for accessing the current pid namespace. With the old approach, every field access to current involves both a macro and a unsafe helper function. The new approach simplifies that to a single safe function on the `CurrentTask` type. This makes it less heavy-weight to add additional current accessors in the future. That said, creating a `CurrentTask` type like the one in this patch requires that we are careful to ensure that it cannot escape the current task or otherwise access things after they are freed. To do this, I declared that it cannot escape the current "task context" where I defined a "task context" as essentially the region in which `current` remains unchanged. So e.g., release_task() or begin_new_exec() would leave the task context. If a userspace thread returns to userspace and later makes another syscall, then I consider the two syscalls to be different task contexts. This allows values stored in that task to be modified between syscalls, even if they're guaranteed to be immutable during a syscall. Ensuring correctness of `CurrentTask` is slightly tricky if we also want the ability to have a safe `kthread_use_mm()` implementation in Rust. To support that safely, there are two patterns we need to ensure are safe: // Case 1: current!() called inside the scope. let mm; kthread_use_mm(some_mm, || { mm = current!().mm(); }); drop(some_mm); mm.do_something(); // UAF and: // Case 2: current!() called before the scope. let mm; let task = current!(); kthread_use_mm(some_mm, || { mm = task.mm(); }); drop(some_mm); mm.do_something(); // UAF The existing `current!()` abstraction already natively prevents the first case: The `&CurrentTask` would be tied to the inner scope, so the borrow-checker ensures that no reference derived from it can escape the scope. Fixing the second case is a bit more tricky. The solution is to essentially pretend that the contents of the scope execute on an different thread, which means that only thread-safe types can cross the boundary. Since `CurrentTask` is marked `NotThreadSafe`, attempts to move it to another thread will fail, and this includes our fake pretend thread boundary. This has the disadvantage that other types that aren't thread-safe for reasons unrelated to `current` also cannot be moved across the `kthread_use_mm()` boundary. I consider this an acceptable tradeoff. Link: https://lkml.kernel.org/r/20250408-vma-v16-8-d8b446e885d9@google.com Signed-off-by: Alice Ryhl Acked-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Reviewed-by: Boqun Feng Reviewed-by: Andreas Hindborg Reviewed-by: Gary Guo Cc: Alex Gaynor Cc: Arnd Bergmann Cc: Balbir Singh Cc: Benno Lossin Cc: Björn Roy Baron Cc: Greg Kroah-Hartman Cc: Jann Horn Cc: John Hubbard Cc: Matthew Wilcox (Oracle) Cc: Miguel Ojeda Cc: Suren Baghdasaryan Cc: Trevor Gross Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- rust/kernel/task.rs | 247 +++++++++++++++++++++++--------------------- 1 file changed, 129 insertions(+), 118 deletions(-) diff --git a/rust/kernel/task.rs b/rust/kernel/task.rs index 9e6f6854948d..927413d85484 100644 --- a/rust/kernel/task.rs +++ b/rust/kernel/task.rs @@ -7,6 +7,7 @@ use crate::{ bindings, ffi::{c_int, c_long, c_uint}, + mm::MmWithUser, pid_namespace::PidNamespace, types::{ARef, NotThreadSafe, Opaque}, }; @@ -33,22 +34,20 @@ pub const TASK_NORMAL: c_uint = bindings::TASK_NORMAL as c_uint; #[macro_export] macro_rules! current { () => { - // SAFETY: Deref + addr-of below create a temporary `TaskRef` that cannot outlive the - // caller. + // SAFETY: This expression creates a temporary value that is dropped at the end of the + // caller's scope. The following mechanisms ensure that the resulting `&CurrentTask` cannot + // leave current task context: + // + // * To return to userspace, the caller must leave the current scope. + // * Operations such as `begin_new_exec()` are necessarily unsafe and the caller of + // `begin_new_exec()` is responsible for safety. + // * Rust abstractions for things such as a `kthread_use_mm()` scope must require the + // closure to be `Send`, so the `NotThreadSafe` field of `CurrentTask` ensures that the + // `&CurrentTask` cannot cross the scope in either direction. unsafe { &*$crate::task::Task::current() } }; } -/// Returns the currently running task's pid namespace. -#[macro_export] -macro_rules! current_pid_ns { - () => { - // SAFETY: Deref + addr-of below create a temporary `PidNamespaceRef` that cannot outlive - // the caller. - unsafe { &*$crate::task::Task::current_pid_ns() } - }; -} - /// Wraps the kernel's `struct task_struct`. /// /// # Invariants @@ -87,7 +86,7 @@ macro_rules! current_pid_ns { /// impl State { /// fn new() -> Self { /// Self { -/// creator: current!().into(), +/// creator: ARef::from(&**current!()), /// index: 0, /// } /// } @@ -107,6 +106,44 @@ unsafe impl Send for Task {} // synchronised by C code (e.g., `signal_pending`). unsafe impl Sync for Task {} +/// Represents the [`Task`] in the `current` global. +/// +/// This type exists to provide more efficient operations that are only valid on the current task. +/// For example, to retrieve the pid-namespace of a task, you must use rcu protection unless it is +/// the current task. +/// +/// # Invariants +/// +/// Each value of this type must only be accessed from the task context it was created within. +/// +/// Of course, every thread is in a different task context, but for the purposes of this invariant, +/// these operations also permanently leave the task context: +/// +/// * Returning to userspace from system call context. +/// * Calling `release_task()`. +/// * Calling `begin_new_exec()` in a binary format loader. +/// +/// Other operations temporarily create a new sub-context: +/// +/// * Calling `kthread_use_mm()` creates a new context, and `kthread_unuse_mm()` returns to the +/// old context. +/// +/// This means that a `CurrentTask` obtained before a `kthread_use_mm()` call may be used again +/// once `kthread_unuse_mm()` is called, but it must not be used between these two calls. +/// Conversely, a `CurrentTask` obtained between a `kthread_use_mm()`/`kthread_unuse_mm()` pair +/// must not be used after `kthread_unuse_mm()`. +#[repr(transparent)] +pub struct CurrentTask(Task, NotThreadSafe); + +// Make all `Task` methods available on `CurrentTask`. +impl Deref for CurrentTask { + type Target = Task; + #[inline] + fn deref(&self) -> &Task { + &self.0 + } +} + /// The type of process identifiers (PIDs). pub type Pid = bindings::pid_t; @@ -133,119 +170,29 @@ impl Task { /// /// # Safety /// - /// Callers must ensure that the returned object doesn't outlive the current task/thread. - pub unsafe fn current() -> impl Deref { - struct TaskRef<'a> { - task: &'a Task, - _not_send: NotThreadSafe, + /// Callers must ensure that the returned object is only used to access a [`CurrentTask`] + /// within the task context that was active when this function was called. For more details, + /// see the invariants section for [`CurrentTask`]. + pub unsafe fn current() -> impl Deref { + struct TaskRef { + task: *const CurrentTask, } - impl Deref for TaskRef<'_> { - type Target = Task; + impl Deref for TaskRef { + type Target = CurrentTask; fn deref(&self) -> &Self::Target { - self.task + // SAFETY: The returned reference borrows from this `TaskRef`, so it cannot outlive + // the `TaskRef`, which the caller of `Task::current()` has promised will not + // outlive the task/thread for which `self.task` is the `current` pointer. Thus, it + // is okay to return a `CurrentTask` reference here. + unsafe { &*self.task } } } - let current = Task::current_raw(); TaskRef { - // SAFETY: If the current thread is still running, the current task is valid. Given - // that `TaskRef` is not `Send`, we know it cannot be transferred to another thread - // (where it could potentially outlive the caller). - task: unsafe { &*current.cast() }, - _not_send: NotThreadSafe, - } - } - - /// Returns a PidNamespace reference for the currently executing task's/thread's pid namespace. - /// - /// This function can be used to create an unbounded lifetime by e.g., storing the returned - /// PidNamespace in a global variable which would be a bug. So the recommended way to get the - /// current task's/thread's pid namespace is to use the [`current_pid_ns`] macro because it is - /// safe. - /// - /// # Safety - /// - /// Callers must ensure that the returned object doesn't outlive the current task/thread. - pub unsafe fn current_pid_ns() -> impl Deref { - struct PidNamespaceRef<'a> { - task: &'a PidNamespace, - _not_send: NotThreadSafe, - } - - impl Deref for PidNamespaceRef<'_> { - type Target = PidNamespace; - - fn deref(&self) -> &Self::Target { - self.task - } - } - - // The lifetime of `PidNamespace` is bound to `Task` and `struct pid`. - // - // The `PidNamespace` of a `Task` doesn't ever change once the `Task` is alive. A - // `unshare(CLONE_NEWPID)` or `setns(fd_pidns/pidfd, CLONE_NEWPID)` will not have an effect - // on the calling `Task`'s pid namespace. It will only effect the pid namespace of children - // created by the calling `Task`. This invariant guarantees that after having acquired a - // reference to a `Task`'s pid namespace it will remain unchanged. - // - // When a task has exited and been reaped `release_task()` will be called. This will set - // the `PidNamespace` of the task to `NULL`. So retrieving the `PidNamespace` of a task - // that is dead will return `NULL`. Note, that neither holding the RCU lock nor holding a - // referencing count to - // the `Task` will prevent `release_task()` being called. - // - // In order to retrieve the `PidNamespace` of a `Task` the `task_active_pid_ns()` function - // can be used. There are two cases to consider: - // - // (1) retrieving the `PidNamespace` of the `current` task - // (2) retrieving the `PidNamespace` of a non-`current` task - // - // From system call context retrieving the `PidNamespace` for case (1) is always safe and - // requires neither RCU locking nor a reference count to be held. Retrieving the - // `PidNamespace` after `release_task()` for current will return `NULL` but no codepath - // like that is exposed to Rust. - // - // Retrieving the `PidNamespace` from system call context for (2) requires RCU protection. - // Accessing `PidNamespace` outside of RCU protection requires a reference count that - // must've been acquired while holding the RCU lock. Note that accessing a non-`current` - // task means `NULL` can be returned as the non-`current` task could have already passed - // through `release_task()`. - // - // To retrieve (1) the `current_pid_ns!()` macro should be used which ensure that the - // returned `PidNamespace` cannot outlive the calling scope. The associated - // `current_pid_ns()` function should not be called directly as it could be abused to - // created an unbounded lifetime for `PidNamespace`. The `current_pid_ns!()` macro allows - // Rust to handle the common case of accessing `current`'s `PidNamespace` without RCU - // protection and without having to acquire a reference count. - // - // For (2) the `task_get_pid_ns()` method must be used. This will always acquire a - // reference on `PidNamespace` and will return an `Option` to force the caller to - // explicitly handle the case where `PidNamespace` is `None`, something that tends to be - // forgotten when doing the equivalent operation in `C`. Missing RCU primitives make it - // difficult to perform operations that are otherwise safe without holding a reference - // count as long as RCU protection is guaranteed. But it is not important currently. But we - // do want it in the future. - // - // Note for (2) the required RCU protection around calling `task_active_pid_ns()` - // synchronizes against putting the last reference of the associated `struct pid` of - // `task->thread_pid`. The `struct pid` stored in that field is used to retrieve the - // `PidNamespace` of the caller. When `release_task()` is called `task->thread_pid` will be - // `NULL`ed and `put_pid()` on said `struct pid` will be delayed in `free_pid()` via - // `call_rcu()` allowing everyone with an RCU protected access to the `struct pid` acquired - // from `task->thread_pid` to finish. - // - // SAFETY: The current task's pid namespace is valid as long as the current task is running. - let pidns = unsafe { bindings::task_active_pid_ns(Task::current_raw()) }; - PidNamespaceRef { - // SAFETY: If the current thread is still running, the current task and its associated - // pid namespace are valid. `PidNamespaceRef` is not `Send`, so we know it cannot be - // transferred to another thread (where it could potentially outlive the current - // `Task`). The caller needs to ensure that the PidNamespaceRef doesn't outlive the - // current task/thread. - task: unsafe { PidNamespace::from_ptr(pidns) }, - _not_send: NotThreadSafe, + // CAST: The layout of `struct task_struct` and `CurrentTask` is identical. + task: Task::current_raw().cast(), } } @@ -328,6 +275,70 @@ impl Task { } } +impl CurrentTask { + /// Access the address space of the current task. + /// + /// This function does not touch the refcount of the mm. + #[inline] + pub fn mm(&self) -> Option<&MmWithUser> { + // SAFETY: The `mm` field of `current` is not modified from other threads, so reading it is + // not a data race. + let mm = unsafe { (*self.as_ptr()).mm }; + + if mm.is_null() { + return None; + } + + // SAFETY: If `current->mm` is non-null, then it references a valid mm with a non-zero + // value of `mm_users`. Furthermore, the returned `&MmWithUser` borrows from this + // `CurrentTask`, so it cannot escape the scope in which the current pointer was obtained. + // + // This is safe even if `kthread_use_mm()`/`kthread_unuse_mm()` are used. There are two + // relevant cases: + // * If the `&CurrentTask` was created before `kthread_use_mm()`, then it cannot be + // accessed during the `kthread_use_mm()`/`kthread_unuse_mm()` scope due to the + // `NotThreadSafe` field of `CurrentTask`. + // * If the `&CurrentTask` was created within a `kthread_use_mm()`/`kthread_unuse_mm()` + // scope, then the `&CurrentTask` cannot escape that scope, so the returned `&MmWithUser` + // also cannot escape that scope. + // In either case, it's not possible to read `current->mm` and keep using it after the + // scope is ended with `kthread_unuse_mm()`. + Some(unsafe { MmWithUser::from_raw(mm) }) + } + + /// Access the pid namespace of the current task. + /// + /// This function does not touch the refcount of the namespace or use RCU protection. + /// + /// To access the pid namespace of another task, see [`Task::get_pid_ns`]. + #[doc(alias = "task_active_pid_ns")] + #[inline] + pub fn active_pid_ns(&self) -> Option<&PidNamespace> { + // SAFETY: It is safe to call `task_active_pid_ns` without RCU protection when calling it + // on the current task. + let active_ns = unsafe { bindings::task_active_pid_ns(self.as_ptr()) }; + + if active_ns.is_null() { + return None; + } + + // The lifetime of `PidNamespace` is bound to `Task` and `struct pid`. + // + // The `PidNamespace` of a `Task` doesn't ever change once the `Task` is alive. + // + // From system call context retrieving the `PidNamespace` for the current task is always + // safe and requires neither RCU locking nor a reference count to be held. Retrieving the + // `PidNamespace` after `release_task()` for current will return `NULL` but no codepath + // like that is exposed to Rust. + // + // SAFETY: If `current`'s pid ns is non-null, then it references a valid pid ns. + // Furthermore, the returned `&PidNamespace` borrows from this `CurrentTask`, so it cannot + // escape the scope in which the current pointer was obtained, e.g. it cannot live past a + // `release_task()` call. + Some(unsafe { PidNamespace::from_ptr(active_ns) }) + } +} + // SAFETY: The type invariants guarantee that `Task` is always refcounted. unsafe impl crate::types::AlwaysRefCounted for Task { fn inc_ref(&self) { -- 2.50.1