From a8d457b29b017a8499ff885d64804de3ff203dee Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:32 +0200 Subject: [PATCH 01/16] arch/sparc: teach arch_get_unmapped_area{_topdown} to handle hugetlb mappings We want to stop special casing hugetlb mappings and make them go through generic channels, so teach arch_get_unmapped_area{_topdown} to handle those. sparc specific hugetlb function does not set info.align_offset, and does not care about adjusting the align_mask for MAP_SHARED cases, so the same here for compatibility. Link: https://lkml.kernel.org/r/20241007075037.267650-5-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/sparc/kernel/sys_sparc_32.c | 17 +++++++++++---- arch/sparc/kernel/sys_sparc_64.c | 37 +++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 80822f922e76..fb31bc0c5b48 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -42,12 +43,16 @@ SYSCALL_DEFINE0(getpagesize) unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; + + if (filp && is_file_hugepages(filp)) + file_hugepage = true; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -62,9 +67,13 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi info.length = len; info.low_limit = addr; info.high_limit = TASK_SIZE; - info.align_mask = (flags & MAP_SHARED) ? - (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + if (!file_hugepage) { + info.align_mask = (flags & MAP_SHARED) ? + (PAGE_MASK & (SHMLBA - 1)) : 0; + info.align_offset = pgoff << PAGE_SHIFT; + } else { + info.align_mask = huge_page_mask_align(filp); + } return vm_unmapped_area(&info); } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index acade309dc2f..c5a284df7b41 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -87,6 +88,16 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, return base + off; } +static unsigned long get_align_mask(struct file *filp, unsigned long flags) +{ + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); + if (filp || (flags & MAP_SHARED)) + return PAGE_MASK & (SHMLBA - 1); + + return 0; +} + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; @@ -94,12 +105,16 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi unsigned long task_size = TASK_SIZE; int do_color_align; struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; + + if (filp && is_file_hugepages(filp)) + file_hugepage = true; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -111,7 +126,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi return -ENOMEM; do_color_align = 0; - if (filp || (flags & MAP_SHARED)) + if ((filp || (flags & MAP_SHARED)) && !file_hugepage) do_color_align = 1; if (addr) { @@ -129,8 +144,9 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi info.length = len; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = min(task_size, VA_EXCLUDE_START); - info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!file_hugepage) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) { @@ -154,15 +170,19 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long addr = addr0; int do_color_align; struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; /* This should only ever run for 32-bit processes. */ BUG_ON(!test_thread_flag(TIF_32BIT)); + if (filp && is_file_hugepages(filp)) + file_hugepage = true; + if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -172,7 +192,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return -ENOMEM; do_color_align = 0; - if (filp || (flags & MAP_SHARED)) + if ((filp || (flags & MAP_SHARED)) && !file_hugepage) do_color_align = 1; /* requesting a specific address */ @@ -192,8 +212,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!file_hugepage) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); /* -- 2.51.0 From 5959ffabbb67dfdd0cd4623e675a24005de66393 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:33 +0200 Subject: [PATCH 02/16] arch/powerpc: teach book3s64 arch_get_unmapped_area{_topdown} to handle hugetlb mappings We want to stop special casing hugetlb mappings and make them go through generic channels, so teach arch_get_unmapped_area{_topdown} to handle those. Reshuffle file_to_psize() definition so arch_get_unmapped_area{_topdown} can make use of it. Link: https://lkml.kernel.org/r/20241007075037.267650-6-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/slice.c | 40 ++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c index 87307d0fc3b8..3a858f6b7270 100644 --- a/arch/powerpc/mm/book3s64/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -633,6 +633,20 @@ return_addr: } EXPORT_SYMBOL_GPL(slice_get_unmapped_area); +#ifdef CONFIG_HUGETLB_PAGE +static int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + + return shift_to_mmu_psize(huge_page_shift(hstate)); +} +#else +static int file_to_psize(struct file *file) +{ + return 0; +} +#endif + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, @@ -640,11 +654,17 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long flags, vm_flags_t vm_flags) { + unsigned int psize; + if (radix_enabled()) return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); - return slice_get_unmapped_area(addr, len, flags, - mm_ctx_user_psize(¤t->mm->context), 0); + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr, len, flags, psize, 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, @@ -654,11 +674,17 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long flags, vm_flags_t vm_flags) { + unsigned int psize; + if (radix_enabled()) return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags); - return slice_get_unmapped_area(addr0, len, flags, - mm_ctx_user_psize(¤t->mm->context), 1); + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr0, len, flags, psize, 1); } unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) @@ -789,12 +815,6 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); } -static int file_to_psize(struct file *file) -{ - struct hstate *hstate = hstate_file(file); - return shift_to_mmu_psize(huge_page_shift(hstate)); -} - unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) -- 2.51.0 From 7bd3f1e1a9ae7f7508c88dd056755a0a4741ae88 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:34 +0200 Subject: [PATCH 03/16] mm: make hugetlb mappings go through mm_get_unmapped_area_vmflags Hugetlb mappings will no longer be special cased but rather go through the generic mm_get_unmapped_area_vmflags function. For that to happen, let us remove the .get_unmapped_area from hugetlbfs_file_operations struct, and hint __get_unmapped_area that it should not send hugetlb mappings through thp_get_unmapped_area_vmflags but through mm_get_unmapped_area_vmflags. Create also a function called hugetlb_mmap_check_and_align() where a couple of safety checks are being done and the addr is aligned to the huge page size. Otherwise we will have to do this in every single function, which duplicates quite a lot of code. Link: https://lkml.kernel.org/r/20241007075037.267650-7-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 24 ++++++++++++++++-------- include/linux/hugetlb.h | 9 ++++----- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5cf327337e22..2c5f34e315d2 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -258,15 +258,23 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, pgoff, flags); } -#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA -static unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) +unsigned long +__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long flags) { - return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); + unsigned long addr0 = 0; + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if ((flags & MAP_FIXED) && prepare_hugepage_range(file, addr, len)) + return -EINVAL; + if (addr) + addr0 = ALIGN(addr, huge_page_size(h)); + + return mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, + flags, 0); } -#endif /* * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. @@ -1300,7 +1308,7 @@ static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, - .get_unmapped_area = hugetlb_get_unmapped_area, + .get_unmapped_area = __hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, .fop_flags = FOP_HUGE_PAGES, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 368d552e4860..3a81b6126f62 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -546,11 +546,10 @@ static inline struct hstate *hstate_inode(struct inode *i) } #endif /* !CONFIG_HUGETLBFS */ -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); -#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ +unsigned long +__generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); unsigned long generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, -- 2.51.0 From cc92882ee218d62ef017fa545b3c8a2d1e060a5a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:35 +0200 Subject: [PATCH 04/16] mm: drop hugetlb_get_unmapped_area{_*} functions Hugetlb mappings are now handled through normal channels just like any other mapping, so we no longer need hugetlb_get_unmapped_area* specific functions. Link: https://lkml.kernel.org/r/20241007075037.267650-8-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/parisc/mm/hugetlbpage.c | 21 ------ arch/powerpc/mm/book3s64/slice.c | 10 --- arch/s390/mm/hugetlbpage.c | 85 ------------------------ arch/sparc/mm/hugetlbpage.c | 108 ------------------------------- arch/x86/mm/hugetlbpage.c | 101 ----------------------------- fs/hugetlbfs/inode.c | 96 ++------------------------- include/linux/hugetlb.h | 7 +- 7 files changed, 6 insertions(+), 422 deletions(-) diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index aa664f7ddb63..e9d18cf25b79 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -21,27 +21,6 @@ #include -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > TASK_SIZE) - return -ENOMEM; - - if (flags & MAP_FIXED) - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - - if (addr) - addr = ALIGN(addr, huge_page_size(h)); - - /* we need to make sure the colouring is OK */ - return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); -} pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c index 3a858f6b7270..bc9a39821d1c 100644 --- a/arch/powerpc/mm/book3s64/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -814,14 +814,4 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); } - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - if (radix_enabled()) - return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); - - return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); -} #endif diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index ded0eff58a19..7c79cf1bc7d7 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -242,88 +242,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size) else return false; } - -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - unsigned long addr; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > TASK_SIZE - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - goto check_asce_limit; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - goto check_asce_limit; - } - - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - addr = hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - if (offset_in_page(addr)) - return addr; - -check_asce_limit: - return check_asce_limit(mm, addr, len); -} diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index cc91ca7a1e18..eee601a0d2cf 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -19,114 +19,6 @@ #include #include -/* Slightly simplified from the non-hugepage variant because by - * definition we don't have to worry about any page coloring stuff - */ - -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags) -{ - struct hstate *h = hstate_file(filp); - unsigned long task_size = TASK_SIZE; - struct vm_unmapped_area_info info = {}; - - if (test_thread_flag(TIF_32BIT)) - task_size = STACK_TOP32; - - info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = min(task_size, VA_EXCLUDE_START); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) { - VM_BUG_ON(addr != -ENOMEM); - info.low_limit = VA_EXCLUDE_END; - info.high_limit = task_size; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -static unsigned long -hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - struct hstate *h = hstate_file(filp); - struct mm_struct *mm = current->mm; - unsigned long addr = addr0; - struct vm_unmapped_area_info info = {}; - - /* This should only ever run for 32-bit processes. */ - BUG_ON(!test_thread_flag(TIF_32BIT)); - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = mm->mmap_base; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = STACK_TOP32; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long task_size = TASK_SIZE; - - if (test_thread_flag(TIF_32BIT)) - task_size = STACK_TOP32; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > task_size) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (task_size - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); -} static pte_t sun4u_hugepage_shift_to_tte(pte_t entry, unsigned int shift) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 807a5859a3c4..58f7f2bd535d 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -19,107 +19,6 @@ #include #include -#ifdef CONFIG_HUGETLB_PAGE -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = get_mmap_base(1); - - /* - * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area - * in the full address space. - */ - info.high_limit = in_32bit_syscall() ? - task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); - - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = get_mmap_base(0); - - /* - * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area - * in the full address space. - */ - if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) - info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; - - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE_LOW; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - - if (len > TASK_SIZE) - return -ENOMEM; - - /* No address checking. See comment at mmap_address_hint_valid() */ - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr &= huge_page_mask(h); - if (!mmap_address_hint_valid(addr, len)) - goto get_unmapped_area; - - vma = find_vma(mm, addr); - if (!vma || addr + len <= vm_start_gap(vma)) - return addr; - } - -get_unmapped_area: - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); -} -#endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 bool __init arch_hugetlb_valid_size(unsigned long size) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2c5f34e315d2..935c0ed3aa1e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -171,96 +171,10 @@ out: * Called under mmap_write_lock(mm). */ -static unsigned long -hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr, len, flags); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long -hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (unlikely(offset_in_page(addr))) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr, len, flags); - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long -generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; - struct hstate *h = hstate_file(file); - const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > mmap_end - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma_prev(mm, addr, &prev); - if (mmap_end - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma)) && - (!prev || addr >= vm_end_gap(prev))) - return addr; - } - - /* - * Use MMF_TOPDOWN flag as a hint to use topdown routine. - * If architectures have special needs, they should define their own - * version of hugetlb_get_unmapped_area. - */ - if (test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); -} - unsigned long -__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long flags) +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { unsigned long addr0 = 0; struct hstate *h = hstate_file(file); @@ -272,7 +186,7 @@ __hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) addr0 = ALIGN(addr, huge_page_size(h)); - return mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, + return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, flags, 0); } @@ -1308,7 +1222,7 @@ static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, - .get_unmapped_area = __hugetlb_get_unmapped_area, + .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, .fop_flags = FOP_HUGE_PAGES, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3a81b6126f62..ae4fe8615bb6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -547,15 +547,10 @@ static inline struct hstate *hstate_inode(struct inode *i) #endif /* !CONFIG_HUGETLBFS */ unsigned long -__generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -unsigned long -generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); - /* * huegtlb page specific state flags. These flags are located in page.private * of the hugetlb head page. Functions created via the below macros should be -- 2.51.0 From 5b2f650d593ed4d020228df8563e7ad23abc847f Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:36 +0200 Subject: [PATCH 05/16] arch/s390: clean up hugetlb definitions s390 redefines functions that are already defined (and the same) in include/asm-generic/hugetlb.h. Do as the other architectures: 1) include include/asm-generic/hugetlb.h 2) drop the already defined functions in the generic hugetlb.h and 3) use the __HAVE_ARCH_HUGE_* macros to define our own. This gets rid of quite some code. Link: https://lkml.kernel.org/r/20241007075037.267650-9-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/s390/include/asm/hugetlb.h | 58 +++++++++------------------------ include/asm-generic/hugetlb.h | 8 +++++ 2 files changed, 24 insertions(+), 42 deletions(-) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index cf1b5d6fb1a6..37e80a32623a 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -12,21 +12,24 @@ #include #include -#define hugetlb_free_pgd_range free_pgd_range #define hugepages_supported() (MACHINE_HAS_EDAT1) +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); -pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, +#define __HAVE_ARCH_HUGE_PTEP_GET +extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. */ +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { @@ -45,6 +48,7 @@ static inline void arch_clear_hugetlb_flags(struct folio *folio) } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags +#define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { @@ -54,12 +58,14 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY)); } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) @@ -72,6 +78,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -79,69 +86,36 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } -static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) -{ - return mk_pte(page, pgprot); -} - +#define __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } +#define __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte); } -static inline int huge_pte_write(pte_t pte) -{ - return pte_write(pte); -} - -static inline int huge_pte_dirty(pte_t pte) -{ - return pte_dirty(pte); -} - -static inline pte_t huge_pte_mkwrite(pte_t pte) -{ - return pte_mkwrite_novma(pte); -} - -static inline pte_t huge_pte_mkdirty(pte_t pte) -{ - return pte_mkdirty(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - -static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) -{ - return pte_modify(pte, newprot); -} - +#define __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return pte; } +#define __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte; } +#define __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return 0; } -static inline bool gigantic_page_runtime_supported(void) -{ - return true; -} +#include #endif /* _ASM_S390_HUGETLB_H */ diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 594d5905f615..67bbdafcfc22 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -42,20 +42,26 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +#ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } +#endif +#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte_clear_uffd_wp(pte); } +#endif +#ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return pte_uffd_wp(pte); } +#endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, @@ -106,10 +112,12 @@ static inline int huge_pte_none(pte_t pte) #endif /* Please refer to comments above pte_none_mostly() for the usage */ +#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte) || is_pte_marker(pte); } +#endif #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, -- 2.51.0 From bd40b053fabe27209cb240d205a0c817cbe5fb87 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:37 +0200 Subject: [PATCH 06/16] mm: consolidate common checks in hugetlb_get_unmapped_area prepare_hugepage_range() performs almost the same checks for all architectures that define it, with the exception of mips and loongarch that also check for overflows. The rest checks for the addr and len to be properly aligned, so we can move that to hugetlb_get_unmapped_area() and get rid of a fair amount of duplicated code. [akpm@linux-foundation.org: remove now-unused local] Link: https://lore.kernel.org/oe-kbuild-all/202410081210.uNLbf3Jk-lkp@intel.com/ Link: https://lkml.kernel.org/r/20241007075037.267650-10-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/hugetlb.h | 5 ----- arch/mips/include/asm/hugetlb.h | 5 ----- arch/parisc/include/asm/hugetlb.h | 15 --------------- arch/s390/include/asm/hugetlb.h | 17 ----------------- arch/sh/include/asm/hugetlb.h | 15 --------------- fs/hugetlbfs/inode.c | 8 ++++++-- include/asm-generic/hugetlb.h | 7 ------- 7 files changed, 6 insertions(+), 66 deletions(-) diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h index 5da32c00d483..b837c65a4894 100644 --- a/arch/loongarch/include/asm/hugetlb.h +++ b/arch/loongarch/include/asm/hugetlb.h @@ -16,12 +16,7 @@ static inline int prepare_hugepage_range(struct file *file, unsigned long len) { unsigned long task_size = STACK_TOP; - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; if (len > task_size) return -ENOMEM; if (task_size - len < addr) diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index fd69c8808554..d0a86ce83de9 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -17,12 +17,7 @@ static inline int prepare_hugepage_range(struct file *file, unsigned long len) { unsigned long task_size = STACK_TOP; - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; if (len > task_size) return -ENOMEM; if (task_size - len < addr) diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 72daacc472a0..5b3a5429f71b 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -12,21 +12,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 37e80a32623a..6f815d4ba0ca 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -25,23 +25,6 @@ extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - static inline void arch_clear_hugetlb_flags(struct folio *folio) { clear_bit(PG_arch_1, &folio->flags); diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 75028bd568ba..4a92e6e4d627 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -5,21 +5,6 @@ #include #include -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 935c0ed3aa1e..c6191a6118b8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -181,8 +181,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; - if ((flags & MAP_FIXED) && prepare_hugepage_range(file, addr, len)) - return -EINVAL; + if (flags & MAP_FIXED) { + if (addr & ~huge_page_mask(h)) + return -EINVAL; + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + } if (addr) addr0 = ALIGN(addr, huge_page_size(h)); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 67bbdafcfc22..f42133dae68e 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -123,13 +123,6 @@ static inline int huge_pte_none_mostly(pte_t pte) static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; } #endif -- 2.51.0 From 018d24539d9ed7531245a381ba24f5d9e8714682 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Mon, 7 Oct 2024 17:19:42 -0700 Subject: [PATCH 07/16] percpu: fix data race with pcpu_nr_empty_pop_pages Fixes the data race by moving the read to be behind the pcpu_lock. This is okay because the code (initially) above it will not increase the empty populated page count because it is populating backing pages that already have allocations served out of them. Link: https://lkml.kernel.org/r/20241008001942.8114-1-dennis@kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202407191651.f24e499d-oliver.sang@intel.com Signed-off-by: Dennis Zhou Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/percpu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index da21680ff294..d1a73cf65c53 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1864,6 +1864,10 @@ restart: area_found: pcpu_stats_area_alloc(chunk, size); + + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ @@ -1891,9 +1895,6 @@ area_found: mutex_unlock(&pcpu_alloc_mutex); } - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) - pcpu_schedule_balance_work(); - /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); -- 2.51.0 From 077c7c1e099f46b4abd0babf233d476597a4823a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 7 Oct 2024 18:20:09 -0700 Subject: [PATCH 08/16] mm/memory.c: remove stray newline at top of file Fixes: d61ea1cb0095 ("userfaultfd: UFFD_FEATURE_WP_ASYNC") Reported-by: Jeongjun Park Closes: https://lkml.kernel.org/r/20241007065307.4158-1-aha310510@gmail.com Cc: Muhammad Usama Anjum Cc: Peter Xu Cc: Greg KH Signed-off-by: Andrew Morton --- mm/memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 6bda739a60e8..5e9d6a22eb08 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1,4 +1,3 @@ - // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c -- 2.51.0 From 150e0fb86d69caec7aec48705e563e9336b7a4a6 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Tue, 8 Oct 2024 14:23:53 +0100 Subject: [PATCH 09/16] MAINTAINERS: mailmap: update Alexey Klimov's email address My new address is alexey.klimov@linaro.org Link: https://lkml.kernel.org/r/20241008132353.68767-1-alexey.klimov@linaro.org Signed-off-by: Alexey Klimov Cc: Srinivas Kandagatla Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.mailmap b/.mailmap index 5378f04b2566..b2cc9ee33caa 100644 --- a/.mailmap +++ b/.mailmap @@ -37,6 +37,7 @@ Alexei Avshalom Lazar Alexei Starovoitov Alexei Starovoitov Alexei Starovoitov +Alexey Klimov Alexey Makhalov Alex Elder Alex Elder diff --git a/MAINTAINERS b/MAINTAINERS index bdae0faf000c..27005b0fada9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7855,7 +7855,7 @@ F: Documentation/gpu/automated_testing.rst F: drivers/gpu/drm/ci/ DSBR100 USB FM RADIO DRIVER -M: Alexey Klimov +M: Alexey Klimov L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git @@ -13568,7 +13568,7 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ F: drivers/media/dvb-frontends/m88rs2000* MA901 MASTERKIT USB FM RADIO DRIVER -M: Alexey Klimov +M: Alexey Klimov L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git @@ -15704,7 +15704,7 @@ F: Documentation/hwmon/mp9941.rst F: drivers/hwmon/pmbus/mp9941.c MR800 AVERMEDIA USB FM RADIO DRIVER -M: Alexey Klimov +M: Alexey Klimov L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git -- 2.51.0 From ebcfc63d6bca3cce1bfca30092712f3468b4ecff Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 8 Oct 2024 11:47:45 +0530 Subject: [PATCH 10/16] mm: abstract THP allocation Patch series "Do not shatter hugezeropage on wp-fault", v7. It was observed at [1] and [2] that the current kernel behaviour of shattering a hugezeropage is inconsistent and suboptimal. For a VMA with a THP allowable order, when we write-fault on it, the kernel installs a PMD-mapped THP. On the other hand, if we first get a read fault, we get a PMD pointing to the hugezeropage; subsequent write will trigger a write-protection fault, shattering the hugezeropage into one writable page, and all the other PTEs write-protected. The conclusion being, as compared to the case of a single write-fault, applications have to suffer 512 extra page faults if they were to use the VMA as such, plus we get the overhead of khugepaged trying to replace that area with a THP anyway. Instead, replace the hugezeropage with a THP on wp-fault. [1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/ [2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/ This patch (of 2): In preparation for the second patch, abstract away the THP allocation logic present in the create_huge_pmd() path, which corresponds to the faulting case when no page is present. There should be no functional change as a result of applying this patch, except that, as David notes at [1], a PMD-aligned address should be passed to update_mmu_cache_pmd(). [1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170-bcaa-2fe66e093f43@redhat.com/ Link: https://lkml.kernel.org/r/20241008061746.285961-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20241008061746.285961-2-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Kefeng Wang Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: Christoph Lameter Cc: Dave Hansen Cc: Hugh Dickins Cc: Jan Kara Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 98 ++++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 41 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e71b58d84cba..4cf29ebed92b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1136,47 +1136,81 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, - struct page *page, gfp_t gfp) +static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, + unsigned long addr) { - struct vm_area_struct *vma = vmf->vma; - struct folio *folio = page_folio(page); - pgtable_t pgtable; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - vm_fault_t ret = 0; + gfp_t gfp = vma_thp_gfp_mask(vma); + const int order = HPAGE_PMD_ORDER; + struct folio *folio; - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + return NULL; + } + + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); - return VM_FAULT_FALLBACK; + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + return NULL; } folio_throttle_swaprate(folio, gfp); - pgtable = pte_alloc_one(vma->vm_mm); - if (unlikely(!pgtable)) { - ret = VM_FAULT_OOM; - goto release; - } - - folio_zero_user(folio, vmf->address); + folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); + return folio; +} + +static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr) +{ + pmd_t entry; + + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_pmd_at(vma->vm_mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, haddr, pmd); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); +} + +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + pgtable_t pgtable; + vm_fault_t ret = 0; + + folio = vma_alloc_anon_folio_pmd(vma, vmf->address); + if (unlikely(!folio)) + return VM_FAULT_FALLBACK; + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { goto unlock_release; } else { - pmd_t entry; - ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock_release; @@ -1190,21 +1224,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } - - entry = mk_huge_pmd(page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); deferred_split_folio(folio, false); spin_unlock(vmf->ptl); - count_vm_event(THP_FAULT_ALLOC); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); - count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -1271,8 +1295,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - gfp_t gfp; - struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret; @@ -1323,14 +1345,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - gfp = vma_thp_gfp_mask(vma); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); - if (unlikely(!folio)) { - count_vm_event(THP_FAULT_FALLBACK); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } - return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); + + return __do_huge_pmd_anonymous_page(vmf); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, -- 2.51.0 From 1ced09e0331f6cc4ca7eae75bc0ef03957129a94 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 8 Oct 2024 11:47:46 +0530 Subject: [PATCH 11/16] mm: allocate THP on hugezeropage wp-fault Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and replace it with a PMD-mapped THP. Remember to flush TLB entry corresponding to the hugezeropage. In case of failure, fallback to splitting the PMD. Link: https://lkml.kernel.org/r/20241008061746.285961-3-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Kefeng Wang Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: Christoph Lameter Cc: Dave Hansen Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4cf29ebed92b..c674afd16245 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1778,6 +1778,38 @@ unlock: spin_unlock(vmf->ptl); } +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct mmu_notifier_range range; + struct folio *folio; + vm_fault_t ret = 0; + + folio = vma_alloc_anon_folio_pmd(vma, vmf->address); + if (unlikely(!folio)) + return VM_FAULT_FALLBACK; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) + goto release; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + goto unlock; +release: + folio_put(folio); +unlock: + spin_unlock(vmf->ptl); + mmu_notifier_invalidate_range_end(&range); + return ret; +} + vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; @@ -1790,8 +1822,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - if (is_huge_zero_pmd(orig_pmd)) + if (is_huge_zero_pmd(orig_pmd)) { + vm_fault_t ret = do_huge_zero_wp_pmd(vmf); + + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + + /* Fallback to splitting PMD if THP cannot be allocated */ goto fallback; + } spin_lock(vmf->ptl); -- 2.51.0 From 01a9097aa3ce4c6aef296779c163169ac403260e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Oct 2024 13:28:00 +0900 Subject: [PATCH 12/16] zram: do not open-code comp priority 0 A cosmetic change: do not open-code compression priority 0, use ZRAM_PRIMARY_COMP instead. Link: https://lkml.kernel.org/r/20241009042908.750260-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 263795c4aef7..e6d12e81241d 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2285,7 +2285,7 @@ static void zram_destroy_comps(struct zram *zram) { u32 prio; - for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) { + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { struct zcomp *comp = zram->comps[prio]; zram->comps[prio] = NULL; @@ -2357,7 +2357,7 @@ static ssize_t disksize_store(struct device *dev, goto out_unlock; } - for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) { + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { if (!zram->comp_algs[prio]) continue; -- 2.51.0 From afe789b7367ad43ba8f079981d40851f8bd319ce Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 8 Oct 2024 19:50:24 -0700 Subject: [PATCH 13/16] kaslr: rename physmem_end and PHYSMEM_END to direct_map_physmem_end For clarity. It's increasingly hard to reason about the code, when KASLR is moving around the boundaries. In this case where KASLR is randomizing the location of the kernel image within physical memory, the maximum number of address bits for physical memory has not changed. What has changed is the ending address of memory that is allowed to be directly mapped by the kernel. Let's name the variable, and the associated macro accordingly. Also, enhance the comment above the direct_map_physmem_end definition, to further clarify how this all works. Link: https://lkml.kernel.org/r/20241009025024.89813-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Pankaj Gupta Acked-by: David Hildenbrand Acked-by: Will Deacon Reviewed-by: Mike Rapoport (Microsoft) Cc: Thomas Gleixner Cc: Alistair Popple Cc: Jordan Niethe Signed-off-by: Andrew Morton --- arch/arm64/include/asm/memory.h | 2 +- arch/x86/include/asm/page_64.h | 2 +- arch/x86/include/asm/pgtable_64_types.h | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/kaslr.c | 14 +++++++++----- include/linux/mm.h | 6 +++--- kernel/resource.c | 4 ++-- mm/memory_hotplug.c | 2 +- mm/sparse.c | 2 +- 9 files changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0480c61dbb4f..73eaa8c2536a 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -110,7 +110,7 @@ #define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ -#define PHYSMEM_END __pa(PAGE_END - 1) +#define DIRECT_MAP_PHYSMEM_END __pa(PAGE_END - 1) #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index f3d257c45225..d63576608ce7 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -17,7 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; -extern unsigned long physmem_end; +extern unsigned long direct_map_physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index a98e53491a4e..ec68f8369bdc 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -141,7 +141,7 @@ extern unsigned int ptrs_per_p4d; #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #ifdef CONFIG_RANDOMIZE_MEMORY -# define PHYSMEM_END physmem_end +# define DIRECT_MAP_PHYSMEM_END direct_map_physmem_end #endif /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ff253648706f..5a564130b9d0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -961,7 +961,7 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret; - if (WARN_ON_ONCE(end > PHYSMEM_END)) + if (WARN_ON_ONCE(end > DIRECT_MAP_PHYSMEM_END)) return -ERANGE; ret = __add_pages(nid, start_pfn, nr_pages, params); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 230f1dee4f09..70d3353c92fa 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -52,7 +52,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { .base = &page_offset_base, - .end = &physmem_end, + .end = &direct_map_physmem_end, }, { .base = &vmalloc_base, @@ -62,8 +62,12 @@ static __initdata struct kaslr_memory_region { }, }; -/* The end of the possible address space for physical memory */ -unsigned long physmem_end __ro_after_init; +/* + * The end of the physical address space that can be mapped directly by the + * kernel. This starts out at (1< __START_KERNEL_map); /* Preset the end of the possible address space for physical memory */ - physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); + direct_map_physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); if (!kaslr_memory_enabled()) return; @@ -145,7 +149,7 @@ void __init kernel_randomize_memory(void) vaddr += get_padding(&kaslr_regions[i]); /* * KASLR trims the maximum possible size of the - * direct-map. Update the physmem_end boundary. + * direct-map. Update the direct_map_physmem_end boundary. * No rounding required as the region starts * PUD aligned and size is in units of TB. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f5394d75ce2..4570f33e2429 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -97,11 +97,11 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif -#ifndef PHYSMEM_END +#ifndef DIRECT_MAP_PHYSMEM_END # ifdef MAX_PHYSMEM_BITS -# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +# define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else -# define PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) +# define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif diff --git a/kernel/resource.c b/kernel/resource.c index 4101016e8b20..d2c8143ae4ff 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1869,7 +1869,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size, if (flags & GFR_DESCENDING) { resource_size_t end; - end = min_t(resource_size_t, base->end, PHYSMEM_END); + end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END); return end - size + 1; } @@ -1886,7 +1886,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr, * @size did not wrap 0. */ return addr > addr - size && - addr <= min_t(resource_size_t, base->end, PHYSMEM_END); + addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END); } static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 621ae1015106..c43b4e7fb298 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void) struct range mhp_get_pluggable_range(bool need_mapping) { - const u64 max_phys = PHYSMEM_END; + const u64 max_phys = DIRECT_MAP_PHYSMEM_END; struct range mhp_range; if (need_mapping) { diff --git a/mm/sparse.c b/mm/sparse.c index dc38539f8560..4cb9793f0b52 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section) static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { - unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT; + unsigned long max_sparsemem_pfn = (DIRECT_MAP_PHYSMEM_END + 1) >> PAGE_SHIFT; /* * Sanity checks - do not allow an architecture to pass -- 2.51.0 From 002c5d1ca89c153e889e7fc3e0380cd807e40107 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 10 Oct 2024 18:54:39 +0300 Subject: [PATCH 14/16] mm/kmemleak: fix typo in object_no_scan() comment Replace "corresponding to the give pointer" with "corresponding to the given pointer" Link: https://lkml.kernel.org/r/20241010155439.554416-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 0400f5e8ac60..17006d8a2afa 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1011,7 +1011,7 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) } /* - * Set the OBJECT_NO_SCAN flag for the object corresponding to the give + * Set the OBJECT_NO_SCAN flag for the object corresponding to the given * pointer. Such object will not be scanned by kmemleak but references to it * are searched. */ -- 2.51.0 From f8780515fe914ac03189213c7e485264d65e2ece Mon Sep 17 00:00:00 2001 From: MengEn Sun Date: Thu, 10 Oct 2024 20:09:36 +0800 Subject: [PATCH 15/16] mm: add pcp high_min high_max to proc zoneinfo When we do not set percpu_pagelist_high_fraction the kernel will compute the pcp high_min/max by itself, which makes it hard to determine the current high_min/max values. So output the pcp high_min/max values to /proc/zoneinfo. Link: https://lkml.kernel.org/r/20241010120935.656619-1-mengensun@tencent.com Signed-off-by: MengEn Sun Reviewed-by: Jinliang Zheng Signed-off-by: Andrew Morton --- mm/vmstat.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index b5a4cea423e1..1917c034c045 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1791,13 +1791,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, pcp = per_cpu_ptr(zone->per_cpu_pageset, i); seq_printf(m, "\n cpu: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", + "\n count: %i" + "\n high: %i" + "\n batch: %i" + "\n high_min: %i" + "\n high_max: %i", i, pcp->count, pcp->high, - pcp->batch); + pcp->batch, + pcp->high_min, + pcp->high_max); #ifdef CONFIG_SMP pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i); seq_printf(m, "\n vm stats threshold: %d", -- 2.51.0 From 6359c39c9de66dede8ff5ff257c9e117483dbc7c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 10 Oct 2024 14:15:56 +0800 Subject: [PATCH 16/16] mm: remove unused hugepage for vma_alloc_folio() The hugepage parameter was deprecated since commit ddc1a5cbc05d ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"), for PMD-sized THP, it still tries only preferred node if possible in vma_alloc_folio() by checking the order of the folio allocation. Link: https://lkml.kernel.org/r/20241010061556.1846751-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 2 +- arch/arm64/mm/fault.c | 2 +- arch/m68k/include/asm/page_no.h | 2 +- arch/s390/include/asm/page.h | 2 +- arch/x86/include/asm/page.h | 2 +- include/linux/gfp.h | 6 +++--- include/linux/highmem.h | 2 +- mm/huge_memory.c | 2 +- mm/ksm.c | 2 +- mm/memory.c | 10 ++++------ mm/mempolicy.c | 3 +-- mm/userfaultfd.c | 2 +- 12 files changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 70419e6be1a3..3dffa2a461d7 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -18,7 +18,7 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 8b281cf308b3..d95dca561f7a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -983,7 +983,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; - return vma_alloc_folio(flags, 0, vma, vaddr, false); + return vma_alloc_folio(flags, 0, vma, vaddr); } void tag_clear_highpage(struct page *page) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index af3a10973233..63c0e706084b 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -14,7 +14,7 @@ extern unsigned long memory_end; #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 73e1e03317b4..d02058f96bcf 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -74,7 +74,7 @@ static inline void copy_page(void *to, void *from) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) /* * These are used to make use of C type-checking.. diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 1b93ff80b43b..c9fe207916f4 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -35,7 +35,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a951de920e20..b65724c3427d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -306,7 +306,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage); + unsigned long addr); #else static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { @@ -326,7 +326,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde { return folio_alloc_noprof(gfp, order); } -#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ +#define vma_alloc_folio_noprof(gfp, order, vma, addr) \ folio_alloc_noprof(gfp, order) #endif @@ -341,7 +341,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false); + struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr); return &folio->page; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 930a591b9b61..bec9bd715acf 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -226,7 +226,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, { struct folio *folio; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio) clear_user_highpage(&folio->page, vaddr); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c674afd16245..387c046a389e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1143,7 +1143,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, const int order = HPAGE_PMD_ORDER; struct folio *folio; - folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); diff --git a/mm/ksm.c b/mm/ksm.c index 556b8a8f37d0..e596bc1b5fa7 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2971,7 +2971,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio, if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); diff --git a/mm/memory.c b/mm/memory.c index 5e9d6a22eb08..c51bc45a7009 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1059,8 +1059,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm, if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; @@ -4017,8 +4016,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) struct folio *folio; swp_entry_t entry; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - vmf->address, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; @@ -4174,7 +4172,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) @@ -4713,7 +4711,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9e18a6fc3061..a29eff5d0585 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2290,7 +2290,6 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: Unused (was: For hugepages try only preferred node if possible). * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the @@ -2301,7 +2300,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) + unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 48b87c62fc3d..60a0be33766f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -251,7 +251,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr, false); + dst_addr); if (!folio) goto out; -- 2.51.0