From 2b1d532e106ee63acb61a8e11608fafd75e52c4d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:52 +0200 Subject: [PATCH 01/16] csky: move setup_initrd() to setup.c Memory used by initrd should be reserved as soon as possible before there any memblock allocations that might overwrite that memory. This will also help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Guo Ren (csky) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/csky/kernel/setup.c | 43 ++++++++++++++++++++++++++++++++++++++++ arch/csky/mm/init.c | 43 ---------------------------------------- 2 files changed, 43 insertions(+), 43 deletions(-) diff --git a/arch/csky/kernel/setup.c b/arch/csky/kernel/setup.c index fe715b707fd0..e0d6ca86ea8c 100644 --- a/arch/csky/kernel/setup.c +++ b/arch/csky/kernel/setup.c @@ -12,6 +12,45 @@ #include #include +#ifdef CONFIG_BLK_DEV_INITRD +static void __init setup_initrd(void) +{ + unsigned long size; + + if (initrd_start >= initrd_end) { + pr_err("initrd not found or empty"); + goto disable; + } + + if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { + pr_err("initrd extends beyond end of memory"); + goto disable; + } + + size = initrd_end - initrd_start; + + if (memblock_is_region_reserved(__pa(initrd_start), size)) { + pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region", + __pa(initrd_start), size); + goto disable; + } + + memblock_reserve(__pa(initrd_start), size); + + pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", + (void *)(initrd_start), size); + + initrd_below_start_ok = 1; + + return; + +disable: + initrd_start = initrd_end = 0; + + pr_err(" - disabling initrd\n"); +} +#endif + static void __init csky_memblock_init(void) { unsigned long lowmem_size = PFN_DOWN(LOWMEM_LIMIT - PHYS_OFFSET_OFFSET); @@ -40,6 +79,10 @@ static void __init csky_memblock_init(void) max_low_pfn = min_low_pfn + sseg_size; } +#ifdef CONFIG_BLK_DEV_INITRD + setup_initrd(); +#endif + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; mmu_init(min_low_pfn, max_low_pfn); diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index bde7cabd23df..ab51acbc19b2 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -42,45 +42,6 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -#ifdef CONFIG_BLK_DEV_INITRD -static void __init setup_initrd(void) -{ - unsigned long size; - - if (initrd_start >= initrd_end) { - pr_err("initrd not found or empty"); - goto disable; - } - - if (__pa(initrd_end) > PFN_PHYS(max_low_pfn)) { - pr_err("initrd extends beyond end of memory"); - goto disable; - } - - size = initrd_end - initrd_start; - - if (memblock_is_region_reserved(__pa(initrd_start), size)) { - pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region", - __pa(initrd_start), size); - goto disable; - } - - memblock_reserve(__pa(initrd_start), size); - - pr_info("Initial ramdisk at: 0x%p (%lu bytes)\n", - (void *)(initrd_start), size); - - initrd_below_start_ok = 1; - - return; - -disable: - initrd_start = initrd_end = 0; - - pr_err(" - disabling initrd\n"); -} -#endif - void __init mem_init(void) { #ifdef CONFIG_HIGHMEM @@ -92,10 +53,6 @@ void __init mem_init(void) #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); -#ifdef CONFIG_BLK_DEV_INITRD - setup_initrd(); -#endif - memblock_free_all(); #ifdef CONFIG_HIGHMEM -- 2.51.0 From 30686816214b6062246e4918f3eadd1f55382425 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:53 +0200 Subject: [PATCH 02/16] hexagon: move initialization of init_mm.context init to paging_init() This will help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/hexagon/mm/init.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 3458f39ca2ac..508bb6a8dcc9 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -59,14 +59,6 @@ void __init mem_init(void) * To-Do: someone somewhere should wipe out the bootmem map * after we're done? */ - - /* - * This can be moved to some more virtual-memory-specific - * initialization hook at some point. Set the init_mm - * descriptors "context" value to point to the initial - * kernel segment table's physical address. - */ - init_mm.context.ptbase = __pa(init_mm.pgd); } void sync_icache_dcache(pte_t pte) @@ -103,6 +95,12 @@ static void __init paging_init(void) free_area_init(max_zone_pfn); /* sets up the zonelists and mem_map */ + /* + * Set the init_mm descriptors "context" value to point to the + * initial kernel segment table's physical address. + */ + init_mm.context.ptbase = __pa(init_mm.pgd); + /* * Start of high memory area. Will probably need something more * fancy if we... get more fancy. -- 2.51.0 From 67e7a600869ca557e259f1ce20f8b89bb95ca97d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:54 +0200 Subject: [PATCH 03/16] MIPS: consolidate mem_init() for NUMA machines Both MIPS systems that support numa (loongsoon3 and sgi-ip27) have identical mem_init() for NUMA case. Move that into arch/mips/mm/init.c and drop duplicate per-machine definitions. Link: https://lkml.kernel.org/r/20250313135003.836600-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/mips/loongson64/numa.c | 7 ------- arch/mips/mm/init.c | 7 +++++++ arch/mips/sgi-ip27/ip27-memory.c | 9 --------- 3 files changed, 7 insertions(+), 16 deletions(-) diff --git a/arch/mips/loongson64/numa.c b/arch/mips/loongson64/numa.c index 8388400d052f..95d5f553ce19 100644 --- a/arch/mips/loongson64/numa.c +++ b/arch/mips/loongson64/numa.c @@ -164,13 +164,6 @@ void __init paging_init(void) free_area_init(zones_size); } -void __init mem_init(void) -{ - high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); - memblock_free_all(); - setup_zero_pages(); /* This comes from node 0 */ -} - /* All PCI device belongs to logical Node-0 */ int pcibus_to_node(struct pci_bus *bus) { diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 4583d1a2a73e..3db6082c611e 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -482,6 +482,13 @@ void __init mem_init(void) 0x80000000 - 4, KCORE_TEXT); #endif } +#else /* CONFIG_NUMA */ +void __init mem_init(void) +{ + high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); + memblock_free_all(); + setup_zero_pages(); /* This comes from node 0 */ +} #endif /* !CONFIG_NUMA */ void free_init_pages(const char *what, unsigned long begin, unsigned long end) diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index 1963313f55d8..2b3e46e2e607 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -406,8 +406,6 @@ void __init prom_meminit(void) } } -extern void setup_zero_pages(void); - void __init paging_init(void) { unsigned long zones_size[MAX_NR_ZONES] = {0, }; @@ -416,10 +414,3 @@ void __init paging_init(void) zones_size[ZONE_NORMAL] = max_low_pfn; free_area_init(zones_size); } - -void __init mem_init(void) -{ - high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); - memblock_free_all(); - setup_zero_pages(); /* This comes from node 0 */ -} -- 2.51.0 From e74e2b8eb424c26dff35727d242437edb87684aa Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:55 +0200 Subject: [PATCH 04/16] MIPS: make setup_zero_pages() use memblock Allocating the zero pages from memblock is simpler because the memory is already reserved. This will also help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-6-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/mips/include/asm/mmzone.h | 2 -- arch/mips/mm/init.c | 18 +++++------------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/arch/mips/include/asm/mmzone.h b/arch/mips/include/asm/mmzone.h index 14226ea42036..602a21aee9d4 100644 --- a/arch/mips/include/asm/mmzone.h +++ b/arch/mips/include/asm/mmzone.h @@ -20,6 +20,4 @@ #define nid_to_addrbase(nid) 0 #endif -extern void setup_zero_pages(void); - #endif /* _ASM_MMZONE_H_ */ diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 3db6082c611e..820e35a59d4d 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -59,24 +59,16 @@ EXPORT_SYMBOL(zero_page_mask); /* * Not static inline because used by IP27 special magic initialization code */ -void setup_zero_pages(void) +static void __init setup_zero_pages(void) { - unsigned int order, i; - struct page *page; + unsigned int order; if (cpu_has_vce) order = 3; else order = 0; - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Oh boy, that early out of memory?"); - - page = virt_to_page((void *)empty_zero_page); - split_page(page, order); - for (i = 0; i < (1 << order); i++, page++) - mark_page_reserved(page); + empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE); zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } @@ -470,9 +462,9 @@ void __init mem_init(void) BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (PFN_PTE_SHIFT > PAGE_SHIFT)); maar_init(); - memblock_free_all(); setup_zero_pages(); /* Setup zeroed pages. */ mem_init_free_highmem(); + memblock_free_all(); #ifdef CONFIG_64BIT if ((unsigned long) &_text > (unsigned long) CKSEG0) @@ -486,8 +478,8 @@ void __init mem_init(void) void __init mem_init(void) { high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); - memblock_free_all(); setup_zero_pages(); /* This comes from node 0 */ + memblock_free_all(); } #endif /* !CONFIG_NUMA */ -- 2.51.0 From be971f957a80e2bbd7747a295886df1472803ff1 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:56 +0200 Subject: [PATCH 05/16] nios2: move pr_debug() about memory start and end to setup_arch() This will help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/nios2/kernel/setup.c | 2 ++ arch/nios2/mm/init.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index da122a5fa43b..a4cffbfc1399 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -149,6 +149,8 @@ void __init setup_arch(char **cmdline_p) memory_start = memblock_start_of_DRAM(); memory_end = memblock_end_of_DRAM(); + pr_debug("%s: start=%lx, end=%lx\n", __func__, memory_start, memory_end); + setup_initial_init_mm(_stext, _etext, _edata, _end); init_task.thread.kregs = &fake_regs; diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index a2278485de19..aa692ad30044 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -65,8 +65,6 @@ void __init mem_init(void) unsigned long end_mem = memory_end; /* this must not include kernel stack at top */ - pr_debug("mem_init: start=%lx, end=%lx\n", memory_start, memory_end); - end_mem &= PAGE_MASK; high_memory = __va(end_mem); -- 2.51.0 From 54ccf66f99d6d2630895eb13156be19a033d4566 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:57 +0200 Subject: [PATCH 06/16] s390: make setup_zero_pages() use memblock Allocating the zero pages from memblock is simpler because the memory is already reserved. This will also help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Heiko Carstens Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/mm/init.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index d88cb1c13f7d..2b41dc9b1fa3 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -73,8 +73,6 @@ static void __init setup_zero_pages(void) { unsigned long total_pages = memblock_estimated_nr_free_pages(); unsigned int order; - struct page *page; - int i; /* Latest machines require a mapping granularity of 512KB */ order = 7; @@ -83,16 +81,7 @@ static void __init setup_zero_pages(void) while (order > 2 && (total_pages >> 10) < (1UL << order)) order--; - empty_zero_page = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!empty_zero_page) - panic("Out of memory in setup_zero_pages"); - - page = virt_to_page((void *) empty_zero_page); - split_page(page, order); - for (i = 1 << order; i > 0; i--) { - mark_page_reserved(page); - page++; - } + empty_zero_page = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE << order, PAGE_SIZE); zero_page_mask = ((PAGE_SIZE << order) - 1) & PAGE_MASK; } @@ -176,9 +165,10 @@ void __init mem_init(void) pv_init(); kfence_split_mapping(); + setup_zero_pages(); /* Setup zeroed pages. */ + /* this will put all low memory onto the freelists */ memblock_free_all(); - setup_zero_pages(); /* Setup zeroed pages. */ } unsigned long memory_block_size_bytes(void) -- 2.51.0 From d319c8b4918d24aea6fd90bd39cd5bc9fcf40859 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:58 +0200 Subject: [PATCH 07/16] xtensa: split out printing of virtual memory layout to a function This will help with pulling out memblock_free_all() to the generic code and reducing code duplication in arch::mem_init(). Link: https://lkml.kernel.org/r/20250313135003.836600-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Max Filippov Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Mark Brown Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/xtensa/mm/init.c | 97 ++++++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 47 deletions(-) diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index b2587a1a7c46..01577d33e602 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -66,6 +66,55 @@ void __init bootmem_init(void) memblock_dump_all(); } +static void __init print_vm_layout(void) +{ + pr_info("virtual kernel memory layout:\n" +#ifdef CONFIG_KASAN + " kasan : 0x%08lx - 0x%08lx (%5lu MB)\n" +#endif +#ifdef CONFIG_MMU + " vmalloc : 0x%08lx - 0x%08lx (%5lu MB)\n" +#endif +#ifdef CONFIG_HIGHMEM + " pkmap : 0x%08lx - 0x%08lx (%5lu kB)\n" + " fixmap : 0x%08lx - 0x%08lx (%5lu kB)\n" +#endif + " lowmem : 0x%08lx - 0x%08lx (%5lu MB)\n" + " .text : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .rodata : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .data : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .init : 0x%08lx - 0x%08lx (%5lu kB)\n" + " .bss : 0x%08lx - 0x%08lx (%5lu kB)\n", +#ifdef CONFIG_KASAN + KASAN_SHADOW_START, KASAN_SHADOW_START + KASAN_SHADOW_SIZE, + KASAN_SHADOW_SIZE >> 20, +#endif +#ifdef CONFIG_MMU + VMALLOC_START, VMALLOC_END, + (VMALLOC_END - VMALLOC_START) >> 20, +#ifdef CONFIG_HIGHMEM + PKMAP_BASE, PKMAP_BASE + LAST_PKMAP * PAGE_SIZE, + (LAST_PKMAP*PAGE_SIZE) >> 10, + FIXADDR_START, FIXADDR_END, + (FIXADDR_END - FIXADDR_START) >> 10, +#endif + PAGE_OFFSET, PAGE_OFFSET + + (max_low_pfn - min_low_pfn) * PAGE_SIZE, +#else + min_low_pfn * PAGE_SIZE, max_low_pfn * PAGE_SIZE, +#endif + ((max_low_pfn - min_low_pfn) * PAGE_SIZE) >> 20, + (unsigned long)_text, (unsigned long)_etext, + (unsigned long)(_etext - _text) >> 10, + (unsigned long)__start_rodata, (unsigned long)__end_rodata, + (unsigned long)(__end_rodata - __start_rodata) >> 10, + (unsigned long)_sdata, (unsigned long)_edata, + (unsigned long)(_edata - _sdata) >> 10, + (unsigned long)__init_begin, (unsigned long)__init_end, + (unsigned long)(__init_end - __init_begin) >> 10, + (unsigned long)__bss_start, (unsigned long)__bss_stop, + (unsigned long)(__bss_stop - __bss_start) >> 10); +} void __init zones_init(void) { @@ -77,6 +126,7 @@ void __init zones_init(void) #endif }; free_area_init(max_zone_pfn); + print_vm_layout(); } static void __init free_highpages(void) @@ -118,53 +168,6 @@ void __init mem_init(void) high_memory = (void *)__va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); - - pr_info("virtual kernel memory layout:\n" -#ifdef CONFIG_KASAN - " kasan : 0x%08lx - 0x%08lx (%5lu MB)\n" -#endif -#ifdef CONFIG_MMU - " vmalloc : 0x%08lx - 0x%08lx (%5lu MB)\n" -#endif -#ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%5lu kB)\n" - " fixmap : 0x%08lx - 0x%08lx (%5lu kB)\n" -#endif - " lowmem : 0x%08lx - 0x%08lx (%5lu MB)\n" - " .text : 0x%08lx - 0x%08lx (%5lu kB)\n" - " .rodata : 0x%08lx - 0x%08lx (%5lu kB)\n" - " .data : 0x%08lx - 0x%08lx (%5lu kB)\n" - " .init : 0x%08lx - 0x%08lx (%5lu kB)\n" - " .bss : 0x%08lx - 0x%08lx (%5lu kB)\n", -#ifdef CONFIG_KASAN - KASAN_SHADOW_START, KASAN_SHADOW_START + KASAN_SHADOW_SIZE, - KASAN_SHADOW_SIZE >> 20, -#endif -#ifdef CONFIG_MMU - VMALLOC_START, VMALLOC_END, - (VMALLOC_END - VMALLOC_START) >> 20, -#ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE + LAST_PKMAP * PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, - FIXADDR_START, FIXADDR_END, - (FIXADDR_END - FIXADDR_START) >> 10, -#endif - PAGE_OFFSET, PAGE_OFFSET + - (max_low_pfn - min_low_pfn) * PAGE_SIZE, -#else - min_low_pfn * PAGE_SIZE, max_low_pfn * PAGE_SIZE, -#endif - ((max_low_pfn - min_low_pfn) * PAGE_SIZE) >> 20, - (unsigned long)_text, (unsigned long)_etext, - (unsigned long)(_etext - _text) >> 10, - (unsigned long)__start_rodata, (unsigned long)__end_rodata, - (unsigned long)(__end_rodata - __start_rodata) >> 10, - (unsigned long)_sdata, (unsigned long)_edata, - (unsigned long)(_edata - _sdata) >> 10, - (unsigned long)__init_begin, (unsigned long)__init_end, - (unsigned long)(__init_end - __init_begin) >> 10, - (unsigned long)__bss_start, (unsigned long)__bss_stop, - (unsigned long)(__bss_stop - __bss_start) >> 10); } static void __init parse_memmap_one(char *p) -- 2.51.0 From 8268af309d07d1c6279080b4e6fd16ec75cc977c Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:49:59 +0200 Subject: [PATCH 08/16] arch, mm: set max_mapnr when allocating memory map for FLATMEM max_mapnr is essentially the size of the memory map for systems that use FLATMEM. There is no reason to calculate it in each and every architecture when it's anyway calculated in alloc_node_mem_map(). Drop setting of max_mapnr from architecture code and set it once in alloc_node_mem_map(). While on it, move definition of mem_map and max_mapnr to mm/mm_init.c so there won't be two copies for MMU and !MMU variants. Link: https://lkml.kernel.org/r/20250313135003.836600-10-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 1 - arch/arc/mm/init.c | 5 ----- arch/arm/mm/init.c | 2 -- arch/csky/mm/init.c | 4 ---- arch/loongarch/mm/init.c | 1 - arch/microblaze/mm/init.c | 4 ---- arch/mips/mm/init.c | 8 -------- arch/nios2/kernel/setup.c | 1 - arch/nios2/mm/init.c | 2 +- arch/openrisc/mm/init.c | 1 - arch/parisc/mm/init.c | 1 - arch/powerpc/kernel/setup-common.c | 2 -- arch/riscv/mm/init.c | 1 - arch/s390/mm/init.c | 1 - arch/sh/mm/init.c | 1 - arch/sparc/mm/init_32.c | 1 - arch/um/include/shared/mem_user.h | 1 - arch/um/kernel/physmem.c | 12 ------------ arch/um/kernel/um_arch.c | 1 - arch/x86/mm/init_32.c | 3 --- arch/xtensa/mm/init.c | 1 - include/asm-generic/memory_model.h | 5 +++-- include/linux/mm.h | 11 ----------- mm/memory.c | 8 -------- mm/mm_init.c | 25 +++++++++++++++++-------- mm/nommu.c | 4 ---- 26 files changed, 21 insertions(+), 86 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 61c2198b1359..ec0eeae9c653 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -276,7 +276,6 @@ srm_paging_stop (void) void __init mem_init(void) { - set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); memblock_free_all(); } diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 6a71b23f1383..7ef883d58dc1 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -154,11 +154,6 @@ void __init setup_arch_memory(void) arch_pfn_offset = min(min_low_pfn, min_high_pfn); kmap_init(); - -#else /* CONFIG_HIGHMEM */ - /* pfn_valid() uses this when FLATMEM=y and HIGHMEM=n */ - max_mapnr = max_low_pfn - min_low_pfn; - #endif /* CONFIG_HIGHMEM */ free_area_init(max_zone_pfn); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 9aec1cb2386f..d4bcc745a044 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -275,8 +275,6 @@ void __init mem_init(void) swiotlb_init(max_pfn > arm_dma_pfn_limit, SWIOTLB_VERBOSE); #endif - set_max_mapnr(pfn_to_page(max_pfn) - mem_map); - #ifdef CONFIG_SA1111 /* now that our DMA memory is actually so designated, we can free it */ memblock_phys_free(PHYS_OFFSET, __pa(swapper_pg_dir) - PHYS_OFFSET); diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index ab51acbc19b2..ba6694d6170a 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -46,10 +46,6 @@ void __init mem_init(void) { #ifdef CONFIG_HIGHMEM unsigned long tmp; - - set_max_mapnr(highend_pfn - ARCH_PFN_OFFSET); -#else - set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index ca5aa5f46a9f..00449df50db1 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -78,7 +78,6 @@ void __init paging_init(void) void __init mem_init(void) { - max_mapnr = max_low_pfn; high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 4520c5741579..857cd2b44bcf 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -104,17 +104,13 @@ void __init setup_memory(void) * * min_low_pfn - the first page (mm/bootmem.c - node_boot_start) * max_low_pfn - * max_mapnr - the first unused page (mm/bootmem.c - node_low_pfn) */ /* memory start is from the kernel end (aligned) to higher addr */ min_low_pfn = memory_start >> PAGE_SHIFT; /* minimum for allocation */ - /* RAM is assumed contiguous */ - max_mapnr = memory_size >> PAGE_SHIFT; max_low_pfn = ((u64)memory_start + (u64)lowmem_size) >> PAGE_SHIFT; max_pfn = ((u64)memory_start + (u64)memory_size) >> PAGE_SHIFT; - pr_info("%s: max_mapnr: %#lx\n", __func__, max_mapnr); pr_info("%s: min_low_pfn: %#lx\n", __func__, min_low_pfn); pr_info("%s: max_low_pfn: %#lx\n", __func__, max_low_pfn); pr_info("%s: max_pfn: %#lx\n", __func__, max_pfn); diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 820e35a59d4d..eb61a73520a0 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -415,15 +415,7 @@ void __init paging_init(void) " %ldk highmem ignored\n", (highend_pfn - max_low_pfn) << (PAGE_SHIFT - 10)); max_zone_pfns[ZONE_HIGHMEM] = max_low_pfn; - - max_mapnr = max_low_pfn; - } else if (highend_pfn) { - max_mapnr = highend_pfn; - } else { - max_mapnr = max_low_pfn; } -#else - max_mapnr = max_low_pfn; #endif high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); diff --git a/arch/nios2/kernel/setup.c b/arch/nios2/kernel/setup.c index a4cffbfc1399..2a40150142c3 100644 --- a/arch/nios2/kernel/setup.c +++ b/arch/nios2/kernel/setup.c @@ -158,7 +158,6 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; find_limits(&min_low_pfn, &max_low_pfn, &max_pfn); - max_mapnr = max_low_pfn; memblock_reserve(__pa_symbol(_stext), _end - _stext); #ifdef CONFIG_BLK_DEV_INITRD diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index aa692ad30044..3cafa87ead9e 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -51,7 +51,7 @@ void __init paging_init(void) pagetable_init(); pgd_current = swapper_pg_dir; - max_zone_pfn[ZONE_NORMAL] = max_mapnr; + max_zone_pfn[ZONE_NORMAL] = max_low_pfn; /* pass the memory from the bootmem allocator to the main allocator */ free_area_init(max_zone_pfn); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index d0cb1a0126f9..9093c336e158 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -193,7 +193,6 @@ void __init mem_init(void) { BUG_ON(!mem_map); - max_mapnr = max_low_pfn; high_memory = (void *)__va(max_low_pfn * PAGE_SIZE); /* clear the zero-page */ diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 61c0a2477072..2cdfc0b1195c 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -563,7 +563,6 @@ void __init mem_init(void) #endif high_memory = __va((max_pfn << PAGE_SHIFT)); - set_max_mapnr(max_low_pfn); memblock_free_all(); #ifdef CONFIG_PA11 diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index a08b0ede4e64..68d47c53876c 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -957,8 +957,6 @@ void __init setup_arch(char **cmdline_p) /* Parse memory topology */ mem_topology_setup(); - /* Set max_mapnr before paging_init() */ - set_max_mapnr(max_pfn); high_memory = (void *)__va(max_low_pfn * PAGE_SIZE); /* diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 15b2eda4c364..157c9ca51541 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -298,7 +298,6 @@ static void __init setup_bootmem(void) high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); - set_max_mapnr(max_low_pfn - ARCH_PFN_OFFSET); reserve_initrd_mem(); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 2b41dc9b1fa3..ad567e2100b7 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -159,7 +159,6 @@ void __init mem_init(void) cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); - set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); pv_init(); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 289a2fecebef..72aea5cd1b85 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -290,7 +290,6 @@ void __init paging_init(void) */ max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT; min_low_pfn = __MEMORY_START >> PAGE_SHIFT; - set_max_mapnr(max_low_pfn - min_low_pfn); nodes_clear(node_online_map); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index d96a14ffceeb..6b58da14edc6 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -275,7 +275,6 @@ void __init mem_init(void) taint_real_pages(); - max_mapnr = last_valid_pfn - pfn_base; high_memory = __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); diff --git a/arch/um/include/shared/mem_user.h b/arch/um/include/shared/mem_user.h index adfa08062f88..d4727efcf23d 100644 --- a/arch/um/include/shared/mem_user.h +++ b/arch/um/include/shared/mem_user.h @@ -47,7 +47,6 @@ extern int iomem_size; #define ROUND_4M(n) ((((unsigned long) (n)) + (1 << 22)) & ~((1 << 22) - 1)) extern unsigned long find_iomem(char *driver, unsigned long *len_out); -extern void mem_total_pages(unsigned long physmem, unsigned long iomem); extern void setup_physmem(unsigned long start, unsigned long usable, unsigned long len); extern void map_memory(unsigned long virt, unsigned long phys, diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index a74f17b033c4..af02b5f9911d 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -22,18 +22,6 @@ static int physmem_fd = -1; unsigned long high_physmem; EXPORT_SYMBOL(high_physmem); -void __init mem_total_pages(unsigned long physmem, unsigned long iomem) -{ - unsigned long phys_pages, iomem_pages, total_pages; - - phys_pages = physmem >> PAGE_SHIFT; - iomem_pages = iomem >> PAGE_SHIFT; - - total_pages = phys_pages + iomem_pages; - - max_mapnr = total_pages; -} - void map_memory(unsigned long virt, unsigned long phys, unsigned long len, int r, int w, int x) { diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 79ea97d4797e..6414cbf00572 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -419,7 +419,6 @@ void __init setup_arch(char **cmdline_p) stack_protections((unsigned long) init_task.stack); setup_physmem(uml_physmem, uml_reserved, physmem_size); - mem_total_pages(physmem_size, iomem_size); uml_dtb_init(); read_initrd(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index ac41b1e0940d..6d2f8cb9451e 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -650,9 +650,6 @@ void __init initmem_init(void) memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); -#ifdef CONFIG_FLATMEM - max_mapnr = IS_ENABLED(CONFIG_HIGHMEM) ? highend_pfn : max_low_pfn; -#endif __vmalloc_start_set = true; printk(KERN_NOTICE "%ldMB LOWMEM available.\n", diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 01577d33e602..9f1b0d5fccc7 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -164,7 +164,6 @@ void __init mem_init(void) { free_highpages(); - max_mapnr = max_pfn - ARCH_PFN_OFFSET; high_memory = (void *)__va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h index 6d1fb6162ac1..a3b5029aebbd 100644 --- a/include/asm-generic/memory_model.h +++ b/include/asm-generic/memory_model.h @@ -19,11 +19,12 @@ #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + \ ARCH_PFN_OFFSET) +/* avoid include hell */ +extern unsigned long max_mapnr; + #ifndef pfn_valid static inline int pfn_valid(unsigned long pfn) { - /* avoid include hell */ - extern unsigned long max_mapnr; unsigned long pfn_offset = ARCH_PFN_OFFSET; return pfn >= pfn_offset && (pfn - pfn_offset) < max_mapnr; diff --git a/include/linux/mm.h b/include/linux/mm.h index 82776b409391..8c4cb8c28507 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -46,17 +46,6 @@ extern int sysctl_page_lock_unfairness; void mm_core_init(void); void init_mm_internals(void); -#ifndef CONFIG_NUMA /* Don't use mapnrs, do it properly */ -extern unsigned long max_mapnr; - -static inline void set_max_mapnr(unsigned long limit) -{ - max_mapnr = limit; -} -#else -static inline void set_max_mapnr(unsigned long limit) { } -#endif - extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { diff --git a/mm/memory.c b/mm/memory.c index 8873b7a4962c..a1d7664855f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -95,14 +95,6 @@ #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif -#ifndef CONFIG_NUMA -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); - -struct page *mem_map; -EXPORT_SYMBOL(mem_map); -#endif - static vm_fault_t do_fault(struct vm_fault *vmf); static vm_fault_t do_anonymous_page(struct vm_fault *vmf); static bool vmf_pte_changed(struct vm_fault *vmf); diff --git a/mm/mm_init.c b/mm/mm_init.c index 133640a93d1d..7fd48d2d5064 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -37,6 +37,14 @@ #include +#ifndef CONFIG_NUMA +unsigned long max_mapnr; +EXPORT_SYMBOL(max_mapnr); + +struct page *mem_map; +EXPORT_SYMBOL(mem_map); +#endif + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -1639,7 +1647,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* - * The zone's endpoints aren't required to be MAX_PAGE_ORDER + * The zone's endpoints aren't required to be MAX_PAGE_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ @@ -1655,14 +1663,15 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", __func__, pgdat->node_id, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); -#ifndef CONFIG_NUMA + /* the global mem_map is just set as node 0's */ - if (pgdat == NODE_DATA(0)) { - mem_map = NODE_DATA(0)->node_mem_map; - if (page_to_pfn(mem_map) != pgdat->node_start_pfn) - mem_map -= offset; - } -#endif + WARN_ON(pgdat != NODE_DATA(0)); + + mem_map = pgdat->node_mem_map; + if (page_to_pfn(mem_map) != pgdat->node_start_pfn) + mem_map -= offset; + + max_mapnr = end - start; } #else static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } diff --git a/mm/nommu.c b/mm/nommu.c index 8b31d8396297..43751726f977 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -44,16 +44,12 @@ void *high_memory; EXPORT_SYMBOL(high_memory); -struct page *mem_map; -unsigned long max_mapnr; -EXPORT_SYMBOL(max_mapnr); unsigned long highest_memmap_pfn; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; -EXPORT_SYMBOL(mem_map); /* list of mapped, potentially shareable regions */ static struct kmem_cache *vm_region_jar; -- 2.51.0 From e120d1bc12da5c1bb871c346f741296610fd6fcb Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:00 +0200 Subject: [PATCH 09/16] arch, mm: set high_memory in free_area_init() high_memory defines upper bound on the directly mapped memory. This bound is defined by the beginning of ZONE_HIGHMEM when a system has high memory and by the end of memory otherwise. All this is known to generic memory management initialization code that can set high_memory while initializing core mm structures. Add a generic calculation of high_memory to free_area_init() and remove per-architecture calculation except for the architectures that set and use high_memory earlier than that. Link: https://lkml.kernel.org/r/20250313135003.836600-11-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 1 - arch/arc/mm/init.c | 2 -- arch/arm64/mm/init.c | 2 -- arch/csky/mm/init.c | 1 - arch/hexagon/mm/init.c | 6 ------ arch/loongarch/kernel/numa.c | 1 - arch/loongarch/mm/init.c | 2 -- arch/microblaze/mm/init.c | 2 -- arch/mips/mm/init.c | 2 -- arch/nios2/mm/init.c | 6 ------ arch/openrisc/mm/init.c | 2 -- arch/parisc/mm/init.c | 1 - arch/riscv/mm/init.c | 1 - arch/s390/mm/init.c | 2 -- arch/sh/mm/init.c | 7 ------- arch/sparc/mm/init_32.c | 1 - arch/sparc/mm/init_64.c | 2 -- arch/um/kernel/um_arch.c | 1 - arch/x86/kernel/setup.c | 2 -- arch/x86/mm/init_32.c | 3 --- arch/x86/mm/numa_32.c | 3 --- arch/xtensa/mm/init.c | 2 -- mm/memory.c | 8 -------- mm/mm_init.c | 30 ++++++++++++++++++++++++++++++ mm/nommu.c | 2 -- 25 files changed, 30 insertions(+), 62 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index ec0eeae9c653..3ab2d2f3c917 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -276,7 +276,6 @@ srm_paging_stop (void) void __init mem_init(void) { - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); memblock_free_all(); } diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 7ef883d58dc1..05025122e965 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -150,8 +150,6 @@ void __init setup_arch_memory(void) */ max_zone_pfn[ZONE_HIGHMEM] = max_high_pfn; - high_memory = (void *)(min_high_pfn << PAGE_SHIFT); - arch_pfn_offset = min(min_low_pfn, min_high_pfn); kmap_init(); #endif /* CONFIG_HIGHMEM */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index ccdef53872a0..53a0b105890b 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -309,8 +309,6 @@ void __init arm64_memblock_init(void) } early_init_fdt_scan_reserved_mem(); - - high_memory = __va(memblock_end_of_DRAM() - 1) + 1; } void __init bootmem_init(void) diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index ba6694d6170a..a22801aa503a 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -47,7 +47,6 @@ void __init mem_init(void) #ifdef CONFIG_HIGHMEM unsigned long tmp; #endif - high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index 508bb6a8dcc9..d412c2314509 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -100,12 +100,6 @@ static void __init paging_init(void) * initial kernel segment table's physical address. */ init_mm.context.ptbase = __pa(init_mm.pgd); - - /* - * Start of high memory area. Will probably need something more - * fancy if we... get more fancy. - */ - high_memory = (void *)((bootmem_lastpg + 1) << PAGE_SHIFT); } #ifndef DMA_RESERVE diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 84fe7f854820..8eb489725b1a 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -389,7 +389,6 @@ void __init paging_init(void) void __init mem_init(void) { - high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); } diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 00449df50db1..6affa3609188 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -78,8 +78,6 @@ void __init paging_init(void) void __init mem_init(void) { - high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); - memblock_free_all(); } #endif /* !CONFIG_NUMA */ diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 857cd2b44bcf..7e2e342e84c5 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -120,8 +120,6 @@ void __init setup_memory(void) void __init mem_init(void) { - high_memory = (void *)__va(memory_start + lowmem_size - 1); - /* this will put all memory onto the freelists */ memblock_free_all(); #ifdef CONFIG_HIGHMEM diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index eb61a73520a0..ed9dde6a00f7 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -417,7 +417,6 @@ void __init paging_init(void) max_zone_pfns[ZONE_HIGHMEM] = max_low_pfn; } #endif - high_memory = (void *) __va(max_low_pfn << PAGE_SHIFT); free_area_init(max_zone_pfns); } @@ -469,7 +468,6 @@ void __init mem_init(void) #else /* CONFIG_NUMA */ void __init mem_init(void) { - high_memory = (void *) __va(get_num_physpages() << PAGE_SHIFT); setup_zero_pages(); /* This comes from node 0 */ memblock_free_all(); } diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 3cafa87ead9e..4ba8dfa0d238 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -62,12 +62,6 @@ void __init paging_init(void) void __init mem_init(void) { - unsigned long end_mem = memory_end; /* this must not include - kernel stack at top */ - - end_mem &= PAGE_MASK; - high_memory = __va(end_mem); - /* this will put all memory onto the freelists */ memblock_free_all(); } diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 9093c336e158..72c5952607ac 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -193,8 +193,6 @@ void __init mem_init(void) { BUG_ON(!mem_map); - high_memory = (void *)__va(max_low_pfn * PAGE_SIZE); - /* clear the zero-page */ memset((void *)empty_zero_page, 0, PAGE_SIZE); diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 2cdfc0b1195c..4fbe354dc9b4 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -562,7 +562,6 @@ void __init mem_init(void) BUILD_BUG_ON(TMPALIAS_MAP_START >= 0x80000000); #endif - high_memory = __va((max_pfn << PAGE_SHIFT)); memblock_free_all(); #ifdef CONFIG_PA11 diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 157c9ca51541..ac6d41e86243 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -295,7 +295,6 @@ static void __init setup_bootmem(void) phys_ram_end = memblock_end_of_DRAM(); min_low_pfn = PFN_UP(phys_ram_base); max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end); - high_memory = (void *)(__va(PFN_PHYS(max_low_pfn))); dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index ad567e2100b7..4bd6f316d71f 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -159,8 +159,6 @@ void __init mem_init(void) cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); - pv_init(); kfence_split_mapping(); diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 72aea5cd1b85..6d459ffba4bc 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -330,13 +330,6 @@ unsigned int mem_init_done = 0; void __init mem_init(void) { - pg_data_t *pgdat; - - high_memory = NULL; - for_each_online_pgdat(pgdat) - high_memory = max_t(void *, high_memory, - __va(pgdat_end_pfn(pgdat) << PAGE_SHIFT)); - memblock_free_all(); /* Set this up early, so we can take care of the zero page */ diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 6b58da14edc6..81a468a9c223 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -275,7 +275,6 @@ void __init mem_init(void) taint_real_pages(); - high_memory = __va(max_low_pfn << PAGE_SHIFT); memblock_free_all(); for (i = 0; sp_banks[i].num_bytes != 0; i++) { diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 05882bca5b73..34d46adb9571 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2505,8 +2505,6 @@ static void __init register_page_bootmem_info(void) } void __init mem_init(void) { - high_memory = __va(last_valid_pfn << PAGE_SHIFT); - memblock_free_all(); /* diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 6414cbf00572..f24a3ce37ab7 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -385,7 +385,6 @@ int __init linux_main(int argc, char **argv, char **envp) high_physmem = uml_physmem + physmem_size; end_iomem = high_physmem + iomem_size; - high_memory = (void *) end_iomem; start_vm = VMALLOC_START; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ff8604007b08..74ac686d441a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -972,8 +972,6 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = e820__end_of_low_ram_pfn(); else max_low_pfn = max_pfn; - - high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif /* Find and reserve MPTABLE area */ diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 6d2f8cb9451e..801b659ead0c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -643,9 +643,6 @@ void __init initmem_init(void) highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif memblock_set_node(0, PHYS_ADDR_MAX, &memblock.memory, 0); diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 65fda406e6f2..442ef3facff0 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -41,9 +41,6 @@ void __init initmem_init(void) highstart_pfn = max_low_pfn; printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif printk(KERN_NOTICE "%ldMB LOWMEM available.\n", pages_to_mb(max_low_pfn)); diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 9f1b0d5fccc7..9b662477b3d4 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -164,8 +164,6 @@ void __init mem_init(void) { free_highpages(); - high_memory = (void *)__va(max_low_pfn << PAGE_SHIFT); - memblock_free_all(); } diff --git a/mm/memory.c b/mm/memory.c index a1d7664855f2..3900225d99c5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -113,14 +113,6 @@ static __always_inline bool vmf_orig_pte_uffd_wp(struct vm_fault *vmf) return pte_marker_uffd_wp(vmf->orig_pte); } -/* - * A number of key systems in x86 including ioremap() rely on the assumption - * that high_memory defines the upper bound on direct map memory, then end - * of ZONE_NORMAL. - */ -void *high_memory; -EXPORT_SYMBOL(high_memory); - /* * Randomize the address space (stacks, mmaps, brk, etc.). * diff --git a/mm/mm_init.c b/mm/mm_init.c index 7fd48d2d5064..bd7071c32a44 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -45,6 +45,13 @@ struct page *mem_map; EXPORT_SYMBOL(mem_map); #endif +/* + * high_memory defines the upper bound on direct map memory, then end + * of ZONE_NORMAL. + */ +void *high_memory; +EXPORT_SYMBOL(high_memory); + #ifdef CONFIG_DEBUG_MEMORY_INIT int __meminitdata mminit_loglevel; @@ -1778,6 +1785,27 @@ static bool arch_has_descending_max_zone_pfns(void) return IS_ENABLED(CONFIG_ARC) && !IS_ENABLED(CONFIG_ARC_HAS_PAE40); } +static void set_high_memory(void) +{ + phys_addr_t highmem = memblock_end_of_DRAM(); + + /* + * Some architectures (e.g. ARM) set high_memory very early and + * use it in arch setup code. + * If an architecture already set high_memory don't overwrite it + */ + if (high_memory) + return; + +#ifdef CONFIG_HIGHMEM + if (arch_has_descending_max_zone_pfns() || + highmem > PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM])) + highmem = PFN_PHYS(arch_zone_lowest_possible_pfn[ZONE_HIGHMEM]); +#endif + + high_memory = phys_to_virt(highmem - 1) + 1; +} + /** * free_area_init - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone @@ -1900,6 +1928,8 @@ void __init free_area_init(unsigned long *max_zone_pfn) /* disable hash distribution for systems with a single node */ fixup_hashdist(); + + set_high_memory(); } /** diff --git a/mm/nommu.c b/mm/nommu.c index 43751726f977..15a396ce2553 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -42,8 +42,6 @@ #include #include "internal.h" -void *high_memory; -EXPORT_SYMBOL(high_memory); unsigned long highest_memmap_pfn; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -- 2.51.0 From 6faea3422e3b4e8de44a55aa3e6e843320da66d2 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:01 +0200 Subject: [PATCH 10/16] arch, mm: streamline HIGHMEM freeing All architectures that support HIGHMEM have their code that frees high memory pages to the buddy allocator while __free_memory_core() is limited to freeing only low memory. There is no actual reason for that. The memory map is completely ready by the time memblock_free_all() is called and high pages can be released to the buddy allocator along with low memory. Remove low memory limit from __free_memory_core() and drop per-architecture code that frees high memory pages. Link: https://lkml.kernel.org/r/20250313135003.836600-12-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arc/mm/init.c | 6 +----- arch/arm/mm/init.c | 29 ----------------------------- arch/csky/mm/init.c | 14 -------------- arch/microblaze/mm/init.c | 16 ---------------- arch/mips/mm/init.c | 20 -------------------- arch/powerpc/mm/mem.c | 14 -------------- arch/sparc/mm/init_32.c | 25 ------------------------- arch/x86/include/asm/highmem.h | 3 --- arch/x86/include/asm/numa.h | 4 ---- arch/x86/include/asm/numa_32.h | 13 ------------- arch/x86/mm/Makefile | 2 -- arch/x86/mm/highmem_32.c | 34 ---------------------------------- arch/x86/mm/init_32.c | 28 ---------------------------- arch/xtensa/mm/init.c | 29 ----------------------------- include/linux/mm.h | 1 - mm/memblock.c | 3 +-- 16 files changed, 2 insertions(+), 239 deletions(-) delete mode 100644 arch/x86/include/asm/numa_32.h delete mode 100644 arch/x86/mm/highmem_32.c diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 05025122e965..11ce638731c9 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -160,11 +160,7 @@ void __init setup_arch_memory(void) static void __init highmem_init(void) { #ifdef CONFIG_HIGHMEM - unsigned long tmp; - memblock_phys_free(high_mem_start, high_mem_sz); - for (tmp = min_high_pfn; tmp < max_high_pfn; tmp++) - free_highmem_page(pfn_to_page(tmp)); #endif } @@ -176,8 +172,8 @@ static void __init highmem_init(void) */ void __init mem_init(void) { - memblock_free_all(); highmem_init(); + memblock_free_all(); BUILD_BUG_ON((PTRS_PER_PGD * sizeof(pgd_t)) > PAGE_SIZE); BUILD_BUG_ON((PTRS_PER_PUD * sizeof(pud_t)) > PAGE_SIZE); diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index d4bcc745a044..7bb5ce02b9b5 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -237,33 +237,6 @@ static inline void poison_init_mem(void *s, size_t count) *p++ = 0xe7fddef0; } -static void __init free_highpages(void) -{ -#ifdef CONFIG_HIGHMEM - unsigned long max_low = max_low_pfn; - phys_addr_t range_start, range_end; - u64 i; - - /* set highmem page free */ - for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, - &range_start, &range_end, NULL) { - unsigned long start = PFN_UP(range_start); - unsigned long end = PFN_DOWN(range_end); - - /* Ignore complete lowmem entries */ - if (end <= max_low) - continue; - - /* Truncate partial highmem entries */ - if (start < max_low) - start = max_low; - - for (; start < end; start++) - free_highmem_page(pfn_to_page(start)); - } -#endif -} - /* * mem_init() marks the free areas in the mem_map and tells us how much * memory is free. This is done after various parts of the system have @@ -283,8 +256,6 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ memblock_free_all(); - free_highpages(); - /* * Check boundaries twice: Some fundamental inconsistencies can * be detected at build time already. diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index a22801aa503a..3914c2b873da 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -44,21 +44,7 @@ EXPORT_SYMBOL(empty_zero_page); void __init mem_init(void) { -#ifdef CONFIG_HIGHMEM - unsigned long tmp; -#endif - memblock_free_all(); - -#ifdef CONFIG_HIGHMEM - for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) { - struct page *page = pfn_to_page(tmp); - - /* FIXME not sure about */ - if (!memblock_is_reserved(tmp << PAGE_SHIFT)) - free_highmem_page(page); - } -#endif } void free_initmem(void) diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 7e2e342e84c5..3e664e0efc33 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -52,19 +52,6 @@ static void __init highmem_init(void) map_page(PKMAP_BASE, 0, 0); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); } - -static void __meminit highmem_setup(void) -{ - unsigned long pfn; - - for (pfn = max_low_pfn; pfn < max_pfn; ++pfn) { - struct page *page = pfn_to_page(pfn); - - /* FIXME not sure about */ - if (!memblock_is_reserved(pfn << PAGE_SHIFT)) - free_highmem_page(page); - } -} #endif /* CONFIG_HIGHMEM */ /* @@ -122,9 +109,6 @@ void __init mem_init(void) { /* this will put all memory onto the freelists */ memblock_free_all(); -#ifdef CONFIG_HIGHMEM - highmem_setup(); -#endif mem_init_done = 1; } diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index ed9dde6a00f7..075177e817ac 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -425,25 +425,6 @@ void __init paging_init(void) static struct kcore_list kcore_kseg0; #endif -static inline void __init mem_init_free_highmem(void) -{ -#ifdef CONFIG_HIGHMEM - unsigned long tmp; - - if (cpu_has_dc_aliases) - return; - - for (tmp = highstart_pfn; tmp < highend_pfn; tmp++) { - struct page *page = pfn_to_page(tmp); - - if (!memblock_is_memory(PFN_PHYS(tmp))) - SetPageReserved(page); - else - free_highmem_page(page); - } -#endif -} - void __init mem_init(void) { /* @@ -454,7 +435,6 @@ void __init mem_init(void) maar_init(); setup_zero_pages(); /* Setup zeroed pages. */ - mem_init_free_highmem(); memblock_free_all(); #ifdef CONFIG_64BIT diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index c7708c8fad29..1bc94bca9944 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -297,20 +297,6 @@ void __init mem_init(void) memblock_free_all(); -#ifdef CONFIG_HIGHMEM - { - unsigned long pfn, highmem_mapnr; - - highmem_mapnr = lowmem_end_addr >> PAGE_SHIFT; - for (pfn = highmem_mapnr; pfn < max_mapnr; ++pfn) { - phys_addr_t paddr = (phys_addr_t)pfn << PAGE_SHIFT; - struct page *page = pfn_to_page(pfn); - if (memblock_is_memory(paddr) && !memblock_is_reserved(paddr)) - free_highmem_page(page); - } - } -#endif /* CONFIG_HIGHMEM */ - #if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP) /* * If smp is enabled, next_tlbcam_idx is initialized in the cpu up diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 81a468a9c223..043e9b6fadd0 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -232,18 +232,6 @@ static void __init taint_real_pages(void) } } -static void map_high_region(unsigned long start_pfn, unsigned long end_pfn) -{ - unsigned long tmp; - -#ifdef CONFIG_DEBUG_HIGHMEM - printk("mapping high region %08lx - %08lx\n", start_pfn, end_pfn); -#endif - - for (tmp = start_pfn; tmp < end_pfn; tmp++) - free_highmem_page(pfn_to_page(tmp)); -} - void __init mem_init(void) { int i; @@ -276,19 +264,6 @@ void __init mem_init(void) taint_real_pages(); memblock_free_all(); - - for (i = 0; sp_banks[i].num_bytes != 0; i++) { - unsigned long start_pfn = sp_banks[i].base_addr >> PAGE_SHIFT; - unsigned long end_pfn = (sp_banks[i].base_addr + sp_banks[i].num_bytes) >> PAGE_SHIFT; - - if (end_pfn <= highstart_pfn) - continue; - - if (start_pfn < highstart_pfn) - start_pfn = highstart_pfn; - - map_high_region(start_pfn, end_pfn); - } } void sparc_flush_page_to_ram(struct page *page) diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 731ee7cc40a5..585bdadba47d 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -69,9 +69,6 @@ extern unsigned long highstart_pfn, highend_pfn; arch_flush_lazy_mmu_mode(); \ } while (0) -extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn); - #endif /* __KERNEL__ */ #endif /* _ASM_X86_HIGHMEM_H */ diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 5469d7a7c40f..53ba39ce010c 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -41,10 +41,6 @@ static inline int numa_cpu_node(int cpu) } #endif /* CONFIG_NUMA */ -#ifdef CONFIG_X86_32 -# include -#endif - #ifdef CONFIG_NUMA extern void numa_set_node(int cpu, int node); extern void numa_clear_node(int cpu); diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h deleted file mode 100644 index 9c8e9e85be77..000000000000 --- a/arch/x86/include/asm/numa_32.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_NUMA_32_H -#define _ASM_X86_NUMA_32_H - -#ifdef CONFIG_HIGHMEM -extern void set_highmem_pages_init(void); -#else -static inline void set_highmem_pages_init(void) -{ -} -#endif - -#endif /* _ASM_X86_NUMA_32_H */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index e0c99a8760ca..32035d5be5a0 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -42,8 +42,6 @@ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_PTDUMP) += dump_pagetables.o obj-$(CONFIG_PTDUMP_DEBUGFS) += debug_pagetables.o -obj-$(CONFIG_HIGHMEM) += highmem_32.o - KASAN_SANITIZE_kasan_init_$(BITS).o := n obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c deleted file mode 100644 index d9efa35711ee..000000000000 --- a/arch/x86/mm/highmem_32.c +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include -#include -#include /* for totalram_pages */ -#include -#include - -void __init set_highmem_pages_init(void) -{ - struct zone *zone; - int nid; - - /* - * Explicitly reset zone->managed_pages because set_highmem_pages_init() - * is invoked before memblock_free_all() - */ - reset_all_zones_managed_pages(); - for_each_zone(zone) { - unsigned long zone_start_pfn, zone_end_pfn; - - if (!is_highmem(zone)) - continue; - - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone_start_pfn + zone->spanned_pages; - - nid = zone_to_nid(zone); - printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, nid, zone_start_pfn, zone_end_pfn); - - add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn); - } -} diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 801b659ead0c..9ee8ec2bc5d1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -394,23 +394,6 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) pkmap_page_table = virt_to_kpte(vaddr); } - -void __init add_highpages_with_active_regions(int nid, - unsigned long start_pfn, unsigned long end_pfn) -{ - phys_addr_t start, end; - u64 i; - - for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &start, &end, NULL) { - unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), - start_pfn, end_pfn); - unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), - start_pfn, end_pfn); - for ( ; pfn < e_pfn; pfn++) - if (pfn_valid(pfn)) - free_highmem_page(pfn_to_page(pfn)); - } -} #else static inline void permanent_kmaps_init(pgd_t *pgd_base) { @@ -715,17 +698,6 @@ void __init mem_init(void) #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif - /* - * With CONFIG_DEBUG_PAGEALLOC initialization of highmem pages has to - * be done before memblock_free_all(). Memblock use free low memory for - * temporary data (see find_range_array()) and for this purpose can use - * pages that was already passed to the buddy allocator, hence marked as - * not accessible in the page tables when compiled with - * CONFIG_DEBUG_PAGEALLOC. Otherwise order of initialization is not - * important here. - */ - set_highmem_pages_init(); - /* this will put all low memory onto the freelists */ memblock_free_all(); diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 9b662477b3d4..47ecbe28263e 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -129,41 +129,12 @@ void __init zones_init(void) print_vm_layout(); } -static void __init free_highpages(void) -{ -#ifdef CONFIG_HIGHMEM - unsigned long max_low = max_low_pfn; - phys_addr_t range_start, range_end; - u64 i; - - /* set highmem page free */ - for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, - &range_start, &range_end, NULL) { - unsigned long start = PFN_UP(range_start); - unsigned long end = PFN_DOWN(range_end); - - /* Ignore complete lowmem entries */ - if (end <= max_low) - continue; - - /* Truncate partial highmem entries */ - if (start < max_low) - start = max_low; - - for (; start < end; start++) - free_highmem_page(pfn_to_page(start)); - } -#endif -} - /* * Initialize memory pages. */ void __init mem_init(void) { - free_highpages(); - memblock_free_all(); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 8c4cb8c28507..6c519a5098d4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3275,7 +3275,6 @@ extern void reserve_bootmem_region(phys_addr_t start, /* Free the reserved page into the buddy system, so it gets managed. */ void free_reserved_page(struct page *page); -#define free_highmem_page(page) free_reserved_page(page) static inline void mark_page_reserved(struct page *page) { diff --git a/mm/memblock.c b/mm/memblock.c index 95af35fd1389..64ae678cd1d1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2164,8 +2164,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); + unsigned long end_pfn = PFN_DOWN(end); if (start_pfn >= end_pfn) return 0; -- 2.51.0 From 0d98484ee3330c35d1dc0da0be0871b3596cbe0d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:02 +0200 Subject: [PATCH 11/16] arch, mm: introduce arch_mm_preinit Currently, implementation of mem_init() in every architecture consists of one or more of the following: * initializations that must run before page allocator is active, for instance swiotlb_init() * a call to memblock_free_all() to release all the memory to the buddy allocator * initializations that must run after page allocator is ready and there is no arch-specific hook other than mem_init() for that, like for example register_page_bootmem_info() in x86 and sparc64 or simple setting of mem_init_done = 1 in several architectures * a bunch of semi-related stuff that apparently had no better place to live, for example a ton of BUILD_BUG_ON()s in parisc. Introduce arch_mm_preinit() that will be the first thing called from mm_core_init(). On architectures that have initializations that must happen before the page allocator is ready, move those into arch_mm_preinit() along with the code that does not depend on ordering with page allocator setup. On several architectures this results in reduction of mem_init() to a single call to memblock_free_all() that allows its consolidation next. Link: https://lkml.kernel.org/r/20250313135003.836600-13-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arc/mm/init.c | 13 ++++++------- arch/arm/mm/init.c | 21 ++++++++++++--------- arch/arm64/mm/init.c | 21 ++++++++++++--------- arch/mips/mm/init.c | 11 +++++++---- arch/powerpc/mm/mem.c | 9 ++++++--- arch/riscv/mm/init.c | 8 ++++++-- arch/s390/mm/init.c | 5 ++++- arch/sparc/mm/init_32.c | 5 ++++- arch/um/kernel/mem.c | 7 +++++-- arch/x86/mm/init_32.c | 6 +++++- arch/x86/mm/init_64.c | 5 ++++- include/linux/mm.h | 1 + mm/mm_init.c | 5 +++++ 13 files changed, 77 insertions(+), 40 deletions(-) diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 11ce638731c9..90715b4a0bfa 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -157,11 +157,16 @@ void __init setup_arch_memory(void) free_area_init(max_zone_pfn); } -static void __init highmem_init(void) +void __init arch_mm_preinit(void) { #ifdef CONFIG_HIGHMEM memblock_phys_free(high_mem_start, high_mem_sz); #endif + + BUILD_BUG_ON((PTRS_PER_PGD * sizeof(pgd_t)) > PAGE_SIZE); + BUILD_BUG_ON((PTRS_PER_PUD * sizeof(pud_t)) > PAGE_SIZE); + BUILD_BUG_ON((PTRS_PER_PMD * sizeof(pmd_t)) > PAGE_SIZE); + BUILD_BUG_ON((PTRS_PER_PTE * sizeof(pte_t)) > PAGE_SIZE); } /* @@ -172,13 +177,7 @@ static void __init highmem_init(void) */ void __init mem_init(void) { - highmem_init(); memblock_free_all(); - - BUILD_BUG_ON((PTRS_PER_PGD * sizeof(pgd_t)) > PAGE_SIZE); - BUILD_BUG_ON((PTRS_PER_PUD * sizeof(pud_t)) > PAGE_SIZE); - BUILD_BUG_ON((PTRS_PER_PMD * sizeof(pmd_t)) > PAGE_SIZE); - BUILD_BUG_ON((PTRS_PER_PTE * sizeof(pte_t)) > PAGE_SIZE); } #ifdef CONFIG_HIGHMEM diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 7bb5ce02b9b5..7222100b0631 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -237,12 +237,7 @@ static inline void poison_init_mem(void *s, size_t count) *p++ = 0xe7fddef0; } -/* - * mem_init() marks the free areas in the mem_map and tells us how much - * memory is free. This is done after various parts of the system have - * claimed their memory after the kernel image. - */ -void __init mem_init(void) +void __init arch_mm_preinit(void) { #ifdef CONFIG_ARM_LPAE swiotlb_init(max_pfn > arm_dma_pfn_limit, SWIOTLB_VERBOSE); @@ -253,9 +248,6 @@ void __init mem_init(void) memblock_phys_free(PHYS_OFFSET, __pa(swapper_pg_dir) - PHYS_OFFSET); #endif - /* this will put all unused low memory onto the freelists */ - memblock_free_all(); - /* * Check boundaries twice: Some fundamental inconsistencies can * be detected at build time already. @@ -271,6 +263,17 @@ void __init mem_init(void) #endif } +/* + * mem_init() marks the free areas in the mem_map and tells us how much + * memory is free. This is done after various parts of the system have + * claimed their memory after the kernel image. + */ +void __init mem_init(void) +{ + /* this will put all unused low memory onto the freelists */ + memblock_free_all(); +} + #ifdef CONFIG_STRICT_KERNEL_RWX struct section_perm { const char *name; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 53a0b105890b..2312e3812043 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -357,12 +357,7 @@ void __init bootmem_init(void) memblock_dump_all(); } -/* - * mem_init() marks the free areas in the mem_map and tells us how much memory - * is free. This is done after various parts of the system have claimed their - * memory after the kernel image. - */ -void __init mem_init(void) +void __init arch_mm_preinit(void) { unsigned int flags = SWIOTLB_VERBOSE; bool swiotlb = max_pfn > PFN_DOWN(arm64_dma_phys_limit); @@ -386,9 +381,6 @@ void __init mem_init(void) swiotlb_init(swiotlb, flags); swiotlb_update_mem_attributes(); - /* this will put all unused low memory onto the freelists */ - memblock_free_all(); - /* * Check boundaries twice: Some fundamental inconsistencies can be * detected at build time already. @@ -414,6 +406,17 @@ void __init mem_init(void) } } +/* + * mem_init() marks the free areas in the mem_map and tells us how much memory + * is free. This is done after various parts of the system have claimed their + * memory after the kernel image. + */ +void __init mem_init(void) +{ + /* this will put all unused low memory onto the freelists */ + memblock_free_all(); +} + void free_initmem(void) { void *lm_init_begin = lm_alias(__init_begin); diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 075177e817ac..eec38e7735dd 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -425,7 +425,7 @@ void __init paging_init(void) static struct kcore_list kcore_kseg0; #endif -void __init mem_init(void) +void __init arch_mm_preinit(void) { /* * When PFN_PTE_SHIFT is greater than PAGE_SHIFT we won't have enough PTE @@ -435,7 +435,6 @@ void __init mem_init(void) maar_init(); setup_zero_pages(); /* Setup zeroed pages. */ - memblock_free_all(); #ifdef CONFIG_64BIT if ((unsigned long) &_text > (unsigned long) CKSEG0) @@ -446,13 +445,17 @@ void __init mem_init(void) #endif } #else /* CONFIG_NUMA */ -void __init mem_init(void) +void __init arch_mm_preinit(void) { setup_zero_pages(); /* This comes from node 0 */ - memblock_free_all(); } #endif /* !CONFIG_NUMA */ +void __init mem_init(void) +{ + memblock_free_all(); +} + void free_init_pages(const char *what, unsigned long begin, unsigned long end) { unsigned long pfn; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 1bc94bca9944..68efdaf14e58 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -273,7 +273,7 @@ void __init paging_init(void) mark_nonram_nosave(); } -void __init mem_init(void) +void __init arch_mm_preinit(void) { /* * book3s is limited to 16 page sizes due to encoding this in @@ -295,8 +295,6 @@ void __init mem_init(void) kasan_late_init(); - memblock_free_all(); - #if defined(CONFIG_PPC_E500) && !defined(CONFIG_SMP) /* * If smp is enabled, next_tlbcam_idx is initialized in the cpu up @@ -329,6 +327,11 @@ void __init mem_init(void) #endif /* CONFIG_PPC32 */ } +void __init mem_init(void) +{ + memblock_free_all(); +} + void free_initmem(void) { ppc_md.progress = ppc_printk_progress; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index ac6d41e86243..9efadabf6be1 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -171,7 +171,7 @@ static void __init print_vm_layout(void) static void print_vm_layout(void) { } #endif /* CONFIG_DEBUG_VM */ -void __init mem_init(void) +void __init arch_mm_preinit(void) { bool swiotlb = max_pfn > PFN_DOWN(dma32_phys_limit); #ifdef CONFIG_FLATMEM @@ -192,11 +192,15 @@ void __init mem_init(void) } swiotlb_init(swiotlb, SWIOTLB_VERBOSE); - memblock_free_all(); print_vm_layout(); } +void __init mem_init(void) +{ + memblock_free_all(); +} + /* Limit the memory size via mem. */ static phys_addr_t memory_limit; #ifdef CONFIG_XIP_KERNEL diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 4bd6f316d71f..e771b7458d8b 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -154,7 +154,7 @@ static void pv_init(void) swiotlb_update_mem_attributes(); } -void __init mem_init(void) +void __init arch_mm_preinit(void) { cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask); cpumask_set_cpu(0, mm_cpumask(&init_mm)); @@ -163,7 +163,10 @@ void __init mem_init(void) kfence_split_mapping(); setup_zero_pages(); /* Setup zeroed pages. */ +} +void __init mem_init(void) +{ /* this will put all low memory onto the freelists */ memblock_free_all(); } diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 043e9b6fadd0..e16c32c5728f 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -232,7 +232,7 @@ static void __init taint_real_pages(void) } } -void __init mem_init(void) +void __init arch_mm_preinit(void) { int i; @@ -262,7 +262,10 @@ void __init mem_init(void) memset(sparc_valid_addr_bitmap, 0, i << 2); taint_real_pages(); +} +void __init mem_init(void) +{ memblock_free_all(); } diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index befed230aac2..cce387438e60 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -54,7 +54,7 @@ int kmalloc_ok = 0; /* Used during early boot */ static unsigned long brk_end; -void __init mem_init(void) +void __init arch_mm_preinit(void) { /* clear the zero-page */ memset(empty_zero_page, 0, PAGE_SIZE); @@ -66,10 +66,13 @@ void __init mem_init(void) map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; + max_pfn = max_low_pfn; +} +void __init mem_init(void) +{ /* this will put all low memory onto the freelists */ memblock_free_all(); - max_pfn = max_low_pfn; kmalloc_ok = 1; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9ee8ec2bc5d1..16664c5464b5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -691,13 +691,17 @@ static void __init test_wp_bit(void) panic("Linux doesn't support CPUs with broken WP."); } -void __init mem_init(void) +void __init arch_mm_preinit(void) { pci_iommu_alloc(); #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif +} + +void __init mem_init(void) +{ /* this will put all low memory onto the freelists */ memblock_free_all(); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6e8e4ef5312a..a88f7db8089e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1348,10 +1348,13 @@ failed: panic("Failed to pre-allocate %s pages for vmalloc area\n", lvl); } -void __init mem_init(void) +void __init arch_mm_preinit(void) { pci_iommu_alloc(); +} +void __init mem_init(void) +{ /* clear_bss() already clear the empty_zero_page */ /* this will put all memory onto the freelists */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 6c519a5098d4..c417e5634a58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -43,6 +43,7 @@ struct folio_batch; extern int sysctl_page_lock_unfairness; +void arch_mm_preinit(void); void mm_core_init(void); void init_mm_internals(void); diff --git a/mm/mm_init.c b/mm/mm_init.c index bd7071c32a44..6844de516a50 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2734,11 +2734,16 @@ static void __init mem_init_print_info(void) ); } +void __init __weak arch_mm_preinit(void) +{ +} + /* * Set up kernel memory allocators */ void __init mm_core_init(void) { + arch_mm_preinit(); hugetlb_bootmem_alloc(); /* Initializations relying on SMP setup */ -- 2.51.0 From 8afa901c147a41f92e83943cddf154bbb7995ee6 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 13 Mar 2025 15:50:03 +0200 Subject: [PATCH 12/16] arch, mm: make releasing of memory to page allocator more explicit The point where the memory is released from memblock to the buddy allocator is hidden inside arch-specific mem_init()s and the call to memblock_free_all() is needlessly duplicated in every artiste cure and after introduction of arch_mm_preinit() hook, mem_init() implementation on many architecture only contains the call to memblock_free_all(). Pull memblock_free_all() call into mm_core_init() and drop mem_init() on relevant architectures to make it more explicit where the free memory is released from memblock to the buddy allocator and to reduce code duplication in architecture specific code. Link: https://lkml.kernel.org/r/20250313135003.836600-14-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Acked-by: Dave Hansen [x86] Acked-by: Geert Uytterhoeven [m68k] Tested-by: Mark Brown Cc: Alexander Gordeev Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Catalin Marinas Cc: David S. Miller Cc: Dinh Nguyen Cc: Gerald Schaefer Cc: Guo Ren (csky) Cc: Heiko Carstens Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Jiaxun Yang Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Madhavan Srinivasan Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Palmer Dabbelt Cc: Richard Weinberger Cc: Russel King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/mm/init.c | 6 ------ arch/arc/mm/init.c | 11 ----------- arch/arm/mm/init.c | 11 ----------- arch/arm64/mm/init.c | 11 ----------- arch/csky/mm/init.c | 5 ----- arch/hexagon/mm/init.c | 18 ------------------ arch/loongarch/kernel/numa.c | 5 ----- arch/loongarch/mm/init.c | 5 ----- arch/m68k/mm/init.c | 2 -- arch/microblaze/mm/init.c | 3 --- arch/mips/mm/init.c | 5 ----- arch/nios2/mm/init.c | 6 ------ arch/openrisc/mm/init.c | 3 --- arch/parisc/mm/init.c | 2 -- arch/powerpc/mm/mem.c | 5 ----- arch/riscv/mm/init.c | 5 ----- arch/s390/mm/init.c | 6 ------ arch/sh/mm/init.c | 2 -- arch/sparc/mm/init_32.c | 5 ----- arch/sparc/mm/init_64.c | 2 -- arch/um/kernel/mem.c | 2 -- arch/x86/mm/init_32.c | 3 --- arch/x86/mm/init_64.c | 2 -- arch/xtensa/mm/init.c | 9 --------- include/linux/memblock.h | 1 - mm/internal.h | 3 ++- mm/mm_init.c | 5 +++++ 27 files changed, 7 insertions(+), 136 deletions(-) diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 3ab2d2f3c917..2d491b8cdab9 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -273,12 +273,6 @@ srm_paging_stop (void) } #endif -void __init -mem_init(void) -{ - memblock_free_all(); -} - static const pgprot_t protection_map[16] = { [VM_NONE] = _PAGE_P(_PAGE_FOE | _PAGE_FOW | _PAGE_FOR), diff --git a/arch/arc/mm/init.c b/arch/arc/mm/init.c index 90715b4a0bfa..a73cc94f806e 100644 --- a/arch/arc/mm/init.c +++ b/arch/arc/mm/init.c @@ -169,17 +169,6 @@ void __init arch_mm_preinit(void) BUILD_BUG_ON((PTRS_PER_PTE * sizeof(pte_t)) > PAGE_SIZE); } -/* - * mem_init - initializes memory - * - * Frees up bootmem - * Calculates and displays memory available/used - */ -void __init mem_init(void) -{ - memblock_free_all(); -} - #ifdef CONFIG_HIGHMEM int pfn_valid(unsigned long pfn) { diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 7222100b0631..54bdca025c9f 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -263,17 +263,6 @@ void __init arch_mm_preinit(void) #endif } -/* - * mem_init() marks the free areas in the mem_map and tells us how much - * memory is free. This is done after various parts of the system have - * claimed their memory after the kernel image. - */ -void __init mem_init(void) -{ - /* this will put all unused low memory onto the freelists */ - memblock_free_all(); -} - #ifdef CONFIG_STRICT_KERNEL_RWX struct section_perm { const char *name; diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 2312e3812043..4b966d5709d2 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -406,17 +406,6 @@ void __init arch_mm_preinit(void) } } -/* - * mem_init() marks the free areas in the mem_map and tells us how much memory - * is free. This is done after various parts of the system have claimed their - * memory after the kernel image. - */ -void __init mem_init(void) -{ - /* this will put all unused low memory onto the freelists */ - memblock_free_all(); -} - void free_initmem(void) { void *lm_init_begin = lm_alias(__init_begin); diff --git a/arch/csky/mm/init.c b/arch/csky/mm/init.c index 3914c2b873da..573da66b2543 100644 --- a/arch/csky/mm/init.c +++ b/arch/csky/mm/init.c @@ -42,11 +42,6 @@ unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); -void __init mem_init(void) -{ - memblock_free_all(); -} - void free_initmem(void) { free_initmem_default(-1); diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index d412c2314509..34eb9d424b96 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -43,24 +43,6 @@ DEFINE_SPINLOCK(kmap_gen_lock); /* checkpatch says don't init this to 0. */ unsigned long long kmap_generation; -/* - * mem_init - initializes memory - * - * Frees up bootmem - * Fixes up more stuff for HIGHMEM - * Calculates and displays memory available/used - */ -void __init mem_init(void) -{ - /* No idea where this is actually declared. Seems to evade LXR. */ - memblock_free_all(); - - /* - * To-Do: someone somewhere should wipe out the bootmem map - * after we're done? - */ -} - void sync_icache_dcache(pte_t pte) { unsigned long addr; diff --git a/arch/loongarch/kernel/numa.c b/arch/loongarch/kernel/numa.c index 8eb489725b1a..30a72fd528c0 100644 --- a/arch/loongarch/kernel/numa.c +++ b/arch/loongarch/kernel/numa.c @@ -387,11 +387,6 @@ void __init paging_init(void) free_area_init(zones_size); } -void __init mem_init(void) -{ - memblock_free_all(); -} - int pcibus_to_node(struct pci_bus *bus) { return dev_to_node(&bus->dev); diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 6affa3609188..fdb7f73ad160 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -75,11 +75,6 @@ void __init paging_init(void) free_area_init(max_zone_pfns); } - -void __init mem_init(void) -{ - memblock_free_all(); -} #endif /* !CONFIG_NUMA */ void __ref free_initmem(void) diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 8b11d0d545aa..488411af1b3f 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -121,7 +121,5 @@ static inline void init_pointer_tables(void) void __init mem_init(void) { - /* this will put all memory onto the freelists */ - memblock_free_all(); init_pointer_tables(); } diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 3e664e0efc33..65f0d1fb8a2a 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -107,9 +107,6 @@ void __init setup_memory(void) void __init mem_init(void) { - /* this will put all memory onto the freelists */ - memblock_free_all(); - mem_init_done = 1; } diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index eec38e7735dd..a673d3d68254 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -451,11 +451,6 @@ void __init arch_mm_preinit(void) } #endif /* !CONFIG_NUMA */ -void __init mem_init(void) -{ - memblock_free_all(); -} - void free_init_pages(const char *what, unsigned long begin, unsigned long end) { unsigned long pfn; diff --git a/arch/nios2/mm/init.c b/arch/nios2/mm/init.c index 4ba8dfa0d238..94efa3de3933 100644 --- a/arch/nios2/mm/init.c +++ b/arch/nios2/mm/init.c @@ -60,12 +60,6 @@ void __init paging_init(void) (unsigned long)empty_zero_page + PAGE_SIZE); } -void __init mem_init(void) -{ - /* this will put all memory onto the freelists */ - memblock_free_all(); -} - void __init mmu_init(void) { flush_tlb_all(); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 72c5952607ac..be1c2eb8bb94 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -196,9 +196,6 @@ void __init mem_init(void) /* clear the zero-page */ memset((void *)empty_zero_page, 0, PAGE_SIZE); - /* this will put all low memory onto the freelists */ - memblock_free_all(); - printk("mem_init_done ...........................................\n"); mem_init_done = 1; return; diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 4fbe354dc9b4..14270715d754 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -562,8 +562,6 @@ void __init mem_init(void) BUILD_BUG_ON(TMPALIAS_MAP_START >= 0x80000000); #endif - memblock_free_all(); - #ifdef CONFIG_PA11 if (boot_cpu_data.cpu_type == pcxl2 || boot_cpu_data.cpu_type == pcxl) { pcxl_dma_start = (unsigned long)SET_MAP_OFFSET(MAP_START); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 68efdaf14e58..d8fe11b64259 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -327,11 +327,6 @@ void __init arch_mm_preinit(void) #endif /* CONFIG_PPC32 */ } -void __init mem_init(void) -{ - memblock_free_all(); -} - void free_initmem(void) { ppc_md.progress = ppc_printk_progress; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 9efadabf6be1..79b649f6de72 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -196,11 +196,6 @@ void __init arch_mm_preinit(void) print_vm_layout(); } -void __init mem_init(void) -{ - memblock_free_all(); -} - /* Limit the memory size via mem. */ static phys_addr_t memory_limit; #ifdef CONFIG_XIP_KERNEL diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index e771b7458d8b..5b7b7b281334 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -165,12 +165,6 @@ void __init arch_mm_preinit(void) setup_zero_pages(); /* Setup zeroed pages. */ } -void __init mem_init(void) -{ - /* this will put all low memory onto the freelists */ - memblock_free_all(); -} - unsigned long memory_block_size_bytes(void) { /* diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 6d459ffba4bc..99e302eeeec1 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -330,8 +330,6 @@ unsigned int mem_init_done = 0; void __init mem_init(void) { - memblock_free_all(); - /* Set this up early, so we can take care of the zero page */ cpu_cache_init(); diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index e16c32c5728f..fdc93dd12c3e 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -264,11 +264,6 @@ void __init arch_mm_preinit(void) taint_real_pages(); } -void __init mem_init(void) -{ - memblock_free_all(); -} - void sparc_flush_page_to_ram(struct page *page) { unsigned long vaddr = (unsigned long)page_address(page); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 34d46adb9571..760818950464 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2505,8 +2505,6 @@ static void __init register_page_bootmem_info(void) } void __init mem_init(void) { - memblock_free_all(); - /* * Must be done after boot memory is put on freelist, because here we * might set fields in deferred struct pages that have not yet been diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index cce387438e60..379f33a1babf 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -71,8 +71,6 @@ void __init arch_mm_preinit(void) void __init mem_init(void) { - /* this will put all low memory onto the freelists */ - memblock_free_all(); kmalloc_ok = 1; } diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 16664c5464b5..95b2758b4e4d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -702,9 +702,6 @@ void __init arch_mm_preinit(void) void __init mem_init(void) { - /* this will put all low memory onto the freelists */ - memblock_free_all(); - after_bootmem = 1; x86_init.hyper.init_after_bootmem(); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a88f7db8089e..d67f15386ea2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1357,8 +1357,6 @@ void __init mem_init(void) { /* clear_bss() already clear the empty_zero_page */ - /* this will put all memory onto the freelists */ - memblock_free_all(); after_bootmem = 1; x86_init.hyper.init_after_bootmem(); diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 47ecbe28263e..cc52733a0649 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -129,15 +129,6 @@ void __init zones_init(void) print_vm_layout(); } -/* - * Initialize memory pages. - */ - -void __init mem_init(void) -{ - memblock_free_all(); -} - static void __init parse_memmap_one(char *p) { char *oldp; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e79eb6ac516f..ef5a1ecc6e59 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -133,7 +133,6 @@ int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); int memblock_reserved_mark_noinit(phys_addr_t base, phys_addr_t size); -void memblock_free_all(void); void memblock_free(void *ptr, size_t size); void reset_all_zones_managed_pages(void); diff --git a/mm/internal.h b/mm/internal.h index 558c8e2a3d94..2f52a65272c1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1475,7 +1475,8 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, } extern bool mirrored_kernelcore; -extern bool memblock_has_mirror(void); +bool memblock_has_mirror(void); +void memblock_free_all(void); static __always_inline void vma_set_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/mm/mm_init.c b/mm/mm_init.c index 6844de516a50..c82b0162f1cb 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2738,6 +2738,10 @@ void __init __weak arch_mm_preinit(void) { } +void __init __weak mem_init(void) +{ +} + /* * Set up kernel memory allocators */ @@ -2761,6 +2765,7 @@ void __init mm_core_init(void) report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); + memblock_free_all(); mem_init(); kmem_cache_init(); /* -- 2.51.0 From 4c9ea539ad59ec60676930dacee02b7adde2e0c0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:58:56 -0800 Subject: [PATCH 13/16] mm/damon/sysfs: validate user inputs from damon_sysfs_commit_input() Patch series "mm/damon/sysfs: commit parameters online via damon_call()". Due to the lack of ways to synchronously access DAMON internal data, DAMON sysfs interface is using damon_callback hooks with its own synchronization mechanism. The mechanism is built on top of damon_callback hooks in an ineifficient and complicated way. Patch series "mm/damon: replace most damon_callback usages in sysfs with new core functions", which starts with commit e035320fd38e ("mm/damon/sysfs-schemes: remove unnecessary schemes existence check in damon_sysfs_schemes_clear_regions()") introduced two new DAMON kernel API functions that providing the synchronous access, replaced most damon_callback hooks usage in DAMON sysfs interface, and cleaned up unnecessary code. Continue the replacement and cleanup works. Update the last DAMON sysfs' usage of its own synchronization mechanism, namely online DAMON parameters commit, to use damon_call() instead of the damon_callback hooks and the hard-to-maintain core-external synchronization mechanism. Then remove the no more be used code due to the change, and more unused code that just not yet cleaned up. The first four patches (patches 1-4) of this series makes DAMON sysfs interface's online parameters commit to use damon_call(). Then, following three patches (patches 5-7) remove the DAMON sysfs interface's own synchronization mechanism and its usages, which is no more be used by anyone due to the first four patches. Finally, six patches (8-13) do more cleanup of outdated comment and unused code. This patch (of 13): Online DAMON parameters commit via DAMON sysfs interface can make kdamond stop. This behavior was made because it can make the implementation simpler. The implementation tries committing the parameter without validation. If it finds something wrong in the middle of the parameters update, it returns error without reverting the partially committed parameters back. It is safe though, since it immediately breaks kdamond main loop in the case of the error return. Users can make the wrong parameters by mistake, though. Stopping kdamond in the case is not very useful behavior. Also this makes it difficult to utilize damon_call() instead of damon_callback hook for online parameters update, since damon_call() cannot immediately break kdamond main loop in the middle. Validate the input parameters and return error when it fails before starting parameters updates. In case of mistakenly wrong parameters, kdamond can continue running with the old and valid parameters. Link: https://lkml.kernel.org/r/20250306175908.66300-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250306175908.66300-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index ccd435d234b9..87e4c6e3614e 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1449,11 +1449,11 @@ static struct damon_ctx *damon_sysfs_build_ctx( * damon_sysfs_commit_input() - Commit user inputs to a running kdamond. * @kdamond: The kobject wrapper for the associated kdamond. * - * If the sysfs input is wrong, the kdamond will be terminated. + * Returns error if the sysfs input is wrong. */ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) { - struct damon_ctx *param_ctx; + struct damon_ctx *param_ctx, *test_ctx; int err; if (!damon_sysfs_kdamond_running(kdamond)) @@ -1465,7 +1465,15 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) param_ctx = damon_sysfs_build_ctx(kdamond->contexts->contexts_arr[0]); if (IS_ERR(param_ctx)) return PTR_ERR(param_ctx); + test_ctx = damon_new_ctx(); + err = damon_commit_ctx(test_ctx, param_ctx); + if (err) { + damon_sysfs_destroy_targets(test_ctx); + damon_destroy_ctx(test_ctx); + goto out; + } err = damon_commit_ctx(kdamond->damon_ctx, param_ctx); +out: damon_sysfs_destroy_targets(param_ctx); damon_destroy_ctx(param_ctx); return err; -- 2.51.0 From bf74bdfd2edb60ab44b48a6ef705207dc889dc3d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:58:57 -0800 Subject: [PATCH 14/16] mm/damon/core: invoke kdamond_call() after merging is done if possible kdamond_call() callers may iterate the regions, so better to call it when the number of regions is as small as possible. It is when kdamond_merge_regions() is finished. Invoke it on the point. This change is also aimed to make future changes for carrying online parameters commit with damon_call() easier. The commit operation should be able to make sequence between other aggregation interval based operations including regioins merging and aggregation reset. Placing damon_call() invocation after the regions merging makes the sequence handling simpler. Link: https://lkml.kernel.org/r/20250306175908.66300-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index ebbb22840435..b9a9db1a90b4 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2421,7 +2421,6 @@ static int kdamond_fn(void *data) if (ctx->callback.after_sampling && ctx->callback.after_sampling(ctx)) break; - kdamond_call(ctx, false); kdamond_usleep(sample_interval); ctx->passed_sample_intervals++; @@ -2439,9 +2438,10 @@ static int kdamond_fn(void *data) } /* - * do kdamond_apply_schemes() after kdamond_merge_regions() if - * possible, to reduce overhead + * do kdamond_call() and kdamond_apply_schemes() after + * kdamond_merge_regions() if possible, to reduce overhead */ + kdamond_call(ctx, false); if (!list_empty(&ctx->schemes)) kdamond_apply_schemes(ctx); else -- 2.51.0 From 258d941e5877f5fd40d5e636540c0a00458b8825 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:58:58 -0800 Subject: [PATCH 15/16] mm/damon/core: make damon_set_attrs() be safe to be called from damon_call() Currently all DAMON kernel API callers do online DAMON parameters commit from damon_callback->after_aggregation because only those are safe place to call the DAMON monitoring attributes update function, namely damon_set_attrs(). Because damon_callback hooks provide no synchronization, the callers work in asynchronous ways or implement their own inefficient and complicated synchronization mechanisms. It also means online DAMON parameters commit can take up to one aggregation interval. On large systems having long aggregation intervals, that can be too slow. The synchronization can be done in more efficient and simple way while removing the latency constraint if it can be done using damon_call(). The fact that damon_call() can be executed in the middle of the aggregation makes damon_set_attrs() unsafe to be called from it, though. Two real problems can occur in the case. First, converting the not yet completely aggregated nr_accesses for new user-set intervals can arguably degrade the accuracy or at least make the logic complicated. Second, kdamond_reset_aggregated() will not be called after the monitoring results update, so next aggregation starts from unclean state. This can result in inconsistent and unexpected nr_accesses_bp. Make it safe as follows. Catch the middle-of-the-aggregation case from damon_set_attrs() by checking the passed_sample_intervals and next_aggregationsis of the context. And pass the information to nr_accesses conversion logic. The logic works as before if it is not the case (called after the current aggregation is completed). If it is the case (committing parameters in the middle of the aggregation), it drops the nr_accesses information that so far aggregated, and make the status same to the beginning of this aggregation, but as if the last aggregation was started with the updated sampling/aggregation intervals. The middle-of-aggregastion check introduce yet another edge case, though. This happens because kdamond_tune_intervals() can also call damon_set_attrs() with the middle-of-aggregation check. Consider damon_call() for parameters commit and kdamond_tune_intervals() are called in same iteration of kdamond main loop. Because kdamond_tune_interval() is called for aggregation intervals, it should be the end of the aggregation. The first damon_set_attrs() call from kdamond_call() understands it is the end of the aggregation and correctly handle it. But, because the damon_set_attrs() updated next_aggregation_sis of the context. Hence, the second damon_set_attrs() invocation from kdamond_tune_interval() believes it is called in the middle of the aggregation. It therefore resets aggregated information so far. After that, kdamond_reset_interval() is called and double-reset the aggregated information. Avoid this case, too, by setting the next_aggregation_sis before kdamond_tune_intervals() is invoked. Link: https://lkml.kernel.org/r/20250306175908.66300-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 56 +++++++++++++++++++++++++++++-------- mm/damon/tests/core-kunit.h | 6 ++-- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index b9a9db1a90b4..2be4099e0666 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -603,11 +603,25 @@ static unsigned int damon_nr_accesses_for_new_attrs(unsigned int nr_accesses, } static void damon_update_monitoring_result(struct damon_region *r, - struct damon_attrs *old_attrs, struct damon_attrs *new_attrs) + struct damon_attrs *old_attrs, struct damon_attrs *new_attrs, + bool aggregating) { - r->nr_accesses = damon_nr_accesses_for_new_attrs(r->nr_accesses, - old_attrs, new_attrs); - r->nr_accesses_bp = r->nr_accesses * 10000; + if (!aggregating) { + r->nr_accesses = damon_nr_accesses_for_new_attrs( + r->nr_accesses, old_attrs, new_attrs); + r->nr_accesses_bp = r->nr_accesses * 10000; + } else { + /* + * if this is called in the middle of the aggregation, reset + * the aggregations we made so far for this aggregation + * interval. In other words, make the status like + * kdamond_reset_aggregated() is called. + */ + r->last_nr_accesses = damon_nr_accesses_for_new_attrs( + r->last_nr_accesses, old_attrs, new_attrs); + r->nr_accesses_bp = r->last_nr_accesses * 10000; + r->nr_accesses = 0; + } r->age = damon_age_for_new_attrs(r->age, old_attrs, new_attrs); } @@ -620,7 +634,7 @@ static void damon_update_monitoring_result(struct damon_region *r, * ->nr_accesses and ->age of given damon_ctx's regions for new damon_attrs. */ static void damon_update_monitoring_results(struct damon_ctx *ctx, - struct damon_attrs *new_attrs) + struct damon_attrs *new_attrs, bool aggregating) { struct damon_attrs *old_attrs = &ctx->attrs; struct damon_target *t; @@ -635,7 +649,7 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx, damon_for_each_target(t, ctx) damon_for_each_region(r, t) damon_update_monitoring_result( - r, old_attrs, new_attrs); + r, old_attrs, new_attrs, aggregating); } /* @@ -662,10 +676,10 @@ static bool damon_valid_intervals_goal(struct damon_attrs *attrs) * @ctx: monitoring context * @attrs: monitoring attributes * - * This function should be called while the kdamond is not running, or an - * access check results aggregation is not ongoing (e.g., from - * &struct damon_callback->after_aggregation or - * &struct damon_callback->after_wmarks_check callbacks). + * This function should be called while the kdamond is not running, an access + * check results aggregation is not ongoing (e.g., from &struct + * damon_callback->after_aggregation or &struct + * damon_callback->after_wmarks_check callbacks), or from damon_call(). * * Every time interval is in micro-seconds. * @@ -676,6 +690,8 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) unsigned long sample_interval = attrs->sample_interval ? attrs->sample_interval : 1; struct damos *s; + bool aggregating = ctx->passed_sample_intervals < + ctx->next_aggregation_sis; if (!damon_valid_intervals_goal(attrs)) return -EINVAL; @@ -696,7 +712,7 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) ctx->next_ops_update_sis = ctx->passed_sample_intervals + attrs->ops_update_interval / sample_interval; - damon_update_monitoring_results(ctx, attrs); + damon_update_monitoring_results(ctx, attrs, aggregating); ctx->attrs = *attrs; damon_for_each_scheme(s, ctx) @@ -2453,6 +2469,24 @@ static int kdamond_fn(void *data) if (ctx->attrs.intervals_goal.aggrs && ctx->passed_sample_intervals >= ctx->next_intervals_tune_sis) { + /* + * ctx->next_aggregation_sis might be updated + * from kdamond_call(). In the case, + * damon_set_attrs() which will be called from + * kdamond_tune_interval() may wrongly think + * this is in the middle of the current + * aggregation, and make aggregation + * information reset for all regions. Then, + * following kdamond_reset_aggregated() call + * will make the region information invalid, + * particularly for ->nr_accesses_bp. + * + * Reset ->next_aggregation_sis to avoid that. + * It will anyway correctly updated after this + * if caluse. + */ + ctx->next_aggregation_sis = + next_aggregation_sis; ctx->next_intervals_tune_sis += ctx->attrs.aggr_samples * ctx->attrs.intervals_goal.aggrs; diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index 532c6a6f21f9..be0fea9ee5fc 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -348,19 +348,19 @@ static void damon_test_update_monitoring_result(struct kunit *test) new_attrs = (struct damon_attrs){ .sample_interval = 100, .aggr_interval = 10000,}; - damon_update_monitoring_result(r, &old_attrs, &new_attrs); + damon_update_monitoring_result(r, &old_attrs, &new_attrs, false); KUNIT_EXPECT_EQ(test, r->nr_accesses, 15); KUNIT_EXPECT_EQ(test, r->age, 2); new_attrs = (struct damon_attrs){ .sample_interval = 1, .aggr_interval = 1000}; - damon_update_monitoring_result(r, &old_attrs, &new_attrs); + damon_update_monitoring_result(r, &old_attrs, &new_attrs, false); KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); KUNIT_EXPECT_EQ(test, r->age, 2); new_attrs = (struct damon_attrs){ .sample_interval = 1, .aggr_interval = 100}; - damon_update_monitoring_result(r, &old_attrs, &new_attrs); + damon_update_monitoring_result(r, &old_attrs, &new_attrs, false); KUNIT_EXPECT_EQ(test, r->nr_accesses, 150); KUNIT_EXPECT_EQ(test, r->age, 20); -- 2.51.0 From 3301f1861d34f53911a30a8f5f41b9141bd8ed39 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 6 Mar 2025 09:58:59 -0800 Subject: [PATCH 16/16] mm/damon/sysfs: handle commit command using damon_call() DAMON sysfs interface is using damon_callback->after_aggregation hook with its self-implemented synchronization mechanism for the hook. It is inefficient, complicated, and take up to one aggregation interval to complete, which can be long on some configs. Use damon_call() instead. It provides a synchronization mechanism that built inside DAMON's core layer, so more efficient than DAMON sysfs interface's own one. Also it isolates the implementation inside the core layer, and hence it makes the code easier to maintain. Finally, it takes up to one sampling interval, which is much shorter than the aggregation interval in common setups. Link: https://lkml.kernel.org/r/20250306175908.66300-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 87e4c6e3614e..c55a2cee4b74 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1451,8 +1451,9 @@ static struct damon_ctx *damon_sysfs_build_ctx( * * Returns error if the sysfs input is wrong. */ -static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) +static int damon_sysfs_commit_input(void *data) { + struct damon_sysfs_kdamond *kdamond = data; struct damon_ctx *param_ctx, *test_ctx; int err; @@ -1550,11 +1551,6 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c, bool active, if (!kdamond || kdamond->damon_ctx != c) goto out; switch (damon_sysfs_cmd_request.cmd) { - case DAMON_SYSFS_CMD_COMMIT: - if (!after_aggregation) - goto out; - err = damon_sysfs_commit_input(kdamond); - break; default: break; } @@ -1712,11 +1708,7 @@ static int damon_sysfs_update_schemes_tried_regions( * @cmd: The command to handle. * @kdamond: The kobject wrapper for the associated kdamond. * - * This function handles a DAMON sysfs command for a kdamond. For commands - * that need to access running DAMON context-internal data, it requests - * handling of the command to the DAMON callback - * (@damon_sysfs_cmd_request_callback()) and wait until it is properly handled, - * or the context is completed. + * This function handles a DAMON sysfs command for a kdamond. * * Return: 0 on success, negative error code otherwise. */ @@ -1730,6 +1722,9 @@ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, return damon_sysfs_turn_damon_on(kdamond); case DAMON_SYSFS_CMD_OFF: return damon_sysfs_turn_damon_off(kdamond); + case DAMON_SYSFS_CMD_COMMIT: + return damon_sysfs_damon_call( + damon_sysfs_commit_input, kdamond); case DAMON_SYSFS_CMD_COMMIT_SCHEMES_QUOTA_GOALS: return damon_sysfs_damon_call( damon_sysfs_commit_schemes_quota_goals, -- 2.51.0