From 7c8c76e446ca0079692fad44a3993cb1d7666c21 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:54 -0700 Subject: [PATCH 01/16] maple_tree: add mas_for_each_rev() helper Patch series "page allocation tag compression", v4. This patchset implements several improvements: 1. Gracefully handles module unloading while there are used allocations allocated from that module; 2. Provides an option to store page allocation tag references in the page flags, removing dependency on page extensions and eliminating the memory overhead from storing page allocation references (~0.2% of total system memory). This also improves page allocation performance when CONFIG_MEM_ALLOC_PROFILING is enabled by eliminating page extension lookup. Page allocation performance overhead is reduced from 41% to 5.5%. Patch #1 introduces mas_for_each_rev() helper function. Patch #2 introduces shutdown_mem_profiling() helper function to be used when disabling memory allocation profiling. Patch #3 copies module tags into virtually contiguous memory which serves two purposes: - Lets us deal with the situation when module is unloaded while there are still live allocations from that module. Since we are using a copy version of the tags we can safely unload the module. Space and gaps in this contiguous memory are managed using a maple tree. - Enables simple indexing of the tags in the later patches. Patch #4 changes the way we allocate virtually contiguous memory for module tags to reserve only vitrual area and populate physical pages only as needed at module load time. Patch #5 abstracts page allocation tag reference to simplify later changes. Patch #6 adds compression option to the sysctl.vm.mem_profiling boot parameter for storing page allocation tag references inside page flags if they fit. If the number of available page flag bits is insufficient to address all kernel allocations, memory allocation profiling gets disabled with an appropriate warning. This patch (of 6): Add mas_for_each_rev() function to iterate maple tree nodes in reverse order. Link: https://lkml.kernel.org/r/20241023170759.999909-1-surenb@google.com Link: https://lkml.kernel.org/r/20241023170759.999909-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 61c236850ca8..cbbcd18d4186 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -592,6 +592,20 @@ static __always_inline void mas_reset(struct ma_state *mas) #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. + * @__mas: Maple Tree operation state (maple_state) + * @__entry: Entry retrieved from the tree + * @__min: minimum index to retrieve from the tree + * + * When returned, mas->index and mas->last will hold the entire range for the + * entry. + * + * Note: may return the zero entry. + */ +#define mas_for_each_rev(__mas, __entry, __min) \ + while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) + #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, -- 2.51.0 From 3e09c500bb5b0606282d04438404f67df132835a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:55 -0700 Subject: [PATCH 02/16] alloc_tag: introduce shutdown_mem_profiling helper function Implement a helper function to disable memory allocation profiling and use it when creation of /proc/allocinfo fails. Ensure /proc/allocinfo does not get created when memory allocation profiling is disabled. Link: https://lkml.kernel.org/r/20241023170759.999909-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 81e5f9a70f22..435aa837e550 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -8,6 +8,14 @@ #include #include +#define ALLOCINFO_FILE_NAME "allocinfo" + +#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT +static bool mem_profiling_support __meminitdata = true; +#else +static bool mem_profiling_support __meminitdata; +#endif + static struct codetag_type *alloc_tag_cttype; DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); @@ -144,9 +152,26 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } +static void __init shutdown_mem_profiling(void) +{ + if (mem_alloc_profiling_enabled()) + static_branch_disable(&mem_alloc_profiling_key); + + if (!mem_profiling_support) + return; + + mem_profiling_support = false; +} + static void __init procfs_init(void) { - proc_create_seq("allocinfo", 0400, NULL, &allocinfo_seq_op); + if (!mem_profiling_support) + return; + + if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { + pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); + shutdown_mem_profiling(); + } } static bool alloc_tag_module_unload(struct codetag_type *cttype, @@ -174,12 +199,6 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype, return module_unused; } -#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -static bool mem_profiling_support __meminitdata = true; -#else -static bool mem_profiling_support __meminitdata; -#endif - static int __init setup_early_mem_profiling(char *str) { bool enable; -- 2.51.0 From 0db6f8d7820a4b788565dac8eed52bfc2c3216da Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:56 -0700 Subject: [PATCH 03/16] alloc_tag: load module tags into separate contiguous memory When a module gets unloaded there is a possibility that some of the allocations it made are still used and therefore the allocation tags corresponding to these allocations are still referenced. As such, the memory for these tags can't be freed. This is currently handled as an abnormal situation and module's data section is not being unloaded. To handle this situation without keeping module's data in memory, allow codetags with longer lifespan than the module to be loaded into their own separate memory. The in-use memory areas and gaps after module unloading in this separate memory are tracked using maple trees. Allocation tags arrange their separate memory so that it is virtually contiguous and that will allow simple allocation tag indexing later on in this patchset. The size of this virtually contiguous memory is set to store up to 100000 allocation tags. [surenb@google.com: fix empty codetag module section handling] Link: https://lkml.kernel.org/r/20241101000017.3856204-1-surenb@google.com [akpm@linux-foundation.org: update comment, per Dan] Link: https://lkml.kernel.org/r/20241023170759.999909-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/asm-generic/codetag.lds.h | 19 +++ include/linux/alloc_tag.h | 13 +- include/linux/codetag.h | 37 ++++- kernel/module/main.c | 84 ++++++---- lib/alloc_tag.c | 249 +++++++++++++++++++++++++++--- lib/codetag.c | 100 +++++++++++- scripts/module.lds.S | 5 +- 7 files changed, 445 insertions(+), 62 deletions(-) diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h index 64f536b80380..372c320c5043 100644 --- a/include/asm-generic/codetag.lds.h +++ b/include/asm-generic/codetag.lds.h @@ -11,4 +11,23 @@ #define CODETAG_SECTIONS() \ SECTION_WITH_BOUNDARIES(alloc_tags) +/* + * Module codetags which aren't used after module unload, therefore have the + * same lifespan as the module and can be safely unloaded with the module. + */ +#define MOD_CODETAG_SECTIONS() + +#define MOD_SEPARATE_CODETAG_SECTION(_name) \ + .codetag.##_name : { \ + SECTION_WITH_BOUNDARIES(_name) \ + } + +/* + * For codetags which might be used after module unload, therefore might stay + * longer in memory. Each such codetag type has its own section so that we can + * unload them individually once unused. + */ +#define MOD_SEPARATE_CODETAG_SECTIONS() \ + MOD_SEPARATE_CODETAG_SECTION(alloc_tags) + #endif /* __ASM_GENERIC_CODETAG_LDS_H */ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 941deffc590d..55d30543c4c7 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,6 +30,13 @@ struct alloc_tag { struct alloc_tag_counters __percpu *counters; } __aligned(8); +struct alloc_tag_module_section { + unsigned long start_addr; + unsigned long end_addr; + /* used size */ + unsigned long size; +}; + #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG #define CODETAG_EMPTY ((void *)1) @@ -54,6 +61,8 @@ static inline void set_codetag_empty(union codetag_ref *ref) {} #ifdef CONFIG_MEM_ALLOC_PROFILING +#define ALLOC_TAG_SECTION_NAME "alloc_tags" + struct codetag_bytes { struct codetag *ct; s64 bytes; @@ -76,7 +85,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #define DEFINE_ALLOC_TAG(_alloc_tag) \ static struct alloc_tag _alloc_tag __used __aligned(8) \ - __section("alloc_tags") = { \ + __section(ALLOC_TAG_SECTION_NAME) = { \ .ct = CODE_TAG_INIT, \ .counters = &_shared_alloc_tag }; @@ -85,7 +94,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #define DEFINE_ALLOC_TAG(_alloc_tag) \ static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ static struct alloc_tag _alloc_tag __used __aligned(8) \ - __section("alloc_tags") = { \ + __section(ALLOC_TAG_SECTION_NAME) = { \ .ct = CODE_TAG_INIT, \ .counters = &_alloc_tag_cntr }; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index c2a579ccd455..d10bd9810d32 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -35,8 +35,15 @@ struct codetag_type_desc { size_t tag_size; void (*module_load)(struct codetag_type *cttype, struct codetag_module *cmod); - bool (*module_unload)(struct codetag_type *cttype, + void (*module_unload)(struct codetag_type *cttype, struct codetag_module *cmod); +#ifdef CONFIG_MODULES + void (*module_replaced)(struct module *mod, struct module *new_mod); + bool (*needs_section_mem)(struct module *mod, unsigned long size); + void *(*alloc_section_mem)(struct module *mod, unsigned long size, + unsigned int prepend, unsigned long align); + void (*free_section_mem)(struct module *mod, bool used); +#endif }; struct codetag_iterator { @@ -71,11 +78,31 @@ struct codetag_type * codetag_register_type(const struct codetag_type_desc *desc); #if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) + +bool codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size); +void *codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align); +void codetag_free_module_sections(struct module *mod); +void codetag_module_replaced(struct module *mod, struct module *new_mod); void codetag_load_module(struct module *mod); -bool codetag_unload_module(struct module *mod); -#else +void codetag_unload_module(struct module *mod); + +#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ + +static inline bool +codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size) { return false; } +static inline void * +codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align) { return NULL; } +static inline void codetag_free_module_sections(struct module *mod) {} +static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {} static inline void codetag_load_module(struct module *mod) {} -static inline bool codetag_unload_module(struct module *mod) { return true; } -#endif +static inline void codetag_unload_module(struct module *mod) {} + +#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ #endif /* _LINUX_CODETAG_H */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 73b588fe98d4..00c16f5c5568 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1251,22 +1251,17 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) return 0; } -static void module_memory_free(struct module *mod, enum mod_mem_type type, - bool unload_codetags) +static void module_memory_free(struct module *mod, enum mod_mem_type type) { struct module_memory *mem = &mod->mem[type]; - void *ptr = mem->base; if (mem->is_rox) vfree(mem->rw_copy); - if (!unload_codetags && mod_mem_type_is_core_data(type)) - return; - - execmem_free(ptr); + execmem_free(mem->base); } -static void free_mod_mem(struct module *mod, bool unload_codetags) +static void free_mod_mem(struct module *mod) { for_each_mod_mem_type(type) { struct module_memory *mod_mem = &mod->mem[type]; @@ -1277,25 +1272,20 @@ static void free_mod_mem(struct module *mod, bool unload_codetags) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod, type, unload_codetags); + module_memory_free(mod, type); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod, MOD_DATA, unload_codetags); + module_memory_free(mod, MOD_DATA); } /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { - bool unload_codetags; - trace_module_free(mod); - unload_codetags = codetag_unload_module(mod); - if (!unload_codetags) - pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n", - mod->name); + codetag_unload_module(mod); mod_sysfs_teardown(mod); @@ -1338,7 +1328,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - free_mod_mem(mod, unload_codetags); + free_mod_mem(mod); } void *__symbol_get(const char *symbol) @@ -1603,6 +1593,20 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i if (WARN_ON_ONCE(type == MOD_INVALID)) continue; + /* + * Do not allocate codetag memory as we load it into + * preallocated contiguous memory. + */ + if (codetag_needs_module_section(mod, sname, s->sh_size)) { + /* + * s->sh_entsize won't be used but populate the + * type field to avoid confusion. + */ + s->sh_entsize = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) + << SH_ENTSIZE_TYPE_SHIFT; + continue; + } + s->sh_entsize = module_get_offset_and_type(mod, type, s, i); pr_debug("\t%s\n", sname); } @@ -2277,6 +2281,7 @@ static int move_module(struct module *mod, struct load_info *info) int i; enum mod_mem_type t = 0; int ret = -ENOMEM; + bool codetag_section_found = false; for_each_mod_mem_type(type) { if (!mod->mem[type].size) { @@ -2288,7 +2293,7 @@ static int move_module(struct module *mod, struct load_info *info) ret = module_memory_alloc(mod, type); if (ret) { t = type; - goto out_enomem; + goto out_err; } } @@ -2297,15 +2302,37 @@ static int move_module(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; - enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; - unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; + const char *sname; unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; - addr = (unsigned long)mod->mem[type].base + offset; - dest = mod->mem[type].rw_copy + offset; + sname = info->secstrings + shdr->sh_name; + /* + * Load codetag sections separately as they might still be used + * after module unload. + */ + if (codetag_needs_module_section(mod, sname, shdr->sh_size)) { + dest = codetag_alloc_module_section(mod, sname, shdr->sh_size, + arch_mod_section_prepend(mod, i), shdr->sh_addralign); + if (WARN_ON(!dest)) { + ret = -EINVAL; + goto out_err; + } + if (IS_ERR(dest)) { + ret = PTR_ERR(dest); + goto out_err; + } + addr = (unsigned long)dest; + codetag_section_found = true; + } else { + enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; + unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; + + addr = (unsigned long)mod->mem[type].base + offset; + dest = mod->mem[type].rw_copy + offset; + } if (shdr->sh_type != SHT_NOBITS) { /* @@ -2317,7 +2344,7 @@ static int move_module(struct module *mod, struct load_info *info) if (i == info->index.mod && (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) { ret = -ENOEXEC; - goto out_enomem; + goto out_err; } memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); } @@ -2333,9 +2360,12 @@ static int move_module(struct module *mod, struct load_info *info) } return 0; -out_enomem: +out_err: for (t--; t >= 0; t--) - module_memory_free(mod, t, true); + module_memory_free(mod, t); + if (codetag_section_found) + codetag_free_module_sections(mod); + return ret; } @@ -2456,6 +2486,8 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); + codetag_module_replaced(info->mod, mod); + return mod; } @@ -2465,7 +2497,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) percpu_modfree(mod); module_arch_freeing_init(mod); - free_mod_mem(mod, true); + free_mod_mem(mod); } int __weak module_finalize(const Elf_Ehdr *hdr, diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 435aa837e550..5f9cd1642d58 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -9,6 +10,7 @@ #include #define ALLOCINFO_FILE_NAME "allocinfo" +#define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT static bool mem_profiling_support __meminitdata = true; @@ -174,31 +176,226 @@ static void __init procfs_init(void) } } -static bool alloc_tag_module_unload(struct codetag_type *cttype, - struct codetag_module *cmod) +#ifdef CONFIG_MODULES + +static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); +/* A dummy object used to indicate an unloaded module */ +static struct module unloaded_mod; +/* A dummy object used to indicate a module prepended area */ +static struct module prepend_mod; + +static struct alloc_tag_module_section module_tags; + +static bool needs_section_mem(struct module *mod, unsigned long size) { - struct codetag_iterator iter = codetag_get_ct_iter(cttype); - struct alloc_tag_counters counter; - bool module_unused = true; - struct alloc_tag *tag; - struct codetag *ct; + return size >= sizeof(struct alloc_tag); +} + +static struct alloc_tag *find_used_tag(struct alloc_tag *from, struct alloc_tag *to) +{ + while (from <= to) { + struct alloc_tag_counters counter; - for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { - if (iter.cmod != cmod) + counter = alloc_tag_read(from); + if (counter.bytes) + return from; + from++; + } + + return NULL; +} + +/* Called with mod_area_mt locked */ +static void clean_unused_module_areas_locked(void) +{ + MA_STATE(mas, &mod_area_mt, 0, module_tags.size); + struct module *val; + + mas_for_each(&mas, val, module_tags.size) { + if (val != &unloaded_mod) continue; - tag = ct_to_alloc_tag(ct); - counter = alloc_tag_read(tag); + /* Release area if all tags are unused */ + if (!find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), + (struct alloc_tag *)(module_tags.start_addr + mas.last))) + mas_erase(&mas); + } +} + +/* Called with mod_area_mt locked */ +static bool find_aligned_area(struct ma_state *mas, unsigned long section_size, + unsigned long size, unsigned int prepend, unsigned long align) +{ + bool cleanup_done = false; + +repeat: + /* Try finding exact size and hope the start is aligned */ + if (!mas_empty_area(mas, 0, section_size - 1, prepend + size)) { + if (IS_ALIGNED(mas->index + prepend, align)) + return true; + + /* Try finding larger area to align later */ + mas_reset(mas); + if (!mas_empty_area(mas, 0, section_size - 1, + size + prepend + align - 1)) + return true; + } + + /* No free area, try cleanup stale data and repeat the search once */ + if (!cleanup_done) { + clean_unused_module_areas_locked(); + cleanup_done = true; + mas_reset(mas); + goto repeat; + } + + return false; +} + +static void *reserve_module_tags(struct module *mod, unsigned long size, + unsigned int prepend, unsigned long align) +{ + unsigned long section_size = module_tags.end_addr - module_tags.start_addr; + MA_STATE(mas, &mod_area_mt, 0, section_size - 1); + unsigned long offset; + void *ret = NULL; + + /* If no tags return error */ + if (size < sizeof(struct alloc_tag)) + return ERR_PTR(-EINVAL); + + /* + * align is always power of 2, so we can use IS_ALIGNED and ALIGN. + * align 0 or 1 means no alignment, to simplify set to 1. + */ + if (!align) + align = 1; + + mas_lock(&mas); + if (!find_aligned_area(&mas, section_size, size, prepend, align)) { + ret = ERR_PTR(-ENOMEM); + goto unlock; + } + + /* Mark found area as reserved */ + offset = mas.index; + offset += prepend; + offset = ALIGN(offset, align); + if (offset != mas.index) { + unsigned long pad_start = mas.index; + + mas.last = offset - 1; + mas_store(&mas, &prepend_mod); + if (mas_is_err(&mas)) { + ret = ERR_PTR(xa_err(mas.node)); + goto unlock; + } + mas.index = offset; + mas.last = offset + size - 1; + mas_store(&mas, mod); + if (mas_is_err(&mas)) { + mas.index = pad_start; + mas_erase(&mas); + ret = ERR_PTR(xa_err(mas.node)); + } + } else { + mas.last = offset + size - 1; + mas_store(&mas, mod); + if (mas_is_err(&mas)) + ret = ERR_PTR(xa_err(mas.node)); + } +unlock: + mas_unlock(&mas); + + if (IS_ERR(ret)) + return ret; - if (WARN(counter.bytes, - "%s:%u module %s func:%s has %llu allocated at module unload", - ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) - module_unused = false; + if (module_tags.size < offset + size) + module_tags.size = offset + size; + + return (struct alloc_tag *)(module_tags.start_addr + offset); +} + +static void release_module_tags(struct module *mod, bool used) +{ + MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size); + struct alloc_tag *tag; + struct module *val; + + mas_lock(&mas); + mas_for_each_rev(&mas, val, 0) + if (val == mod) + break; + + if (!val) /* module not found */ + goto out; + + if (!used) + goto release_area; + + /* Find out if the area is used */ + tag = find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), + (struct alloc_tag *)(module_tags.start_addr + mas.last)); + if (tag) { + struct alloc_tag_counters counter = alloc_tag_read(tag); + + pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n", + tag->ct.filename, tag->ct.lineno, tag->ct.modname, + tag->ct.function, counter.bytes); + } else { + used = false; + } +release_area: + mas_store(&mas, used ? &unloaded_mod : NULL); + val = mas_prev_range(&mas, 0); + if (val == &prepend_mod) + mas_store(&mas, NULL); +out: + mas_unlock(&mas); +} + +static void replace_module(struct module *mod, struct module *new_mod) +{ + MA_STATE(mas, &mod_area_mt, 0, module_tags.size); + struct module *val; + + mas_lock(&mas); + mas_for_each(&mas, val, module_tags.size) { + if (val != mod) + continue; + + mas_store_gfp(&mas, new_mod, GFP_KERNEL); + break; } + mas_unlock(&mas); +} + +static int __init alloc_mod_tags_mem(void) +{ + /* Allocate space to copy allocation tags */ + module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA, + MODULE_ALLOC_TAG_VMAP_SIZE); + if (!module_tags.start_addr) + return -ENOMEM; + + module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; + + return 0; +} - return module_unused; +static void __init free_mod_tags_mem(void) +{ + execmem_free((void *)module_tags.start_addr); + module_tags.start_addr = 0; } +#else /* CONFIG_MODULES */ + +static inline int alloc_mod_tags_mem(void) { return 0; } +static inline void free_mod_tags_mem(void) {} + +#endif /* CONFIG_MODULES */ + static int __init setup_early_mem_profiling(char *str) { bool enable; @@ -274,14 +471,26 @@ static inline void sysctl_init(void) {} static int __init alloc_tag_init(void) { const struct codetag_type_desc desc = { - .section = "alloc_tags", - .tag_size = sizeof(struct alloc_tag), - .module_unload = alloc_tag_module_unload, + .section = ALLOC_TAG_SECTION_NAME, + .tag_size = sizeof(struct alloc_tag), +#ifdef CONFIG_MODULES + .needs_section_mem = needs_section_mem, + .alloc_section_mem = reserve_module_tags, + .free_section_mem = release_module_tags, + .module_replaced = replace_module, +#endif }; + int res; + + res = alloc_mod_tags_mem(); + if (res) + return res; alloc_tag_cttype = codetag_register_type(&desc); - if (IS_ERR(alloc_tag_cttype)) + if (IS_ERR(alloc_tag_cttype)) { + free_mod_tags_mem(); return PTR_ERR(alloc_tag_cttype); + } sysctl_init(); procfs_init(); diff --git a/lib/codetag.c b/lib/codetag.c index d1fbbb7c2ec3..7455b966cae4 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -207,6 +207,94 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) } #ifdef CONFIG_MODULES +#define CODETAG_SECTION_PREFIX ".codetag." + +/* Some codetag types need a separate module section */ +bool codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size) +{ + const char *type_name; + struct codetag_type *cttype; + bool ret = false; + + if (strncmp(name, CODETAG_SECTION_PREFIX, strlen(CODETAG_SECTION_PREFIX))) + return false; + + type_name = name + strlen(CODETAG_SECTION_PREFIX); + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (strcmp(type_name, cttype->desc.section) == 0) { + if (!cttype->desc.needs_section_mem) + break; + + down_write(&cttype->mod_lock); + ret = cttype->desc.needs_section_mem(mod, size); + up_write(&cttype->mod_lock); + break; + } + } + mutex_unlock(&codetag_lock); + + return ret; +} + +void *codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align) +{ + const char *type_name = name + strlen(CODETAG_SECTION_PREFIX); + struct codetag_type *cttype; + void *ret = ERR_PTR(-EINVAL); + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (strcmp(type_name, cttype->desc.section) == 0) { + if (WARN_ON(!cttype->desc.alloc_section_mem)) + break; + + down_write(&cttype->mod_lock); + ret = cttype->desc.alloc_section_mem(mod, size, prepend, align); + up_write(&cttype->mod_lock); + break; + } + } + mutex_unlock(&codetag_lock); + + return ret; +} + +void codetag_free_module_sections(struct module *mod) +{ + struct codetag_type *cttype; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (!cttype->desc.free_section_mem) + continue; + + down_write(&cttype->mod_lock); + cttype->desc.free_section_mem(mod, false); + up_write(&cttype->mod_lock); + } + mutex_unlock(&codetag_lock); +} + +void codetag_module_replaced(struct module *mod, struct module *new_mod) +{ + struct codetag_type *cttype; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (!cttype->desc.module_replaced) + continue; + + down_write(&cttype->mod_lock); + cttype->desc.module_replaced(mod, new_mod); + up_write(&cttype->mod_lock); + } + mutex_unlock(&codetag_lock); +} + void codetag_load_module(struct module *mod) { struct codetag_type *cttype; @@ -220,13 +308,12 @@ void codetag_load_module(struct module *mod) mutex_unlock(&codetag_lock); } -bool codetag_unload_module(struct module *mod) +void codetag_unload_module(struct module *mod) { struct codetag_type *cttype; - bool unload_ok = true; if (!mod) - return true; + return; /* await any module's kfree_rcu() operations to complete */ kvfree_rcu_barrier(); @@ -246,18 +333,17 @@ bool codetag_unload_module(struct module *mod) } if (found) { if (cttype->desc.module_unload) - if (!cttype->desc.module_unload(cttype, cmod)) - unload_ok = false; + cttype->desc.module_unload(cttype, cmod); cttype->count -= range_size(cttype, &cmod->range); idr_remove(&cttype->mod_idr, mod_id); kfree(cmod); } up_write(&cttype->mod_lock); + if (found && cttype->desc.free_section_mem) + cttype->desc.free_section_mem(mod, true); } mutex_unlock(&codetag_lock); - - return unload_ok; } #endif /* CONFIG_MODULES */ diff --git a/scripts/module.lds.S b/scripts/module.lds.S index 3f43edef813c..711c6e029936 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -50,7 +50,7 @@ SECTIONS { .data : { *(.data .data.[0-9a-zA-Z_]*) *(.data..L*) - CODETAG_SECTIONS() + MOD_CODETAG_SECTIONS() } .rodata : { @@ -59,9 +59,10 @@ SECTIONS { } #else .data : { - CODETAG_SECTIONS() + MOD_CODETAG_SECTIONS() } #endif + MOD_SEPARATE_CODETAG_SECTIONS() } /* bring in arch-specific sections */ -- 2.51.0 From 0f9b685626daa2f8e19a9788625c9b624c223e45 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:57 -0700 Subject: [PATCH 04/16] alloc_tag: populate memory for module tags as needed The memory reserved for module tags does not need to be backed by physical pages until there are tags to store there. Change the way we reserve this memory to allocate only virtual area for the tags and populate it with physical pages as needed when we load a module. [surenb@google.com: avoid execmem_vmap() when !MMU] Link: https://lkml.kernel.org/r/20241031233611.3833002-1-surenb@google.com Link: https://lkml.kernel.org/r/20241023170759.999909-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/execmem.h | 12 +++++++ include/linux/vmalloc.h | 3 ++ lib/Kconfig.debug | 1 + lib/alloc_tag.c | 73 ++++++++++++++++++++++++++++++++++++----- mm/execmem.c | 16 +++++++++ mm/internal.h | 6 ++++ mm/vmalloc.c | 4 +-- 7 files changed, 104 insertions(+), 11 deletions(-) diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 1517fa196bf7..64130ae19690 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -139,6 +139,18 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +#ifdef CONFIG_MMU +/** + * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory + * @size: size of the virtual mapping in bytes + * + * Maps virtually contiguous area in the range suitable for EXECMEM_MODULE_DATA. + * + * Return: the area descriptor on success or %NULL on failure. + */ +struct vm_struct *execmem_vmap(size_t size); +#endif + /** * execmem_update_copy - copy an update to executable memory * @dst: destination address to update diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 27408f21e501..31e9ffd936e3 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -202,6 +202,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, + struct page **pages, unsigned int page_shift); + /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc5..6798bbbcbd32 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -993,6 +993,7 @@ config CODE_TAGGING config MEM_ALLOC_PROFILING bool "Enable memory allocation profiling" default n + depends on MMU depends on PROC_FS depends on !DEBUG_FORCE_WEAK_PER_CPU select CODE_TAGGING diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 5f9cd1642d58..4a7fc081b789 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -8,14 +8,15 @@ #include #include #include +#include #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -static bool mem_profiling_support __meminitdata = true; +static bool mem_profiling_support = true; #else -static bool mem_profiling_support __meminitdata; +static bool mem_profiling_support; #endif static struct codetag_type *alloc_tag_cttype; @@ -154,7 +155,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } -static void __init shutdown_mem_profiling(void) +static void shutdown_mem_profiling(void) { if (mem_alloc_profiling_enabled()) static_branch_disable(&mem_alloc_profiling_key); @@ -179,6 +180,7 @@ static void __init procfs_init(void) #ifdef CONFIG_MODULES static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); +static struct vm_struct *vm_module_tags; /* A dummy object used to indicate an unloaded module */ static struct module unloaded_mod; /* A dummy object used to indicate a module prepended area */ @@ -252,6 +254,33 @@ repeat: return false; } +static int vm_module_tags_populate(void) +{ + unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT; + + if (phys_size < module_tags.size) { + struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages; + unsigned long addr = module_tags.start_addr + phys_size; + unsigned long more_pages; + unsigned long nr; + + more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT; + nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, + NUMA_NO_NODE, more_pages, next_page); + if (nr < more_pages || + vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL, + next_page, PAGE_SHIFT) < 0) { + /* Clean up and error out */ + for (int i = 0; i < nr; i++) + __free_page(next_page[i]); + return -ENOMEM; + } + vm_module_tags->nr_pages += nr; + } + + return 0; +} + static void *reserve_module_tags(struct module *mod, unsigned long size, unsigned int prepend, unsigned long align) { @@ -310,8 +339,18 @@ unlock: if (IS_ERR(ret)) return ret; - if (module_tags.size < offset + size) + if (module_tags.size < offset + size) { + int grow_res; + module_tags.size = offset + size; + grow_res = vm_module_tags_populate(); + if (grow_res) { + shutdown_mem_profiling(); + pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n", + mod->name); + return ERR_PTR(grow_res); + } + } return (struct alloc_tag *)(module_tags.start_addr + offset); } @@ -372,12 +411,23 @@ static void replace_module(struct module *mod, struct module *new_mod) static int __init alloc_mod_tags_mem(void) { - /* Allocate space to copy allocation tags */ - module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA, - MODULE_ALLOC_TAG_VMAP_SIZE); - if (!module_tags.start_addr) + /* Map space to copy allocation tags */ + vm_module_tags = execmem_vmap(MODULE_ALLOC_TAG_VMAP_SIZE); + if (!vm_module_tags) { + pr_err("Failed to map %lu bytes for module allocation tags\n", + MODULE_ALLOC_TAG_VMAP_SIZE); + module_tags.start_addr = 0; return -ENOMEM; + } + vm_module_tags->pages = kmalloc_array(get_vm_area_size(vm_module_tags) >> PAGE_SHIFT, + sizeof(struct page *), GFP_KERNEL | __GFP_ZERO); + if (!vm_module_tags->pages) { + free_vm_area(vm_module_tags); + return -ENOMEM; + } + + module_tags.start_addr = (unsigned long)vm_module_tags->addr; module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; return 0; @@ -385,8 +435,13 @@ static int __init alloc_mod_tags_mem(void) static void __init free_mod_tags_mem(void) { - execmem_free((void *)module_tags.start_addr); + int i; + module_tags.start_addr = 0; + for (i = 0; i < vm_module_tags->nr_pages; i++) + __free_page(vm_module_tags->pages[i]); + kfree(vm_module_tags->pages); + free_vm_area(vm_module_tags); } #else /* CONFIG_MODULES */ diff --git a/mm/execmem.c b/mm/execmem.c index 576a57e2161f..317b6a8d35be 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -64,6 +64,22 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size, return p; } + +struct vm_struct *execmem_vmap(size_t size) +{ + struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA]; + struct vm_struct *area; + + area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, + range->start, range->end, NUMA_NO_NODE, + GFP_KERNEL, __builtin_return_address(0)); + if (!area && range->fallback_start) + area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, + range->fallback_start, range->fallback_end, + NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); + + return area; +} #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) diff --git a/mm/internal.h b/mm/internal.h index 3dc745ba76dd..cd96848be245 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1263,6 +1263,12 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); +struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long align, unsigned long shift, + unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, + const void *caller); + /* * mm/gup.c */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 74c0a5eae210..7ed39d104201 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -653,7 +653,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, * RETURNS: * 0 on success, -errno on failure. */ -static int vmap_pages_range(unsigned long addr, unsigned long end, +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; @@ -3106,7 +3106,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) vm->flags &= ~VM_UNINITIALIZED; } -static struct vm_struct *__get_vm_area_node(unsigned long size, +struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) -- 2.51.0 From 42895a86124418d8dd29a93812bc282e569ccfee Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:58 -0700 Subject: [PATCH 05/16] alloc_tag: introduce pgtag_ref_handle to abstract page tag references To simplify later changes to page tag references, introduce new pgtag_ref_handle type. This allows easy replacement of page_ext as a storage of page allocation tags. Link: https://lkml.kernel.org/r/20241023170759.999909-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 25 +++++----- include/linux/pgalloc_tag.h | 92 ++++++++++++++++++++++--------------- 2 files changed, 67 insertions(+), 50 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index eb070c14e309..f9120ac6d901 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4181,37 +4181,38 @@ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new return; for (i = nr_pages; i < (1 << old_order); i += nr_pages) { - union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i)); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { + if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { /* Set new reference to point to the original tag */ - alloc_tag_ref_set(ref, tag); - put_page_tag_ref(ref); + alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) { + union pgtag_ref_handle handle; + union codetag_ref ref; struct alloc_tag *tag; - union codetag_ref *ref; tag = pgalloc_tag_get(&old->page); if (!tag) return; - ref = get_page_tag_ref(&new->page); - if (!ref) + if (!get_page_tag_ref(&new->page, &ref, &handle)) return; /* Clear the old ref to the original allocation tag. */ clear_page_tag_ref(&old->page); /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(ref, folio_size(new)); - - __alloc_tag_ref_set(ref, tag); - - put_page_tag_ref(ref); + alloc_tag_sub(&ref, folio_size(new)); + __alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } #else /* !CONFIG_MEM_ALLOC_PROFILING */ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 59a3deb792a8..b13cd3313a88 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -11,46 +11,59 @@ #include +union pgtag_ref_handle { + union codetag_ref *ref; /* reference in page extension */ +}; + extern struct page_ext_operations page_alloc_tagging_ops; -static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) +/* Should be called only if mem_alloc_profiling_enabled() */ +static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, + union pgtag_ref_handle *handle) { - return (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); -} + struct page_ext *page_ext; + union codetag_ref *tmp; -static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) -{ - return (void *)ref - page_alloc_tagging_ops.offset; + if (!page) + return false; + + page_ext = page_ext_get(page); + if (!page_ext) + return false; + + tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); + ref->ct = tmp->ct; + handle->ref = tmp; + return true; } -/* Should be called only if mem_alloc_profiling_enabled() */ -static inline union codetag_ref *get_page_tag_ref(struct page *page) +static inline void put_page_tag_ref(union pgtag_ref_handle handle) { - if (page) { - struct page_ext *page_ext = page_ext_get(page); + if (WARN_ON(!handle.ref)) + return; - if (page_ext) - return codetag_ref_from_page_ext(page_ext); - } - return NULL; + page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); } -static inline void put_page_tag_ref(union codetag_ref *ref) +static inline void update_page_tag_ref(union pgtag_ref_handle handle, + union codetag_ref *ref) { - if (WARN_ON(!ref)) + if (WARN_ON(!handle.ref || !ref)) return; - page_ext_put(page_ext_from_codetag_ref(ref)); + handle.ref->ct = ref->ct; } static inline void clear_page_tag_ref(struct page *page) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + set_codetag_empty(&ref); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -59,11 +72,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE * nr); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -71,11 +86,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - alloc_tag_sub(ref, PAGE_SIZE * nr); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub(&ref, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -85,13 +102,14 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) struct alloc_tag *tag = NULL; if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - alloc_tag_sub_check(ref); - if (ref) { - if (ref->ct) - tag = ct_to_alloc_tag(ref->ct); - put_page_tag_ref(ref); + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub_check(&ref); + if (ref.ct) + tag = ct_to_alloc_tag(ref.ct); + put_page_tag_ref(handle); } } @@ -106,8 +124,6 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) #else /* CONFIG_MEM_ALLOC_PROFILING */ -static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } -static inline void put_page_tag_ref(union codetag_ref *ref) {} static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} -- 2.51.0 From 4835f747d3ed181bf2c67930fe06b2c01a5d2323 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:59 -0700 Subject: [PATCH 06/16] alloc_tag: support for page allocation tag compression Implement support for storing page allocation tag references directly in the page flags instead of page extensions. sysctl.vm.mem_profiling boot parameter it extended to provide a way for a user to request this mode. Enabling compression eliminates memory overhead caused by page_ext and results in better performance for page allocations. However this mode will not work if the number of available page flag bits is insufficient to address all kernel allocations. Such condition can happen during boot or when loading a module. If this condition is detected, memory allocation profiling gets disabled with an appropriate warning. By default compression mode is disabled. Link: https://lkml.kernel.org/r/20241023170759.999909-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/allocation-profiling.rst | 7 +- include/linux/alloc_tag.h | 10 +- include/linux/codetag.h | 3 + include/linux/page-flags-layout.h | 7 ++ include/linux/pgalloc_tag.h | 145 +++++++++++++++++++--- lib/alloc_tag.c | 142 +++++++++++++++++++-- lib/codetag.c | 4 +- mm/mm_init.c | 5 +- 8 files changed, 290 insertions(+), 33 deletions(-) diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst index ffd6655b7be2..316311240e6a 100644 --- a/Documentation/mm/allocation-profiling.rst +++ b/Documentation/mm/allocation-profiling.rst @@ -18,12 +18,17 @@ kconfig options: missing annotation Boot parameter: - sysctl.vm.mem_profiling=0|1|never + sysctl.vm.mem_profiling={0|1|never}[,compressed] When set to "never", memory allocation profiling overhead is minimized and it cannot be enabled at runtime (sysctl becomes read-only). When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y, default value is "1". When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, default value is "never". + "compressed" optional parameter will try to store page tag references in a + compact format, avoiding page extensions. This results in improved performance + and memory consumption, however it might fail depending on system configuration. + If compression fails, a warning is issued and memory allocation profiling gets + disabled. sysctl: /proc/sys/vm/mem_profiling diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 55d30543c4c7..7c0786bdf9af 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,8 +30,16 @@ struct alloc_tag { struct alloc_tag_counters __percpu *counters; } __aligned(8); +struct alloc_tag_kernel_section { + struct alloc_tag *first_tag; + unsigned long count; +}; + struct alloc_tag_module_section { - unsigned long start_addr; + union { + unsigned long start_addr; + struct alloc_tag *first_tag; + }; unsigned long end_addr; /* used size */ unsigned long size; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index d10bd9810d32..d14dbd26b370 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -13,6 +13,9 @@ struct codetag_module; struct seq_buf; struct module; +#define CODETAG_SECTION_START_PREFIX "__start_" +#define CODETAG_SECTION_STOP_PREFIX "__stop_" + /* * An instance of this structure is created in a special ELF section at every * code location being tagged. At runtime, the special section is treated as diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 7d79818dc065..4f5c9e979bb9 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -111,5 +111,12 @@ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) +#define NR_NON_PAGEFLAG_BITS (SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + \ + LAST_CPUPID_SHIFT + KASAN_TAG_WIDTH + \ + LRU_GEN_WIDTH + LRU_REFS_WIDTH) + +#define NR_UNUSED_PAGEFLAG_BITS (BITS_PER_LONG - \ + (NR_NON_PAGEFLAG_BITS + NR_PAGEFLAGS)) + #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index b13cd3313a88..1fe63b52e5e5 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -11,29 +11,118 @@ #include +extern struct page_ext_operations page_alloc_tagging_ops; +extern unsigned long alloc_tag_ref_mask; +extern int alloc_tag_ref_offs; +extern struct alloc_tag_kernel_section kernel_tags; + +DECLARE_STATIC_KEY_FALSE(mem_profiling_compressed); + +typedef u16 pgalloc_tag_idx; + union pgtag_ref_handle { union codetag_ref *ref; /* reference in page extension */ + struct page *page; /* reference in page flags */ }; -extern struct page_ext_operations page_alloc_tagging_ops; +/* Reserved indexes */ +#define CODETAG_ID_NULL 0 +#define CODETAG_ID_EMPTY 1 +#define CODETAG_ID_FIRST 2 + +#ifdef CONFIG_MODULES + +extern struct alloc_tag_module_section module_tags; + +static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx) +{ + return &module_tags.first_tag[idx - kernel_tags.count]; +} + +static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag) +{ + return CODETAG_ID_FIRST + kernel_tags.count + (tag - module_tags.first_tag); +} + +#else /* CONFIG_MODULES */ + +static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx) +{ + pr_warn("invalid page tag reference %lu\n", (unsigned long)idx); + return NULL; +} + +static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag) +{ + pr_warn("invalid page tag 0x%lx\n", (unsigned long)tag); + return CODETAG_ID_NULL; +} + +#endif /* CONFIG_MODULES */ + +static inline void idx_to_ref(pgalloc_tag_idx idx, union codetag_ref *ref) +{ + switch (idx) { + case (CODETAG_ID_NULL): + ref->ct = NULL; + break; + case (CODETAG_ID_EMPTY): + set_codetag_empty(ref); + break; + default: + idx -= CODETAG_ID_FIRST; + ref->ct = idx < kernel_tags.count ? + &kernel_tags.first_tag[idx].ct : + &module_idx_to_tag(idx)->ct; + break; + } +} + +static inline pgalloc_tag_idx ref_to_idx(union codetag_ref *ref) +{ + struct alloc_tag *tag; + + if (!ref->ct) + return CODETAG_ID_NULL; + + if (is_codetag_empty(ref)) + return CODETAG_ID_EMPTY; + + tag = ct_to_alloc_tag(ref->ct); + if (tag >= kernel_tags.first_tag && tag < kernel_tags.first_tag + kernel_tags.count) + return CODETAG_ID_FIRST + (tag - kernel_tags.first_tag); + + return module_tag_to_idx(tag); +} + + /* Should be called only if mem_alloc_profiling_enabled() */ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, union pgtag_ref_handle *handle) { - struct page_ext *page_ext; - union codetag_ref *tmp; - if (!page) return false; - page_ext = page_ext_get(page); - if (!page_ext) - return false; + if (static_key_enabled(&mem_profiling_compressed)) { + pgalloc_tag_idx idx; + + idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask; + idx_to_ref(idx, ref); + handle->page = page; + } else { + struct page_ext *page_ext; + union codetag_ref *tmp; + + page_ext = page_ext_get(page); + if (!page_ext) + return false; + + tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); + ref->ct = tmp->ct; + handle->ref = tmp; + } - tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); - ref->ct = tmp->ct; - handle->ref = tmp; return true; } @@ -42,16 +131,35 @@ static inline void put_page_tag_ref(union pgtag_ref_handle handle) if (WARN_ON(!handle.ref)) return; - page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); + if (!static_key_enabled(&mem_profiling_compressed)) + page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); } -static inline void update_page_tag_ref(union pgtag_ref_handle handle, - union codetag_ref *ref) +static inline void update_page_tag_ref(union pgtag_ref_handle handle, union codetag_ref *ref) { - if (WARN_ON(!handle.ref || !ref)) - return; - - handle.ref->ct = ref->ct; + if (static_key_enabled(&mem_profiling_compressed)) { + struct page *page = handle.page; + unsigned long old_flags; + unsigned long flags; + unsigned long idx; + + if (WARN_ON(!page || !ref)) + return; + + idx = (unsigned long)ref_to_idx(ref); + idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs; + do { + old_flags = READ_ONCE(page->flags); + flags = old_flags; + flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs); + flags |= idx; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } else { + if (WARN_ON(!handle.ref || !ref)) + return; + + handle.ref->ct = ref->ct; + } } static inline void clear_page_tag_ref(struct page *page) @@ -122,6 +230,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } +void __init alloc_tag_sec_init(void); + #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void clear_page_tag_ref(struct page *page) {} @@ -130,6 +240,7 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} +static inline void alloc_tag_sec_init(void) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 4a7fc081b789..d38a4b2a551d 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -12,6 +13,8 @@ #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) +#define SECTION_START(NAME) (CODETAG_SECTION_START_PREFIX NAME) +#define SECTION_STOP(NAME) (CODETAG_SECTION_STOP_PREFIX NAME) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT static bool mem_profiling_support = true; @@ -26,6 +29,11 @@ EXPORT_SYMBOL(_shared_alloc_tag); DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, mem_alloc_profiling_key); +DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed); + +struct alloc_tag_kernel_section kernel_tags = { NULL, 0 }; +unsigned long alloc_tag_ref_mask; +int alloc_tag_ref_offs; struct allocinfo_private { struct codetag_iterator iter; @@ -155,7 +163,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } -static void shutdown_mem_profiling(void) +static void shutdown_mem_profiling(bool remove_file) { if (mem_alloc_profiling_enabled()) static_branch_disable(&mem_alloc_profiling_key); @@ -163,6 +171,8 @@ static void shutdown_mem_profiling(void) if (!mem_profiling_support) return; + if (remove_file) + remove_proc_entry(ALLOCINFO_FILE_NAME, NULL); mem_profiling_support = false; } @@ -173,10 +183,40 @@ static void __init procfs_init(void) if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); - shutdown_mem_profiling(); + shutdown_mem_profiling(false); } } +void __init alloc_tag_sec_init(void) +{ + struct alloc_tag *last_codetag; + + if (!mem_profiling_support) + return; + + if (!static_key_enabled(&mem_profiling_compressed)) + return; + + kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name( + SECTION_START(ALLOC_TAG_SECTION_NAME)); + last_codetag = (struct alloc_tag *)kallsyms_lookup_name( + SECTION_STOP(ALLOC_TAG_SECTION_NAME)); + kernel_tags.count = last_codetag - kernel_tags.first_tag; + + /* Check if kernel tags fit into page flags */ + if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) { + shutdown_mem_profiling(false); /* allocinfo file does not exist yet */ + pr_err("%lu allocation tags cannot be references using %d available page flag bits. Memory allocation profiling is disabled!\n", + kernel_tags.count, NR_UNUSED_PAGEFLAG_BITS); + return; + } + + alloc_tag_ref_offs = (LRU_REFS_PGOFF - NR_UNUSED_PAGEFLAG_BITS); + alloc_tag_ref_mask = ((1UL << NR_UNUSED_PAGEFLAG_BITS) - 1); + pr_debug("Memory allocation profiling compression is using %d page flag bits!\n", + NR_UNUSED_PAGEFLAG_BITS); +} + #ifdef CONFIG_MODULES static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); @@ -186,10 +226,59 @@ static struct module unloaded_mod; /* A dummy object used to indicate a module prepended area */ static struct module prepend_mod; -static struct alloc_tag_module_section module_tags; +struct alloc_tag_module_section module_tags; + +static inline unsigned long alloc_tag_align(unsigned long val) +{ + if (!static_key_enabled(&mem_profiling_compressed)) { + /* No alignment requirements when we are not indexing the tags */ + return val; + } + + if (val % sizeof(struct alloc_tag) == 0) + return val; + return ((val / sizeof(struct alloc_tag)) + 1) * sizeof(struct alloc_tag); +} + +static bool ensure_alignment(unsigned long align, unsigned int *prepend) +{ + if (!static_key_enabled(&mem_profiling_compressed)) { + /* No alignment requirements when we are not indexing the tags */ + return true; + } + + /* + * If alloc_tag size is not a multiple of required alignment, tag + * indexing does not work. + */ + if (!IS_ALIGNED(sizeof(struct alloc_tag), align)) + return false; + + /* Ensure prepend consumes multiple of alloc_tag-sized blocks */ + if (*prepend) + *prepend = alloc_tag_align(*prepend); + + return true; +} + +static inline bool tags_addressable(void) +{ + unsigned long tag_idx_count; + + if (!static_key_enabled(&mem_profiling_compressed)) + return true; /* with page_ext tags are always addressable */ + + tag_idx_count = CODETAG_ID_FIRST + kernel_tags.count + + module_tags.size / sizeof(struct alloc_tag); + + return tag_idx_count < (1UL << NR_UNUSED_PAGEFLAG_BITS); +} static bool needs_section_mem(struct module *mod, unsigned long size) { + if (!mem_profiling_support) + return false; + return size >= sizeof(struct alloc_tag); } @@ -300,6 +389,13 @@ static void *reserve_module_tags(struct module *mod, unsigned long size, if (!align) align = 1; + if (!ensure_alignment(align, &prepend)) { + shutdown_mem_profiling(true); + pr_err("%s: alignment %lu is incompatible with allocation tag indexing. Memory allocation profiling is disabled!\n", + mod->name, align); + return ERR_PTR(-EINVAL); + } + mas_lock(&mas); if (!find_aligned_area(&mas, section_size, size, prepend, align)) { ret = ERR_PTR(-ENOMEM); @@ -343,9 +439,15 @@ unlock: int grow_res; module_tags.size = offset + size; + if (mem_alloc_profiling_enabled() && !tags_addressable()) { + shutdown_mem_profiling(true); + pr_warn("With module %s there are too many tags to fit in %d page flag bits. Memory allocation profiling is disabled!\n", + mod->name, NR_UNUSED_PAGEFLAG_BITS); + } + grow_res = vm_module_tags_populate(); if (grow_res) { - shutdown_mem_profiling(); + shutdown_mem_profiling(true); pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n", mod->name); return ERR_PTR(grow_res); @@ -429,6 +531,8 @@ static int __init alloc_mod_tags_mem(void) module_tags.start_addr = (unsigned long)vm_module_tags->addr; module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; + /* Ensure the base is alloc_tag aligned when required for indexing */ + module_tags.start_addr = alloc_tag_align(module_tags.start_addr); return 0; } @@ -451,8 +555,10 @@ static inline void free_mod_tags_mem(void) {} #endif /* CONFIG_MODULES */ +/* See: Documentation/mm/allocation-profiling.rst */ static int __init setup_early_mem_profiling(char *str) { + bool compressed = false; bool enable; if (!str || !str[0]) @@ -461,22 +567,37 @@ static int __init setup_early_mem_profiling(char *str) if (!strncmp(str, "never", 5)) { enable = false; mem_profiling_support = false; + pr_info("Memory allocation profiling is disabled!\n"); } else { - int res; + char *token = strsep(&str, ","); + + if (kstrtobool(token, &enable)) + return -EINVAL; - res = kstrtobool(str, &enable); - if (res) - return res; + if (str) { + if (strcmp(str, "compressed")) + return -EINVAL; + + compressed = true; + } mem_profiling_support = true; + pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n", + compressed ? "with" : "without", enable ? "on" : "off"); } - if (enable != static_key_enabled(&mem_alloc_profiling_key)) { + if (enable != mem_alloc_profiling_enabled()) { if (enable) static_branch_enable(&mem_alloc_profiling_key); else static_branch_disable(&mem_alloc_profiling_key); } + if (compressed != static_key_enabled(&mem_profiling_compressed)) { + if (compressed) + static_branch_enable(&mem_profiling_compressed); + else + static_branch_disable(&mem_profiling_compressed); + } return 0; } @@ -484,6 +605,9 @@ early_param("sysctl.vm.mem_profiling", setup_early_mem_profiling); static __init bool need_page_alloc_tagging(void) { + if (static_key_enabled(&mem_profiling_compressed)) + return false; + return mem_profiling_support; } diff --git a/lib/codetag.c b/lib/codetag.c index 7455b966cae4..42aadd6c1454 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -149,8 +149,8 @@ static struct codetag_range get_section_range(struct module *mod, const char *section) { return (struct codetag_range) { - get_symbol(mod, "__start_", section), - get_symbol(mod, "__stop_", section), + get_symbol(mod, CODETAG_SECTION_START_PREFIX, section), + get_symbol(mod, CODETAG_SECTION_STOP_PREFIX, section), }; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 4ba5607aaf19..1c205b0a86ed 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -83,8 +83,7 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = BITS_PER_LONG; - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + width = shift - NR_NON_PAGEFLAG_BITS; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", SECTIONS_WIDTH, @@ -2639,7 +2638,7 @@ void __init mm_core_init(void) BUILD_BUG_ON(MAX_ZONELISTS > 2); build_all_zonelists(NULL); page_alloc_init_cpuhp(); - + alloc_tag_sec_init(); /* * page_ext requires contiguous pages, * bigger than MAX_PAGE_ORDER unless SPARSEMEM. -- 2.51.0 From b7fc16a16b0850e41b88eae0edfa4c085c012347 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 24 Oct 2024 09:23:18 -0700 Subject: [PATCH 07/16] mm/codetag: uninline and move pgalloc_tag_copy and pgalloc_tag_split pgalloc_tag_copy() and pgalloc_tag_split() are sizable and outside of any performance-critical paths, so it should be fine to uninline them. Also move their declarations into pgalloc_tag.h which seems like a more appropriate place for them. No functional changes other than uninlining. Link: https://lkml.kernel.org/r/20241024162318.1640781-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Acked-by: Yu Zhao Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Signed-off-by: Andrew Morton --- include/linux/mm.h | 58 ------------------------------------- include/linux/pgalloc_tag.h | 5 ++++ lib/alloc_tag.c | 48 ++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 58 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f9120ac6d901..08e487e5850a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4166,62 +4166,4 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif -#ifdef CONFIG_MEM_ALLOC_PROFILING -static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) -{ - int i; - struct alloc_tag *tag; - unsigned int nr_pages = 1 << new_order; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = pgalloc_tag_get(&folio->page); - if (!tag) - return; - - for (i = nr_pages; i < (1 << old_order); i += nr_pages) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { - /* Set new reference to point to the original tag */ - alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} - -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) -{ - union pgtag_ref_handle handle; - union codetag_ref ref; - struct alloc_tag *tag; - - tag = pgalloc_tag_get(&old->page); - if (!tag) - return; - - if (!get_page_tag_ref(&new->page, &ref, &handle)) - return; - - /* Clear the old ref to the original allocation tag. */ - clear_page_tag_ref(&old->page); - /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(&ref, folio_size(new)); - __alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); -} -#else /* !CONFIG_MEM_ALLOC_PROFILING */ -static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) -{ -} - -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) -{ -} -#endif /* CONFIG_MEM_ALLOC_PROFILING */ - #endif /* _LINUX_MM_H */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 1fe63b52e5e5..0e43ab653ab6 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -230,6 +230,9 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } +void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); +void pgalloc_tag_copy(struct folio *new, struct folio *old); + void __init alloc_tag_sec_init(void); #else /* CONFIG_MEM_ALLOC_PROFILING */ @@ -241,6 +244,8 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index d38a4b2a551d..2414a7ee7ec7 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -163,6 +163,54 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } +void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ + int i; + struct alloc_tag *tag; + unsigned int nr_pages = 1 << new_order; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = pgalloc_tag_get(&folio->page); + if (!tag) + return; + + for (i = nr_pages; i < (1 << old_order); i += nr_pages) { + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { + /* Set new reference to point to the original tag */ + alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } + } +} + +void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + struct alloc_tag *tag; + + tag = pgalloc_tag_get(&old->page); + if (!tag) + return; + + if (!get_page_tag_ref(&new->page, &ref, &handle)) + return; + + /* Clear the old ref to the original allocation tag. */ + clear_page_tag_ref(&old->page); + /* Decrement the counters of the tag on get_new_folio. */ + alloc_tag_sub(&ref, folio_size(new)); + __alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); +} + static void shutdown_mem_profiling(bool remove_file) { if (mem_alloc_profiling_enabled()) -- 2.51.0 From 91d0ec834786a4c9e1e0c55f0fffc8c82cd66cd7 Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Tue, 1 Oct 2024 23:23:58 +0530 Subject: [PATCH 08/16] zsmalloc: replace kmap_atomic with kmap_local_page The use of kmap_atomic/kunmap_atomic is deprecated. Replace it will kmap_local_page/kunmap_local all over the place. Also fix SPDX missing license header. WARNING: Missing or malformed SPDX-License-Identifier tag in line 1 WARNING: Deprecated use of 'kmap_atomic', prefer 'kmap_local_page' instead + vaddr = kmap_atomic(page); Link: https://lkml.kernel.org/r/20241001175358.12970-1-quic_pintu@quicinc.com Signed-off-by: Pintu Kumar Cc: Joe Perches Cc: Minchan Kim Cc: Pintu Agarwal Cc: Sergey Senozhatsky Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 66 ++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 16a07def09c9..d3ff10160a5f 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + /* * zsmalloc memory allocator * @@ -898,7 +900,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) set_first_obj_offset(page, off); - vaddr = kmap_atomic(page); + vaddr = kmap_local_page(page); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { @@ -921,7 +923,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) */ link->next = -1UL << OBJ_TAG_BITS; } - kunmap_atomic(vaddr); + kunmap_local(vaddr); page = next_page; off %= PAGE_SIZE; } @@ -1059,12 +1061,12 @@ static void *__zs_map_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ - addr = kmap_atomic(pages[0]); + addr = kmap_local_page(pages[0]); memcpy(buf, addr + off, sizes[0]); - kunmap_atomic(addr); - addr = kmap_atomic(pages[1]); + kunmap_local(addr); + addr = kmap_local_page(pages[1]); memcpy(buf + sizes[0], addr, sizes[1]); - kunmap_atomic(addr); + kunmap_local(addr); out: return area->vm_buf; } @@ -1089,12 +1091,12 @@ static void __zs_unmap_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ - addr = kmap_atomic(pages[0]); + addr = kmap_local_page(pages[0]); memcpy(addr + off, buf, sizes[0]); - kunmap_atomic(addr); - addr = kmap_atomic(pages[1]); + kunmap_local(addr); + addr = kmap_local_page(pages[1]); memcpy(addr, buf + sizes[0], sizes[1]); - kunmap_atomic(addr); + kunmap_local(addr); out: /* enable page faults to match kunmap_atomic() return conditions */ @@ -1223,7 +1225,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ - area->vm_addr = kmap_atomic(page); + area->vm_addr = kmap_local_page(page); ret = area->vm_addr + off; goto out; } @@ -1260,7 +1262,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) - kunmap_atomic(area->vm_addr); + kunmap_local(area->vm_addr); else { struct page *pages[2]; @@ -1318,7 +1320,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, for (i = 0; i < nr_page; i++) m_page = get_next_page(m_page); - vaddr = kmap_atomic(m_page); + vaddr = kmap_local_page(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); if (likely(!ZsHugePage(zspage))) @@ -1328,7 +1330,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, /* record handle to page->index */ zspage->first_page->index = handle | OBJ_ALLOCATED_TAG; - kunmap_atomic(vaddr); + kunmap_local(vaddr); mod_zspage_inuse(zspage, 1); obj = location_to_obj(m_page, obj); @@ -1419,7 +1421,7 @@ static void obj_free(int class_size, unsigned long obj) f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_page); - vaddr = kmap_atomic(f_page); + vaddr = kmap_local_page(f_page); link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ @@ -1429,7 +1431,7 @@ static void obj_free(int class_size, unsigned long obj) f_page->index = 0; set_freeobj(zspage, f_objidx); - kunmap_atomic(vaddr); + kunmap_local(vaddr); mod_zspage_inuse(zspage, -1); } @@ -1492,8 +1494,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (d_off + class->size > PAGE_SIZE) d_size = PAGE_SIZE - d_off; - s_addr = kmap_atomic(s_page); - d_addr = kmap_atomic(d_page); + s_addr = kmap_local_page(s_page); + d_addr = kmap_local_page(d_page); while (1) { size = min(s_size, d_size); @@ -1516,26 +1518,26 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * Documentation/mm/highmem.rst. */ if (s_off >= PAGE_SIZE) { - kunmap_atomic(d_addr); - kunmap_atomic(s_addr); + kunmap_local(d_addr); + kunmap_local(s_addr); s_page = get_next_page(s_page); - s_addr = kmap_atomic(s_page); - d_addr = kmap_atomic(d_page); + s_addr = kmap_local_page(s_page); + d_addr = kmap_local_page(d_page); s_size = class->size - written; s_off = 0; } if (d_off >= PAGE_SIZE) { - kunmap_atomic(d_addr); + kunmap_local(d_addr); d_page = get_next_page(d_page); - d_addr = kmap_atomic(d_page); + d_addr = kmap_local_page(d_page); d_size = class->size - written; d_off = 0; } } - kunmap_atomic(d_addr); - kunmap_atomic(s_addr); + kunmap_local(d_addr); + kunmap_local(s_addr); } /* @@ -1548,7 +1550,7 @@ static unsigned long find_alloced_obj(struct size_class *class, unsigned int offset; int index = *obj_idx; unsigned long handle = 0; - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); offset = get_first_obj_offset(page); offset += class->size * index; @@ -1561,7 +1563,7 @@ static unsigned long find_alloced_obj(struct size_class *class, index++; } - kunmap_atomic(addr); + kunmap_local(addr); *obj_idx = index; @@ -1798,14 +1800,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page, migrate_write_lock(zspage); offset = get_first_obj_offset(page); - s_addr = kmap_atomic(page); + s_addr = kmap_local_page(page); /* * Here, any user cannot access all objects in the zspage so let's move. */ - d_addr = kmap_atomic(newpage); + d_addr = kmap_local_page(newpage); copy_page(d_addr, s_addr); - kunmap_atomic(d_addr); + kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { @@ -1818,7 +1820,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, record_obj(handle, new_obj); } } - kunmap_atomic(s_addr); + kunmap_local(s_addr); replace_sub_page(class, zspage, newpage, page); /* -- 2.51.0 From e664c2cd98cbf5e6fcc3ee92b5f3fc8f7f35e11e Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Thu, 10 Oct 2024 23:21:43 +0530 Subject: [PATCH 09/16] mm/zsmalloc: use memcpy_from/to_page whereever possible As part of "zsmalloc: replace kmap_atomic with kmap_local_page" [1] we replaced kmap/kunmap_atomic() with kmap_local_page()/kunmap_local(). But later it was found that some of the code could be replaced with already available apis in highmem.h, such as memcpy_from_page()/memcpy_to_page(). Also, update the comments with correct api naming. [1] https://lkml.kernel.org/r/20241001175358.12970-1-quic_pintu@quicinc.com Link: https://lkml.kernel.org/r/20241010175143.27262-1-quic_pintu@quicinc.com Signed-off-by: Pintu Kumar Suggested-by: Matthew Wilcox Suggested-by: Sergey Senozhatsky Cc: Joe Perches Cc: Minchan Kim Cc: Pintu Agarwal Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d3ff10160a5f..64b66a4d3e6e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -263,7 +263,7 @@ struct zspage { struct mapping_area { local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ - char *vm_addr; /* address of kmap_atomic()'ed pages */ + char *vm_addr; /* address of kmap_local_page()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; @@ -1046,11 +1046,10 @@ static inline void __zs_cpu_down(struct mapping_area *area) static void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - int sizes[2]; - void *addr; + size_t sizes[2]; char *buf = area->vm_buf; - /* disable page faults to match kmap_atomic() return conditions */ + /* disable page faults to match kmap_local_page() return conditions */ pagefault_disable(); /* no read fastpath */ @@ -1061,12 +1060,8 @@ static void *__zs_map_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ - addr = kmap_local_page(pages[0]); - memcpy(buf, addr + off, sizes[0]); - kunmap_local(addr); - addr = kmap_local_page(pages[1]); - memcpy(buf + sizes[0], addr, sizes[1]); - kunmap_local(addr); + memcpy_from_page(buf, pages[0], off, sizes[0]); + memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]); out: return area->vm_buf; } @@ -1074,8 +1069,7 @@ out: static void __zs_unmap_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - int sizes[2]; - void *addr; + size_t sizes[2]; char *buf; /* no write fastpath */ @@ -1091,15 +1085,11 @@ static void __zs_unmap_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ - addr = kmap_local_page(pages[0]); - memcpy(addr + off, buf, sizes[0]); - kunmap_local(addr); - addr = kmap_local_page(pages[1]); - memcpy(addr, buf + sizes[0], sizes[1]); - kunmap_local(addr); + memcpy_to_page(pages[0], off, buf, sizes[0]); + memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]); out: - /* enable page faults to match kunmap_atomic() return conditions */ + /* enable page faults to match kunmap_local() return conditions */ pagefault_enable(); } @@ -1511,10 +1501,10 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, d_size -= size; /* - * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() - * calls must occurs in reverse order of calls to kmap_atomic(). - * So, to call kunmap_atomic(s_addr) we should first call - * kunmap_atomic(d_addr). For more details see + * Calling kunmap_local(d_addr) is necessary. kunmap_local() + * calls must occurs in reverse order of calls to kmap_local_page(). + * So, to call kunmap_local(s_addr) we should first call + * kunmap_local(d_addr). For more details see * Documentation/mm/highmem.rst. */ if (s_off >= PAGE_SIZE) { -- 2.51.0 From f7470591f8db1a72ce9f7ab49cb13c2b21b92002 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:12 +0100 Subject: [PATCH 10/16] mm: convert page_to_pgoff() to page_pgoff() Patch series "page->index removals in mm", v2. As part of shrinking struct page, we need to stop using page->index. This patchset gets rid of most of the remaining references to page->index in mm, as well as increasing the number of functions which take a const folio/page pointer. It shrinks the text segment of mm by a few hundred bytes in my test config, probably mostly from removing calls to compound_head() in page_to_pgoff(). This patch (of 7): Change the function signature to pass in the folio as all three callers have it. This removes a reference to page->index, which we're trying to get rid of. And add kernel-doc. Link: https://lkml.kernel.org/r/20241005200121.3231142-1-willy@infradead.org Link: https://lkml.kernel.org/r/20241005200121.3231142-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/pagemap.h | 31 +++++++++++++++++-------------- mm/memory-failure.c | 4 ++-- mm/rmap.c | 2 +- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 08e487e5850a..78848fbefe94 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1895,7 +1895,7 @@ static inline unsigned long page_to_section(const struct page *page) * * Return: The Page Frame Number of the first page in the folio. */ -static inline unsigned long folio_pfn(struct folio *folio) +static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 68a5f1ff3301..bcf0865a38ae 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1011,22 +1011,25 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping, return read_cache_folio(mapping, index, NULL, file); } -/* - * Get the offset in PAGE_SIZE (even for hugetlb pages). +/** + * page_pgoff - Calculate the logical page offset of this page. + * @folio: The folio containing this page. + * @page: The page which we need the offset of. + * + * For file pages, this is the offset from the beginning of the file + * in units of PAGE_SIZE. For anonymous pages, this is the offset from + * the beginning of the anon_vma in units of PAGE_SIZE. This will + * return nonsense for KSM pages. + * + * Context: Caller must have a reference on the folio or otherwise + * prevent it from being split or freed. + * + * Return: The offset in units of PAGE_SIZE. */ -static inline pgoff_t page_to_pgoff(struct page *page) +static inline pgoff_t page_pgoff(const struct folio *folio, + const struct page *page) { - struct page *head; - - if (likely(!PageTransTail(page))) - return page->index; - - head = compound_head(page); - /* - * We don't initialize ->index for tail pages: calculate based on - * head page - */ - return head->index + page - head; + return folio->index + folio_page_idx(folio, page); } /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 96ce31e5a203..58a3d80961a4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -617,7 +617,7 @@ static void collect_procs_anon(struct folio *folio, struct page *page, if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page_to_pgoff(page); + pgoff = page_pgoff(folio, page); rcu_read_lock(); for_each_process(tsk) { struct vm_area_struct *vma; @@ -653,7 +653,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, i_mmap_lock_read(mapping); rcu_read_lock(); - pgoff = page_to_pgoff(page); + pgoff = page_pgoff(folio, page); for_each_process(tsk) { struct task_struct *t = task_early_kill(tsk, force_early); unsigned long addr; diff --git a/mm/rmap.c b/mm/rmap.c index 4d75433330f9..77c6b27cb0ee 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1276,7 +1276,7 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page, */ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, folio); - VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), + VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address), page); } -- 2.51.0 From 7d3e93eca3ca28bb5927b09b9b603c0c995bcd24 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:13 +0100 Subject: [PATCH 11/16] mm: use page_pgoff() in more places There are several places which currently open-code page_pgoff(), convert them to call it. Link: https://lkml.kernel.org/r/20241005200121.3231142-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/futex/core.c | 2 +- mm/page_vma_mapped.c | 3 +-- mm/rmap.c | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 136768ae2637..342dc4dd328b 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -399,7 +399,7 @@ again: key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = folio->index + folio_page_idx(folio, page); + key->shared.pgoff = page_pgoff(folio, page); rcu_read_unlock(); } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index ab1671e71cb2..6b356853c04e 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -340,7 +340,6 @@ next_pte: unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); - pgoff_t pgoff = folio->index + folio_page_idx(folio, page); struct page_vma_mapped_walk pvmw = { .pfn = page_to_pfn(page), .nr_pages = 1, @@ -348,7 +347,7 @@ unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .flags = PVMW_SYNC, }; - pvmw.address = vma_address(vma, pgoff, 1); + pvmw.address = vma_address(vma, page_pgoff(folio, page), 1); if (pvmw.address == -EFAULT) goto out; if (!page_vma_mapped_walk(&pvmw)) diff --git a/mm/rmap.c b/mm/rmap.c index 77c6b27cb0ee..e5ec8304a193 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -774,7 +774,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); - pgoff_t pgoff; if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); @@ -792,8 +791,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) } /* The !page__anon_vma above handles KSM folios */ - pgoff = folio->index + folio_page_idx(folio, page); - return vma_address(vma, pgoff, 1); + return vma_address(vma, page_pgoff(folio, page), 1); } /* -- 2.51.0 From 713da0b33b3e9d16272b57f4c44dee5c052be9b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:14 +0100 Subject: [PATCH 12/16] mm: renovate page_address_in_vma() This function doesn't modify any of its arguments, so if we make a few other functions take const pointers, we can make page_address_in_vma() take const pointers too. All of its callers have the containing folio already, so pass that in as an argument instead of recalculating it. Also add kernel-doc Link: https://lkml.kernel.org/r/20241005200121.3231142-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ++----- mm/internal.h | 4 ++-- mm/ksm.c | 7 +++---- mm/memory-failure.c | 2 +- mm/mempolicy.c | 2 +- mm/rmap.c | 27 ++++++++++++++++++++------- mm/util.c | 2 +- 7 files changed, 30 insertions(+), 21 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d5e93e44322e..78923015a2e8 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -728,11 +728,8 @@ page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) } bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); - -/* - * Used by swapoff to help locate where page is expected in vma. - */ -unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); +unsigned long page_address_in_vma(const struct folio *folio, + const struct page *, const struct vm_area_struct *); /* * Cleans the PTEs of shared mappings. diff --git a/mm/internal.h b/mm/internal.h index cd96848be245..8674f677304a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -841,7 +841,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype) } /* mm/util.c */ -struct anon_vma *folio_anon_vma(struct folio *folio); +struct anon_vma *folio_anon_vma(const struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); @@ -959,7 +959,7 @@ extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); * If any page in this range is mapped by this VMA, return the first address * where any of these pages appear. Otherwise, return -EFAULT. */ -static inline unsigned long vma_address(struct vm_area_struct *vma, +static inline unsigned long vma_address(const struct vm_area_struct *vma, pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; diff --git a/mm/ksm.c b/mm/ksm.c index e596bc1b5fa7..b813225a806d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1256,7 +1256,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, if (WARN_ON_ONCE(folio_test_large(folio))) return err; - pvmw.address = page_address_in_vma(&folio->page, vma); + pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma); if (pvmw.address == -EFAULT) goto out; @@ -1340,7 +1340,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; - struct folio *folio; + struct folio *folio = page_folio(page); pmd_t *pmd; pmd_t pmde; pte_t *ptep; @@ -1350,7 +1350,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, int err = -EFAULT; struct mmu_notifier_range range; - addr = page_address_in_vma(page, vma); + addr = page_address_in_vma(folio, page, vma); if (addr == -EFAULT) goto out; @@ -1416,7 +1416,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at(mm, addr, ptep, newpte); - folio = page_folio(page); folio_remove_rmap_pte(folio, page, vma); if (!folio_mapped(folio)) folio_free_swap(folio); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 58a3d80961a4..ea9d883c01c1 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -671,7 +671,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, */ if (vma->vm_mm != t->mm) continue; - addr = page_address_in_vma(page, vma); + addr = page_address_in_vma(folio, page, vma); add_to_kill_anon_file(t, page, vma, to_kill, addr); } } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a29eff5d0585..bb37cd1a51d8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1367,7 +1367,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_entry_is_head(folio, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { - addr = page_address_in_vma( + addr = page_address_in_vma(folio, folio_page(folio, 0), vma); if (addr != -EFAULT) break; diff --git a/mm/rmap.c b/mm/rmap.c index e5ec8304a193..d4e5fe94fa92 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -767,14 +767,27 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ -/* - * At what user virtual address is page expected in vma? - * Caller should check the page is actually part of the vma. +/** + * page_address_in_vma - The virtual address of a page in this VMA. + * @folio: The folio containing the page. + * @page: The page within the folio. + * @vma: The VMA we need to know the address in. + * + * Calculates the user virtual address of this page in the specified VMA. + * It is the caller's responsibililty to check the page is actually + * within the VMA. There may not currently be a PTE pointing at this + * page, but if a page fault occurs at this address, this is the page + * which will be accessed. + * + * Context: Caller should hold a reference to the folio. Caller should + * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the + * VMA from being altered. + * + * Return: The virtual address corresponding to this page in the VMA. */ -unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) +unsigned long page_address_in_vma(const struct folio *folio, + const struct page *page, const struct vm_area_struct *vma) { - struct folio *folio = page_folio(page); - if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* @@ -790,7 +803,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } - /* The !page__anon_vma above handles KSM folios */ + /* KSM folios don't reach here because of the !page__anon_vma check */ return vma_address(vma, page_pgoff(folio, page), 1); } diff --git a/mm/util.c b/mm/util.c index 4f1275023eb7..60017d2a9e48 100644 --- a/mm/util.c +++ b/mm/util.c @@ -820,7 +820,7 @@ void *vcalloc_noprof(size_t n, size_t size) } EXPORT_SYMBOL(vcalloc_noprof); -struct anon_vma *folio_anon_vma(struct folio *folio) +struct anon_vma *folio_anon_vma(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; -- 2.51.0 From 68158bfa3dbd4af8461ef75a91ffc03be942c8fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:15 +0100 Subject: [PATCH 13/16] mm: mass constification of folio/page pointers Now that page_pgoff() takes const pointers, we can constify the pointers to a lot of functions. Link: https://lkml.kernel.org/r/20241005200121.3231142-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/ksm.h | 7 ++++--- include/linux/rmap.h | 10 +++++----- mm/internal.h | 5 +++-- mm/ksm.c | 5 +++-- mm/memory-failure.c | 24 +++++++++++++----------- mm/page_vma_mapped.c | 5 +++-- mm/rmap.c | 11 ++++++----- 7 files changed, 37 insertions(+), 30 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 29022e71a074..6a53ac4885bb 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -90,7 +90,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); -void collect_procs_ksm(struct folio *folio, struct page *page, +void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); @@ -122,8 +122,9 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } -static inline void collect_procs_ksm(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static inline void collect_procs_ksm(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 78923015a2e8..683a04088f3f 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -171,7 +171,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, unlink_anon_vmas(next); } -struct anon_vma *folio_get_anon_vma(struct folio *folio); +struct anon_vma *folio_get_anon_vma(const struct folio *folio); /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; @@ -194,8 +194,8 @@ enum rmap_level { RMAP_LEVEL_PMD, }; -static inline void __folio_rmap_sanity_checks(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) +static inline void __folio_rmap_sanity_checks(const struct folio *folio, + const struct page *page, int nr_pages, enum rmap_level level) { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); @@ -771,14 +771,14 @@ struct rmap_walk_control { bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct folio *folio); - struct anon_vma *(*anon_lock)(struct folio *folio, + struct anon_vma *(*anon_lock)(const struct folio *folio, struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, +struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ diff --git a/mm/internal.h b/mm/internal.h index 8674f677304a..fd6373cb1c66 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1117,10 +1117,11 @@ void ClearPageHWPoisonTakenOff(struct page *page); bool take_page_off_buddy(struct page *page); bool put_page_back_buddy(struct page *page); struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, +void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long ksm_addr); -unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); +unsigned long page_mapped_in_vma(const struct page *page, + struct vm_area_struct *vma); #else static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) diff --git a/mm/ksm.c b/mm/ksm.c index b813225a806d..7ac59cde626c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1051,7 +1051,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } -static inline struct ksm_stable_node *folio_stable_node(struct folio *folio) +static inline +struct ksm_stable_node *folio_stable_node(const struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } @@ -3067,7 +3068,7 @@ again: /* * Collect processes when the error hit an ksm page. */ -void collect_procs_ksm(struct folio *folio, struct page *page, +void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ea9d883c01c1..7ce7ba8586f5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -445,7 +445,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. */ -static void __add_to_kill(struct task_struct *tsk, struct page *p, +static void __add_to_kill(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { @@ -461,7 +461,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); else - tk->size_shift = page_shift(compound_head(p)); + tk->size_shift = folio_shift(page_folio(p)); /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -486,7 +486,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, list_add_tail(&tk->nd, to_kill); } -static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, +static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { @@ -509,7 +509,7 @@ static bool task_in_to_kill_list(struct list_head *to_kill, return false; } -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, +void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { @@ -606,8 +606,9 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) /* * Collect processes when the error hit an anonymous page. */ -static void collect_procs_anon(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static void collect_procs_anon(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { struct task_struct *tsk; struct anon_vma *av; @@ -643,8 +644,9 @@ static void collect_procs_anon(struct folio *folio, struct page *page, /* * Collect processes when the error hit a file mapped page. */ -static void collect_procs_file(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static void collect_procs_file(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -680,7 +682,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, } #ifdef CONFIG_FS_DAX -static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, +static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { @@ -691,7 +693,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, /* * Collect processes when the error hit a fsdax page. */ -static void collect_procs_fsdax(struct page *page, +static void collect_procs_fsdax(const struct page *page, struct address_space *mapping, pgoff_t pgoff, struct list_head *to_kill, bool pre_remove) { @@ -725,7 +727,7 @@ static void collect_procs_fsdax(struct page *page, /* * Collect the processes who have the corrupted page mapped to kill. */ -static void collect_procs(struct folio *folio, struct page *page, +static void collect_procs(const struct folio *folio, const struct page *page, struct list_head *tokill, int force_early) { if (!folio->mapping) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 6b356853c04e..81839a9e74f1 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -337,9 +337,10 @@ next_pte: * outside the VMA or not present, returns -EFAULT. * Only valid for normal file or anonymous VMAs. */ -unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +unsigned long page_mapped_in_vma(const struct page *page, + struct vm_area_struct *vma) { - struct folio *folio = page_folio(page); + const struct folio *folio = page_folio(page); struct page_vma_mapped_walk pvmw = { .pfn = page_to_pfn(page), .nr_pages = 1, diff --git a/mm/rmap.c b/mm/rmap.c index d4e5fe94fa92..c6c4d4ea29a7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -496,7 +496,7 @@ void __init anon_vma_init(void) * concurrently without folio lock protection). See folio_lock_anon_vma_read() * which has already covered that, and comment above remap_pages(). */ -struct anon_vma *folio_get_anon_vma(struct folio *folio) +struct anon_vma *folio_get_anon_vma(const struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; @@ -540,7 +540,7 @@ out: * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, +struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; @@ -1271,8 +1271,9 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ -static void __page_check_anon_rmap(struct folio *folio, struct page *page, - struct vm_area_struct *vma, unsigned long address) +static void __page_check_anon_rmap(const struct folio *folio, + const struct page *page, struct vm_area_struct *vma, + unsigned long address) { /* * The page's anon-rmap details (mapping and index) are guaranteed to @@ -2569,7 +2570,7 @@ void __put_anon_vma(struct anon_vma *anon_vma) anon_vma_free(root); } -static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, +static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; -- 2.51.0 From 0386aaa6e9c826bc494169a914e01a86befe6edf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:16 +0100 Subject: [PATCH 14/16] bootmem: stop using page->index Encode the type into the bottom four bits of page->private and the info into the remaining bits. Also turn the bootmem type into a named enum. [arnd@arndb.de: bootmem: add bootmem_type stub function] Link: https://lkml.kernel.org/r/20241015143802.577613-1-arnd@kernel.org [akpm@linux-foundation.org: fix build with !CONFIG_HAVE_BOOTMEM_INFO_NODE] Link: https://lore.kernel.org/oe-kbuild-all/202410090311.eaqcL7IZ-lkp@intel.com/ Link: https://lkml.kernel.org/r/20241005200121.3231142-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arnd Bergmann Cc: kernel test robot Signed-off-by: Andrew Morton --- arch/x86/mm/init_64.c | 28 +++++++++++++++++++--------- include/linux/bootmem_info.h | 35 +++++++++++++++++++++++++++-------- mm/bootmem_info.c | 11 ++++++----- mm/sparse.c | 8 ++++---- 4 files changed, 56 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5a564130b9d0..01ea7c6df303 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -985,22 +985,32 @@ int arch_add_memory(int nid, u64 start, u64 size, return add_pages(nid, start_pfn, nr_pages, params); } -static void __meminit free_pagetable(struct page *page, int order) +static void free_reserved_pages(struct page *page, unsigned long nr_pages) { - unsigned long magic; - unsigned int nr_pages = 1 << order; + while (nr_pages--) + free_reserved_page(page++); +} +static void __meminit free_pagetable(struct page *page, int order) +{ /* bootmem page has reserved flag */ if (PageReserved(page)) { - magic = page->index; - if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + unsigned long nr_pages = 1 << order; +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE + enum bootmem_type type = bootmem_type(page); + + if (type == SECTION_INFO || type == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); - } else - while (nr_pages--) - free_reserved_page(page++); - } else + } else { + free_reserved_pages(page, nr_pages); + } +#else + free_reserved_pages(page, nr_pages); +#endif + } else { free_pages((unsigned long)page_address(page), order); + } } static void __meminit free_hugepage_table(struct page *page, diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index cffa38a73618..d8a8d245824a 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -6,11 +6,10 @@ #include /* - * Types for free bootmem stored in page->lru.next. These have to be in - * some random range in unsigned long space for debugging purposes. + * Types for free bootmem stored in the low bits of page->private. */ -enum { - MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, +enum bootmem_type { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 1, SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, MIX_SECTION_INFO, NODE_INFO, @@ -21,9 +20,19 @@ enum { void __init register_page_bootmem_info_node(struct pglist_data *pgdat); void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type); + enum bootmem_type type); void put_page_bootmem(struct page *page); +static inline enum bootmem_type bootmem_type(const struct page *page) +{ + return (unsigned long)page->private & 0xf; +} + +static inline unsigned long bootmem_info(const struct page *page) +{ + return (unsigned long)page->private >> 4; +} + /* * Any memory allocated via the memblock allocator and not via the * buddy will be marked reserved already in the memmap. For those @@ -31,7 +40,7 @@ void put_page_bootmem(struct page *page); */ static inline void free_bootmem_page(struct page *page) { - unsigned long magic = page->index; + enum bootmem_type type = bootmem_type(page); /* * The reserve_bootmem_region sets the reserved flag on bootmem @@ -39,7 +48,7 @@ static inline void free_bootmem_page(struct page *page) */ VM_BUG_ON_PAGE(page_ref_count(page) != 2, page); - if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) + if (type == SECTION_INFO || type == MIX_SECTION_INFO) put_page_bootmem(page); else VM_BUG_ON_PAGE(1, page); @@ -53,8 +62,18 @@ static inline void put_page_bootmem(struct page *page) { } +static inline enum bootmem_type bootmem_type(const struct page *page) +{ + return SECTION_INFO; +} + +static inline unsigned long bootmem_info(const struct page *page) +{ + return 0; +} + static inline void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) + enum bootmem_type type) { } diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index fa7cb0c87c03..95f288169a38 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -14,23 +14,24 @@ #include #include -void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) +void get_page_bootmem(unsigned long info, struct page *page, + enum bootmem_type type) { - page->index = type; + BUG_ON(type > 0xf); + BUG_ON(info > (ULONG_MAX >> 4)); SetPagePrivate(page); - set_page_private(page, info); + set_page_private(page, info << 4 | type); page_ref_inc(page); } void put_page_bootmem(struct page *page) { - unsigned long type = page->index; + enum bootmem_type type = bootmem_type(page); BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { - page->index = 0; ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); diff --git a/mm/sparse.c b/mm/sparse.c index 4cb9793f0b52..13b6624d3562 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -720,19 +720,19 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; - unsigned long magic, nr_pages; + unsigned long type, nr_pages; struct page *page = virt_to_page(memmap); nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) >> PAGE_SHIFT; for (i = 0; i < nr_pages; i++, page++) { - magic = page->index; + type = bootmem_type(page); - BUG_ON(magic == NODE_INFO); + BUG_ON(type == NODE_INFO); maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); - removing_section_nr = page_private(page); + removing_section_nr = bootmem_info(page); /* * When this function is called, the removing section is -- 2.51.0 From 544ec0ed376486fae387c023390add32e68b58dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:17 +0100 Subject: [PATCH 15/16] mm: remove references to page->index in huge_memory.c We already have folios in all these places; it's just a matter of using them instead of the pages. Link: https://lkml.kernel.org/r/20241005200121.3231142-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 492c16eaf147..61fc407330f2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3199,8 +3199,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail, /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); - page_tail->mapping = head->mapping; - page_tail->index = head->index + tail; + new_folio->mapping = folio->mapping; + new_folio->index = folio->index + tail; /* * page->private should not be set in tail pages. Fix up and warn once @@ -3276,11 +3276,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { + struct folio *tail; __split_huge_page_tail(folio, i, lruvec, list, new_order); + tail = page_folio(head + i); /* Some pages can be beyond EOF: drop them from page cache */ - if (head[i].index >= end) { - struct folio *tail = page_folio(head + i); - + if (tail->index >= end) { if (shmem_mapping(folio->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) @@ -3288,12 +3288,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); folio_put(tail); - } else if (!PageAnon(page)) { - __xa_store(&folio->mapping->i_pages, head[i].index, - head + i, 0); + } else if (!folio_test_anon(folio)) { + __xa_store(&folio->mapping->i_pages, tail->index, + tail, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, - head + i, 0); + tail, 0); } } -- 2.51.0 From 33d7f15f916ea50e9d29b805fcfdbb9e930a742a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:18 +0100 Subject: [PATCH 16/16] mm: use page->private instead of page->index in percpu The percpu allocator only uses one field in struct page, just change it from page->index to page->private. Link: https://lkml.kernel.org/r/20241005200121.3231142-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/percpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index d1a73cf65c53..d8dd31a2e407 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -253,13 +253,13 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { - page->index = (unsigned long)pcpu; + page->private = (unsigned long)pcpu; } /* obtain pointer to a chunk from a page struct */ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { - return (struct pcpu_chunk *)page->index; + return (struct pcpu_chunk *)page->private; } static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) -- 2.51.0