From 4f79384a25d57a59e142009e52f40ae1f25102fe Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:32 +0800 Subject: [PATCH 01/16] mm, swap_slots: remove slot cache for freeing path The slot cache for freeing path is mostly for reducing the overhead of si->lock. As we have basically eliminated the si->lock usage for freeing path, it can be removed. This helps simplify the code, and avoids swap entries from being hold in cache upon freeing. The delayed freeing of entries have been causing trouble for further optimizations for zswap [1] and in theory will also cause more fragmentation, and extra overhead. Test with build linux kernel showed both performance and fragmentation is better without the cache: tiem make -j96 / 768M memcg, 4K pages, 10G ZRAM, avg of 4 test run:: Before: Sys time: 36047.78, Real time: 472.43 After: (-7.6% sys time, -7.3% real time) Sys time: 33314.76, Real time: 437.67 time make -j96 / 1152M memcg, 64K mTHP, 10G ZRAM, avg of 4 test run: Before: Sys time: 46859.04, Real time: 562.63 hugepages-64kB/stats/swpout: 1783392 hugepages-64kB/stats/swpout_fallback: 240875 After: (-23.3% sys time, -21.3% real time) Sys time: 35958.87, Real time: 442.69 hugepages-64kB/stats/swpout: 1866267 hugepages-64kB/stats/swpout_fallback: 158330 Sequential SWAP should be also slightly faster, tests didn't show a measurable difference though, at least no regression: Swapin 4G zero page on ZRAM (time in us): Before (avg. 1923756) 1912391 1927023 1927957 1916527 1918263 1914284 1934753 1940813 1921791 After (avg. 1922290): 1919101 1925743 1916810 1917007 1923930 1935152 1917403 1923549 1921913 Link: https://lore.kernel.org/all/CAMgjq7ACohT_uerSz8E_994ZZCv709Zor+43hdmesW_59W1BWw@mail.gmail.com/[1] Link: https://lkml.kernel.org/r/20250113175732.48099-14-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Cc: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap_slots.h | 3 -- mm/swap_slots.c | 78 +++++---------------------------- mm/swapfile.c | 89 +++++++++++++++----------------------- 3 files changed, 44 insertions(+), 126 deletions(-) diff --git a/include/linux/swap_slots.h b/include/linux/swap_slots.h index 15adfb8c813a..840aec3523b2 100644 --- a/include/linux/swap_slots.h +++ b/include/linux/swap_slots.h @@ -16,15 +16,12 @@ struct swap_slots_cache { swp_entry_t *slots; int nr; int cur; - spinlock_t free_lock; /* protects slots_ret, n_ret */ - swp_entry_t *slots_ret; int n_ret; }; void disable_swap_slots_cache_lock(void); void reenable_swap_slots_cache_unlock(void); void enable_swap_slots_cache(void); -void free_swap_slot(swp_entry_t entry); extern bool swap_slot_cache_enabled; diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 13ab3b771409..9c7c171df7ba 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -43,17 +43,15 @@ static DEFINE_MUTEX(swap_slots_cache_mutex); /* Serialize swap slots cache enable/disable operations */ static DEFINE_MUTEX(swap_slots_cache_enable_mutex); -static void __drain_swap_slots_cache(unsigned int type); +static void __drain_swap_slots_cache(void); #define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled) -#define SLOTS_CACHE 0x1 -#define SLOTS_CACHE_RET 0x2 static void deactivate_swap_slots_cache(void) { mutex_lock(&swap_slots_cache_mutex); swap_slot_cache_active = false; - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + __drain_swap_slots_cache(); mutex_unlock(&swap_slots_cache_mutex); } @@ -72,7 +70,7 @@ void disable_swap_slots_cache_lock(void) if (swap_slot_cache_initialized) { /* serialize with cpu hotplug operations */ cpus_read_lock(); - __drain_swap_slots_cache(SLOTS_CACHE|SLOTS_CACHE_RET); + __drain_swap_slots_cache(); cpus_read_unlock(); } } @@ -113,7 +111,7 @@ out: static int alloc_swap_slot_cache(unsigned int cpu) { struct swap_slots_cache *cache; - swp_entry_t *slots, *slots_ret; + swp_entry_t *slots; /* * Do allocation outside swap_slots_cache_mutex @@ -125,28 +123,19 @@ static int alloc_swap_slot_cache(unsigned int cpu) if (!slots) return -ENOMEM; - slots_ret = kvcalloc(SWAP_SLOTS_CACHE_SIZE, sizeof(swp_entry_t), - GFP_KERNEL); - if (!slots_ret) { - kvfree(slots); - return -ENOMEM; - } - mutex_lock(&swap_slots_cache_mutex); cache = &per_cpu(swp_slots, cpu); - if (cache->slots || cache->slots_ret) { + if (cache->slots) { /* cache already allocated */ mutex_unlock(&swap_slots_cache_mutex); kvfree(slots); - kvfree(slots_ret); return 0; } if (!cache->lock_initialized) { mutex_init(&cache->alloc_lock); - spin_lock_init(&cache->free_lock); cache->lock_initialized = true; } cache->nr = 0; @@ -160,19 +149,16 @@ static int alloc_swap_slot_cache(unsigned int cpu) */ mb(); cache->slots = slots; - cache->slots_ret = slots_ret; mutex_unlock(&swap_slots_cache_mutex); return 0; } -static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, - bool free_slots) +static void drain_slots_cache_cpu(unsigned int cpu, bool free_slots) { struct swap_slots_cache *cache; - swp_entry_t *slots = NULL; cache = &per_cpu(swp_slots, cpu); - if ((type & SLOTS_CACHE) && cache->slots) { + if (cache->slots) { mutex_lock(&cache->alloc_lock); swapcache_free_entries(cache->slots + cache->cur, cache->nr); cache->cur = 0; @@ -183,20 +169,9 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } mutex_unlock(&cache->alloc_lock); } - if ((type & SLOTS_CACHE_RET) && cache->slots_ret) { - spin_lock_irq(&cache->free_lock); - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - if (free_slots && cache->slots_ret) { - slots = cache->slots_ret; - cache->slots_ret = NULL; - } - spin_unlock_irq(&cache->free_lock); - kvfree(slots); - } } -static void __drain_swap_slots_cache(unsigned int type) +static void __drain_swap_slots_cache(void) { unsigned int cpu; @@ -224,13 +199,13 @@ static void __drain_swap_slots_cache(unsigned int type) * There are no slots on such cpu that need to be drained. */ for_each_online_cpu(cpu) - drain_slots_cache_cpu(cpu, type, false); + drain_slots_cache_cpu(cpu, false); } static int free_slot_cache(unsigned int cpu) { mutex_lock(&swap_slots_cache_mutex); - drain_slots_cache_cpu(cpu, SLOTS_CACHE | SLOTS_CACHE_RET, true); + drain_slots_cache_cpu(cpu, true); mutex_unlock(&swap_slots_cache_mutex); return 0; } @@ -269,39 +244,6 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) return cache->nr; } -void free_swap_slot(swp_entry_t entry) -{ - struct swap_slots_cache *cache; - - /* Large folio swap slot is not covered. */ - zswap_invalidate(entry); - - cache = raw_cpu_ptr(&swp_slots); - if (likely(use_swap_slot_cache && cache->slots_ret)) { - spin_lock_irq(&cache->free_lock); - /* Swap slots cache may be deactivated before acquiring lock */ - if (!use_swap_slot_cache || !cache->slots_ret) { - spin_unlock_irq(&cache->free_lock); - goto direct_free; - } - if (cache->n_ret >= SWAP_SLOTS_CACHE_SIZE) { - /* - * Return slots to global pool. - * The current swap_map value is SWAP_HAS_CACHE. - * Set it to 0 to indicate it is available for - * allocation in global pool - */ - swapcache_free_entries(cache->slots_ret, cache->n_ret); - cache->n_ret = 0; - } - cache->slots_ret[cache->n_ret++] = entry; - spin_unlock_irq(&cache->free_lock); - } else { -direct_free: - swapcache_free_entries(&entry, 1); - } -} - swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; diff --git a/mm/swapfile.c b/mm/swapfile.c index adf97c9ccb96..6e867c16ea93 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -53,14 +53,15 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages); +static void swap_entry_range_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset); -static void unlock_cluster(struct swap_cluster_info *ci); +static inline void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -261,10 +262,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, folio_ref_sub(folio, nr_pages); folio_set_dirty(folio); - /* Only sinple page folio can be backed by zswap */ - if (nr_pages == 1) - zswap_invalidate(entry); - swap_entry_range_free(si, entry, nr_pages); + ci = lock_cluster(si, offset); + swap_entry_range_free(si, ci, entry, nr_pages); + unlock_cluster(ci); ret = nr_pages; out_unlock: folio_unlock(folio); @@ -1128,8 +1128,10 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, * Use atomic clear_bit operations only on zeromap instead of non-atomic * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes. */ - for (i = 0; i < nr_entries; i++) + for (i = 0; i < nr_entries; i++) { clear_bit(offset + i, si->zeromap); + zswap_invalidate(swp_entry(si->type, offset + i)); + } if (si->flags & SWP_BLKDEV) swap_slot_free_notify = @@ -1434,9 +1436,9 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, ci = lock_cluster(si, offset); usage = __swap_entry_free_locked(si, offset, 1); - unlock_cluster(ci); if (!usage) - free_swap_slot(entry); + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + unlock_cluster(ci); return usage; } @@ -1464,13 +1466,10 @@ static bool __swap_entries_free(struct swap_info_struct *si, } for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); + if (!has_cache) + swap_entry_range_free(si, ci, entry, nr); unlock_cluster(ci); - if (!has_cache) { - for (i = 0; i < nr; i++) - zswap_invalidate(swp_entry(si->type, offset + i)); - swap_entry_range_free(si, entry, nr); - } return has_cache; fallback: @@ -1490,15 +1489,13 @@ fallback: * Drop the last HAS_CACHE flag of swap entries, caller have to * ensure all entries belong to the same cgroup. */ -static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry, - unsigned int nr_pages) +static void swap_entry_range_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; unsigned char *map_end = map + nr_pages; - struct swap_cluster_info *ci; - - ci = lock_cluster(si, offset); /* It should never free entries across different clusters */ VM_BUG_ON(ci != offset_to_cluster(si, offset + nr_pages - 1)); @@ -1518,7 +1515,6 @@ static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry free_cluster(si, ci); else partial_free_cluster(si, ci); - unlock_cluster(ci); } static void cluster_swap_free_nr(struct swap_info_struct *si, @@ -1526,28 +1522,13 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, unsigned char usage) { struct swap_cluster_info *ci; - DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; - int i, nr; + unsigned long end = offset + nr_pages; ci = lock_cluster(si, offset); - while (nr_pages) { - nr = min(BITS_PER_LONG, nr_pages); - for (i = 0; i < nr; i++) { - if (!__swap_entry_free_locked(si, offset + i, usage)) - bitmap_set(to_free, i, 1); - } - if (!bitmap_empty(to_free, BITS_PER_LONG)) { - unlock_cluster(ci); - for_each_set_bit(i, to_free, BITS_PER_LONG) - free_swap_slot(swp_entry(si->type, offset + i)); - if (nr == nr_pages) - return; - bitmap_clear(to_free, 0, BITS_PER_LONG); - ci = lock_cluster(si, offset); - } - offset += nr; - nr_pages -= nr; - } + do { + if (!__swap_entry_free_locked(si, offset, usage)) + swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + } while (++offset < end); unlock_cluster(ci); } @@ -1588,18 +1569,12 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) return; ci = lock_cluster(si, offset); - if (size > 1 && swap_is_has_cache(si, offset, size)) { - unlock_cluster(ci); - swap_entry_range_free(si, entry, size); - return; - } - for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { - unlock_cluster(ci); - free_swap_slot(entry); - if (i == size - 1) - return; - lock_cluster(si, offset); + if (swap_is_has_cache(si, offset, size)) + swap_entry_range_free(si, ci, entry, size); + else { + for (int i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) + swap_entry_range_free(si, ci, entry, 1); } } unlock_cluster(ci); @@ -1608,6 +1583,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) void swapcache_free_entries(swp_entry_t *entries, int n) { int i; + struct swap_cluster_info *ci; struct swap_info_struct *si = NULL; if (n <= 0) @@ -1615,8 +1591,11 @@ void swapcache_free_entries(swp_entry_t *entries, int n) for (i = 0; i < n; ++i) { si = _swap_info_get(entries[i]); - if (si) - swap_entry_range_free(si, entries[i], 1); + if (si) { + ci = lock_cluster(si, swp_offset(entries[i])); + swap_entry_range_free(si, ci, entries[i], 1); + unlock_cluster(ci); + } } } -- 2.50.1 From c4f1b56f3f78d22d8f73bd9264a7c2e6d74432ff Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 2 Jan 2025 12:10:51 +0000 Subject: [PATCH 02/16] mips: vdso: prefer do_mmap() to mmap_region() Patch series "mm: update mips to use do_mmap(), make mmap_region() internal". Currently the only user of mmap_region() outside of the memory management code is the MIPS VDSO implementation. This uses mmap_region() to map a 'delay slot emulation page' at the top of the stack which is read-only and executable. This mapping requires that an already-acquired mmap write lock is utilised and that uffd and populate logic is ignored. This rules out vm_mmap(), however do_mmap() fits the bill. Adapt this code to use do_mmap() and then once done, make mmap_region() internal and userland testable, and avoid any other uses of mmap_region(), which is absolutely and strictly an internal mm function which bypasses a great number of checks and logic. This patch (of 2): mmap_region() is an internal memory management implementation detail that is not intended to be used outside of the memory management subsystem. Map the delay slot emulation page using do_mmap() which makes use of the already-held mmap write lock and bypasses unneeded populate and userfaultfd logic. This should have the precise same behaviour as the existing logic. Link: https://lkml.kernel.org/r/cover.1735819274.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/ef076e381570f709e5c2c142dc030ec5b3309a0e.1735819274.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/mips/kernel/vdso.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 4c8e3c0aa210..75c9d3618f58 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -97,11 +98,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return -EINTR; if (IS_ENABLED(CONFIG_MIPS_FP_SUPPORT)) { + unsigned long unused; + /* Map delay slot emulation page */ - base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, - VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - 0, NULL); + base = do_mmap(NULL, STACK_TOP, PAGE_SIZE, PROT_READ | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_FIXED, 0, 0, &unused, + NULL); if (IS_ERR_VALUE(base)) { ret = base; goto out; -- 2.50.1 From f8d4a6cabb74f82c37ccb7c5e9dc3fdad50393d4 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 2 Jan 2025 12:10:52 +0000 Subject: [PATCH 03/16] mm: make mmap_region() internal Now that we have removed the one user of mmap_region() outside of mm, make it internal and add it to vma.c so it can be userland tested. This ensures that all external memory mappings are performed using the appropriate interfaces and allows us to modify memory mapping logic as we see fit. Additionally expand test stubs to allow for the mmap_region() code to compile and be userland testable. Link: https://lkml.kernel.org/r/de5a3c574d35c26237edf20a1d8652d7305709c9.1735819274.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 -- mm/mmap.c | 59 ----------------------------- mm/vma.c | 61 +++++++++++++++++++++++++++++- mm/vma.h | 2 +- tools/testing/vma/vma_internal.h | 65 ++++++++++++++++++++++++++++++++ 5 files changed, 126 insertions(+), 64 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3550cbeed488..8483e09aeb2c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3363,9 +3363,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return __get_unmapped_area(file, addr, len, pgoff, flags, 0); } -extern unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, diff --git a/mm/mmap.c b/mm/mmap.c index 3cc8de07411d..cda01071c7b1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1072,65 +1072,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } -/** - * mmap_region() - Actually perform the userland mapping of a VMA into - * current->mm with known, aligned and overflow-checked @addr and @len, and - * correctly determined VMA flags @vm_flags and page offset @pgoff. - * - * This is an internal memory management function, and should not be used - * directly. - * - * The caller must write-lock current->mm->mmap_lock. - * - * @file: If a file-backed mapping, a pointer to the struct file describing the - * file to be mapped, otherwise NULL. - * @addr: The page-aligned address at which to perform the mapping. - * @len: The page-aligned, non-zero, length of the mapping. - * @vm_flags: The VMA flags which should be applied to the mapping. - * @pgoff: If @file is specified, the page offset into the file, if not then - * the virtual page offset in memory of the anonymous mapping. - * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap - * events. - * - * Returns: Either an error, or the address at which the requested mapping has - * been performed. - */ -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - unsigned long ret; - bool writable_file_mapping = false; - - mmap_assert_write_locked(current->mm); - - /* Check to see if MDWE is applicable. */ - if (map_deny_write_exec(vm_flags, vm_flags)) - return -EACCES; - - /* Allow architectures to sanity-check the vm_flags. */ - if (!arch_validate_flags(vm_flags)) - return -EINVAL; - - /* Map writable and ensure this isn't a sealed memfd. */ - if (file && is_shared_maywrite(vm_flags)) { - int error = mapping_map_writable(file->f_mapping); - - if (error) - return error; - writable_file_mapping = true; - } - - ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); - - /* Clear our write mapping regardless of error. */ - if (writable_file_mapping) - mapping_unmap_writable(file->f_mapping); - - validate_mm(current->mm); - return ret; -} - int vm_munmap(unsigned long start, size_t len) { return __vm_munmap(start, len, false); diff --git a/mm/vma.c b/mm/vma.c index 5ef2ea843a8a..af1d549b179c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -2431,7 +2431,7 @@ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) vma_set_page_prot(vma); } -unsigned long __mmap_region(struct file *file, unsigned long addr, +static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { @@ -2483,6 +2483,65 @@ abort_munmap: return error; } +/** + * mmap_region() - Actually perform the userland mapping of a VMA into + * current->mm with known, aligned and overflow-checked @addr and @len, and + * correctly determined VMA flags @vm_flags and page offset @pgoff. + * + * This is an internal memory management function, and should not be used + * directly. + * + * The caller must write-lock current->mm->mmap_lock. + * + * @file: If a file-backed mapping, a pointer to the struct file describing the + * file to be mapped, otherwise NULL. + * @addr: The page-aligned address at which to perform the mapping. + * @len: The page-aligned, non-zero, length of the mapping. + * @vm_flags: The VMA flags which should be applied to the mapping. + * @pgoff: If @file is specified, the page offset into the file, if not then + * the virtual page offset in memory of the anonymous mapping. + * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap + * events. + * + * Returns: Either an error, or the address at which the requested mapping has + * been performed. + */ +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + unsigned long ret; + bool writable_file_mapping = false; + + mmap_assert_write_locked(current->mm); + + /* Check to see if MDWE is applicable. */ + if (map_deny_write_exec(vm_flags, vm_flags)) + return -EACCES; + + /* Allow architectures to sanity-check the vm_flags. */ + if (!arch_validate_flags(vm_flags)) + return -EINVAL; + + /* Map writable and ensure this isn't a sealed memfd. */ + if (file && is_shared_maywrite(vm_flags)) { + int error = mapping_map_writable(file->f_mapping); + + if (error) + return error; + writable_file_mapping = true; + } + + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + + /* Clear our write mapping regardless of error. */ + if (writable_file_mapping) + mapping_unmap_writable(file->f_mapping); + + validate_mm(current->mm); + return ret; +} + /* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator diff --git a/mm/vma.h b/mm/vma.h index 61ed044b6145..a2e8710b8c47 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -241,7 +241,7 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); -unsigned long __mmap_region(struct file *file, unsigned long addr, +unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index ae635eecbfa8..2404347fa2c7 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -41,6 +41,8 @@ extern unsigned long dac_mmap_min_addr; #define VM_BUG_ON(_expr) (BUG_ON(_expr)) #define VM_BUG_ON_VMA(_expr, _vma) (BUG_ON(_expr)) +#define MMF_HAS_MDWE 28 + #define VM_NONE 0x00000000 #define VM_READ 0x00000001 #define VM_WRITE 0x00000002 @@ -226,6 +228,8 @@ struct mm_struct { unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; + + unsigned long flags; /* Must use atomic bitops to access */ }; struct vma_lock { @@ -1185,4 +1189,65 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } +/* + * Denies creating a writable executable mapping or gaining executable permissions. + * + * This denies the following: + * + * a) mmap(PROT_WRITE | PROT_EXEC) + * + * b) mmap(PROT_WRITE) + * mprotect(PROT_EXEC) + * + * c) mmap(PROT_WRITE) + * mprotect(PROT_READ) + * mprotect(PROT_EXEC) + * + * But allows the following: + * + * d) mmap(PROT_READ | PROT_EXEC) + * mmap(PROT_READ | PROT_EXEC | PROT_BTI) + * + * This is only applicable if the user has set the Memory-Deny-Write-Execute + * (MDWE) protection mask for the current process. + * + * @old specifies the VMA flags the VMA originally possessed, and @new the ones + * we propose to set. + * + * Return: false if proposed change is OK, true if not ok and should be denied. + */ +static inline bool map_deny_write_exec(unsigned long old, unsigned long new) +{ + /* If MDWE is disabled, we have nothing to deny. */ + if (!test_bit(MMF_HAS_MDWE, ¤t->mm->flags)) + return false; + + /* If the new VMA is not executable, we have nothing to deny. */ + if (!(new & VM_EXEC)) + return false; + + /* Under MDWE we do not accept newly writably executable VMAs... */ + if (new & VM_WRITE) + return true; + + /* ...nor previously non-executable VMAs becoming executable. */ + if (!(old & VM_EXEC)) + return true; + + return false; +} + +static inline int mapping_map_writable(struct address_space *mapping) +{ + int c = atomic_read(&mapping->i_mmap_writable); + + /* Derived from the raw_atomic_inc_unless_negative() implementation. */ + do { + if (c < 0) + return -EPERM; + } while (!__sync_bool_compare_and_swap(&mapping->i_mmap_writable, c, c+1)); + + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ -- 2.50.1 From c6f239796b55dbc4225a6fca9f96232092b9df83 Mon Sep 17 00:00:00 2001 From: Guo Weikang Date: Thu, 2 Jan 2025 15:25:28 +0800 Subject: [PATCH 04/16] mm/memblock: add memblock_alloc_or_panic interface Before SLUB initialization, various subsystems used memblock_alloc to allocate memory. In most cases, when memory allocation fails, an immediate panic is required. To simplify this behavior and reduce repetitive checks, introduce `memblock_alloc_or_panic`. This function ensures that memory allocation failures result in a panic automatically, improving code readability and consistency across subsystems that require this behavior. [guoweikang.kernel@gmail.com: arch/s390: save_area_alloc default failure behavior changed to panic] Link: https://lkml.kernel.org/r/20250109033136.2845676-1-guoweikang.kernel@gmail.com Link: https://lore.kernel.org/lkml/Z2fknmnNtiZbCc7x@kernel.org/ Link: https://lkml.kernel.org/r/20250102072528.650926-1-guoweikang.kernel@gmail.com Signed-off-by: Guo Weikang Acked-by: Geert Uytterhoeven [m68k] Reviewed-by: Alexander Gordeev [s390] Acked-by: Mike Rapoport (Microsoft) Cc: Alexander Gordeev Signed-off-by: Andrew Morton --- arch/alpha/kernel/core_cia.c | 5 +- arch/alpha/kernel/core_marvel.c | 10 +--- arch/alpha/kernel/pci.c | 13 +---- arch/alpha/kernel/pci_iommu.c | 10 +--- arch/arm/kernel/setup.c | 10 +--- arch/arm/mm/mmu.c | 17 ++---- arch/arm/mm/nommu.c | 5 +- arch/arm64/kernel/setup.c | 4 +- arch/loongarch/kernel/setup.c | 2 +- arch/loongarch/mm/init.c | 13 ++--- arch/m68k/mm/init.c | 5 +- arch/m68k/mm/mcfmmu.c | 10 +--- arch/m68k/mm/motorola.c | 5 +- arch/m68k/mm/sun3mmu.c | 10 +--- arch/m68k/sun3/sun3dvma.c | 6 +-- arch/mips/kernel/setup.c | 5 +- arch/openrisc/mm/ioremap.c | 5 +- arch/parisc/mm/init.c | 20 ++----- arch/powerpc/kernel/dt_cpu_ftrs.c | 10 ++-- arch/powerpc/kernel/pci_32.c | 5 +- arch/powerpc/kernel/setup-common.c | 5 +- arch/powerpc/kernel/setup_32.c | 8 +-- arch/powerpc/mm/book3s32/mmu.c | 5 +- arch/powerpc/mm/book3s64/pgtable.c | 6 +-- arch/powerpc/mm/kasan/init_book3e_64.c | 8 +-- arch/powerpc/mm/kasan/init_book3s_64.c | 2 +- arch/powerpc/mm/nohash/mmu_context.c | 16 ++---- arch/powerpc/mm/pgtable_32.c | 7 +-- arch/powerpc/platforms/powermac/nvram.c | 5 +- arch/powerpc/platforms/powernv/opal.c | 5 +- arch/powerpc/platforms/ps3/setup.c | 5 +- arch/powerpc/sysdev/msi_bitmap.c | 5 +- arch/riscv/kernel/setup.c | 4 +- arch/riscv/mm/kasan_init.c | 14 ++--- arch/s390/kernel/crash_dump.c | 4 +- arch/s390/kernel/numa.c | 8 +-- arch/s390/kernel/setup.c | 20 ++----- arch/s390/kernel/smp.c | 11 +--- arch/s390/kernel/topology.c | 10 +--- arch/sh/mm/init.c | 10 +--- arch/sparc/kernel/prom_32.c | 4 +- arch/sparc/mm/srmmu.c | 14 ++--- arch/um/drivers/net_kern.c | 5 +- arch/um/drivers/vector_kern.c | 5 +- arch/um/kernel/load_file.c | 4 +- arch/x86/coco/sev/core.c | 4 +- arch/x86/kernel/acpi/boot.c | 5 +- arch/x86/kernel/apic/io_apic.c | 9 +--- arch/x86/kernel/e820.c | 5 +- arch/x86/platform/olpc/olpc_dt.c | 6 +-- arch/x86/xen/p2m.c | 8 +-- arch/xtensa/mm/kasan_init.c | 6 +-- drivers/clk/ti/clk.c | 5 +- drivers/macintosh/smu.c | 6 +-- drivers/of/fdt.c | 8 +-- drivers/of/unittest.c | 8 +-- include/linux/memblock.h | 6 +++ init/main.c | 18 ++----- kernel/power/snapshot.c | 5 +- lib/cpumask.c | 5 +- mm/kmsan/shadow.c | 8 +-- mm/memblock.c | 20 +++++++ mm/numa.c | 8 +-- mm/percpu.c | 70 +++++-------------------- mm/sparse.c | 5 +- 65 files changed, 144 insertions(+), 431 deletions(-) diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c index ca3d9c732b61..6e577228e175 100644 --- a/arch/alpha/kernel/core_cia.c +++ b/arch/alpha/kernel/core_cia.c @@ -331,10 +331,7 @@ cia_prepare_tbia_workaround(int window) long i; /* Use minimal 1K map. */ - ppte = memblock_alloc(CIA_BROKEN_TBIA_SIZE, 32768); - if (!ppte) - panic("%s: Failed to allocate %u bytes align=0x%x\n", - __func__, CIA_BROKEN_TBIA_SIZE, 32768); + ppte = memblock_alloc_or_panic(CIA_BROKEN_TBIA_SIZE, 32768); pte = (virt_to_phys(ppte) >> (PAGE_SHIFT - 1)) | 1; for (i = 0; i < CIA_BROKEN_TBIA_SIZE / sizeof(unsigned long); ++i) diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c index b22248044bf0..b1bfbd11980d 100644 --- a/arch/alpha/kernel/core_marvel.c +++ b/arch/alpha/kernel/core_marvel.c @@ -81,10 +81,7 @@ mk_resource_name(int pe, int port, char *str) char *name; sprintf(tmp, "PCI %s PE %d PORT %d", str, pe, port); - name = memblock_alloc(strlen(tmp) + 1, SMP_CACHE_BYTES); - if (!name) - panic("%s: Failed to allocate %zu bytes\n", __func__, - strlen(tmp) + 1); + name = memblock_alloc_or_panic(strlen(tmp) + 1, SMP_CACHE_BYTES); strcpy(name, tmp); return name; @@ -119,10 +116,7 @@ alloc_io7(unsigned int pe) return NULL; } - io7 = memblock_alloc(sizeof(*io7), SMP_CACHE_BYTES); - if (!io7) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*io7)); + io7 = memblock_alloc_or_panic(sizeof(*io7), SMP_CACHE_BYTES); io7->pe = pe; raw_spin_lock_init(&io7->irq_lock); diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c index 4458eb7f44f0..8e9b4ac86b7e 100644 --- a/arch/alpha/kernel/pci.c +++ b/arch/alpha/kernel/pci.c @@ -391,10 +391,7 @@ alloc_pci_controller(void) { struct pci_controller *hose; - hose = memblock_alloc(sizeof(*hose), SMP_CACHE_BYTES); - if (!hose) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*hose)); + hose = memblock_alloc_or_panic(sizeof(*hose), SMP_CACHE_BYTES); *hose_tail = hose; hose_tail = &hose->next; @@ -405,13 +402,7 @@ alloc_pci_controller(void) struct resource * __init alloc_resource(void) { - void *ptr = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); - - if (!ptr) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct resource)); - - return ptr; + return memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES); } diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 7fcf3e9b7103..681f56089d9c 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -71,14 +71,8 @@ iommu_arena_new_node(int nid, struct pci_controller *hose, dma_addr_t base, if (align < mem_size) align = mem_size; - arena = memblock_alloc(sizeof(*arena), SMP_CACHE_BYTES); - if (!arena) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*arena)); - arena->ptes = memblock_alloc(mem_size, align); - if (!arena->ptes) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, mem_size, align); + arena = memblock_alloc_or_panic(sizeof(*arena), SMP_CACHE_BYTES); + arena->ptes = memblock_alloc_or_panic(mem_size, align); spin_lock_init(&arena->lock); arena->hose = hose; diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index e6a857bf0ce6..a41c93988d2c 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -880,10 +880,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) */ boot_alias_start = phys_to_idmap(start); if (arm_has_idmap_alias() && boot_alias_start != IDMAP_INVALID_ADDR) { - res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", - __func__, sizeof(*res)); + res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM (boot alias)"; res->start = boot_alias_start; res->end = phys_to_idmap(res_end); @@ -891,10 +888,7 @@ static void __init request_standard_resources(const struct machine_desc *mdesc) request_resource(&iomem_resource, res); } - res = memblock_alloc(sizeof(*res), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*res)); + res = memblock_alloc_or_panic(sizeof(*res), SMP_CACHE_BYTES); res->name = "System RAM"; res->start = start; res->end = res_end; diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index f5b7a16c5803..f02f872ea8a9 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -726,13 +726,8 @@ EXPORT_SYMBOL(phys_mem_access_prot); static void __init *early_alloc(unsigned long sz) { - void *ptr = memblock_alloc(sz, sz); + return memblock_alloc_or_panic(sz, sz); - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, sz, sz); - - return ptr; } static void *__init late_alloc(unsigned long sz) @@ -1027,10 +1022,7 @@ void __init iotable_init(struct map_desc *io_desc, int nr) if (!nr) return; - svm = memblock_alloc(sizeof(*svm) * nr, __alignof__(*svm)); - if (!svm) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, sizeof(*svm) * nr, __alignof__(*svm)); + svm = memblock_alloc_or_panic(sizeof(*svm) * nr, __alignof__(*svm)); for (md = io_desc; nr; md++, nr--) { create_mapping(md); @@ -1052,10 +1044,7 @@ void __init vm_reserve_area_early(unsigned long addr, unsigned long size, struct vm_struct *vm; struct static_vm *svm; - svm = memblock_alloc(sizeof(*svm), __alignof__(*svm)); - if (!svm) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, sizeof(*svm), __alignof__(*svm)); + svm = memblock_alloc_or_panic(sizeof(*svm), __alignof__(*svm)); vm = &svm->vm; vm->addr = (void *)addr; diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index c415f3859b20..1a8f6914ee59 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -162,10 +162,7 @@ void __init paging_init(const struct machine_desc *mdesc) mpu_setup(); /* allocate the zero page. */ - zero_page = (void *)memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + zero_page = (void *)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); bootmem_init(); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 4f613e8e0745..85104587f849 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -223,9 +223,7 @@ static void __init request_standard_resources(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); - if (!standard_resources) - panic("%s: Failed to allocate %zu bytes\n", __func__, res_size); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; diff --git a/arch/loongarch/kernel/setup.c b/arch/loongarch/kernel/setup.c index 56934fe58170..edcfdfcad7d2 100644 --- a/arch/loongarch/kernel/setup.c +++ b/arch/loongarch/kernel/setup.c @@ -431,7 +431,7 @@ static void __init resource_init(void) num_standard_resources = memblock.memory.cnt; res_size = num_standard_resources * sizeof(*standard_resources); - standard_resources = memblock_alloc(res_size, SMP_CACHE_BYTES); + standard_resources = memblock_alloc_or_panic(res_size, SMP_CACHE_BYTES); for_each_mem_region(region) { res = &standard_resources[i++]; diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index 188b52bbb254..ca5aa5f46a9f 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -174,9 +174,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) pmd_t *pmd; if (p4d_none(p4dp_get(p4d))) { - pud = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pud) - panic("%s: Failed to allocate memory\n", __func__); + pud = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); p4d_populate(&init_mm, p4d, pud); #ifndef __PAGETABLE_PUD_FOLDED pud_init(pud); @@ -185,9 +183,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) pud = pud_offset(p4d, addr); if (pud_none(pudp_get(pud))) { - pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pmd) - panic("%s: Failed to allocate memory\n", __func__); + pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, pmd); #ifndef __PAGETABLE_PMD_FOLDED pmd_init(pmd); @@ -198,10 +194,7 @@ pte_t * __init populate_kernel_pte(unsigned long addr) if (!pmd_present(pmdp_get(pmd))) { pte_t *pte; - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate memory\n", __func__); - + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); kernel_pte_init(pte); } diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index 1b47bec15832..8b11d0d545aa 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -68,10 +68,7 @@ void __init paging_init(void) high_memory = (void *) end_mem; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); max_zone_pfn[ZONE_DMA] = end_mem >> PAGE_SHIFT; free_area_init(max_zone_pfn); } diff --git a/arch/m68k/mm/mcfmmu.c b/arch/m68k/mm/mcfmmu.c index 9a6fa342e872..19a75029036c 100644 --- a/arch/m68k/mm/mcfmmu.c +++ b/arch/m68k/mm/mcfmmu.c @@ -42,20 +42,14 @@ void __init paging_init(void) unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; int i; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pg_dir = swapper_pg_dir; memset(swapper_pg_dir, 0, sizeof(swapper_pg_dir)); size = num_pages * sizeof(pte_t); size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1); - next_pgtable = (unsigned long) memblock_alloc(size, PAGE_SIZE); - if (!next_pgtable) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, PAGE_SIZE); + next_pgtable = (unsigned long) memblock_alloc_or_panic(size, PAGE_SIZE); pg_dir += PAGE_OFFSET >> PGDIR_SHIFT; diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 0fb8b098d926..73651e093c4d 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -500,10 +500,7 @@ void __init paging_init(void) * initialize the bad page table and bad page to point * to a couple of allocated pages */ - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); /* * Set up SFC/DFC registers diff --git a/arch/m68k/mm/sun3mmu.c b/arch/m68k/mm/sun3mmu.c index 494739c1783e..1ecf6bdd08bf 100644 --- a/arch/m68k/mm/sun3mmu.c +++ b/arch/m68k/mm/sun3mmu.c @@ -44,10 +44,7 @@ void __init paging_init(void) unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0, }; unsigned long size; - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); address = PAGE_OFFSET; pg_dir = swapper_pg_dir; @@ -57,10 +54,7 @@ void __init paging_init(void) size = num_pages * sizeof(pte_t); size = (size + PAGE_SIZE) & ~(PAGE_SIZE-1); - next_pgtable = (unsigned long)memblock_alloc(size, PAGE_SIZE); - if (!next_pgtable) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, PAGE_SIZE); + next_pgtable = (unsigned long)memblock_alloc_or_panic(size, PAGE_SIZE); bootmem_end = (next_pgtable + size + PAGE_SIZE) & PAGE_MASK; /* Map whole memory from PAGE_OFFSET (0x0E000000) */ diff --git a/arch/m68k/sun3/sun3dvma.c b/arch/m68k/sun3/sun3dvma.c index 6ebf52740ad7..225fc735e466 100644 --- a/arch/m68k/sun3/sun3dvma.c +++ b/arch/m68k/sun3/sun3dvma.c @@ -252,12 +252,8 @@ void __init dvma_init(void) list_add(&(hole->list), &hole_list); - iommu_use = memblock_alloc(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long), + iommu_use = memblock_alloc_or_panic(IOMMU_TOTAL_ENTRIES * sizeof(unsigned long), SMP_CACHE_BYTES); - if (!iommu_use) - panic("%s: Failed to allocate %zu bytes\n", __func__, - IOMMU_TOTAL_ENTRIES * sizeof(unsigned long)); - dvma_unmap_iommu(DVMA_START, DVMA_SIZE); sun3_dvma_init(); diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 12a1a4ffb602..fbfe0771317e 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -704,10 +704,7 @@ static void __init resource_init(void) for_each_mem_range(i, &start, &end) { struct resource *res; - res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct resource)); + res = memblock_alloc_or_panic(sizeof(struct resource), SMP_CACHE_BYTES); res->start = start; /* diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index f59ea4c10b0f..8e63e86251ca 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -38,10 +38,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm) if (likely(mem_init_done)) { pte = (pte_t *)get_zeroed_page(GFP_KERNEL); } else { - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } return pte; diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 96970fa75e4a..61c0a2477072 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -377,10 +377,8 @@ static void __ref map_pages(unsigned long start_vaddr, #if CONFIG_PGTABLE_LEVELS == 3 if (pud_none(*pud)) { - pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER, + pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER, PAGE_SIZE << PMD_TABLE_ORDER); - if (!pmd) - panic("pmd allocation failed.\n"); pud_populate(NULL, pud, pmd); } #endif @@ -388,9 +386,7 @@ static void __ref map_pages(unsigned long start_vaddr, pmd = pmd_offset(pud, vaddr); for (tmp1 = start_pmd; tmp1 < PTRS_PER_PMD; tmp1++, pmd++) { if (pmd_none(*pmd)) { - pg_table = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pg_table) - panic("page table allocation failed\n"); + pg_table = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(NULL, pmd, pg_table); } @@ -648,9 +644,7 @@ static void __init pagetable_init(void) } #endif - empty_zero_page = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!empty_zero_page) - panic("zero page allocation failed.\n"); + empty_zero_page = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } @@ -687,19 +681,15 @@ static void __init fixmap_init(void) #if CONFIG_PGTABLE_LEVELS == 3 if (pud_none(*pud)) { - pmd = memblock_alloc(PAGE_SIZE << PMD_TABLE_ORDER, + pmd = memblock_alloc_or_panic(PAGE_SIZE << PMD_TABLE_ORDER, PAGE_SIZE << PMD_TABLE_ORDER); - if (!pmd) - panic("fixmap: pmd allocation failed.\n"); pud_populate(NULL, pud, pmd); } #endif pmd = pmd_offset(pud, addr); do { - pte_t *pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("fixmap: pte allocation failed.\n"); + pte_t *pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c b/arch/powerpc/kernel/dt_cpu_ftrs.c index 1bee15c013e7..3af6c06af02f 100644 --- a/arch/powerpc/kernel/dt_cpu_ftrs.c +++ b/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -1087,12 +1087,10 @@ static int __init dt_cpu_ftrs_scan_callback(unsigned long node, const char /* Count and allocate space for cpu features */ of_scan_flat_dt_subnodes(node, count_cpufeatures_subnodes, &nr_dt_cpu_features); - dt_cpu_features = memblock_alloc(sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, PAGE_SIZE); - if (!dt_cpu_features) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, - sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, - PAGE_SIZE); + dt_cpu_features = + memblock_alloc_or_panic( + sizeof(struct dt_cpu_feature) * nr_dt_cpu_features, + PAGE_SIZE); cpufeatures_setup_start(isa); diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index ce0c8623e563..f8a3bd8cfae4 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -213,11 +213,8 @@ pci_create_OF_bus_map(void) struct property* of_prop; struct device_node *dn; - of_prop = memblock_alloc(sizeof(struct property) + 256, + of_prop = memblock_alloc_or_panic(sizeof(struct property) + 256, SMP_CACHE_BYTES); - if (!of_prop) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct property) + 256); dn = of_find_node_by_path("/"); if (dn) { memset(of_prop, -1, sizeof(struct property) + 256); diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 6fa179448c33..f3ea1329c566 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -458,11 +458,8 @@ void __init smp_setup_cpu_maps(void) DBG("smp_setup_cpu_maps()\n"); - cpu_to_phys_id = memblock_alloc(nr_cpu_ids * sizeof(u32), + cpu_to_phys_id = memblock_alloc_or_panic(nr_cpu_ids * sizeof(u32), __alignof__(u32)); - if (!cpu_to_phys_id) - panic("%s: Failed to allocate %zu bytes align=0x%zx\n", - __func__, nr_cpu_ids * sizeof(u32), __alignof__(u32)); for_each_node_by_type(dn, "cpu") { const __be32 *intserv; diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 75dbf3e0d9c4..5a1bf501fbe1 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -140,13 +140,7 @@ arch_initcall(ppc_init); static void *__init alloc_stack(void) { - void *ptr = memblock_alloc(THREAD_SIZE, THREAD_ALIGN); - - if (!ptr) - panic("cannot allocate %d bytes for stack at %pS\n", - THREAD_SIZE, (void *)_RET_IP_); - - return ptr; + return memblock_alloc_or_panic(THREAD_SIZE, THREAD_ALIGN); } void __init irqstack_early_init(void) diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 6978344edcb4..be9c4106e22f 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -377,10 +377,7 @@ void __init MMU_init_hw(void) * Find some memory for the hash table. */ if ( ppc_md.progress ) ppc_md.progress("hash:find piece", 0x322); - Hash = memblock_alloc(Hash_size, Hash_size); - if (!Hash) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, Hash_size, Hash_size); + Hash = memblock_alloc_or_panic(Hash_size, Hash_size); _SDR1 = __pa(Hash) | SDR1_LOW_BITS; pr_info("Total memory = %lldMB; using %ldkB for hash table\n", diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 3f28e4acd920..ce64abea9e3e 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -330,11 +330,7 @@ void __init mmu_partition_table_init(void) unsigned long ptcr; /* Initialize the Partition Table with no entries */ - partition_tb = memblock_alloc(patb_size, patb_size); - if (!partition_tb) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, patb_size, patb_size); - + partition_tb = memblock_alloc_or_panic(patb_size, patb_size); ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); set_ptcr_when_no_uv(ptcr); powernv_set_nmmu_ptcr(ptcr); diff --git a/arch/powerpc/mm/kasan/init_book3e_64.c b/arch/powerpc/mm/kasan/init_book3e_64.c index 43c03b84ff32..60c78aac0f63 100644 --- a/arch/powerpc/mm/kasan/init_book3e_64.c +++ b/arch/powerpc/mm/kasan/init_book3e_64.c @@ -40,19 +40,19 @@ static int __init kasan_map_kernel_page(unsigned long ea, unsigned long pa, pgpr pgdp = pgd_offset_k(ea); p4dp = p4d_offset(pgdp, ea); if (kasan_pud_table(*p4dp)) { - pudp = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + pudp = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); memcpy(pudp, kasan_early_shadow_pud, PUD_TABLE_SIZE); p4d_populate(&init_mm, p4dp, pudp); } pudp = pud_offset(p4dp, ea); if (kasan_pmd_table(*pudp)) { - pmdp = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + pmdp = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); memcpy(pmdp, kasan_early_shadow_pmd, PMD_TABLE_SIZE); pud_populate(&init_mm, pudp, pmdp); } pmdp = pmd_offset(pudp, ea); if (kasan_pte_table(*pmdp)) { - ptep = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + ptep = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); memcpy(ptep, kasan_early_shadow_pte, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmdp, ptep); } @@ -74,7 +74,7 @@ static void __init kasan_init_phys_region(void *start, void *end) k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); - va = memblock_alloc(k_end - k_start, PAGE_SIZE); + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) kasan_map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); } diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c index 3fb5ce4f48f4..7d959544c077 100644 --- a/arch/powerpc/mm/kasan/init_book3s_64.c +++ b/arch/powerpc/mm/kasan/init_book3s_64.c @@ -32,7 +32,7 @@ static void __init kasan_init_phys_region(void *start, void *end) k_start = ALIGN_DOWN((unsigned long)kasan_mem_to_shadow(start), PAGE_SIZE); k_end = ALIGN((unsigned long)kasan_mem_to_shadow(end), PAGE_SIZE); - va = memblock_alloc(k_end - k_start, PAGE_SIZE); + va = memblock_alloc_or_panic(k_end - k_start, PAGE_SIZE); for (k_cur = k_start; k_cur < k_end; k_cur += PAGE_SIZE, va += PAGE_SIZE) map_kernel_page(k_cur, __pa(va), PAGE_KERNEL); } diff --git a/arch/powerpc/mm/nohash/mmu_context.c b/arch/powerpc/mm/nohash/mmu_context.c index 0b181da40ddb..a1a4e697251a 100644 --- a/arch/powerpc/mm/nohash/mmu_context.c +++ b/arch/powerpc/mm/nohash/mmu_context.c @@ -385,21 +385,11 @@ void __init mmu_context_init(void) /* * Allocate the maps used by context management */ - context_map = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!context_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - context_mm = memblock_alloc(sizeof(void *) * (LAST_CONTEXT + 1), + context_map = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); + context_mm = memblock_alloc_or_panic(sizeof(void *) * (LAST_CONTEXT + 1), SMP_CACHE_BYTES); - if (!context_mm) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(void *) * (LAST_CONTEXT + 1)); if (IS_ENABLED(CONFIG_SMP)) { - stale_map[boot_cpuid] = memblock_alloc(CTX_MAP_SIZE, SMP_CACHE_BYTES); - if (!stale_map[boot_cpuid]) - panic("%s: Failed to allocate %zu bytes\n", __func__, - CTX_MAP_SIZE); - + stale_map[boot_cpuid] = memblock_alloc_or_panic(CTX_MAP_SIZE, SMP_CACHE_BYTES); cpuhp_setup_state_nocalls(CPUHP_POWERPC_MMU_CTX_PREPARE, "powerpc/mmu/ctx:prepare", mmu_ctx_cpu_prepare, mmu_ctx_cpu_dead); diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index 787b22206386..15276068f657 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -50,13 +50,8 @@ notrace void __init early_ioremap_init(void) void __init *early_alloc_pgtable(unsigned long size) { - void *ptr = memblock_alloc(size, size); + return memblock_alloc_or_panic(size, size); - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, size); - - return ptr; } pte_t __init *early_pte_alloc_kernel(pmd_t *pmdp, unsigned long va) diff --git a/arch/powerpc/platforms/powermac/nvram.c b/arch/powerpc/platforms/powermac/nvram.c index fe2e0249cbc2..a112d26185a0 100644 --- a/arch/powerpc/platforms/powermac/nvram.c +++ b/arch/powerpc/platforms/powermac/nvram.c @@ -514,10 +514,7 @@ static int __init core99_nvram_setup(struct device_node *dp, unsigned long addr) printk(KERN_ERR "nvram: no address\n"); return -EINVAL; } - nvram_image = memblock_alloc(NVRAM_SIZE, SMP_CACHE_BYTES); - if (!nvram_image) - panic("%s: Failed to allocate %u bytes\n", __func__, - NVRAM_SIZE); + nvram_image = memblock_alloc_or_panic(NVRAM_SIZE, SMP_CACHE_BYTES); nvram_data = ioremap(addr, NVRAM_SIZE*2); nvram_naddrs = 1; /* Make sure we get the correct case */ diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 5d0f35bb917e..09bd93464b4f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -180,10 +180,7 @@ int __init early_init_dt_scan_recoverable_ranges(unsigned long node, /* * Allocate a buffer to hold the MC recoverable ranges. */ - mc_recoverable_range = memblock_alloc(size, __alignof__(u64)); - if (!mc_recoverable_range) - panic("%s: Failed to allocate %u bytes align=0x%lx\n", - __func__, size, __alignof__(u64)); + mc_recoverable_range = memblock_alloc_or_panic(size, __alignof__(u64)); for (i = 0; i < mc_recoverable_range_len; i++) { mc_recoverable_range[i].start_addr = diff --git a/arch/powerpc/platforms/ps3/setup.c b/arch/powerpc/platforms/ps3/setup.c index 5144f11359f7..150c09b58ae8 100644 --- a/arch/powerpc/platforms/ps3/setup.c +++ b/arch/powerpc/platforms/ps3/setup.c @@ -115,10 +115,7 @@ static void __init prealloc(struct ps3_prealloc *p) if (!p->size) return; - p->address = memblock_alloc(p->size, p->align); - if (!p->address) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, p->size, p->align); + p->address = memblock_alloc_or_panic(p->size, p->align); printk(KERN_INFO "%s: %lu bytes at %p\n", p->name, p->size, p->address); diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 0b6e37f3ffb8..456a4f64ae0a 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -124,10 +124,7 @@ int __ref msi_bitmap_alloc(struct msi_bitmap *bmp, unsigned int irq_count, if (bmp->bitmap_from_slab) bmp->bitmap = kzalloc(size, GFP_KERNEL); else { - bmp->bitmap = memblock_alloc(size, SMP_CACHE_BYTES); - if (!bmp->bitmap) - panic("%s: Failed to allocate %u bytes\n", __func__, - size); + bmp->bitmap = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); /* the bitmap won't be freed from memblock allocator */ kmemleak_not_leak(bmp->bitmap); } diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 45010e71df86..f1793630fc51 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -147,9 +147,7 @@ static void __init init_resources(void) res_idx = num_resources - 1; mem_res_sz = num_resources * sizeof(*mem_res); - mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); - if (!mem_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); + mem_res = memblock_alloc_or_panic(mem_res_sz, SMP_CACHE_BYTES); /* * Start by adding the reserved regions, if they overlap diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index c301c8d291d2..41c635d6aca4 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -32,7 +32,7 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned pte_t *ptep, *p; if (pmd_none(pmdp_get(pmd))) { - p = memblock_alloc(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PTE * sizeof(pte_t), PAGE_SIZE); set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -54,7 +54,7 @@ static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned unsigned long next; if (pud_none(pudp_get(pud))) { - p = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE); set_pud(pud, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -85,7 +85,7 @@ static void __init kasan_populate_pud(p4d_t *p4d, unsigned long next; if (p4d_none(p4dp_get(p4d))) { - p = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE); set_p4d(p4d, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -116,7 +116,7 @@ static void __init kasan_populate_p4d(pgd_t *pgd, unsigned long next; if (pgd_none(pgdp_get(pgd))) { - p = memblock_alloc(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); + p = memblock_alloc_or_panic(PTRS_PER_P4D * sizeof(p4d_t), PAGE_SIZE); set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); } @@ -385,7 +385,7 @@ static void __init kasan_shallow_populate_pud(p4d_t *p4d, next = pud_addr_end(vaddr, end); if (pud_none(pudp_get(pud_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_pud(pud_k, pfn_pud(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } @@ -405,7 +405,7 @@ static void __init kasan_shallow_populate_p4d(pgd_t *pgd, next = p4d_addr_end(vaddr, end); if (p4d_none(p4dp_get(p4d_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_p4d(p4d_k, pfn_p4d(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } @@ -424,7 +424,7 @@ static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long next = pgd_addr_end(vaddr, end); if (pgd_none(pgdp_get(pgd_k))) { - p = memblock_alloc(PAGE_SIZE, PAGE_SIZE); + p = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE)); continue; } diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index cd0c93a8fb8b..dc7328fd2ec4 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -63,9 +63,7 @@ struct save_area * __init save_area_alloc(bool is_boot_cpu) { struct save_area *sa; - sa = memblock_alloc(sizeof(*sa), 8); - if (!sa) - return NULL; + sa = memblock_alloc_or_panic(sizeof(*sa), 8); if (is_boot_cpu) list_add(&sa->list, &dump_save_areas); diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index ddc1448ea2e1..2fc40f97c0ad 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -21,12 +21,8 @@ void __init numa_setup(void) nodes_clear(node_possible_map); node_set(0, node_possible_map); node_set_online(0); - for (nid = 0; nid < MAX_NUMNODES; nid++) { - NODE_DATA(nid) = memblock_alloc(sizeof(pg_data_t), 8); - if (!NODE_DATA(nid)) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(pg_data_t), 8); - } + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = memblock_alloc_or_panic(sizeof(pg_data_t), 8); NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index a3fea683b227..f873535eddd2 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -384,11 +384,7 @@ static unsigned long __init stack_alloc_early(void) { unsigned long stack; - stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); - if (!stack) { - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, THREAD_SIZE, THREAD_SIZE); - } + stack = (unsigned long)memblock_alloc_or_panic(THREAD_SIZE, THREAD_SIZE); return stack; } @@ -512,10 +508,7 @@ static void __init setup_resources(void) bss_resource.end = __pa_symbol(__bss_stop) - 1; for_each_mem_range(i, &start, &end) { - res = memblock_alloc(sizeof(*res), 8); - if (!res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*res), 8); + res = memblock_alloc_or_panic(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; @@ -534,10 +527,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = memblock_alloc(sizeof(*sub_res), 8); - if (!sub_res) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*sub_res), 8); + sub_res = memblock_alloc_or_panic(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; @@ -824,9 +814,7 @@ static void __init setup_randomness(void) { struct sysinfo_3_2_2 *vmms; - vmms = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!vmms) - panic("Failed to allocate memory for sysinfo structure\n"); + vmms = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); if (stsi(vmms, 3, 2, 2) == 0 && vmms->count) add_device_randomness(&vmms->vm, sizeof(vmms->vm[0]) * vmms->count); memblock_free(vmms, PAGE_SIZE); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 822d8e6f8717..7b08399b0846 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -611,9 +611,7 @@ void __init smp_save_dump_ipl_cpu(void) if (!dump_available()) return; sa = save_area_alloc(true); - regs = memblock_alloc(512, 8); - if (!sa || !regs) - panic("could not allocate memory for boot CPU save area\n"); + regs = memblock_alloc_or_panic(512, 8); copy_oldmem_kernel(regs, __LC_FPREGS_SAVE_AREA, 512); save_area_add_regs(sa, regs); memblock_free(regs, 512); @@ -646,8 +644,6 @@ void __init smp_save_dump_secondary_cpus(void) SIGP_CC_NOT_OPERATIONAL) continue; sa = save_area_alloc(false); - if (!sa) - panic("could not allocate memory for save area\n"); __pcpu_sigp_relax(addr, SIGP_STORE_STATUS_AT_ADDRESS, __pa(page)); save_area_add_regs(sa, page); if (cpu_has_vx()) { @@ -792,10 +788,7 @@ void __init smp_detect_cpus(void) u16 address; /* Get CPU information */ - info = memblock_alloc(sizeof(*info), 8); - if (!info) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*info), 8); + info = memblock_alloc_or_panic(sizeof(*info), 8); smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 4f9c301a705b..45e220bfce75 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -548,10 +548,7 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = memblock_alloc(sizeof(*mask->next), 8); - if (!mask->next) - panic("%s: Failed to allocate %zu bytes align=0x%x\n", - __func__, sizeof(*mask->next), 8); + mask->next = memblock_alloc_or_panic(sizeof(*mask->next), 8); mask = mask->next; } } @@ -569,10 +566,7 @@ void __init topology_init_early(void) } if (!MACHINE_HAS_TOPOLOGY) goto out; - tl_info = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!tl_info) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + tl_info = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); info = tl_info; store_topology(info); pr_info("The CPU configuration topology of the machine is: %d %d %d %d %d %d / %d\n", diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2a88b0c9e70f..289a2fecebef 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -137,10 +137,7 @@ static pmd_t * __init one_md_table_init(pud_t *pud) if (pud_none(*pud)) { pmd_t *pmd; - pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pmd) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pmd = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pud_populate(&init_mm, pud, pmd); BUG_ON(pmd != pmd_offset(pud, 0)); } @@ -153,10 +150,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) if (pmd_none(*pmd)) { pte_t *pte; - pte = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); + pte = memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); pmd_populate_kernel(&init_mm, pmd, pte); BUG_ON(pte != pte_offset_kernel(pmd, 0)); } diff --git a/arch/sparc/kernel/prom_32.c b/arch/sparc/kernel/prom_32.c index 3df960c137f7..a67dd67f10c8 100644 --- a/arch/sparc/kernel/prom_32.c +++ b/arch/sparc/kernel/prom_32.c @@ -28,9 +28,7 @@ void * __init prom_early_alloc(unsigned long size) { void *ret; - ret = memblock_alloc(size, SMP_CACHE_BYTES); - if (!ret) - panic("%s: Failed to allocate %lu bytes\n", __func__, size); + ret = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); prom_early_allocated += size; diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index e3a72c884b86..dd32711022f5 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -277,19 +277,13 @@ static void __init srmmu_nocache_init(void) bitmap_bits = srmmu_nocache_size >> SRMMU_NOCACHE_BITMAP_SHIFT; - srmmu_nocache_pool = memblock_alloc(srmmu_nocache_size, + srmmu_nocache_pool = memblock_alloc_or_panic(srmmu_nocache_size, SRMMU_NOCACHE_ALIGN_MAX); - if (!srmmu_nocache_pool) - panic("%s: Failed to allocate %lu bytes align=0x%x\n", - __func__, srmmu_nocache_size, SRMMU_NOCACHE_ALIGN_MAX); memset(srmmu_nocache_pool, 0, srmmu_nocache_size); srmmu_nocache_bitmap = - memblock_alloc(BITS_TO_LONGS(bitmap_bits) * sizeof(long), + memblock_alloc_or_panic(BITS_TO_LONGS(bitmap_bits) * sizeof(long), SMP_CACHE_BYTES); - if (!srmmu_nocache_bitmap) - panic("%s: Failed to allocate %zu bytes\n", __func__, - BITS_TO_LONGS(bitmap_bits) * sizeof(long)); bit_map_init(&srmmu_nocache_map, srmmu_nocache_bitmap, bitmap_bits); srmmu_swapper_pg_dir = __srmmu_get_nocache(SRMMU_PGD_TABLE_SIZE, SRMMU_PGD_TABLE_SIZE); @@ -452,9 +446,7 @@ static void __init sparc_context_init(int numctx) unsigned long size; size = numctx * sizeof(struct ctx_list); - ctx_list_pool = memblock_alloc(size, SMP_CACHE_BYTES); - if (!ctx_list_pool) - panic("%s: Failed to allocate %lu bytes\n", __func__, size); + ctx_list_pool = memblock_alloc_or_panic(size, SMP_CACHE_BYTES); for (ctx = 0; ctx < numctx; ctx++) { struct ctx_list *clist; diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 75d04fb4994a..d5a9c5aabaec 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -636,10 +636,7 @@ static int __init eth_setup(char *str) return 1; } - new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); - if (!new) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*new)); + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->index = n; diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c index 64c09db392c1..85b129e2b70b 100644 --- a/arch/um/drivers/vector_kern.c +++ b/arch/um/drivers/vector_kern.c @@ -1694,10 +1694,7 @@ static int __init vector_setup(char *str) str, error); return 1; } - new = memblock_alloc(sizeof(*new), SMP_CACHE_BYTES); - if (!new) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*new)); + new = memblock_alloc_or_panic(sizeof(*new), SMP_CACHE_BYTES); INIT_LIST_HEAD(&new->list); new->unit = n; new->arguments = str; diff --git a/arch/um/kernel/load_file.c b/arch/um/kernel/load_file.c index 5cecd0e291fb..cb9d178ab7d8 100644 --- a/arch/um/kernel/load_file.c +++ b/arch/um/kernel/load_file.c @@ -48,9 +48,7 @@ void *uml_load_file(const char *filename, unsigned long long *size) return NULL; } - area = memblock_alloc(*size, SMP_CACHE_BYTES); - if (!area) - panic("%s: Failed to allocate %llu bytes\n", __func__, *size); + area = memblock_alloc_or_panic(*size, SMP_CACHE_BYTES); if (__uml_load_file(filename, area, *size)) { memblock_free(area, *size); diff --git a/arch/x86/coco/sev/core.c b/arch/x86/coco/sev/core.c index c5b0148b8c0a..a3c9b7c67640 100644 --- a/arch/x86/coco/sev/core.c +++ b/arch/x86/coco/sev/core.c @@ -1572,9 +1572,7 @@ static void __init alloc_runtime_data(int cpu) struct svsm_ca *caa; /* Allocate the SVSM CA page if an SVSM is present */ - caa = memblock_alloc(sizeof(*caa), PAGE_SIZE); - if (!caa) - panic("Can't allocate SVSM CA page\n"); + caa = memblock_alloc_or_panic(sizeof(*caa), PAGE_SIZE); per_cpu(svsm_caa, cpu) = caa; per_cpu(svsm_caa_pa, cpu) = __pa(caa); diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3a44a9dc3fb7..7c15d6e83c37 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -911,11 +911,8 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table) * the resource tree during the lateinit timeframe. */ #define HPET_RESOURCE_NAME_SIZE 9 - hpet_res = memblock_alloc(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, + hpet_res = memblock_alloc_or_panic(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE, SMP_CACHE_BYTES); - if (!hpet_res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); hpet_res->name = (void *)&hpet_res[1]; hpet_res->flags = IORESOURCE_MEM; diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 1029ea4ac8ba..a57d3fa7c6b6 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2503,9 +2503,7 @@ static struct resource * __init ioapic_setup_resources(void) n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); n *= nr_ioapics; - mem = memblock_alloc(n, SMP_CACHE_BYTES); - if (!mem) - panic("%s: Failed to allocate %lu bytes\n", __func__, n); + mem = memblock_alloc_or_panic(n, SMP_CACHE_BYTES); res = (void *)mem; mem += sizeof(struct resource) * nr_ioapics; @@ -2564,11 +2562,8 @@ void __init io_apic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long)memblock_alloc(PAGE_SIZE, + ioapic_phys = (unsigned long)memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); - if (!ioapic_phys) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } io_apic_set_fixmap(idx, ioapic_phys); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 4893d30ce438..82b96ed9890a 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1146,11 +1146,8 @@ void __init e820__reserve_resources(void) struct resource *res; u64 end; - res = memblock_alloc(sizeof(*res) * e820_table->nr_entries, + res = memblock_alloc_or_panic(sizeof(*res) * e820_table->nr_entries, SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*res) * e820_table->nr_entries); e820_res = res; for (i = 0; i < e820_table->nr_entries; i++) { diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c index 74ebd6882690..cf5dca2dbb91 100644 --- a/arch/x86/platform/olpc/olpc_dt.c +++ b/arch/x86/platform/olpc/olpc_dt.c @@ -136,11 +136,7 @@ void * __init prom_early_alloc(unsigned long size) * fast enough on the platforms we care about while minimizing * wasted bootmem) and hand off chunks of it to callers. */ - res = memblock_alloc(chunk_size, SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - chunk_size); - BUG_ON(!res); + res = memblock_alloc_or_panic(chunk_size, SMP_CACHE_BYTES); prom_early_allocated += chunk_size; memset(res, 0, chunk_size); free_mem = chunk_size; diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index b52d3e17e2c1..56914e21e303 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -178,13 +178,7 @@ static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) static void * __ref alloc_p2m_page(void) { if (unlikely(!slab_is_available())) { - void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - - if (!ptr) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE); - - return ptr; + return memblock_alloc_or_panic(PAGE_SIZE, PAGE_SIZE); } return (void *)__get_free_page(GFP_KERNEL); diff --git a/arch/xtensa/mm/kasan_init.c b/arch/xtensa/mm/kasan_init.c index f00d122aa806..f39c4d83173a 100644 --- a/arch/xtensa/mm/kasan_init.c +++ b/arch/xtensa/mm/kasan_init.c @@ -39,11 +39,7 @@ static void __init populate(void *start, void *end) unsigned long i, j; unsigned long vaddr = (unsigned long)start; pmd_t *pmd = pmd_off_k(vaddr); - pte_t *pte = memblock_alloc(n_pages * sizeof(pte_t), PAGE_SIZE); - - if (!pte) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, n_pages * sizeof(pte_t), PAGE_SIZE); + pte_t *pte = memblock_alloc_or_panic(n_pages * sizeof(pte_t), PAGE_SIZE); pr_debug("%s: %p - %p\n", __func__, start, end); diff --git a/drivers/clk/ti/clk.c b/drivers/clk/ti/clk.c index f2117fef7c7d..9c75dcc9a534 100644 --- a/drivers/clk/ti/clk.c +++ b/drivers/clk/ti/clk.c @@ -449,10 +449,7 @@ void __init omap2_clk_legacy_provider_init(int index, void __iomem *mem) { struct clk_iomap *io; - io = memblock_alloc(sizeof(*io), SMP_CACHE_BYTES); - if (!io) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(*io)); + io = memblock_alloc_or_panic(sizeof(*io), SMP_CACHE_BYTES); io->mem = mem; diff --git a/drivers/macintosh/smu.c b/drivers/macintosh/smu.c index a01bc5090cdf..a1534cc6c641 100644 --- a/drivers/macintosh/smu.c +++ b/drivers/macintosh/smu.c @@ -492,11 +492,7 @@ int __init smu_init (void) goto fail_np; } - smu = memblock_alloc(sizeof(struct smu_device), SMP_CACHE_BYTES); - if (!smu) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct smu_device)); - + smu = memblock_alloc_or_panic(sizeof(struct smu_device), SMP_CACHE_BYTES); spin_lock_init(&smu->lock); INIT_LIST_HEAD(&smu->cmd_list); INIT_LIST_HEAD(&smu->cmd_i2c_list); diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index 0121100372b4..2eb718fbeffd 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1126,13 +1126,7 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) static void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align) { - void *ptr = memblock_alloc(size, align); - - if (!ptr) - panic("%s: Failed to allocate %llu bytes align=0x%llx\n", - __func__, size, align); - - return ptr; + return memblock_alloc_or_panic(size, align); } bool __init early_init_dt_verify(void *dt_virt, phys_addr_t dt_phys) diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 438fd70fa995..6e8561dba537 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -3666,13 +3666,7 @@ static struct device_node *overlay_base_root; static void * __init dt_alloc_memory(u64 size, u64 align) { - void *ptr = memblock_alloc(size, align); - - if (!ptr) - panic("%s: Failed to allocate %llu bytes align=0x%llx\n", - __func__, size, align); - - return ptr; + return memblock_alloc_or_panic(size, align); } /* diff --git a/include/linux/memblock.h b/include/linux/memblock.h index d48b56c1e558..e79eb6ac516f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -421,6 +421,12 @@ static __always_inline void *memblock_alloc(phys_addr_t size, phys_addr_t align) MEMBLOCK_ALLOC_ACCESSIBLE, NUMA_NO_NODE); } +void *__memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func); + +#define memblock_alloc_or_panic(size, align) \ + __memblock_alloc_or_panic(size, align, __func__) + static inline void *memblock_alloc_raw(phys_addr_t size, phys_addr_t align) { diff --git a/init/main.c b/init/main.c index 00fac1170294..4bae539ebc05 100644 --- a/init/main.c +++ b/init/main.c @@ -640,15 +640,11 @@ static void __init setup_command_line(char *command_line) len = xlen + strlen(boot_command_line) + ilen + 1; - saved_command_line = memblock_alloc(len, SMP_CACHE_BYTES); - if (!saved_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); + saved_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES); len = xlen + strlen(command_line) + 1; - static_command_line = memblock_alloc(len, SMP_CACHE_BYTES); - if (!static_command_line) - panic("%s: Failed to allocate %zu bytes\n", __func__, len); + static_command_line = memblock_alloc_or_panic(len, SMP_CACHE_BYTES); if (xlen) { /* @@ -1145,16 +1141,10 @@ static int __init initcall_blacklist(char *str) str_entry = strsep(&str, ","); if (str_entry) { pr_debug("blacklisting initcall %s\n", str_entry); - entry = memblock_alloc(sizeof(*entry), + entry = memblock_alloc_or_panic(sizeof(*entry), SMP_CACHE_BYTES); - if (!entry) - panic("%s: Failed to allocate %zu bytes\n", - __func__, sizeof(*entry)); - entry->buf = memblock_alloc(strlen(str_entry) + 1, + entry->buf = memblock_alloc_or_panic(strlen(str_entry) + 1, SMP_CACHE_BYTES); - if (!entry->buf) - panic("%s: Failed to allocate %zu bytes\n", - __func__, strlen(str_entry) + 1); strcpy(entry->buf, str_entry); list_add(&entry->next, &blacklisted_initcalls); } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 30894d8f0a78..c9fb559a6399 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1011,11 +1011,8 @@ void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pf } } /* This allocation cannot fail */ - region = memblock_alloc(sizeof(struct nosave_region), + region = memblock_alloc_or_panic(sizeof(struct nosave_region), SMP_CACHE_BYTES); - if (!region) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/lib/cpumask.c b/lib/cpumask.c index e77ee9d46f71..57274ba8b6d9 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -83,10 +83,7 @@ EXPORT_SYMBOL(alloc_cpumask_var_node); */ void __init alloc_bootmem_cpumask_var(cpumask_var_t *mask) { - *mask = memblock_alloc(cpumask_size(), SMP_CACHE_BYTES); - if (!*mask) - panic("%s: Failed to allocate %u bytes\n", __func__, - cpumask_size()); + *mask = memblock_alloc_or_panic(cpumask_size(), SMP_CACHE_BYTES); } /** diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 9c58f081d84f..1bb505a08415 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -280,12 +280,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) start = (void *)PAGE_ALIGN_DOWN((u64)start); size = PAGE_ALIGN((u64)end - (u64)start); - shadow = memblock_alloc(size, PAGE_SIZE); - origin = memblock_alloc(size, PAGE_SIZE); - - if (!shadow || !origin) - panic("%s: Failed to allocate metadata memory for early boot range of size %llu", - __func__, size); + shadow = memblock_alloc_or_panic(size, PAGE_SIZE); + origin = memblock_alloc_or_panic(size, PAGE_SIZE); for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { page = virt_to_page_or_null((char *)start + addr); diff --git a/mm/memblock.c b/mm/memblock.c index 095c18b5c430..95af35fd1389 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1691,6 +1691,26 @@ void * __init memblock_alloc_try_nid( return ptr; } +/** + * __memblock_alloc_or_panic - Try to allocate memory and panic on failure + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @func: caller func name + * + * This function attempts to allocate memory using memblock_alloc, + * and in case of failure, it calls panic with the formatted message. + * This function should not be used directly, please use the macro memblock_alloc_or_panic. + */ +void *__init __memblock_alloc_or_panic(phys_addr_t size, phys_addr_t align, + const char *func) +{ + void *addr = memblock_alloc(size, align); + + if (unlikely(!addr)) + panic("%s: Failed to allocate %pap bytes\n", func, &size); + return addr; +} + /** * memblock_free_late - free pages directly to buddy allocator * @base: phys starting address of the boot memory block diff --git a/mm/numa.c b/mm/numa.c index e2eec07707d1..f1787d7713a6 100644 --- a/mm/numa.c +++ b/mm/numa.c @@ -37,13 +37,7 @@ void __init alloc_node_data(int nid) void __init alloc_offline_node_data(int nid) { pg_data_t *pgdat; - - pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); - if (!pgdat) - panic("Cannot allocate %zuB for node %d.\n", - sizeof(*pgdat), nid); - - node_data[nid] = pgdat; + node_data[nid] = memblock_alloc_or_panic(sizeof(*pgdat), SMP_CACHE_BYTES); } /* Stub functions: */ diff --git a/mm/percpu.c b/mm/percpu.c index d8dd31a2e407..ac61e3fc5f15 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1359,10 +1359,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); - chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); INIT_LIST_HEAD(&chunk->list); @@ -1374,24 +1371,14 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); - chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->alloc_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->alloc_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); - chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->bound_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + chunk->bound_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); - chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!chunk->md_blocks) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); - + chunk->md_blocks = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); #ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ chunk->obj_exts = NULL; @@ -2595,28 +2582,16 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); - group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_offsets) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_offsets = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); - group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!group_sizes) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + group_sizes = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); - unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_map) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_map = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); - unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); - if (!unit_off) - panic("%s: Failed to allocate %zu bytes\n", __func__, - alloc_size); + unit_off = memblock_alloc_or_panic(alloc_size, SMP_CACHE_BYTES); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; @@ -2685,12 +2660,9 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_free_slot = pcpu_sidelined_slot + 1; pcpu_to_depopulate_slot = pcpu_free_slot + 1; pcpu_nr_slots = pcpu_to_depopulate_slot + 1; - pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * + pcpu_chunk_lists = memblock_alloc_or_panic(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]), SMP_CACHE_BYTES); - if (!pcpu_chunk_lists) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pcpu_nr_slots * sizeof(pcpu_chunk_lists[0])); for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_lists[i]); @@ -3155,25 +3127,19 @@ void __init __weak pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (pgd_none(*pgd)) { - p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - if (!p4d) - goto err_alloc; + p4d = memblock_alloc_or_panic(P4D_TABLE_SIZE, P4D_TABLE_SIZE); pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!pud) - goto err_alloc; + pud = memblock_alloc_or_panic(PUD_TABLE_SIZE, PUD_TABLE_SIZE); p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!pmd) - goto err_alloc; + pmd = memblock_alloc_or_panic(PMD_TABLE_SIZE, PMD_TABLE_SIZE); pud_populate(&init_mm, pud, pmd); } @@ -3181,16 +3147,11 @@ void __init __weak pcpu_populate_pte(unsigned long addr) if (!pmd_present(*pmd)) { pte_t *new; - new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); - if (!new) - goto err_alloc; + new = memblock_alloc_or_panic(PTE_TABLE_SIZE, PTE_TABLE_SIZE); pmd_populate_kernel(&init_mm, pmd, new); } return; - -err_alloc: - panic("%s: Failed to allocate memory\n", __func__); } /** @@ -3237,10 +3198,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); - pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); - if (!pages) - panic("%s: Failed to allocate %zu bytes\n", __func__, - pages_size); + pages = memblock_alloc_or_panic(pages_size, SMP_CACHE_BYTES); /* allocate pages */ j = 0; diff --git a/mm/sparse.c b/mm/sparse.c index 13b6624d3562..133b033d0cba 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -257,10 +257,7 @@ static void __init memblocks_present(void) size = sizeof(struct mem_section *) * NR_SECTION_ROOTS; align = 1 << (INTERNODE_CACHE_SHIFT); - mem_section = memblock_alloc(size, align); - if (!mem_section) - panic("%s: Failed to allocate %lu bytes align=0x%lx\n", - __func__, size, align); + mem_section = memblock_alloc_or_panic(size, align); } #endif -- 2.50.1 From 9cbfd1c3c83b62c42ff46e17227e996cc9b2b336 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:32 -0700 Subject: [PATCH 05/16] mm/mglru: clean up workingset Patch series "mm/mglru: performance optimizations", v4. This series improves performance for some previously reported test cases. Most of the code changes gathered here has been floating on the mailing list [1][2]. They are now properly organized and have gone through various benchmarks on client and server devices, including Android, FIO, memcached, multiple VMs and MongoDB. In addition to the syzbot regressions fixed in v2 [3] and v3 [4], this version fixes two more regressions: one reported by Oliver Sang [5] and the other by Barry Song. [1] https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/ [2] https://lore.kernel.org/CAOUHufawNerxqLm7L9Yywp3HJFiYVrYO26ePUb1jH-qxNGWzyA@mail.gmail.com/ [3] https://lore.kernel.org/67294349.050a0220.701a.0010.GAE@google.com/ [4] https://lore.kernel.org/67549eca.050a0220.2477f.001b.GAE@google.com/ [5] https://lore.kernel.org/202412231601.f1eb8f84-lkp@intel.com/ This patch (of 7): Move VM_BUG_ON_FOLIO() to cover both the default and MGLRU paths. Also use a pair of rcu_read_lock() and rcu_read_unlock() within each path, to improve readability. This change should not have any side effects. Link: https://lkml.kernel.org/r/20241231043538.4075764-1-yuzhao@google.com Link: https://lkml.kernel.org/r/20241231043538.4075764-2-yuzhao@google.com Signed-off-by: Yu Zhao Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: David Stevens Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/workingset.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index a4705e196545..ad181d1b8cf1 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -428,17 +428,17 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, struct pglist_data *pgdat; unsigned long eviction; - rcu_read_lock(); - if (lru_gen_enabled()) { - bool recent = lru_gen_test_recent(shadow, file, - &eviction_lruvec, &eviction, workingset); + bool recent; + rcu_read_lock(); + recent = lru_gen_test_recent(shadow, file, &eviction_lruvec, + &eviction, workingset); rcu_read_unlock(); return recent; } - + rcu_read_lock(); unpack_shadow(shadow, &memcgid, &pgdat, &eviction, workingset); eviction <<= bucket_order; @@ -459,14 +459,12 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, * configurations instead. */ eviction_memcg = mem_cgroup_from_id(memcgid); - if (!mem_cgroup_disabled() && - (!eviction_memcg || !mem_cgroup_tryget(eviction_memcg))) { - rcu_read_unlock(); - return false; - } - + if (!mem_cgroup_tryget(eviction_memcg)) + eviction_memcg = NULL; rcu_read_unlock(); + if (!mem_cgroup_disabled() && !eviction_memcg) + return false; /* * Flush stats (and potentially sleep) outside the RCU read section. * @@ -544,6 +542,8 @@ void workingset_refault(struct folio *folio, void *shadow) bool workingset; long nr; + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + if (lru_gen_enabled()) { lru_gen_refault(folio, shadow); return; @@ -558,7 +558,6 @@ void workingset_refault(struct folio *folio, void *shadow) * is actually experiencing the refault event. Make sure the folio is * locked to guarantee folio_memcg() stability throughout. */ - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); nr = folio_nr_pages(folio); memcg = folio_memcg(folio); pgdat = folio_pgdat(folio); -- 2.50.1 From cc8ec7be78ffaa29f6e0781fd12602974970eb53 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:33 -0700 Subject: [PATCH 06/16] mm/mglru: optimize deactivation Do not shuffle a folio in the deactivation paths if it is already in the oldest generation. This reduces the LRU lock contention. Before this patch, the contention is reproducible by FIO, e.g., fio -filename=/dev/nvme1n1p2 -direct=0 -thread -size=1024G \ -rwmixwrite=30 --norandommap --randrepeat=0 -ioengine=sync \ -bs=4k -numjobs=400 -runtime=25000 --time_based \ -group_reporting -name=mglru 98.96%--_raw_spin_lock_irqsave folio_lruvec_lock_irqsave | --98.78%--folio_batch_move_lru | --98.63%--deactivate_file_folio mapping_try_invalidate invalidate_mapping_pages invalidate_bdev blkdev_common_ioctl blkdev_ioctl After this patch, deactivate_file_folio() bails out early without taking the LRU lock. A side effect is that a folio can be left at the head of the oldest generation, rather than the tail. If reclaim happens at the same time, it cannot reclaim this folio immediately. Since there is no known correlation between truncation and reclaim, this side effect is considered insignificant. Link: https://lkml.kernel.org/r/20241231043538.4075764-3-yuzhao@google.com Reported-by: Bharata B Rao Closes: https://lore.kernel.org/CAOUHufawNerxqLm7L9Yywp3HJFiYVrYO26ePUb1jH-qxNGWzyA@mail.gmail.com/ Signed-off-by: Yu Zhao Tested-by: Kalesh Singh Cc: Barry Song Cc: David Stevens Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swap.c | 48 +++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 41 insertions(+), 7 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 3a01acfd5a89..649ef7f2b74b 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -379,7 +379,8 @@ static void __lru_cache_activate_folio(struct folio *folio) } #ifdef CONFIG_LRU_GEN -static void folio_inc_refs(struct folio *folio) + +static void lru_gen_inc_refs(struct folio *folio) { unsigned long new_flags, old_flags = READ_ONCE(folio->flags); @@ -406,10 +407,34 @@ static void folio_inc_refs(struct folio *folio) new_flags |= old_flags & ~LRU_REFS_MASK; } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } -#else -static void folio_inc_refs(struct folio *folio) + +static bool lru_gen_clear_refs(struct folio *folio) +{ + struct lru_gen_folio *lrugen; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + + if (gen < 0) + return true; + + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + + lrugen = &folio_lruvec(folio)->lrugen; + /* whether can do without shuffling under the LRU lock */ + return gen == lru_gen_from_seq(READ_ONCE(lrugen->min_seq[type])); +} + +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_inc_refs(struct folio *folio) { } + +static bool lru_gen_clear_refs(struct folio *folio) +{ + return false; +} + #endif /* CONFIG_LRU_GEN */ /** @@ -428,7 +453,7 @@ static void folio_inc_refs(struct folio *folio) void folio_mark_accessed(struct folio *folio) { if (lru_gen_enabled()) { - folio_inc_refs(folio); + lru_gen_inc_refs(folio); return; } @@ -524,7 +549,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) */ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) { - bool active = folio_test_active(folio); + bool active = folio_test_active(folio) || lru_gen_enabled(); long nr_pages = folio_nr_pages(folio); if (folio_test_unevictable(folio)) @@ -589,7 +614,10 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_del_folio(lruvec, folio); folio_clear_active(folio); - folio_clear_referenced(folio); + if (lru_gen_enabled()) + lru_gen_clear_refs(folio); + else + folio_clear_referenced(folio); /* * Lazyfree folios are clean anonymous folios. They have * the swapbacked flag cleared, to distinguish them from normal @@ -657,6 +685,9 @@ void deactivate_file_folio(struct folio *folio) if (folio_test_unevictable(folio)) return; + if (lru_gen_enabled() && lru_gen_clear_refs(folio)) + return; + folio_batch_add_and_move(folio, lru_deactivate_file, true); } @@ -670,7 +701,10 @@ void deactivate_file_folio(struct folio *folio) */ void folio_deactivate(struct folio *folio) { - if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled())) + if (folio_test_unevictable(folio)) + return; + + if (lru_gen_enabled() ? lru_gen_clear_refs(folio) : !folio_test_active(folio)) return; folio_batch_add_and_move(folio, lru_deactivate, true); -- 2.50.1 From 798c0330c2ca078cc3e155e567c77c4d61345a38 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:34 -0700 Subject: [PATCH 07/16] mm/mglru: rework aging feedback The aging feedback is based on both the number of generations and the distribution of folios in each generation. The number of generations is currently the distance between max_seq and anon min_seq. This is because anon min_seq is not allowed to move past file min_seq. The rationale for that is that file is always evictable whereas anon is not. However, for use cases where anon is a lot cheaper than file: 1. Anon in the second oldest generation can be a better choice than file in the oldest generation. 2. A large amount of file in the oldest generation can skew the distribution, making should_run_aging() return false negative. Allow anon and file min_seq to move independently, and use solely the number of generations as the feedback for aging. Specifically, when both anon and file are evictable, anon min_seq can now be greater than file min_seq, and therefore the number of generations becomes the distance between max_seq and min(min_seq[0],min_seq[1]). And should_run_aging() returns true if and only if the number of generations is less than MAX_NR_GENS. As the first step to the final optimization, this change by itself should not have userspace-visiable effects beyond performance. The next twos patch will take advantage of this change; the last patch in this series will better distribute folios across MAX_NR_GENS. [yuzhao@google.com: restore behaviour for systems with swappiness == 200] Link: https://lkml.kernel.org/r/Z4S3-aJy5dj9tBTk@google.com Link: https://lkml.kernel.org/r/20241231043538.4075764-4-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: David Stevens Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 17 ++-- mm/vmscan.c | 206 ++++++++++++++++++----------------------- 2 files changed, 99 insertions(+), 124 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b36124145a16..8245ecb0400b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -421,12 +421,11 @@ enum { /* * The youngest generation number is stored in max_seq for both anon and file * types as they are aged on an equal footing. The oldest generation numbers are - * stored in min_seq[] separately for anon and file types as clean file pages - * can be evicted regardless of swap constraints. - * - * Normally anon and file min_seq are in sync. But if swapping is constrained, - * e.g., out of swap space, file min_seq is allowed to advance and leave anon - * min_seq behind. + * stored in min_seq[] separately for anon and file types so that they can be + * incremented independently. Ideally min_seq[] are kept in sync when both anon + * and file types are evictable. However, to adapt to situations like extreme + * swappiness, they are allowed to be out of sync by at most + * MAX_NR_GENS-MIN_NR_GENS-1. * * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. @@ -446,8 +445,8 @@ struct lru_gen_folio { unsigned long avg_refaulted[ANON_AND_FILE][MAX_NR_TIERS]; /* the exponential moving average of evicted+protected */ unsigned long avg_total[ANON_AND_FILE][MAX_NR_TIERS]; - /* the first tier doesn't need protection, hence the minus one */ - unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS - 1]; + /* can only be modified under the LRU lock */ + unsigned long protected[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; /* can be modified without holding the LRU lock */ atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS]; @@ -498,7 +497,7 @@ struct lru_gen_mm_walk { int mm_stats[NR_MM_STATS]; /* total batched items */ int batched; - bool can_swap; + int swappiness; bool force_scan; }; diff --git a/mm/vmscan.c b/mm/vmscan.c index b1ec5ece067e..168ca5d10a4d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2619,11 +2619,17 @@ static bool should_clear_pmd_young(void) READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ } +#define evictable_min_seq(min_seq, swappiness) \ + min((min_seq)[!(swappiness)], (min_seq)[(swappiness) <= MAX_SWAPPINESS]) + #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) +#define for_each_evictable_type(type, swappiness) \ + for ((type) = !(swappiness); (type) <= ((swappiness) <= MAX_SWAPPINESS); (type)++) + #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS) #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS) @@ -2669,10 +2675,16 @@ static int get_nr_gens(struct lruvec *lruvec, int type) static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) { - /* see the comment on lru_gen_folio */ - return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && - get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && - get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; + int type; + + for (type = 0; type < ANON_AND_FILE; type++) { + int n = get_nr_gens(lruvec, type); + + if (n < MIN_NR_GENS || n > MAX_NR_GENS) + return false; + } + + return true; } /****************************************************************************** @@ -3079,9 +3091,8 @@ static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, pos->refaulted = lrugen->avg_refaulted[type][tier] + atomic_long_read(&lrugen->refaulted[hist][type][tier]); pos->total = lrugen->avg_total[type][tier] + + lrugen->protected[hist][type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - pos->total += lrugen->protected[hist][type][tier - 1]; pos->gain = gain; } @@ -3108,17 +3119,15 @@ static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); sum = lrugen->avg_total[type][tier] + + lrugen->protected[hist][type][tier] + atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - sum += lrugen->protected[hist][type][tier - 1]; WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); } if (clear) { atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); atomic_long_set(&lrugen->evicted[hist][type][tier], 0); - if (tier) - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + WRITE_ONCE(lrugen->protected[hist][type][tier], 0); } } } @@ -3253,7 +3262,7 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (vma_is_anonymous(vma)) - return !walk->can_swap; + return !walk->swappiness; if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping)) return true; @@ -3263,7 +3272,10 @@ static int should_skip_vma(unsigned long start, unsigned long end, struct mm_wal return true; if (shmem_mapping(mapping)) - return !walk->can_swap; + return !walk->swappiness; + + if (walk->swappiness > MAX_SWAPPINESS) + return true; /* to exclude special mappings like dax, etc. */ return !mapping->a_ops->read_folio; @@ -3351,7 +3363,7 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned } static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, - struct pglist_data *pgdat, bool can_swap) + struct pglist_data *pgdat) { struct folio *folio; @@ -3362,10 +3374,6 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, if (folio_memcg(folio) != memcg) return NULL; - /* file VMAs can contain anon pages from COW */ - if (!folio_is_file_lru(folio) && !can_swap) - return NULL; - return folio; } @@ -3421,7 +3429,7 @@ restart: if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; @@ -3506,7 +3514,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (pfn == -1) goto next; - folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) goto next; @@ -3718,22 +3726,26 @@ static void clear_mm_walk(void) kfree(walk); } -static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) +static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) { int zone; int remaining = MAX_LRU_BATCH; struct lru_gen_folio *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); - if (type == LRU_GEN_ANON && !can_swap) + if (type ? swappiness > MAX_SWAPPINESS : !swappiness) goto done; - /* prevent cold/hot inversion if force_scan is true */ + /* prevent cold/hot inversion if the type is evictable */ for (zone = 0; zone < MAX_NR_ZONES; zone++) { struct list_head *head = &lrugen->folios[old_gen][type][zone]; while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); + int refs = folio_lru_refs(folio); + int tier = lru_tier_from_refs(refs); + int delta = folio_nr_pages(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); @@ -3743,6 +3755,9 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap) new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + if (!--remaining) return false; } @@ -3754,7 +3769,7 @@ done: return true; } -static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +static bool try_to_inc_min_seq(struct lruvec *lruvec, int swappiness) { int gen, type, zone; bool success = false; @@ -3764,7 +3779,7 @@ static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); /* find the oldest populated generation */ - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { gen = lru_gen_from_seq(min_seq[type]); @@ -3780,13 +3795,17 @@ next: } /* see the comment on lru_gen_folio */ - if (can_swap) { - min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); - min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + if (swappiness && swappiness <= MAX_SWAPPINESS) { + unsigned long seq = lrugen->max_seq - MIN_NR_GENS; + + if (min_seq[LRU_GEN_ANON] > seq && min_seq[LRU_GEN_FILE] < seq) + min_seq[LRU_GEN_ANON] = seq; + else if (min_seq[LRU_GEN_FILE] > seq && min_seq[LRU_GEN_ANON] < seq) + min_seq[LRU_GEN_FILE] = seq; } - for (type = !can_swap; type < ANON_AND_FILE; type++) { - if (min_seq[type] == lrugen->min_seq[type]) + for_each_evictable_type(type, swappiness) { + if (min_seq[type] <= lrugen->min_seq[type]) continue; reset_ctrl_pos(lruvec, type, true); @@ -3797,8 +3816,7 @@ next: return success; } -static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) +static bool inc_max_seq(struct lruvec *lruvec, unsigned long seq, int swappiness) { bool success; int prev, next; @@ -3816,13 +3834,11 @@ restart: if (!success) goto unlock; - for (type = ANON_AND_FILE - 1; type >= 0; type--) { + for (type = 0; type < ANON_AND_FILE; type++) { if (get_nr_gens(lruvec, type) != MAX_NR_GENS) continue; - VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); - - if (inc_min_seq(lruvec, type, can_swap)) + if (inc_min_seq(lruvec, type, swappiness)) continue; spin_unlock_irq(&lruvec->lru_lock); @@ -3866,7 +3882,7 @@ unlock: } static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { bool success; struct lru_gen_mm_walk *walk; @@ -3877,7 +3893,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, VM_WARN_ON_ONCE(seq > READ_ONCE(lrugen->max_seq)); if (!mm_state) - return inc_max_seq(lruvec, seq, can_swap, force_scan); + return inc_max_seq(lruvec, seq, swappiness); /* see the comment in iterate_mm_list() */ if (seq <= READ_ONCE(mm_state->seq)) @@ -3902,7 +3918,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, walk->lruvec = lruvec; walk->seq = seq; - walk->can_swap = can_swap; + walk->swappiness = swappiness; walk->force_scan = force_scan; do { @@ -3912,7 +3928,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long seq, } while (mm); done: if (success) { - success = inc_max_seq(lruvec, seq, can_swap, force_scan); + success = inc_max_seq(lruvec, seq, swappiness); WARN_ON_ONCE(!success); } @@ -3953,13 +3969,13 @@ static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc) { int gen, type, zone; unsigned long total = 0; - bool can_swap = get_swappiness(lruvec, sc); + int swappiness = get_swappiness(lruvec, sc); struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { @@ -3979,6 +3995,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc { int gen; unsigned long birth; + int swappiness = get_swappiness(lruvec, sc); struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MIN_SEQ(lruvec); @@ -3988,8 +4005,7 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc if (!lruvec_is_sizable(lruvec, sc)) return false; - /* see the comment on lru_gen_folio */ - gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]); + gen = lru_gen_from_seq(evictable_min_seq(min_seq, swappiness)); birth = READ_ONCE(lruvec->lrugen.timestamps[gen]); return time_is_before_jiffies(birth + min_ttl); @@ -4056,7 +4072,6 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) unsigned long addr = pvmw->address; struct vm_area_struct *vma = pvmw->vma; struct folio *folio = pfn_folio(pvmw->pfn); - bool can_swap = !folio_is_file_lru(folio); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); @@ -4109,7 +4124,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (pfn == -1) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat); if (!folio) continue; @@ -4325,8 +4340,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); - WRITE_ONCE(lrugen->protected[hist][type][tier - 1], - lrugen->protected[hist][type][tier - 1] + delta); + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); return true; } @@ -4525,7 +4540,6 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw { int i; int type; - int scanned; int tier = -1; DEFINE_MIN_SEQ(lruvec); @@ -4537,34 +4551,36 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw * 2. If !__GFP_IO, file first since clean pagecache is more likely to * exist than clean swapcache. */ - if (!swappiness) + if (swappiness <= MIN_SWAPPINESS + 1) type = LRU_GEN_FILE; else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) type = LRU_GEN_ANON; else if (swappiness == 1) type = LRU_GEN_FILE; - else if (swappiness == MAX_SWAPPINESS) + if (swappiness >= MAX_SWAPPINESS) type = LRU_GEN_ANON; else if (!(sc->gfp_mask & __GFP_IO)) type = LRU_GEN_FILE; else type = get_type_to_scan(lruvec, swappiness, &tier); - for (i = !swappiness; i < ANON_AND_FILE; i++) { + for_each_evictable_type(i, swappiness) { + int scanned; + if (tier < 0) tier = get_tier_idx(lruvec, type); + *type_scanned = type; + scanned = scan_folios(lruvec, sc, type, tier, list); if (scanned) - break; + return scanned; type = !type; tier = -1; } - *type_scanned = type; - - return scanned; + return 0; } static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) @@ -4580,6 +4596,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap struct reclaim_stat stat; struct lru_gen_mm_walk *walk; bool skip_retry = false; + struct lru_gen_folio *lrugen = &lruvec->lrugen; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -4589,7 +4606,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap scanned += try_to_inc_min_seq(lruvec, swappiness); - if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + if (evictable_min_seq(lrugen->min_seq, swappiness) + MIN_NR_GENS > lrugen->max_seq) scanned = 0; spin_unlock_irq(&lruvec->lru_lock); @@ -4664,63 +4681,32 @@ retry: } static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, - bool can_swap, unsigned long *nr_to_scan) + int swappiness, unsigned long *nr_to_scan) { int gen, type, zone; - unsigned long old = 0; - unsigned long young = 0; - unsigned long total = 0; + unsigned long size = 0; struct lru_gen_folio *lrugen = &lruvec->lrugen; DEFINE_MIN_SEQ(lruvec); - /* whether this lruvec is completely out of cold folios */ - if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) { - *nr_to_scan = 0; + *nr_to_scan = 0; + /* have to run aging, since eviction is not possible anymore */ + if (evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS > max_seq) return true; - } - for (type = !can_swap; type < ANON_AND_FILE; type++) { + for_each_evictable_type(type, swappiness) { unsigned long seq; for (seq = min_seq[type]; seq <= max_seq; seq++) { - unsigned long size = 0; - gen = lru_gen_from_seq(seq); for (zone = 0; zone < MAX_NR_ZONES; zone++) size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); - - total += size; - if (seq == max_seq) - young += size; - else if (seq + MIN_NR_GENS == max_seq) - old += size; } } - *nr_to_scan = total; - - /* - * The aging tries to be lazy to reduce the overhead, while the eviction - * stalls when the number of generations reaches MIN_NR_GENS. Hence, the - * ideal number of generations is MIN_NR_GENS+1. - */ - if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) - return false; - - /* - * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) - * of the total number of pages for each generation. A reasonable range - * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The - * aging cares about the upper bound of hot pages, while the eviction - * cares about the lower bound of cold pages. - */ - if (young * MIN_NR_GENS > total) - return true; - if (old * (MIN_NR_GENS + 2) < total) - return true; - - return false; + *nr_to_scan = size; + /* better to run aging even though eviction is still possible */ + return evictable_min_seq(min_seq, swappiness) + MIN_NR_GENS == max_seq; } /* @@ -4728,7 +4714,7 @@ static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg * reclaim. */ -static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap) +static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) { bool success; unsigned long nr_to_scan; @@ -4738,7 +4724,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg)) return -1; - success = should_run_aging(lruvec, max_seq, can_swap, &nr_to_scan); + success = should_run_aging(lruvec, max_seq, swappiness, &nr_to_scan); /* try to scrape all its memory if this memcg was deleted */ if (nr_to_scan && !mem_cgroup_online(memcg)) @@ -4749,7 +4735,7 @@ static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool return nr_to_scan >> sc->priority; /* stop scanning this lruvec as it's low on cold folios */ - return try_to_inc_max_seq(lruvec, max_seq, can_swap, false) ? -1 : 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, false) ? -1 : 0; } static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc) @@ -5293,8 +5279,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, s = "rep"; n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]); n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]); - if (tier) - n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]); + n[2] = READ_ONCE(lrugen->protected[hist][type][tier]); } for (i = 0; i < 3; i++) @@ -5349,7 +5334,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) seq_printf(m, " node %5d\n", nid); if (!full) - seq = min_seq[LRU_GEN_ANON]; + seq = evictable_min_seq(min_seq, MAX_SWAPPINESS / 2); else if (max_seq >= MAX_NR_GENS) seq = max_seq - MAX_NR_GENS + 1; else @@ -5389,23 +5374,14 @@ static const struct seq_operations lru_gen_seq_ops = { }; static int run_aging(struct lruvec *lruvec, unsigned long seq, - bool can_swap, bool force_scan) + int swappiness, bool force_scan) { DEFINE_MAX_SEQ(lruvec); - DEFINE_MIN_SEQ(lruvec); - - if (seq < max_seq) - return 0; if (seq > max_seq) return -EINVAL; - if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq) - return -ERANGE; - - try_to_inc_max_seq(lruvec, max_seq, can_swap, force_scan); - - return 0; + return try_to_inc_max_seq(lruvec, max_seq, swappiness, force_scan) ? 0 : -EEXIST; } static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc, @@ -5421,7 +5397,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co while (!signal_pending(current)) { DEFINE_MIN_SEQ(lruvec); - if (seq < min_seq[!swappiness]) + if (seq < evictable_min_seq(min_seq, swappiness)) return 0; if (sc->nr_reclaimed >= nr_to_reclaim) @@ -5466,7 +5442,7 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (swappiness < MIN_SWAPPINESS) swappiness = get_swappiness(lruvec, sc); - else if (swappiness > MAX_SWAPPINESS) + else if (swappiness > MAX_SWAPPINESS + 1) goto done; switch (cmd) { -- 2.50.1 From 37a260870f2ce41a649b14f74f10224046bcaeb1 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:35 -0700 Subject: [PATCH 08/16] mm/mglru: rework type selection With anon and file min_seq being able to move independently, rework type selection so that it is based on the total refaults from all tiers of each type. Also allow a type to be selected until that type reaches MIN_NR_GENS, regardless of whether that type has a larger min_seq or not, to accommodate extreme swappiness. Since some tiers of a selected type can have higher refaults than the first tier of the other type, use a less larger gain factor 2:3 instead of 1:2, in order for those tiers in the selected type to be better protected. As an intermediate step to the final optimization, this change by itself should not have userspace-visiable effects beyond performance. Link: https://lkml.kernel.org/r/20241231043538.4075764-5-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: David Stevens Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/vmscan.c | 82 +++++++++++++++++------------------------------------ 1 file changed, 26 insertions(+), 56 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 168ca5d10a4d..761298d9c09a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3085,15 +3085,20 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) { + int i; struct lru_gen_folio *lrugen = &lruvec->lrugen; int hist = lru_hist_from_seq(lrugen->min_seq[type]); - pos->refaulted = lrugen->avg_refaulted[type][tier] + - atomic_long_read(&lrugen->refaulted[hist][type][tier]); - pos->total = lrugen->avg_total[type][tier] + - lrugen->protected[hist][type][tier] + - atomic_long_read(&lrugen->evicted[hist][type][tier]); pos->gain = gain; + pos->refaulted = pos->total = 0; + + for (i = tier % MAX_NR_TIERS; i <= min(tier, MAX_NR_TIERS - 1); i++) { + pos->refaulted += lrugen->avg_refaulted[type][i] + + atomic_long_read(&lrugen->refaulted[hist][type][i]); + pos->total += lrugen->avg_total[type][i] + + lrugen->protected[hist][type][i] + + atomic_long_read(&lrugen->evicted[hist][type][i]); + } } static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) @@ -4493,13 +4498,13 @@ static int get_tier_idx(struct lruvec *lruvec, int type) struct ctrl_pos sp, pv; /* - * To leave a margin for fluctuations, use a larger gain factor (1:2). + * To leave a margin for fluctuations, use a larger gain factor (2:3). * This value is chosen because any other tier would have at least twice * as many refaults as the first tier. */ - read_ctrl_pos(lruvec, type, 0, 1, &sp); + read_ctrl_pos(lruvec, type, 0, 2, &sp); for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, 2, &pv); + read_ctrl_pos(lruvec, type, tier, 3, &pv); if (!positive_ctrl_err(&sp, &pv)) break; } @@ -4507,68 +4512,34 @@ static int get_tier_idx(struct lruvec *lruvec, int type) return tier - 1; } -static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +static int get_type_to_scan(struct lruvec *lruvec, int swappiness) { - int type, tier; struct ctrl_pos sp, pv; - int gain[ANON_AND_FILE] = { swappiness, MAX_SWAPPINESS - swappiness }; + if (swappiness <= MIN_SWAPPINESS + 1) + return LRU_GEN_FILE; + + if (swappiness >= MAX_SWAPPINESS) + return LRU_GEN_ANON; /* - * Compare the first tier of anon with that of file to determine which - * type to scan. Also need to compare other tiers of the selected type - * with the first tier of the other type to determine the last tier (of - * the selected type) to evict. + * Compare the sum of all tiers of anon with that of file to determine + * which type to scan. */ - read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); - read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); - type = positive_ctrl_err(&sp, &pv); - - read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); - for (tier = 1; tier < MAX_NR_TIERS; tier++) { - read_ctrl_pos(lruvec, type, tier, gain[type], &pv); - if (!positive_ctrl_err(&sp, &pv)) - break; - } - - *tier_idx = tier - 1; + read_ctrl_pos(lruvec, LRU_GEN_ANON, MAX_NR_TIERS, swappiness, &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, MAX_NR_TIERS, MAX_SWAPPINESS - swappiness, &pv); - return type; + return positive_ctrl_err(&sp, &pv); } static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, int *type_scanned, struct list_head *list) { int i; - int type; - int tier = -1; - DEFINE_MIN_SEQ(lruvec); - - /* - * Try to make the obvious choice first, and if anon and file are both - * available from the same generation, - * 1. Interpret swappiness 1 as file first and MAX_SWAPPINESS as anon - * first. - * 2. If !__GFP_IO, file first since clean pagecache is more likely to - * exist than clean swapcache. - */ - if (swappiness <= MIN_SWAPPINESS + 1) - type = LRU_GEN_FILE; - else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) - type = LRU_GEN_ANON; - else if (swappiness == 1) - type = LRU_GEN_FILE; - if (swappiness >= MAX_SWAPPINESS) - type = LRU_GEN_ANON; - else if (!(sc->gfp_mask & __GFP_IO)) - type = LRU_GEN_FILE; - else - type = get_type_to_scan(lruvec, swappiness, &tier); + int type = get_type_to_scan(lruvec, swappiness); for_each_evictable_type(i, swappiness) { int scanned; - - if (tier < 0) - tier = get_tier_idx(lruvec, type); + int tier = get_tier_idx(lruvec, type); *type_scanned = type; @@ -4577,7 +4548,6 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw return scanned; type = !type; - tier = -1; } return 0; -- 2.50.1 From b1a71694fb00c9a7bad788a0b49198eab20621b3 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:36 -0700 Subject: [PATCH 09/16] mm/mglru: rework refault detection With anon and file min_seq being able to move independently, rework workingset protection as well so that the comparison of refaults between anon and file is always on an equal footing. Specifically, make lru_gen_test_recent() return true for refaults happening within the distance of MAX_NR_GENS. For example, if min_seq of a type is max_seq-MIN_NR_GENS, refaults from min_seq-1, i.e., max_seq-MIN_NR_GENS-1, are also considered recent, since the distance max_seq-(max_seq-MIN_NR_GENS-1), i.e., MIN_NR_GENS+1 is less than MAX_NR_GENS. As an intermediate step to the final optimization, this change by itself should not have userspace-visiable effects beyond performance. Link: https://lkml.kernel.org/r/20241231043538.4075764-6-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: Kairui Song Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/ Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: David Stevens Signed-off-by: Andrew Morton --- mm/workingset.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/workingset.c b/mm/workingset.c index ad181d1b8cf1..2c310c29f51e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -260,11 +260,11 @@ static void *lru_gen_eviction(struct folio *folio) * Tests if the shadow entry is for a folio that was recently evicted. * Fills in @lruvec, @token, @workingset with the values unpacked from shadow. */ -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { int memcg_id; - unsigned long min_seq; + unsigned long max_seq; struct mem_cgroup *memcg; struct pglist_data *pgdat; @@ -273,8 +273,10 @@ static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, memcg = mem_cgroup_from_id(memcg_id); *lruvec = mem_cgroup_lruvec(memcg, pgdat); - min_seq = READ_ONCE((*lruvec)->lrugen.min_seq[file]); - return (*token >> LRU_REFS_WIDTH) == (min_seq & (EVICTION_MASK >> LRU_REFS_WIDTH)); + max_seq = READ_ONCE((*lruvec)->lrugen.max_seq); + max_seq &= EVICTION_MASK >> LRU_REFS_WIDTH; + + return abs_diff(max_seq, *token >> LRU_REFS_WIDTH) < MAX_NR_GENS; } static void lru_gen_refault(struct folio *folio, void *shadow) @@ -290,7 +292,7 @@ static void lru_gen_refault(struct folio *folio, void *shadow) rcu_read_lock(); - recent = lru_gen_test_recent(shadow, type, &lruvec, &token, &workingset); + recent = lru_gen_test_recent(shadow, &lruvec, &token, &workingset); if (lruvec != folio_lruvec(folio)) goto unlock; @@ -331,7 +333,7 @@ static void *lru_gen_eviction(struct folio *folio) return NULL; } -static bool lru_gen_test_recent(void *shadow, bool file, struct lruvec **lruvec, +static bool lru_gen_test_recent(void *shadow, struct lruvec **lruvec, unsigned long *token, bool *workingset) { return false; @@ -432,8 +434,7 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset, bool recent; rcu_read_lock(); - recent = lru_gen_test_recent(shadow, file, &eviction_lruvec, - &eviction, workingset); + recent = lru_gen_test_recent(shadow, &eviction_lruvec, &eviction, workingset); rcu_read_unlock(); return recent; } -- 2.50.1 From 4d5d14a01e2c9091b128fb46e1d07475e9a7bb72 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:37 -0700 Subject: [PATCH 10/16] mm/mglru: rework workingset protection With the aging feedback no longer considering the distribution of folios in each generation, rework workingset protection to better distribute folios across MAX_NR_GENS. This is achieved by reusing PG_workingset and PG_referenced/LRU_REFS_FLAGS in a slightly different way. For folios accessed multiple times through file descriptors, make lru_gen_inc_refs() set additional bits of LRU_REFS_WIDTH in folio->flags after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily promoted into the second oldest generation in the eviction path. And when folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that lru_gen_inc_refs() can start over. For this case, LRU_REFS_MASK is only valid when PG_referenced is set. For folios accessed multiple times through page tables, folio_update_gen() from a page table walk or lru_gen_set_refs() from a rmap walk sets PG_referenced after the accessed bit is cleared for the first time. Thereafter, those two paths set PG_workingset and promote folios to the youngest generation. Like folio_inc_gen(), when folio_update_gen() does that, it also clears PG_referenced. For this case, LRU_REFS_MASK is not used. For both of the cases, after PG_workingset is set on a folio, it remains until this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It can be set again if lru_gen_test_recent() returns true upon a refault. When adding folios to the LRU lists, lru_gen_folio_seq() distributes them as follows: +---------------------------------+---------------------------------+ | Accessed thru page tables | Accessed thru file descriptors | +---------------------------------+---------------------------------+ | PG_active (set while isolated) | | +----------------+----------------+----------------+----------------+ | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | +---------------------------------+---------------------------------+ |<--------- MIN_NR_GENS --------->| | |<-------------------------- MAX_NR_GENS -------------------------->| After this patch, some typical client and server workloads showed improvements under heavy memory pressure. For example, Python TPC-C, which was used to benchmark a different approach [1] to better detect refault distances, showed a significant decrease in total refaults: Before After Change Time (seconds) 10801 10801 0% Executed (transactions) 41472 43663 +5% workingset_nodes 109070 120244 +10% workingset_refault_anon 5019627 7281831 +45% workingset_refault_file 1294678786 554855564 -57% workingset_refault_total 1299698413 562137395 -57% [1] https://lore.kernel.org/20230920190244.16839-1-ryncsn@gmail.com/ Link: https://lkml.kernel.org/r/20241231043538.4075764-7-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: Kairui Song Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/ Tested-by: Kalesh Singh Cc: Barry Song Cc: Bharata B Rao Cc: David Stevens Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 88 ++++++++++++----------- include/linux/mmzone.h | 82 +++++++++++++--------- mm/swap.c | 24 +++---- mm/vmscan.c | 143 ++++++++++++++++++++++---------------- mm/workingset.c | 29 ++++---- 5 files changed, 202 insertions(+), 164 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 34e5097182a0..f9157a0c42a5 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -133,31 +133,25 @@ static inline int lru_hist_from_seq(unsigned long seq) return seq % NR_HIST_GENS; } -static inline int lru_tier_from_refs(int refs) +static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); - /* see the comment in folio_lru_refs() */ - return order_base_2(refs + 1); + /* see the comment on MAX_NR_TIERS */ + return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); - bool workingset = flags & BIT(PG_workingset); + if (!(flags & BIT(PG_referenced))) + return 0; /* - * Return the number of accesses beyond PG_referenced, i.e., N-1 if the - * total number of accesses is N>1, since N=0,1 both map to the first - * tier. lru_tier_from_refs() will account for this off-by-one. Also see - * the comment on MAX_NR_TIERS. + * Return the total number of accesses including PG_referenced. Also see + * the comment on LRU_REFS_FLAGS. */ - return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; -} - -static inline void folio_clear_lru_refs(struct folio *folio) -{ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) @@ -223,11 +217,43 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } +static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, + bool reclaiming) +{ + int gen; + int type = folio_is_file_lru(folio); + struct lru_gen_folio *lrugen = &lruvec->lrugen; + + /* + * +-----------------------------------+-----------------------------------+ + * | Accessed through page tables and | Accessed through file descriptors | + * | promoted by folio_update_gen() | and protected by folio_inc_gen() | + * +-----------------------------------+-----------------------------------+ + * | PG_active (set while isolated) | | + * +-----------------+-----------------+-----------------+-----------------+ + * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | + * +-----------------------------------+-----------------------------------+ + * |<---------- MIN_NR_GENS ---------->| | + * |<---------------------------- MAX_NR_GENS ---------------------------->| + */ + if (folio_test_active(folio)) + gen = MIN_NR_GENS - folio_test_workingset(folio); + else if (reclaiming) + gen = MAX_NR_GENS; + else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || + (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio)))) + gen = MIN_NR_GENS; + else + gen = MAX_NR_GENS - folio_test_workingset(folio); + + return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); +} + static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; - unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -237,40 +263,12 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, if (folio_test_unevictable(folio) || !lrugen->enabled) return false; - /* - * There are four common cases for this page: - * 1. If it's hot, i.e., freshly faulted in, add it to the youngest - * generation, and it's protected over the rest below. - * 2. If it can't be evicted immediately, i.e., a dirty page pending - * writeback, add it to the second youngest generation. - * 3. If it should be evicted first, e.g., cold and clean from - * folio_rotate_reclaimable(), add it to the oldest generation. - * 4. Everything else falls between 2 & 3 above and is added to the - * second oldest generation if it's considered inactive, or the - * oldest generation otherwise. See lru_gen_is_active(). - */ - if (folio_test_active(folio)) - seq = lrugen->max_seq; - else if ((type == LRU_GEN_ANON && !folio_test_swapcache(folio)) || - (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio)))) - seq = lrugen->max_seq - 1; - else if (reclaiming || lrugen->min_seq[type] + MIN_NR_GENS >= lrugen->max_seq) - seq = lrugen->min_seq[type]; - else - seq = lrugen->min_seq[type] + 1; + seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - mask = LRU_GEN_MASK; - /* - * Don't clear PG_workingset here because it can affect PSI accounting - * if the activation is due to workingset refault. - */ - if (folio_test_active(folio)) - mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); - set_mask_bits(&folio->flags, mask, flags); + set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8245ecb0400b..9540b41894da 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -332,66 +332,88 @@ enum lruvec_flags { #endif /* !__GENERATING_BOUNDS_H */ /* - * Evictable pages are divided into multiple generations. The youngest and the + * Evictable folios are divided into multiple generations. The youngest and the * oldest generation numbers, max_seq and min_seq, are monotonically increasing. * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->folios[]. Otherwise it stores 0. + * a folio is on one of lrugen->folios[]. Otherwise it stores 0. * - * A page is added to the youngest generation on faulting. The aging needs to - * check the accessed bit at least twice before handing this page over to the - * eviction. The first check takes care of the accessed bit set on the initial - * fault; the second check makes sure this page hasn't been used since then. - * This process, AKA second chance, requires a minimum of two generations, - * hence MIN_NR_GENS. And to maintain ABI compatibility with the active/inactive - * LRU, e.g., /proc/vmstat, these two generations are considered active; the - * rest of generations, if they exist, are considered inactive. See - * lru_gen_is_active(). + * After a folio is faulted in, the aging needs to check the accessed bit at + * least twice before handing this folio over to the eviction. The first check + * clears the accessed bit from the initial fault; the second check makes sure + * this folio hasn't been used since then. This process, AKA second chance, + * requires a minimum of two generations, hence MIN_NR_GENS. And to maintain ABI + * compatibility with the active/inactive LRU, e.g., /proc/vmstat, these two + * generations are considered active; the rest of generations, if they exist, + * are considered inactive. See lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->folios[] so - * that the aging needs not to worry about it. And it's set again when a page - * considered active is isolated for non-reclaiming purposes, e.g., migration. - * See lru_gen_add_folio() and lru_gen_del_folio(). + * PG_active is always cleared while a folio is on one of lrugen->folios[] so + * that the sliding window needs not to worry about it. And it's set again when + * a folio considered active is isolated for non-reclaiming purposes, e.g., + * migration. See lru_gen_add_folio() and lru_gen_del_folio(). * * MAX_NR_GENS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through page tables. This requires order_base_2(MAX_NR_GENS+1) bits - * in folio->flags. + * in folio->flags, masked by LRU_GEN_MASK. */ #define MIN_NR_GENS 2U #define MAX_NR_GENS 4U /* - * Each generation is divided into multiple tiers. A page accessed N times - * through file descriptors is in tier order_base_2(N). A page in the first tier - * (N=0,1) is marked by PG_referenced unless it was faulted in through page - * tables or read ahead. A page in any other tier (N>1) is marked by - * PG_referenced and PG_workingset. This implies a minimum of two tiers is - * supported without using additional bits in folio->flags. + * Each generation is divided into multiple tiers. A folio accessed N times + * through file descriptors is in tier order_base_2(N). A folio in the first + * tier (N=0,1) is marked by PG_referenced unless it was faulted in through page + * tables or read ahead. A folio in the last tier (MAX_NR_TIERS-1) is marked by + * PG_workingset. A folio in any other tier (1flags. * * In contrast to moving across generations which requires the LRU lock, moving * across tiers only involves atomic operations on folio->flags and therefore * has a negligible cost in the buffered access path. In the eviction path, - * comparisons of refaulted/(evicted+protected) from the first tier and the - * rest infer whether pages accessed multiple times through file descriptors - * are statistically hot and thus worth protecting. + * comparisons of refaulted/(evicted+protected) from the first tier and the rest + * infer whether folios accessed multiple times through file descriptors are + * statistically hot and thus worth protecting. * * MAX_NR_TIERS is set to 4 so that the multi-gen LRU can support twice the * number of categories of the active/inactive LRU when keeping track of * accesses through file descriptors. This uses MAX_NR_TIERS-2 spare bits in - * folio->flags. + * folio->flags, masked by LRU_REFS_MASK. */ #define MAX_NR_TIERS 4U #ifndef __GENERATING_BOUNDS_H -struct lruvec; -struct page_vma_mapped_walk; - #define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) #define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) +/* + * For folios accessed multiple times through file descriptors, + * lru_gen_inc_refs() sets additional bits of LRU_REFS_WIDTH in folio->flags + * after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all its + * bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is lazily + * promoted into the second oldest generation in the eviction path. And when + * folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that + * lru_gen_inc_refs() can start over. Note that for this case, LRU_REFS_MASK is + * only valid when PG_referenced is set. + * + * For folios accessed multiple times through page tables, folio_update_gen() + * from a page table walk or lru_gen_set_refs() from a rmap walk sets + * PG_referenced after the accessed bit is cleared for the first time. + * Thereafter, those two paths set PG_workingset and promote folios to the + * youngest generation. Like folio_inc_gen(), folio_update_gen() also clears + * PG_referenced. Note that for this case, LRU_REFS_MASK is not used. + * + * For both cases above, after PG_workingset is set on a folio, it remains until + * this folio is either reclaimed, or "deactivated" by lru_gen_clear_refs(). It + * can be set again if lru_gen_test_recent() returns true upon a refault. + */ +#define LRU_REFS_FLAGS (LRU_REFS_MASK | BIT(PG_referenced)) + +struct lruvec; +struct page_vma_mapped_walk; + #ifdef CONFIG_LRU_GEN enum { @@ -406,8 +428,6 @@ enum { NR_LRU_GEN_CAPS }; -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) diff --git a/mm/swap.c b/mm/swap.c index 649ef7f2b74b..746a5ceba42c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -387,24 +387,20 @@ static void lru_gen_inc_refs(struct folio *folio) if (folio_test_unevictable(folio)) return; + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) { - folio_set_referenced(folio); - return; - } - - if (!folio_test_workingset(folio)) { - folio_set_workingset(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); return; } - /* see the comment on MAX_NR_TIERS */ do { - new_flags = old_flags & LRU_REFS_MASK; - if (new_flags == LRU_REFS_MASK) - break; + if ((old_flags & LRU_REFS_MASK) == LRU_REFS_MASK) { + if (!folio_test_workingset(folio)) + folio_set_workingset(folio); + return; + } - new_flags += BIT(LRU_REFS_PGOFF); - new_flags |= old_flags & ~LRU_REFS_MASK; + new_flags = old_flags + BIT(LRU_REFS_PGOFF); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); } @@ -417,7 +413,7 @@ static bool lru_gen_clear_refs(struct folio *folio) if (gen < 0) return true; - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + set_mask_bits(&folio->flags, LRU_REFS_FLAGS | BIT(PG_workingset), 0); lrugen = &folio_lruvec(folio)->lrugen; /* whether can do without shuffling under the LRU lock */ @@ -499,7 +495,7 @@ void folio_add_lru(struct folio *folio) folio_test_unevictable(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - /* see the comment in lru_gen_add_folio() */ + /* see the comment in lru_gen_folio_seq() */ if (lru_gen_enabled() && !folio_test_unevictable(folio) && lru_gen_in_fault() && !(current->flags & PF_MEMALLOC)) folio_set_active(folio); diff --git a/mm/vmscan.c b/mm/vmscan.c index 761298d9c09a..9b8e0a9fc9d5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -862,6 +862,31 @@ enum folio_references { FOLIOREF_ACTIVATE, }; +#ifdef CONFIG_LRU_GEN +/* + * Only used on a mapped folio in the eviction (rmap walk) path, where promotion + * needs to be done by taking the folio off the LRU list and then adding it back + * with PG_active set. In contrast, the aging (page table walk) path uses + * folio_update_gen(). + */ +static bool lru_gen_set_refs(struct folio *folio) +{ + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return false; + } + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + return true; +} +#else +static bool lru_gen_set_refs(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ + static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { @@ -870,7 +895,6 @@ static enum folio_references folio_check_references(struct folio *folio, referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); - referenced_folio = folio_test_clear_referenced(folio); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. @@ -888,6 +912,15 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; + if (lru_gen_enabled()) { + if (!referenced_ptes) + return FOLIOREF_RECLAIM; + + return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; + } + + referenced_folio = folio_test_clear_referenced(folio); + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1092,11 +1125,6 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; - /* folio_update_gen() tried to promote this page? */ - if (lru_gen_enabled() && !ignore_references && - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -3159,16 +3187,19 @@ static int folio_update_gen(struct folio *folio, int gen) VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return -1; + } + do { /* lru_gen_del_folio() has isolated this page? */ - if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_folio_list() */ - new_flags = old_flags | BIT(PG_referenced); - continue; - } + if (!(old_flags & LRU_GEN_MASK)) + return -1; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; @@ -3192,7 +3223,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_gen = (old_gen + 1) % MAX_NR_GENS; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) @@ -3370,9 +3401,11 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, struct pglist_data *pgdat) { - struct folio *folio; + struct folio *folio = pfn_folio(pfn); + + if (folio_lru_gen(folio) < 0) + return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; @@ -3749,8 +3782,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); - int delta = folio_nr_pages(folio); + bool workingset = folio_test_workingset(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); @@ -3760,8 +3792,14 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); - WRITE_ONCE(lrugen->protected[hist][type][tier], - lrugen->protected[hist][type][tier] + delta); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int tier = lru_tier_from_refs(refs, workingset); + int delta = folio_nr_pages(folio); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } if (!--remaining) return false; @@ -4147,16 +4185,10 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) old_gen = folio_update_gen(folio, new_gen); if (old_gen >= 0 && old_gen != new_gen) update_batch_size(walk, folio, old_gen, new_gen); - - continue; - } - - old_gen = folio_lru_gen(folio); - if (old_gen < 0) - folio_set_referenced(folio); - else if (old_gen != new_gen) { - folio_clear_lru_refs(folio); - folio_activate(folio); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); } } @@ -4317,7 +4349,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4339,14 +4372,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* protected */ - if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - + if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); - WRITE_ONCE(lrugen->protected[hist][type][tier], - lrugen->protected[hist][type][tier] + delta); + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } return true; } @@ -4366,8 +4402,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* waiting for writeback */ - if (folio_test_locked(folio) || writeback || - (type == LRU_GEN_FILE && dirty)) { + if (writeback || (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4396,13 +4431,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return false; } - /* see the comment on MAX_NR_TIERS */ + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - folio_clear_lru_refs(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); - folio_clear_referenced(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); @@ -4592,31 +4626,24 @@ retry: type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { + DEFINE_MIN_SEQ(lruvec); + if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } - if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); - continue; - } - - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || - folio_mapped(folio) || folio_test_locked(folio) || - folio_test_dirty(folio) || folio_test_writeback(folio)) { - /* don't add rejected folios to the oldest generation */ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, - BIT(PG_active)); + /* retry folios that may have missed folio_rotate_reclaimable() */ + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, &clean); continue; } - /* retry folios that may have missed folio_rotate_reclaimable() */ - list_move(&folio->lru, &clean); + /* don't add rejected folios to the oldest generation */ + if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); diff --git a/mm/workingset.c b/mm/workingset.c index 2c310c29f51e..4841ae8af411 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -239,7 +239,8 @@ static void *lru_gen_eviction(struct folio *folio) int type = folio_is_file_lru(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); @@ -253,7 +254,7 @@ static void *lru_gen_eviction(struct folio *folio) hist = lru_hist_from_seq(min_seq); atomic_long_add(delta, &lrugen->evicted[hist][type][tier]); - return pack_shadow(mem_cgroup_id(memcg), pgdat, token, refs); + return pack_shadow(mem_cgroup_id(memcg), pgdat, token, workingset); } /* @@ -304,24 +305,20 @@ static void lru_gen_refault(struct folio *folio, void *shadow) lrugen = &lruvec->lrugen; hist = lru_hist_from_seq(READ_ONCE(lrugen->min_seq[type])); - /* see the comment in folio_lru_refs() */ - refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + workingset; - tier = lru_tier_from_refs(refs); + refs = (token & (BIT(LRU_REFS_WIDTH) - 1)) + 1; + tier = lru_tier_from_refs(refs, workingset); atomic_long_add(delta, &lrugen->refaulted[hist][type][tier]); - mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); - /* - * Count the following two cases as stalls: - * 1. For pages accessed through page tables, hotter pages pushed out - * hot pages which refaulted immediately. - * 2. For pages accessed multiple times through file descriptors, - * they would have been protected by sort_folio(). - */ - if (lru_gen_in_fault() || refs >= BIT(LRU_REFS_WIDTH) - 1) { - set_mask_bits(&folio->flags, 0, LRU_REFS_MASK | BIT(PG_workingset)); + /* see folio_add_lru() where folio_set_active() will be called */ + if (lru_gen_in_fault()) + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + + if (workingset) { + folio_set_workingset(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + type, delta); - } + } else + set_mask_bits(&folio->flags, LRU_REFS_MASK, (refs - 1UL) << LRU_REFS_PGOFF); unlock: rcu_read_unlock(); } -- 2.50.1 From a52dcec56c5b96250f15efbd7de3d3ea6ce863d9 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Mon, 30 Dec 2024 21:35:38 -0700 Subject: [PATCH 11/16] mm/mglru: fix PTE-mapped large folios Count the accessed bits from PTEs mapping the same large folio as one access rather than multiple accesses. The last patch changed how folios accessed through page tables are promoted: rather than getting promoted after the accessed bit is cleared for the first time, a folio only gets promoted thereafter. Counting the accessed bits from the same large folio as multiple accesses can cause that folio to be promoted prematurely, which in turn can cause overprotection of single-use large folios. This patch reduced the sys time of the kernel compilation by 95% CI [2, 5]% on Altra M128-30 with 3GB DRAM, 12GB zram, 16KB THPs and -j32. Link: https://lkml.kernel.org/r/20241231043538.4075764-8-yuzhao@google.com Signed-off-by: Yu Zhao Reported-by: Barry Song Tested-by: Kalesh Singh Cc: Bharata B Rao Cc: David Stevens Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/vmscan.c | 108 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 37 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9b8e0a9fc9d5..f5970fdd759c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3423,29 +3423,55 @@ static bool suitable_to_scan(int total, int young) return young * n >= total; } +static void walk_update_folio(struct lru_gen_mm_walk *walk, struct folio *folio, + int new_gen, bool dirty) +{ + int old_gen; + + if (!folio) + return; + + if (dirty && !folio_test_dirty(folio) && + !(folio_test_anon(folio) && folio_test_swapbacked(folio) && + !folio_test_swapcache(folio))) + folio_mark_dirty(folio); + + if (walk) { + old_gen = folio_update_gen(folio, new_gen); + if (old_gen >= 0 && old_gen != new_gen) + update_batch_size(walk, folio, old_gen, new_gen); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); + } +} + static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct mm_walk *args) { int i; + bool dirty; pte_t *pte; spinlock_t *ptl; unsigned long addr; int total = 0; int young = 0; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); pmd_t pmdval; - pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, - &ptl); + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, &ptl); if (!pte) return false; + if (!spin_trylock(ptl)) { pte_unmap(pte); - return false; + return true; } if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { @@ -3474,19 +3500,23 @@ restart: if (!ptep_clear_young_notify(args->vma, addr, pte + i)) continue; - young++; - walk->mm_stats[MM_LEAF_YOUNG]++; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + last = folio; + dirty = false; + } - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + if (pte_dirty(ptent)) + dirty = true; + + young++; + walk->mm_stats[MM_LEAF_YOUNG]++; } + walk_update_folio(walk, last, gen, dirty); + last = NULL; + if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end)) goto restart; @@ -3500,13 +3530,15 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area struct mm_walk *args, unsigned long *bitmap, unsigned long *first) { int i; + bool dirty; pmd_t *pmd; spinlock_t *ptl; + struct folio *last = NULL; struct lru_gen_mm_walk *walk = args->private; struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec); struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); VM_WARN_ON_ONCE(pud_leaf(*pud)); @@ -3559,20 +3591,23 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area if (!pmdp_clear_young_notify(vma, addr, pmd + i)) goto next; - walk->mm_stats[MM_LEAF_YOUNG]++; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); + last = folio; + dirty = false; + } - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); + if (pmd_dirty(pmd[i])) + dirty = true; + + walk->mm_stats[MM_LEAF_YOUNG]++; next: i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1; } while (i <= MIN_LRU_BATCH); + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); spin_unlock(ptl); done: @@ -4107,9 +4142,11 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) { int i; + bool dirty; unsigned long start; unsigned long end; struct lru_gen_mm_walk *walk; + struct folio *last = NULL; int young = 1; pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; @@ -4120,7 +4157,7 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); struct lru_gen_mm_state *mm_state = get_mm_state(lruvec); DEFINE_MAX_SEQ(lruvec); - int old_gen, new_gen = lru_gen_from_seq(max_seq); + int gen = lru_gen_from_seq(max_seq); lockdep_assert_held(pvmw->ptl); VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio); @@ -4174,24 +4211,21 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (!ptep_clear_young_notify(vma, addr, pte + i)) continue; - young++; + if (last != folio) { + walk_update_folio(walk, last, gen, dirty); - if (pte_dirty(ptent) && !folio_test_dirty(folio) && - !(folio_test_anon(folio) && folio_test_swapbacked(folio) && - !folio_test_swapcache(folio))) - folio_mark_dirty(folio); - - if (walk) { - old_gen = folio_update_gen(folio, new_gen); - if (old_gen >= 0 && old_gen != new_gen) - update_batch_size(walk, folio, old_gen, new_gen); - } else if (lru_gen_set_refs(folio)) { - old_gen = folio_lru_gen(folio); - if (old_gen >= 0 && old_gen != new_gen) - folio_activate(folio); + last = folio; + dirty = false; } + + if (pte_dirty(ptent)) + dirty = true; + + young++; } + walk_update_folio(walk, last, gen, dirty); + arch_leave_lazy_mmu_mode(); /* feedback from rmap walkers to page table walkers */ -- 2.50.1 From d670c8e5302af8ccdf5f12242e58816420738bb5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 9 Jan 2025 15:22:21 +0000 Subject: [PATCH 12/16] mm: remove PageTransTail() The last caller was removed in October. Also remove the FALSE definition of PageTransCompoundMap(); the normal definition was removed a few years ago. Link: https://lkml.kernel.org/r/20250109152245.1591914-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Acked-by: Zi Yan Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 691506bdf2c5..330929b6e062 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -894,21 +894,9 @@ static inline int PageTransCompound(const struct page *page) { return PageCompound(page); } - -/* - * PageTransTail returns true for both transparent huge pages - * and hugetlbfs pages, so it should only be called when it's known - * that hugetlbfs pages aren't involved. - */ -static inline int PageTransTail(const struct page *page) -{ - return PageTail(page); -} #else TESTPAGEFLAG_FALSE(TransHuge, transhuge) TESTPAGEFLAG_FALSE(TransCompound, transcompound) -TESTPAGEFLAG_FALSE(TransCompoundMap, transcompoundmap) -TESTPAGEFLAG_FALSE(TransTail, transtail) #endif #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_TRANSPARENT_HUGEPAGE) -- 2.50.1 From 82047ae18446c2c5e97a9bf114c5c871212a656e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Jan 2025 10:52:28 -0800 Subject: [PATCH 13/16] Docs/mm/damon/design: add monitoring parameters tuning guide Patch series "Docs/mm/damon: add tuning guide and misc updates". Add DAMON monitoring parameters tuning guide (patches 1 and 2), with misc documentation fixes (patch 3), updates (patch 4) and clarifications (patch 5). This patch (of 5): DAMON monitoring parameters including sampling and aggregation intervals should be tuned for given workloads. However, the fact is not explicitly documented. Also there is no official guide to help the tuning. This apparently confused a number of people[1] at best, or made people forgive DAMON without tuning. Add a guide on the design document. [1] https://lore.kernel.org/20241202175459.2005526-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250110185232.54907-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250110185232.54907-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Honggyu Kim Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 48 +++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 667775bab86c..dd7e0f63a69a 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -203,6 +203,8 @@ This scheme, however, cannot preserve the quality of the output if the assumption is not guaranteed. +.. _damon_design_adaptive_regions_adjustment: + Adaptive Regions Adjustment ~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -264,6 +266,52 @@ tracepoints. For more details, please refer to the documentations for respectively. +Monitoring Parameters Tuning Guide +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In short, set ``aggregation interval`` to capture meaningful amount of accesses +for the purpose. The amount of accesses can be measured using ``nr_accesses`` +and ``age`` of regions in the aggregated monitoring results snapshot. The +default value of the interval, ``100ms``, turns out to be too short in many +cases. Set ``sampling interval`` proportional to ``aggregation interval``. By +default, ``1/20`` is recommended as the ratio. + +``Aggregation interval`` should be set as the time interval that the workload +can make an amount of accesses for the monitoring purpose, within the interval. +If the interval is too short, only small number of accesses are captured. As a +result, the monitoring results look everything is samely accessed only rarely. +For many purposes, that would be useless. If it is too long, however, the time +to converge regions with the :ref:`regions adjustment mechanism +` can be too long, depending on the +time scale of the given purpose. This could happen if the workload is actually +making only rare accesses but the user thinks the amount of accesses for the +monitoring purpose too high. For such cases, the target amount of access to +capture per ``aggregation interval`` should carefully reconsidered. Also, note +that the captured amount of accesses is represented with not only +``nr_accesses``, but also ``age``. For example, even if every region on the +monitoring results show zero ``nr_accesses``, regions could still be +distinguished using ``age`` values as the recency information. + +Hence the optimum value of ``aggregation interval`` depends on the access +intensiveness of the workload. The user should tune the interval based on the +amount of access that captured on each aggregated snapshot of the monitoring +results. + +Note that the default value of the interval is 100 milliseconds, which is too +short in many cases, especially on large systems. + +``Sampling interval`` defines the resolution of each aggregation. If it is set +too large, monitoring results will look like every region was samely rarely +accessed, or samely frequently accessed. That is, regions become +undistinguishable based on access pattern, and therefore the results will be +useless in many use cases. If ``sampling interval`` is too small, it will not +degrade the resolution, but will increase the monitoring overhead. If it is +appropriate enough to provide a resolution of the monitoring results that +sufficient for the given purpose, it shouldn't be unnecessarily further +lowered. It is recommended to be set proportional to ``aggregation interval``. +By default, the ratio is set as ``1/20``, and it is still recommended. + + .. _damon_design_damos: Operation Schemes -- 2.50.1 From a96f9e2773630967c7c5b3d364263784a67b5daf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Jan 2025 10:52:29 -0800 Subject: [PATCH 14/16] Docs/mm/damon: add an example monitoring intervals tuning Add a DAMON monitoring intervals tuning example that contains output from a demonstration of the guide on a real server workload system. The example with real world numbers will help users better understanding the guide instructions and what outputs they can expect and verify. Those will again help finding the rooms for improvements on the guide. Link: https://lkml.kernel.org/r/20250110185232.54907-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Honggyu Kim Cc: Jonathan Corbet Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 9 + .../monitoring_intervals_tuning_example.rst | 247 ++++++++++++++++++ 2 files changed, 256 insertions(+) create mode 100644 Documentation/mm/damon/monitoring_intervals_tuning_example.rst diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index dd7e0f63a69a..e28c6a1b40ae 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -266,6 +266,8 @@ tracepoints. For more details, please refer to the documentations for respectively. +.. _damon_design_monitoring_params_tuning_guide: + Monitoring Parameters Tuning Guide ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -311,6 +313,13 @@ sufficient for the given purpose, it shouldn't be unnecessarily further lowered. It is recommended to be set proportional to ``aggregation interval``. By default, the ratio is set as ``1/20``, and it is still recommended. +Refer to below documents for an example tuning based on the above guide. + +.. toctree:: + :maxdepth: 1 + + monitoring_intervals_tuning_example + .. _damon_design_damos: diff --git a/Documentation/mm/damon/monitoring_intervals_tuning_example.rst b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst new file mode 100644 index 000000000000..334a854efb40 --- /dev/null +++ b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst @@ -0,0 +1,247 @@ +.. SPDX-License-Identifier: GPL-2.0 + +================================================= +DAMON Moniting Interval Parameters Tuning Example +================================================= + +DAMON's monitoring parameters need tuning based on given workload and the +monitoring purpose. There is a :ref:`tuning guide +` for that. This document +provides an example tuning based on the guide. + +Setup +===== + +For below example, DAMON of Linux kernel v6.11 and `damo +`_ (DAMON user-space tool) v2.5.9 was used to +monitor and visualize access patterns on the physical address space of a system +running a real-world server workload. + +5ms/100ms intervals: Too Short Interval +======================================= + +Let's start by capturing the access pattern snapshot on the physical address +space of the system using DAMON, with the default interval parameters (5 +milliseconds and 100 milliseconds for the sampling and the aggregation +intervals, respectively). Wait ten minutes between the start of DAMON and +the capturing of the snapshot, to show a meaningful time-wise access patterns. +:: + + # damo start + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + +Then, list the DAMON-found regions of different access patterns, sorted by the +"access temperature". "Access temperature" is a metric representing the +access-hotness of a region. It is calculated as a weighted sum of the access +frequency and the age of the region. If the access frequency is 0 %, the +temperature is multipled by minus one. That is, if a region is not accessed, +it gets minus temperature and it gets lower as not accessed for longer time. +The sorting is in temperature-ascendint order, so the region at the top of the +list is the coldest, and the one at the bottom is the hottest one. :: + + # damo report access --sort_regions_by temperature + 0 addr 16.052 GiB size 5.985 GiB access 0 % age 5.900 s # coldest + 1 addr 22.037 GiB size 6.029 GiB access 0 % age 5.300 s + 2 addr 28.065 GiB size 6.045 GiB access 0 % age 5.200 s + 3 addr 10.069 GiB size 5.983 GiB access 0 % age 4.500 s + 4 addr 4.000 GiB size 6.069 GiB access 0 % age 4.400 s + 5 addr 62.008 GiB size 3.992 GiB access 0 % age 3.700 s + 6 addr 56.795 GiB size 5.213 GiB access 0 % age 3.300 s + 7 addr 39.393 GiB size 6.096 GiB access 0 % age 2.800 s + 8 addr 50.782 GiB size 6.012 GiB access 0 % age 2.800 s + 9 addr 34.111 GiB size 5.282 GiB access 0 % age 2.300 s + 10 addr 45.489 GiB size 5.293 GiB access 0 % age 1.800 s # hottest + total size: 62.000 GiB + +The list shows not seemingly hot regions, and only minimum access pattern +diversity. Every region has zero access frequency. The number of region is +10, which is the default ``min_nr_regions value``. Size of each region is also +nearly idential. We can suspect this is because “adaptive regions adjustment” +mechanism was not well working. As the guide suggested, we can get relative +hotness of regions using ``age`` as the recency information. That would be +better than nothing, but given the fact that the longest age is only about 6 +seconds while we waited about ten minuts, it is unclear how useful this will +be. + +The temperature ranges to total size of regions of each range histogram +visualization of the results also shows no interesting distribution pattern. :: + + # damo report access --style temperature-sz-hist + + [-,590,000,000, -,549,000,000) 5.985 GiB |********** | + [-,549,000,000, -,508,000,000) 12.074 GiB |********************| + [-,508,000,000, -,467,000,000) 0 B | | + [-,467,000,000, -,426,000,000) 12.052 GiB |********************| + [-,426,000,000, -,385,000,000) 0 B | | + [-,385,000,000, -,344,000,000) 3.992 GiB |******* | + [-,344,000,000, -,303,000,000) 5.213 GiB |********* | + [-,303,000,000, -,262,000,000) 12.109 GiB |********************| + [-,262,000,000, -,221,000,000) 5.282 GiB |********* | + [-,221,000,000, -,180,000,000) 0 B | | + [-,180,000,000, -,139,000,000) 5.293 GiB |********* | + total size: 62.000 GiB + +In short, the parameters provide poor quality monitoring results for hot +regions detection. According to the :ref:`guide +`, this is due to the too short +aggregation interval. + +100ms/2s intervals: Starts Showing Small Hot Regions +==================================================== + +Following the guide, increase the interval 20 times (100 milliseocnds and 2 +seconds for sampling and aggregation intervals, respectively). :: + + # damo start -s 100ms -a 2s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 10.180 GiB size 6.117 GiB access 0 % age 7 m 8 s # coldest + 1 addr 49.275 GiB size 6.195 GiB access 0 % age 6 m 14 s + 2 addr 62.421 GiB size 3.579 GiB access 0 % age 6 m 4 s + 3 addr 40.154 GiB size 6.127 GiB access 0 % age 5 m 40 s + 4 addr 16.296 GiB size 6.182 GiB access 0 % age 5 m 32 s + 5 addr 34.254 GiB size 5.899 GiB access 0 % age 5 m 24 s + 6 addr 46.281 GiB size 2.995 GiB access 0 % age 5 m 20 s + 7 addr 28.420 GiB size 5.835 GiB access 0 % age 5 m 6 s + 8 addr 4.000 GiB size 6.180 GiB access 0 % age 4 m 16 s + 9 addr 22.478 GiB size 5.942 GiB access 0 % age 3 m 58 s + 10 addr 55.470 GiB size 915.645 MiB access 0 % age 3 m 6 s + 11 addr 56.364 GiB size 6.056 GiB access 0 % age 2 m 8 s + 12 addr 56.364 GiB size 4.000 KiB access 95 % age 16 s + 13 addr 49.275 GiB size 4.000 KiB access 100 % age 8 m 24 s # hottest + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-42,800,000,000, -33,479,999,000) 22.018 GiB |***************** | + [-33,479,999,000, -24,159,998,000) 27.090 GiB |********************| + [-24,159,998,000, -14,839,997,000) 6.836 GiB |****** | + [-14,839,997,000, -5,519,996,000) 6.056 GiB |***** | + [-5,519,996,000, 3,800,005,000) 4.000 KiB |* | + [3,800,005,000, 13,120,006,000) 0 B | | + [13,120,006,000, 22,440,007,000) 0 B | | + [22,440,007,000, 31,760,008,000) 0 B | | + [31,760,008,000, 41,080,009,000) 0 B | | + [41,080,009,000, 50,400,010,000) 0 B | | + [50,400,010,000, 59,720,011,000) 4.000 KiB |* | + total size: 62.000 GiB + +DAMON found two distinct 4 KiB regions that pretty hot. The regions are also +well aged. The hottest 4 KiB region was keeping the access frequency for about +8 minutes, and the coldest region was keeping no access for about 7 minutes. +The distribution on the histogram also looks like having a pattern. + +Especially, the finding of the 4 KiB regions among the 62 GiB total memory +shows DAMON’s adaptive regions adjustment is working as designed. + +Still the number of regions is close to the ``min_nr_regions``, and sizes of +cold regions are similar, though. Apparently it is improved, but it still has +rooms to improve. + +400ms/8s intervals: Pretty Improved Results +=========================================== + +Increase the intervals four times (400 milliseconds and 8 seconds +for sampling and aggregation intervals, respectively). :: + + # damo start -s 400ms -a 8s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 64.492 GiB size 1.508 GiB access 0 % age 6 m 48 s # coldest + 1 addr 21.749 GiB size 5.674 GiB access 0 % age 6 m 8 s + 2 addr 27.422 GiB size 5.801 GiB access 0 % age 6 m + 3 addr 49.431 GiB size 8.675 GiB access 0 % age 5 m 28 s + 4 addr 33.223 GiB size 5.645 GiB access 0 % age 5 m 12 s + 5 addr 58.321 GiB size 6.170 GiB access 0 % age 5 m 4 s + [...] + 25 addr 6.615 GiB size 297.531 MiB access 15 % age 0 ns + 26 addr 9.513 GiB size 12.000 KiB access 20 % age 0 ns + 27 addr 9.511 GiB size 108.000 KiB access 25 % age 0 ns + 28 addr 9.513 GiB size 20.000 KiB access 25 % age 0 ns + 29 addr 9.511 GiB size 12.000 KiB access 30 % age 0 ns + 30 addr 9.520 GiB size 4.000 KiB access 40 % age 0 ns + [...] + 41 addr 9.520 GiB size 4.000 KiB access 80 % age 56 s + 42 addr 9.511 GiB size 12.000 KiB access 100 % age 6 m 16 s + 43 addr 58.321 GiB size 4.000 KiB access 100 % age 6 m 24 s + 44 addr 9.512 GiB size 4.000 KiB access 100 % age 6 m 48 s + 45 addr 58.106 GiB size 4.000 KiB access 100 % age 6 m 48 s # hottest + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-40,800,000,000, -32,639,999,000) 21.657 GiB |********************| + [-32,639,999,000, -24,479,998,000) 17.938 GiB |***************** | + [-24,479,998,000, -16,319,997,000) 16.885 GiB |**************** | + [-16,319,997,000, -8,159,996,000) 586.879 MiB |* | + [-8,159,996,000, 5,000) 4.946 GiB |***** | + [5,000, 8,160,006,000) 260.000 KiB |* | + [8,160,006,000, 16,320,007,000) 0 B | | + [16,320,007,000, 24,480,008,000) 0 B | | + [24,480,008,000, 32,640,009,000) 0 B | | + [32,640,009,000, 40,800,010,000) 16.000 KiB |* | + [40,800,010,000, 48,960,011,000) 8.000 KiB |* | + total size: 62.000 GiB + +The number of regions having different access patterns has significantly +increased. Size of each region is also more varied. Total size of non-zero +access frequency regions is also significantly increased. Maybe this is already +good enough to make some meaningful memory management efficieny changes. + +800ms/16s intervals: Another bias +================================= + +Further double the intervals (800 milliseconds and 16 seconds for sampling +and aggregation intervals, respectively). The results is more improved for the +hot regions detection, but starts looking degrading cold regions detection. :: + + # damo start -s 800ms -a 16s + # sleep 600 + # damo record --snapshot 0 1 + # damo stop + # damo report access --sort_regions_by temperature + 0 addr 64.781 GiB size 1.219 GiB access 0 % age 4 m 48 s + 1 addr 24.505 GiB size 2.475 GiB access 0 % age 4 m 16 s + 2 addr 26.980 GiB size 504.273 MiB access 0 % age 4 m + 3 addr 29.443 GiB size 2.462 GiB access 0 % age 4 m + 4 addr 37.264 GiB size 5.645 GiB access 0 % age 4 m + 5 addr 31.905 GiB size 5.359 GiB access 0 % age 3 m 44 s + [...] + 20 addr 8.711 GiB size 40.000 KiB access 5 % age 2 m 40 s + 21 addr 27.473 GiB size 1.970 GiB access 5 % age 4 m + 22 addr 48.185 GiB size 4.625 GiB access 5 % age 4 m + 23 addr 47.304 GiB size 902.117 MiB access 10 % age 4 m + 24 addr 8.711 GiB size 4.000 KiB access 100 % age 4 m + 25 addr 20.793 GiB size 3.713 GiB access 5 % age 4 m 16 s + 26 addr 8.773 GiB size 4.000 KiB access 100 % age 4 m 16 s + total size: 62.000 GiB + # damo report access --style temperature-sz-hist + + [-28,800,000,000, -23,359,999,000) 12.294 GiB |***************** | + [-23,359,999,000, -17,919,998,000) 9.753 GiB |************* | + [-17,919,998,000, -12,479,997,000) 15.131 GiB |********************| + [-12,479,997,000, -7,039,996,000) 0 B | | + [-7,039,996,000, -1,599,995,000) 7.506 GiB |********** | + [-1,599,995,000, 3,840,006,000) 6.127 GiB |********* | + [3,840,006,000, 9,280,007,000) 0 B | | + [9,280,007,000, 14,720,008,000) 136.000 KiB |* | + [14,720,008,000, 20,160,009,000) 40.000 KiB |* | + [20,160,009,000, 25,600,010,000) 11.188 GiB |*************** | + [25,600,010,000, 31,040,011,000) 4.000 KiB |* | + total size: 62.000 GiB + +It found more non-zero access frequency regions. The number of regions is still +much higher than the ``min_nr_regions``, but it is reduced from that of the +previous setup. And apparently the distribution seems bit biased to hot +regions. + +Conclusion +========== + +With the above experimental tuning results, we can conclude the theory and the +guide makes sense to at least this workload, and could be applied to similar +cases. -- 2.50.1 From eb14b12c8c22fb8bcc72b8f02889a02cf3577590 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Jan 2025 10:52:30 -0800 Subject: [PATCH 15/16] Docs/admin-guide/mm/damon/usage: fix and add missing DAMOS filter sysfs files on files hierarchy DAMOS filter directory part of DAMON sysfs files hierarchy on the usage document is wrong. 'memcg_path' file under the directory is wrongly written as 'memcg_id'. Also the directory has 'addr_start', 'addr_end', and 'target_idx' files, but the list is missing those. Fix the wrong name and add missing files. Link: https://lkml.kernel.org/r/20250110185232.54907-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Honggyu Kim Cc: Jonathan Corbet Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index f0d0c20711d6..47a44bd348ab 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -83,7 +83,7 @@ comma (","). │ │ │ │ │ │ │ │ │ 0/target_metric,target_value,current_value │ │ │ │ │ │ │ :ref:`watermarks `/metric,interval_us,high,mid,low │ │ │ │ │ │ │ :ref:`filters `/nr_filters - │ │ │ │ │ │ │ │ 0/type,matching,memcg_id,allow + │ │ │ │ │ │ │ │ 0/type,matching,allow,memcg_path,addr_start,addr_end,target_idx │ │ │ │ │ │ │ :ref:`stats `/nr_tried,sz_tried,nr_applied,sz_applied,sz_ops_filter_passed,qt_exceeds │ │ │ │ │ │ │ :ref:`tried_regions `/total_bytes │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age,sz_filter_passed -- 2.50.1 From d24393450b8300ca6ec55db9cf4e498123e55b4c Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 10 Jan 2025 10:52:31 -0800 Subject: [PATCH 16/16] Docs/admin-guide/mm/damon/start: update snapshot example Two of DAMON user-space tool (damo) commands that are used for examples on DAMON getting started document, namely 'damo show' and 'damo report heats' are deprecated[1,2], and replaced by new commands that provides same functions with unified and simplified user interfaces. Also the example output of 'damo show' is outdated. 'damo schemes' command is not deprecated, but users are recommended to use 'damo start' or 'damo tune' instead. Update the examples to use the replacements, recommendations, and up-to-date output formats. [1] https://git.kernel.org/sj/damo/c/3272e0ac94ecc5e1 [2] https://git.kernel.org/sj/damo/c/da3ec66bbdd9e87d Link: https://lkml.kernel.org/r/20250110185232.54907-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Honggyu Kim Cc: Jonathan Corbet Cc: Yunjeong Mun Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/start.rst | 67 ++++++++++++-------- 1 file changed, 40 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index c4dddf6733cd..ede14b679d02 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -42,32 +42,45 @@ the execution. :: $ git clone https://github.com/sjp38/masim; cd masim; make $ sudo damo start "./masim ./configs/stairs.cfg --quiet" - $ sudo ./damo show - 0 addr [85.541 TiB , 85.541 TiB ) (57.707 MiB ) access 0 % age 10.400 s - 1 addr [85.541 TiB , 85.542 TiB ) (413.285 MiB) access 0 % age 11.400 s - 2 addr [127.649 TiB , 127.649 TiB) (57.500 MiB ) access 0 % age 1.600 s - 3 addr [127.649 TiB , 127.649 TiB) (32.500 MiB ) access 0 % age 500 ms - 4 addr [127.649 TiB , 127.649 TiB) (9.535 MiB ) access 100 % age 300 ms - 5 addr [127.649 TiB , 127.649 TiB) (8.000 KiB ) access 60 % age 0 ns - 6 addr [127.649 TiB , 127.649 TiB) (6.926 MiB ) access 0 % age 1 s - 7 addr [127.998 TiB , 127.998 TiB) (120.000 KiB) access 0 % age 11.100 s - 8 addr [127.998 TiB , 127.998 TiB) (8.000 KiB ) access 40 % age 100 ms - 9 addr [127.998 TiB , 127.998 TiB) (4.000 KiB ) access 0 % age 11 s - total size: 577.590 MiB - $ sudo ./damo stop + $ sudo damo report access + heatmap: 641111111000000000000000000000000000000000000000000000[...]33333333333333335557984444[...]7 + # min/max temperatures: -1,840,000,000, 370,010,000, column size: 3.925 MiB + 0 addr 86.182 TiB size 8.000 KiB access 0 % age 14.900 s + 1 addr 86.182 TiB size 8.000 KiB access 60 % age 0 ns + 2 addr 86.182 TiB size 3.422 MiB access 0 % age 4.100 s + 3 addr 86.182 TiB size 2.004 MiB access 95 % age 2.200 s + 4 addr 86.182 TiB size 29.688 MiB access 0 % age 14.100 s + 5 addr 86.182 TiB size 29.516 MiB access 0 % age 16.700 s + 6 addr 86.182 TiB size 29.633 MiB access 0 % age 17.900 s + 7 addr 86.182 TiB size 117.652 MiB access 0 % age 18.400 s + 8 addr 126.990 TiB size 62.332 MiB access 0 % age 9.500 s + 9 addr 126.990 TiB size 13.980 MiB access 0 % age 5.200 s + 10 addr 126.990 TiB size 9.539 MiB access 100 % age 3.700 s + 11 addr 126.990 TiB size 16.098 MiB access 0 % age 6.400 s + 12 addr 127.987 TiB size 132.000 KiB access 0 % age 2.900 s + total size: 314.008 MiB + $ sudo damo stop The first command of the above example downloads and builds an artificial memory access generator program called ``masim``. The second command asks DAMO -to execute the artificial generator process start via the given command and -make DAMON monitors the generator process. The third command retrieves the -current snapshot of the monitored access pattern of the process from DAMON and -shows the pattern in a human readable format. - -Each line of the output shows which virtual address range (``addr [XX, XX)``) -of the process is how frequently (``access XX %``) accessed for how long time -(``age XX``). For example, the fifth region of ~9 MiB size is being most -frequently accessed for last 300 milliseconds. Finally, the fourth command -stops DAMON. +to start the program via the given command and make DAMON monitors the newly +started process. The third command retrieves the current snapshot of the +monitored access pattern of the process from DAMON and shows the pattern in a +human readable format. + +The first line of the output shows the relative access temperature (hotness) of +the regions in a single row hetmap format. Each column on the heatmap +represents regions of same size on the monitored virtual address space. The +position of the colun on the row and the number on the column represents the +relative location and access temperature of the region. ``[...]`` means +unmapped huge regions on the virtual address spaces. The second line shows +additional information for better understanding the heatmap. + +Each line of the output from the third line shows which virtual address range +(``addr XX size XX``) of the process is how frequently (``access XX %``) +accessed for how long time (``age XX``). For example, the evelenth region of +~9.5 MiB size is being most frequently accessed for last 3.7 seconds. Finally, +the fourth command stops DAMON. Note that DAMON can monitor not only virtual address spaces but multiple types of address spaces including the physical address space. @@ -95,7 +108,7 @@ Visualizing Recorded Patterns You can visualize the pattern in a heatmap, showing which memory region (x-axis) got accessed when (y-axis) and how frequently (number).:: - $ sudo damo report heats --heatmap stdout + $ sudo damo report heatmap 22222222222222222222222222222222222222211111111111111111111111111111111111111100 44444444444444444444444444444444444444434444444444444444444444444444444444443200 44444444444444444444444444444444444444433444444444444444444444444444444444444200 @@ -160,6 +173,6 @@ Data Access Pattern Aware Memory Management Below command makes every memory region of size >=4K that has not accessed for >=60 seconds in your workload to be swapped out. :: - $ sudo damo schemes --damos_access_rate 0 0 --damos_sz_region 4K max \ - --damos_age 60s max --damos_action pageout \ - + $ sudo damo start --damos_access_rate 0 0 --damos_sz_region 4K max \ + --damos_age 60s max --damos_action pageout \ + -- 2.50.1