From f8780515fe914ac03189213c7e485264d65e2ece Mon Sep 17 00:00:00 2001 From: MengEn Sun Date: Thu, 10 Oct 2024 20:09:36 +0800 Subject: [PATCH 01/16] mm: add pcp high_min high_max to proc zoneinfo When we do not set percpu_pagelist_high_fraction the kernel will compute the pcp high_min/max by itself, which makes it hard to determine the current high_min/max values. So output the pcp high_min/max values to /proc/zoneinfo. Link: https://lkml.kernel.org/r/20241010120935.656619-1-mengensun@tencent.com Signed-off-by: MengEn Sun Reviewed-by: Jinliang Zheng Signed-off-by: Andrew Morton --- mm/vmstat.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index b5a4cea423e1..1917c034c045 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1791,13 +1791,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, pcp = per_cpu_ptr(zone->per_cpu_pageset, i); seq_printf(m, "\n cpu: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", + "\n count: %i" + "\n high: %i" + "\n batch: %i" + "\n high_min: %i" + "\n high_max: %i", i, pcp->count, pcp->high, - pcp->batch); + pcp->batch, + pcp->high_min, + pcp->high_max); #ifdef CONFIG_SMP pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i); seq_printf(m, "\n vm stats threshold: %d", -- 2.50.1 From 6359c39c9de66dede8ff5ff257c9e117483dbc7c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 10 Oct 2024 14:15:56 +0800 Subject: [PATCH 02/16] mm: remove unused hugepage for vma_alloc_folio() The hugepage parameter was deprecated since commit ddc1a5cbc05d ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"), for PMD-sized THP, it still tries only preferred node if possible in vma_alloc_folio() by checking the order of the folio allocation. Link: https://lkml.kernel.org/r/20241010061556.1846751-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 2 +- arch/arm64/mm/fault.c | 2 +- arch/m68k/include/asm/page_no.h | 2 +- arch/s390/include/asm/page.h | 2 +- arch/x86/include/asm/page.h | 2 +- include/linux/gfp.h | 6 +++--- include/linux/highmem.h | 2 +- mm/huge_memory.c | 2 +- mm/ksm.c | 2 +- mm/memory.c | 10 ++++------ mm/mempolicy.c | 3 +-- mm/userfaultfd.c | 2 +- 12 files changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 70419e6be1a3..3dffa2a461d7 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -18,7 +18,7 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 8b281cf308b3..d95dca561f7a 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -983,7 +983,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; - return vma_alloc_folio(flags, 0, vma, vaddr, false); + return vma_alloc_folio(flags, 0, vma, vaddr); } void tag_clear_highpage(struct page *page) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index af3a10973233..63c0e706084b 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -14,7 +14,7 @@ extern unsigned long memory_end; #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 73e1e03317b4..d02058f96bcf 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -74,7 +74,7 @@ static inline void copy_page(void *to, void *from) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) /* * These are used to make use of C type-checking.. diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 1b93ff80b43b..c9fe207916f4 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -35,7 +35,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a951de920e20..b65724c3427d 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -306,7 +306,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage); + unsigned long addr); #else static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { @@ -326,7 +326,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde { return folio_alloc_noprof(gfp, order); } -#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ +#define vma_alloc_folio_noprof(gfp, order, vma, addr) \ folio_alloc_noprof(gfp, order) #endif @@ -341,7 +341,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false); + struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr); return &folio->page; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 930a591b9b61..bec9bd715acf 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -226,7 +226,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, { struct folio *folio; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio) clear_user_highpage(&folio->page, vaddr); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c674afd16245..387c046a389e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1143,7 +1143,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, const int order = HPAGE_PMD_ORDER; struct folio *folio; - folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); diff --git a/mm/ksm.c b/mm/ksm.c index 556b8a8f37d0..e596bc1b5fa7 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2971,7 +2971,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio, if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); diff --git a/mm/memory.c b/mm/memory.c index 5e9d6a22eb08..c51bc45a7009 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1059,8 +1059,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm, if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; @@ -4017,8 +4016,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) struct folio *folio; swp_entry_t entry; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - vmf->address, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; @@ -4174,7 +4172,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) @@ -4713,7 +4711,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9e18a6fc3061..a29eff5d0585 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2290,7 +2290,6 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: Unused (was: For hugepages try only preferred node if possible). * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the @@ -2301,7 +2300,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) + unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 48b87c62fc3d..60a0be33766f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -251,7 +251,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr, false); + dst_addr); if (!folio) goto out; -- 2.50.1 From 0aa3ef3637920799f1b2f67dfff0d698127444ac Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 9 Oct 2024 17:35:50 -0700 Subject: [PATCH 03/16] memcg: add tracing for memcg stat updates The memcg stats are maintained in rstat infrastructure which provides very fast updates side and reasonable read side. However memcg added plethora of stats and made the read side, which is cgroup rstat flush, very slow. To solve that, threshold was added in the memcg stats read side i.e. no need to flush the stats if updates are within the threshold. This threshold based improvement worked for sometime but more stats were added to memcg and also the read codepath was getting triggered in the performance sensitive paths which made threshold based ratelimiting ineffective. We need more visibility into the hot and cold stats i.e. stats with a lot of updates. Let's add trace to get that visibility. [shakeel.butt@linux.dev: use unsigned long type for memcg_rstat_events, per Yosry] Link: https://lkml.kernel.org/r/20241015213721.3804209-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20241010003550.3695245-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: T.J. Mercier Cc: Michal Hocko Cc: Muchun Song Cc: JP Kobryn Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/memcg.h | 81 ++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 13 +++++- 2 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/memcg.h diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h new file mode 100644 index 000000000000..8667e57816d2 --- /dev/null +++ b/include/trace/events/memcg.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM memcg + +#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MEMCG_H + +#include +#include + + +DECLARE_EVENT_CLASS(memcg_rstat_stats, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val), + + TP_STRUCT__entry( + __field(u64, id) + __field(int, item) + __field(int, val) + ), + + TP_fast_assign( + __entry->id = cgroup_id(memcg->css.cgroup); + __entry->item = item; + __entry->val = val; + ), + + TP_printk("memcg_id=%llu item=%d val=%d", + __entry->id, __entry->item, __entry->val) +); + +DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val) +); + +DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val) +); + +DECLARE_EVENT_CLASS(memcg_rstat_events, + + TP_PROTO(struct mem_cgroup *memcg, int item, unsigned long val), + + TP_ARGS(memcg, item, val), + + TP_STRUCT__entry( + __field(u64, id) + __field(int, item) + __field(unsigned long, val) + ), + + TP_fast_assign( + __entry->id = cgroup_id(memcg->css.cgroup); + __entry->item = item; + __entry->val = val; + ), + + TP_printk("memcg_id=%llu item=%d val=%lu", + __entry->id, __entry->item, __entry->val) +); + +DEFINE_EVENT(memcg_rstat_events, count_memcg_events, + + TP_PROTO(struct mem_cgroup *memcg, int item, unsigned long val), + + TP_ARGS(memcg, item, val) +); + + +#endif /* _TRACE_MEMCG_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6159266185f..c93ecedf7a96 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -71,6 +71,10 @@ #include +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + #include struct cgroup_subsys memory_cgrp_subsys __read_mostly; @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, return; __this_cpu_add(memcg->vmstats_percpu->state[i], val); - memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); + val = memcg_state_val_in_pages(idx, val); + memcg_rstat_updated(memcg, val); + trace_mod_memcg_state(memcg, idx, val); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -741,7 +747,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); - memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); + val = memcg_state_val_in_pages(idx, val); + memcg_rstat_updated(memcg, val); + trace_mod_memcg_lruvec_state(memcg, idx, val); memcg_stats_unlock(); } @@ -832,6 +840,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[i], count); memcg_rstat_updated(memcg, count); + trace_count_memcg_events(memcg, idx, count); memcg_stats_unlock(); } -- 2.50.1 From 7e1fbaa0df1dfc7b820cd41be0b2c7535d3e983a Mon Sep 17 00:00:00 2001 From: suhua Date: Sat, 12 Oct 2024 15:08:02 +0800 Subject: [PATCH 04/16] mm/hugetlb: perform vmemmap optimization batchly for specific node allocation When HVO is enabled and huge page memory allocs are made, the freed memory can be aggregated into higher order memory in the following paths, which facilitates further allocs for higher order memory. echo 200000 > /proc/sys/vm/nr_hugepages echo 200000 > /sys/devices/system/node/node*/hugepages/hugepages-2048kB/nr_hugepages grub default_hugepagesz=2M hugepagesz=2M hugepages=200000 Currently not support for releasing aggregations to higher order in the following way, which will releasing to lower order. grub: default_hugepagesz=2M hugepagesz=2M hugepages=0:100000,1:100000 This patch supports the release of huge page optimizations aggregates to higher order memory. eg: cat /proc/cmdline BOOT_IMAGE=/boot/vmlinuz-xxx ... default_hugepagesz=2M hugepagesz=2M hugepages=0:100000,1:100000 Before: Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 ... Node 0, zone Normal, type Unmovable 55282 97039 99307 0 1 1 0 1 1 1 0 Node 0, zone Normal, type Movable 25 11 345 87 48 21 2 20 9 3 75061 Node 0, zone Normal, type Reclaimable 4 2 2 4 3 0 2 1 1 1 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 ... Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 1, zone Normal, type Unmovable 98888 99650 99679 2 3 1 2 2 2 0 0 Node 1, zone Normal, type Movable 1 1 0 1 1 0 1 0 1 1 75937 Node 1, zone Normal, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 1, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 After: Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 ... Node 0, zone Normal, type Unmovable 152 158 37 2 2 0 3 4 2 6 717 Node 0, zone Normal, type Movable 1 37 53 3 55 49 16 6 2 1 75000 Node 0, zone Normal, type Reclaimable 1 4 3 1 2 1 1 1 1 1 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 ... Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 1, zone Normal, type Unmovable 5 3 2 1 3 4 2 2 2 0 779 Node 1, zone Normal, type Movable 1 0 1 1 1 0 1 0 1 1 75849 Node 1, zone Normal, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 1, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Link: https://lkml.kernel.org/r/20241012070802.1876-1-suhua1@kingsoft.com Signed-off-by: suhua Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 190fa05635f4..906294ac85dc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3301,6 +3301,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; char buf[32]; + LIST_HEAD(folio_list); for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { if (hstate_is_gigantic(h)) { @@ -3310,14 +3311,18 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, - &node_states[N_MEMORY]); + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, + &node_states[N_MEMORY], NULL); if (!folio) break; - free_huge_folio(folio); /* free it into the hugepage allocator */ + list_add(&folio->lru, &folio_list); } cond_resched(); } + + if (!list_empty(&folio_list)) + prep_and_add_allocated_folios(h, &folio_list); + if (i == h->max_huge_pages_node[nid]) return; -- 2.50.1 From f0c99037a0c6301ca8c3e41162dd0426b5d38abe Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 11 Oct 2024 17:44:51 -0400 Subject: [PATCH 05/16] maple_tree: refactor mas_wr_store_type() In mas_wr_store_type(), we check if new_end < mt_slots[wr_mas->type]. If this check fails, we know that ,after this, new_end is >= mt_min_slots. Checking this again when we detect a wr_node_store later in the function is reduntant. Because this check is part of an OR statement, the statement will always evaluate to true, therefore we can just get rid of it. We also refactor mas_wr_store_type() to return the store type rather than set it directly as it greatly cleans up the function. Link: https://lkml.kernel.org/r/20241011214451.7286-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Suggested-by: Liam Howlett Suggested-by: Wei Yang Reviewed-by: Wei Yang Reviewed-by: Liam Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 72 +++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b3b1d4b8126b..a5e982e482dd 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4191,24 +4191,22 @@ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) } /* - * mas_wr_store_type() - Set the store type for a given + * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state + * + * Return: the type of store needed for the operation */ -static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) +static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; - if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) { - mas->store_type = wr_store_root; - return; - } + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) + return wr_store_root; - if (unlikely(!mas_wr_walk(wr_mas))) { - mas->store_type = wr_spanning_store; - return; - } + if (unlikely(!mas_wr_walk(wr_mas))) + return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); @@ -4216,50 +4214,30 @@ static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) mas_wr_extend_null(wr_mas); new_end = mas_wr_new_end(wr_mas); - if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) { - mas->store_type = wr_exact_fit; - return; - } + if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) + return wr_exact_fit; - if (unlikely(!mas->index && mas->last == ULONG_MAX)) { - mas->store_type = wr_new_root; - return; - } + if (unlikely(!mas->index && mas->last == ULONG_MAX)) + return wr_new_root; /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { - if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) { - mas->store_type = wr_rebalance; - return; - } - mas->store_type = wr_node_store; - return; + if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) + return wr_rebalance; + return wr_node_store; } - if (new_end >= mt_slots[wr_mas->type]) { - mas->store_type = wr_split_store; - return; - } + if (new_end >= mt_slots[wr_mas->type]) + return wr_split_store; - if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) { - mas->store_type = wr_append; - return; - } + if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) + return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || - (wr_mas->offset_end - mas->offset == 1))) { - mas->store_type = wr_slot_store; - return; - } - - if (mte_is_root(mas->node) || (new_end >= mt_min_slots[wr_mas->type]) || - (mas->mas_flags & MA_STATE_BULK)) { - mas->store_type = wr_node_store; - return; - } + (wr_mas->offset_end - mas->offset == 1))) + return wr_slot_store; - mas->store_type = wr_invalid; - MAS_WARN_ON(mas, 1); + return wr_node_store; } /** @@ -4274,7 +4252,7 @@ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) int request; mas_wr_prealloc_setup(wr_mas); - mas_wr_store_type(wr_mas); + mas->store_type = mas_wr_store_type(wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return; @@ -5446,7 +5424,7 @@ void *mas_store(struct ma_state *mas, void *entry) * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); - mas_wr_store_type(&wr_mas); + mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); @@ -5549,7 +5527,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) int request; mas_wr_prealloc_setup(&wr_mas); - mas_wr_store_type(&wr_mas); + mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return ret; -- 2.50.1 From 773ee2cda50c46e582a8ee2f8f00a5c8ac2923a7 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sat, 12 Oct 2024 01:19:50 +0800 Subject: [PATCH 06/16] mm/zswap: avoid touching XArray for unnecessary invalidation zswap_invalidation simply calls xa_erase, which acquires the Xarray lock first, then does a look up. This has a higher overhead even if zswap is not used or the tree is empty. So instead, do a very lightweight xa_empty check first, if there is nothing to erase, don't touch the lock or the tree. Using xa_empty rather than zswap_never_enabled is more helpful as it cover both case where zswap wes never used or the particular range doesn't have any zswap entry. And it's safe as the swap slot should be currently pinned by caller with HAS_CACHE. Sequential SWAP in/out tests with zswap disabled showed a minor performance gain, SWAP in of zero page with zswap enabled also showed a performance gain. (swapout is basically unchanged so only test one case): Swapout of 2G zero page using brd as SWAP, zswap disabled (total time, 4 testrun, +0.1%): Before: 1705013 us 1703119 us 1704335 us 1705848 us. After: 1703579 us 1710640 us 1703625 us 1708699 us. Swapin of 2G zero page using brd as SWAP, zswap disabled (total time, 4 testrun, -3.5%): Before: 1912312 us 1915692 us 1905837 us 1912706 us. After: 1845354 us 1849691 us 1845868 us 1841828 us. Swapin of 2G zero page using brd as SWAP, zswap enabled (total time, 4 testrun, -3.3%): Before: 1897994 us 1894681 us 1899982 us 1898333 us After: 1835894 us 1834113 us 1832047 us 1833125 us Swapin of 2G random page using brd as SWAP, zswap enabled (total time, 4 testrun, -0.1%): Before: 4519747 us 4431078 us 4430185 us 4439999 us After: 4492176 us 4437796 us 4434612 us 4434289 us And the performance is very slightly better or unchanged for build kernel test with zswap enabled or disabled. Build Linux Kernel with defconfig and -j32 in 1G memory cgroup, using brd SWAP, zswap disabled (sys time in seconds, 6 testrun, -0.1%): Before: 1648.83 1653.52 1666.34 1665.95 1663.06 1656.67 After: 1651.36 1661.89 1645.70 1657.45 1662.07 1652.83 Build Linux Kernel with defconfig and -j32 in 2G memory cgroup, using brd SWAP zswap enabled (sys time in seconds, 6 testrun, -0.3%): Before: 1240.25 1254.06 1246.77 1265.92 1244.23 1227.74 After: 1226.41 1218.21 1249.12 1249.13 1244.39 1233.01 Link: https://lkml.kernel.org/r/20241011171950.62684-1-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Yosry Ahmed Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index 162013952074..a9f4fa121eb2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1594,6 +1594,9 @@ void zswap_invalidate(swp_entry_t swp) struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; + if (xa_empty(tree)) + return; + entry = xa_erase(tree, offset); if (entry) zswap_entry_free(entry); -- 2.50.1 From 5708d96da20b99b4665ad72395e3727016057f70 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 11 Oct 2024 11:03:04 -0400 Subject: [PATCH 07/16] mm: avoid zeroing user movable page twice with init_on_alloc=1 Commit 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options") forces allocated page to be zeroed in post_alloc_hook() when init_on_alloc=1. For order-0 folios, if arch does not define vma_alloc_zeroed_movable_folio(), the default implementation again zeros the page return from the buddy allocator. So the page is zeroed twice. Fix it by passing __GFP_ZERO instead to avoid double page zeroing. At the moment, s390,arm64,x86,alpha,m68k are not impacted since they define their own vma_alloc_zeroed_movable_folio(). For >0 order folios (mTHP and PMD THP), folio_zero_user() is called to zero the folio again. Fix it by calling folio_zero_user() only if init_on_alloc is set. All arch are impacted. Add alloc_zeroed() helper to encapsulate the init_on_alloc check. [ziy@nvidia.com: comment fixes, per David] Link: https://lkml.kernel.org/r/97DB52E1-C594-49B5-9736-89AC302FAB01@nvidia.com Link: https://lkml.kernel.org/r/20241011150304.709590-1-ziy@nvidia.com Signed-off-by: Zi Yan Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Alexander Potapenko Cc: "Huang, Ying" Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +------- mm/huge_memory.c | 8 +++++++- mm/internal.h | 6 ++++++ mm/memory.c | 10 +++++++++- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index bec9bd715acf..6e452bd8e7e3 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,13 +224,7 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - struct folio *folio; - - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); - if (folio) - clear_user_highpage(&folio->page, vaddr); - - return folio; + return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); } #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 387c046a389e..73194aa0544c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1162,7 +1162,13 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, } folio_throttle_swaprate(folio, gfp); - folio_zero_user(folio, addr); + /* + * When a folio is not zeroed during allocation (__GFP_ZERO not used), + * folio_zero_user() is used to make sure that the page corresponding + * to the faulting address will be hot in the cache after zeroing. + */ + if (!alloc_zeroed()) + folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() diff --git a/mm/internal.h b/mm/internal.h index fc2f523258a3..c743c2b21dba 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1276,6 +1276,12 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); +static inline bool alloc_zeroed(void) +{ + return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, diff --git a/mm/memory.c b/mm/memory.c index c51bc45a7009..68e57b33363b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4719,7 +4719,15 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) goto next; } folio_throttle_swaprate(folio, gfp); - folio_zero_user(folio, vmf->address); + /* + * When a folio is not zeroed during allocation + * (__GFP_ZERO not used), folio_zero_user() is used + * to make sure that the page corresponding to the + * faulting address will be hot in the cache after + * zeroing. + */ + if (!alloc_zeroed()) + folio_zero_user(folio, vmf->address); return folio; } next: -- 2.50.1 From 1f2d03cc535138b7cdbed0122cdc0f9e9626c6bf Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Fri, 11 Oct 2024 21:49:28 +0900 Subject: [PATCH 08/16] vmscan: add a vmscan event for reclaim_pages reclaim_folio_list uses a dummy reclaim_stat and is not being used. To know the memory stat, add a new trace event. This is useful how how many pages are not reclaimed or why. This is an example: mm_vmscan_reclaim_pages: nid=0 nr_scanned=112 nr_reclaimed=112 nr_dirty=0 nr_writeback=0 nr_congested=0 nr_immediate=0 nr_activate_anon=0 nr_activate_file=0 nr_ref_keep=0 nr_unmap_fail=0 Currently reclaim_folio_list is only called by reclaim_pages, and reclaim_pages is used by damon and madvise. In the latest Android, reclaim_pages is also used by shmem to reclaim all pages in a address_space. [jaewon31.kim@samsung.com: use sc.nr_scanned rather than new counting] Link: https://lkml.kernel.org/r/20241016143227.961162-1-jaewon31.kim@samsung.com Link: https://lkml.kernel.org/r/20241011124928.1224813-1-jaewon31.kim@samsung.com Signed-off-by: Jaewon Kim Acked-by: Vlastimil Babka Cc: Jaewon Kim Cc: Kalesh Singh Cc: Minchan Kim Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 45 +++++++++++++++++++++++++++++++++++ mm/vmscan.c | 5 ++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 1a488c30afa5..490958fa10de 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -346,6 +346,51 @@ TRACE_EVENT(mm_vmscan_write_folio, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_reclaim_pages, + + TP_PROTO(int nid, + unsigned long nr_scanned, unsigned long nr_reclaimed, + struct reclaim_stat *stat), + + TP_ARGS(nid, nr_scanned, nr_reclaimed, stat), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_reclaimed) + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_congested) + __field(unsigned long, nr_immediate) + __field(unsigned int, nr_activate0) + __field(unsigned int, nr_activate1) + __field(unsigned long, nr_ref_keep) + __field(unsigned long, nr_unmap_fail) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_scanned = nr_scanned; + __entry->nr_reclaimed = nr_reclaimed; + __entry->nr_dirty = stat->nr_dirty; + __entry->nr_writeback = stat->nr_writeback; + __entry->nr_congested = stat->nr_congested; + __entry->nr_immediate = stat->nr_immediate; + __entry->nr_activate0 = stat->nr_activate[0]; + __entry->nr_activate1 = stat->nr_activate[1]; + __entry->nr_ref_keep = stat->nr_ref_keep; + __entry->nr_unmap_fail = stat->nr_unmap_fail; + ), + + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld", + __entry->nid, + __entry->nr_scanned, __entry->nr_reclaimed, + __entry->nr_dirty, __entry->nr_writeback, + __entry->nr_congested, __entry->nr_immediate, + __entry->nr_activate0, __entry->nr_activate1, + __entry->nr_ref_keep, __entry->nr_unmap_fail) +); + TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, diff --git a/mm/vmscan.c b/mm/vmscan.c index 6a3c498383fa..5bec29914f12 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2129,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan, static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { - struct reclaim_stat dummy_stat; + struct reclaim_stat stat; unsigned int nr_reclaimed; struct folio *folio; struct scan_control sc = { @@ -2140,12 +2140,13 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } + trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat); return nr_reclaimed; } -- 2.50.1 From f69c2e4dc6840cf93a3370853966657aca9f13c6 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Sun, 11 Aug 2024 23:13:40 -0700 Subject: [PATCH 09/16] mm/vmstat: defer the refresh_zone_stat_thresholds after all CPUs bringup refresh_zone_stat_thresholds function has two loops which is expensive for higher number of CPUs and NUMA nodes. Below is the rough estimation of total iterations done by these loops based on number of NUMA and CPUs. Total number of iterations: nCPU * 2 * Numa * mCPU Where: nCPU = total number of CPUs Numa = total number of NUMA nodes mCPU = mean value of total CPUs (e.g., 512 for 1024 total CPUs) For the system under test with 16 NUMA nodes and 1024 CPUs, this results in a substantial increase in the number of loop iterations during boot-up when NUMA is enabled: No NUMA = 1024*2*1*512 = 1,048,576 : Here refresh_zone_stat_thresholds takes around 224 ms total for all the CPUs in the system under test. 16 NUMA = 1024*2*16*512 = 16,777,216 : Here refresh_zone_stat_thresholds takes around 4.5 seconds total for all the CPUs in the system under test. Calling this for each CPU is expensive when there are large number of CPUs along with multiple NUMAs. Fix this by deferring refresh_zone_stat_thresholds to be called later at once when all the secondary CPUs are up. Also, register the DYN hooks to keep the existing hotplug functionality intact. Link: https://lkml.kernel.org/r/1723443220-20623-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Saurabh Sengar Acked-by: Christoph Lameter Reviewed-by: Srivatsa S. Bhat (Microsoft) Cc: Saurabh Singh Sengar Cc: Wei Liu Cc: Mel Gorman Cc: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/vmstat.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 1917c034c045..7b62bfb19afa 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1933,6 +1933,7 @@ static const struct seq_operations vmstat_op = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; +static int vmstat_late_init_done; #ifdef CONFIG_PROC_FS static void refresh_vm_stats(struct work_struct *work) @@ -2135,7 +2136,8 @@ static void __init init_cpu_node_state(void) static int vmstat_cpu_online(unsigned int cpu) { - refresh_zone_stat_thresholds(); + if (vmstat_late_init_done) + refresh_zone_stat_thresholds(); if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); @@ -2167,6 +2169,14 @@ static int vmstat_cpu_dead(unsigned int cpu) return 0; } +static int __init vmstat_late_init(void) +{ + refresh_zone_stat_thresholds(); + vmstat_late_init_done = 1; + + return 0; +} +late_initcall(vmstat_late_init); #endif struct workqueue_struct *mm_percpu_wq; -- 2.50.1 From 5b2100f723bd5c2b5552b27208f3e7d7447910d3 Mon Sep 17 00:00:00 2001 From: Jiazi Li Date: Wed, 26 Jun 2024 12:06:30 -0400 Subject: [PATCH 10/16] maple_tree: fix alloc node fail issue In the following code, the second call to the mas_node_count will return -ENOMEM: mas_node_count(mas, MAPLE_ALLOC_SLOTS + 1); mas_node_count(mas, MAPLE_ALLOC_SLOTS * 2 + 2); This is because there may be some full maple_alloc node in current maple state. Use full maple_alloc node will make max_req equal to 0. And it leads to mt_alloc_bulk return 0. As a result, mas_node_count set mas.node to MA_ERROR(-ENOMEM). Find a non-full maple_alloc node, and if necessary, use this non-full node in the next while loop. Link: https://lkml.kernel.org/r/20240626160631.3636515-1-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Jiazi Li Signed-off-by: Liam R. Howlett Suggested-by: Liam R. Howlett Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a5e982e482dd..cdac15168405 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1285,7 +1285,10 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node->node_count += count; allocated += count; - node = node->slot[0]; + /* find a non-full node*/ + do { + node = node->slot[0]; + } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); requested -= count; } mas->alloc->total = allocated; -- 2.50.1 From 0f85eb3395c74d7cc823169bbacc670c6645ae80 Mon Sep 17 00:00:00 2001 From: Jiazi Li Date: Wed, 26 Jun 2024 12:06:31 -0400 Subject: [PATCH 11/16] maple_tree: add some alloc node test case Add some maple_tree alloc node tese case. Link: https://lkml.kernel.org/r/20240626160631.3636515-2-Liam.Howlett@oracle.com Signed-off-by: Jiazi Li Signed-off-by: Liam R. Howlett Suggested-by: Liam R. Howlett Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 551ae6898c1d..bc30050227fd 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -462,6 +462,28 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); mas_destroy(&mas); + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 2); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mas.status = ma_start; + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 2); + mas_destroy(&mas); + + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 1); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 1); + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 3 + 2); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mas.status = ma_start; + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 3 + 2); + mas_destroy(&mas); + mtree_unlock(mt); } -- 2.50.1 From 0cc8d68abe2fdcb7039ece95f784698c0b0dc51e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 13 Sep 2024 06:31:28 +0000 Subject: [PATCH 12/16] maple_tree: root node could be handled by !p_slot too For a root node, mte_parent_slot() return 0, this exactly fits the following !p_slot check. So we can remove the special handling for root node. Link: https://lkml.kernel.org/r/20240913063128.27391-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index cdac15168405..c2d3c8d27358 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2155,9 +2155,7 @@ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); - if (mte_is_root(mas->node)) - return false; - + /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; -- 2.50.1 From e852cb1d00ceb4b0156832c13ba3daf7ed93ac17 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:44 +0000 Subject: [PATCH 13/16] maple_tree: clear request_count for new allocated one Patch series "maple_tree: simplify mas_push_node()", v2. When count is not 0, we know head is valid. So we can put the assignment in if (count) instead of checking the head pointer again. Also count represents current total, we can assign the new total by increasing the count by one. This patch (of 3): If this is not a new allocated one, the request_count has already been cleared in mas_set_alloc_req(). Link: https://lkml.kernel.org/r/20241015120746.15850-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20241015120746.15850-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c2d3c8d27358..ee7922b19f0a 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1265,11 +1265,11 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) mas->alloc = node; node->total = ++allocated; + node->request_count = 0; requested--; } node = mas->alloc; - node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; -- 2.50.1 From 4223dd93bfc976debededffc0b03cc63d9b73d14 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:45 +0000 Subject: [PATCH 14/16] maple_tree: total is not changed for nomem_one case If it jumps to nomem_one, the total allocated number is not changed. So we don't need to adjust it. For the nomem_bulk case, we know there is a valid mas->alloc. So we don't need to do the check. Link: https://lkml.kernel.org/r/20241015120746.15850-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ee7922b19f0a..1b80201865d8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1297,10 +1297,9 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); + mas->alloc->total = allocated; nomem_one: mas_set_alloc_req(mas, requested); - if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) - mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); } -- 2.50.1 From 908378a30b0972e5bf8fae3cf38affc162fe8e3b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:46 +0000 Subject: [PATCH 15/16] maple_tree: simplify mas_push_node() When count is not 0, we know head is valid. So we can put the assignment in if (count) instead of checking the head pointer again. Also count represents current total, we can assign the new total by increasing the count by one. Link: https://lkml.kernel.org/r/20241015120746.15850-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1b80201865d8..667120a44570 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1207,19 +1207,17 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) reuse->request_count = 0; reuse->node_count = 0; - if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { - head->slot[head->node_count++] = reuse; - head->total++; - goto done; - } - - reuse->total = 1; - if ((head) && !((unsigned long)head & 0x1)) { + if (count) { + if (head->node_count < MAPLE_ALLOC_SLOTS) { + head->slot[head->node_count++] = reuse; + head->total++; + goto done; + } reuse->slot[0] = head; reuse->node_count = 1; - reuse->total += head->total; } + reuse->total = count + 1; mas->alloc = reuse; done: if (requested > 1) -- 2.50.1 From e4137f08816bbf91fe76d1b60fa16862a4827ac1 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Fri, 11 Oct 2024 08:53:10 +0500 Subject: [PATCH 16/16] mm, kasan, kmsan: instrument copy_from/to_kernel_nofault Instrument copy_from_kernel_nofault() with KMSAN for uninitialized kernel memory check and copy_to_kernel_nofault() with KASAN, KCSAN to detect the memory corruption. syzbot reported that bpf_probe_read_kernel() kernel helper triggered KASAN report via kasan_check_range() which is not the expected behaviour as copy_from_kernel_nofault() is meant to be a non-faulting helper. Solution is, suggested by Marco Elver, to replace KASAN, KCSAN check in copy_from_kernel_nofault() with KMSAN detection of copying uninitilaized kernel memory. In copy_to_kernel_nofault() we can retain instrument_write() explicitly for the memory corruption instrumentation. copy_to_kernel_nofault() is tested on x86_64 and arm64 with CONFIG_KASAN_SW_TAGS. On arm64 with CONFIG_KASAN_HW_TAGS, kunit test currently fails. Need more clarification on it. [akpm@linux-foundation.org: fix comment layout, per checkpatch Link: https://lore.kernel.org/linux-mm/CANpmjNMAVFzqnCZhEity9cjiqQ9CVN1X7qeeeAp_6yKjwKo8iw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20241011035310.2982017-1-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Marco Elver Reported-by: syzbot+61123a5daeb9f7454599@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=61123a5daeb9f7454599 Reported-by: Andrey Konovalov Closes: https://bugzilla.kernel.org/show_bug.cgi?id=210505 Reviewed-by: Andrey Konovalov [KASAN] Tested-by: Andrey Konovalov [KASAN] Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 36 ++++++++++++++++++++++++++++++++++++ mm/kmsan/kmsan_test.c | 17 +++++++++++++++++ mm/maccess.c | 10 ++++++++-- 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index d8fb281e439d..fe132ce3c2b3 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1928,6 +1928,41 @@ static void rust_uaf(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf()); } +static void copy_to_kernel_nofault_oob(struct kunit *test) +{ + char *ptr; + char buf[128]; + size_t size = sizeof(buf); + + /* + * This test currently fails with the HW_TAGS mode. The reason is + * unknown and needs to be investigated. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS); + + ptr = kmalloc(size - KASAN_GRANULE_SIZE, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); + + /* + * We test copy_to_kernel_nofault() to detect corrupted memory that is + * being written into the kernel. In contrast, + * copy_from_kernel_nofault() is primarily used in kernel helper + * functions where the source address might be random or uninitialized. + * Applying KASAN instrumentation to copy_from_kernel_nofault() could + * lead to false positives. By focusing KASAN checks only on + * copy_to_kernel_nofault(), we ensure that only valid memory is + * written to the kernel, minimizing the risk of kernel corruption + * while avoiding false positives in the reverse case. + */ + KUNIT_EXPECT_KASAN_FAIL(test, + copy_to_kernel_nofault(&buf[0], ptr, size)); + KUNIT_EXPECT_KASAN_FAIL(test, + copy_to_kernel_nofault(ptr, &buf[0], size)); + + kfree(ptr); +} + static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_right), KUNIT_CASE(kmalloc_oob_left), @@ -2000,6 +2035,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), + KUNIT_CASE(copy_to_kernel_nofault_oob), KUNIT_CASE(rust_uaf), {} }; diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 13236d579eba..9733a22c46c1 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -640,6 +640,22 @@ static void test_unpoison_memory(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } +static void test_copy_from_kernel_nofault(struct kunit *test) +{ + long ret; + char buf[4], src[4]; + size_t size = sizeof(buf); + + EXPECTATION_UNINIT_VALUE_FN(expect, "copy_from_kernel_nofault"); + kunit_info( + test, + "testing copy_from_kernel_nofault with uninitialized memory\n"); + + ret = copy_from_kernel_nofault((char *)&buf[0], (char *)&src[0], size); + USE(ret); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_uninit_kmalloc), KUNIT_CASE(test_init_kmalloc), @@ -664,6 +680,7 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_long_origin_chain), KUNIT_CASE(test_stackdepot_roundtrip), KUNIT_CASE(test_unpoison_memory), + KUNIT_CASE(test_copy_from_kernel_nofault), {}, }; diff --git a/mm/maccess.c b/mm/maccess.c index 518a25667323..3ca55ec63a6a 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -13,9 +13,14 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, return true; } +/* + * The below only uses kmsan_check_memory() to ensure uninitialized kernel + * memory isn't leaked. + */ #define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ - __get_kernel_nofault(dst, src, type, err_label); \ + __get_kernel_nofault(dst, src, type, err_label); \ + kmsan_check_memory(src, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ @@ -49,7 +54,8 @@ EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); #define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ - __put_kernel_nofault(dst, src, type, err_label); \ + __put_kernel_nofault(dst, src, type, err_label); \ + instrument_write(dst, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ -- 2.50.1