From 0e5ffe9b2bd6d9ab7bf45f512c016e4710bf6d5d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 31 Jan 2025 12:31:52 +0000 Subject: [PATCH 01/16] mm: make vmg->target consistent and further simplify commit_merge() It is confusing for vmg->target to sometimes be the target merged VMA and in one case not. Fix this by having commit_merge() use its awareness of the vmg->_adjust_next_start case to know that it is manipulating a separate vma, abstracted in the 'vma' local variable. Place removal and adjust VMA determination logic into init_multi_vma_prep(), as the flags give us enough information to do so, and since this is the function that sets up the vma_prepare struct it makes sense to do so here. Doing this significantly simplifies commit_merge(), allowing us to eliminate the 'merge_target' handling, initialise the VMA iterator in a more sensible place and simply return vmg->target consistently. This also allows us to simplify setting vmg->target in vma_merge_existing_range() since we are then left only with two cases - merge left (or both) where the target is vmg->prev or merge right in which the target is vmg->next. This makes it easy for somebody reading the code to know what VMA will actually be the one returned and merged into and removes a great deal of the confusing 'adjust' nonsense. This patch has no change in functional behaviour. Link: https://lkml.kernel.org/r/50f96e31ab1980eaaf1006e34a4f6e6dad9320b8.1738326519.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton --- mm/vma.c | 119 ++++++++++++++++++++++++++++--------------------------- mm/vma.h | 6 +-- 2 files changed, 62 insertions(+), 63 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index bac0e72ccb62..b10625e8fc99 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -106,24 +106,40 @@ static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked - * @next: The next vma if it is to be adjusted - * @remove: The first vma to be removed - * @remove2: The second vma to be removed + * @vmg: The merge state that will be used to determine adjustment and VMA + * removal. */ static void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, - struct vm_area_struct *next, - struct vm_area_struct *remove, - struct vm_area_struct *remove2) + struct vma_merge_struct *vmg) { + struct vm_area_struct *adjust; + struct vm_area_struct **remove = &vp->remove; + memset(vp, 0, sizeof(struct vma_prepare)); vp->vma = vma; vp->anon_vma = vma->anon_vma; - vp->remove = remove ? remove : remove2; - vp->remove2 = remove ? remove2 : NULL; - vp->adj_next = next; - if (!vp->anon_vma && next) - vp->anon_vma = next->anon_vma; + + if (vmg && vmg->__remove_middle) { + *remove = vmg->middle; + remove = &vp->remove2; + } + if (vmg && vmg->__remove_next) + *remove = vmg->next; + + if (vmg && vmg->__adjust_middle_start) + adjust = vmg->middle; + else if (vmg && vmg->__adjust_next_start) + adjust = vmg->next; + else + adjust = NULL; + + vp->adj_next = adjust; + if (!vp->anon_vma && adjust) + vp->anon_vma = adjust->anon_vma; + + VM_WARN_ON(vp->anon_vma && adjust && adjust->anon_vma && + vp->anon_vma != adjust->anon_vma); vp->file = vma->vm_file; if (vp->file) @@ -360,7 +376,7 @@ again: */ static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { - init_multi_vma_prep(vp, vma, NULL, NULL, NULL); + init_multi_vma_prep(vp, vma, NULL); } /* @@ -634,76 +650,63 @@ void validate_mm(struct mm_struct *mm) */ static struct vm_area_struct *commit_merge(struct vma_merge_struct *vmg) { - struct vm_area_struct *remove = NULL; - struct vm_area_struct *remove2 = NULL; + struct vm_area_struct *vma; struct vma_prepare vp; - struct vm_area_struct *adjust = NULL; + struct vm_area_struct *adjust; long adj_start; - bool merge_target; /* * If modifying an existing VMA and we don't remove vmg->middle, then we * shrink the adjacent VMA. */ if (vmg->__adjust_middle_start) { + vma = vmg->target; adjust = vmg->middle; /* The POSITIVE value by which we offset vmg->middle->vm_start. */ adj_start = vmg->end - vmg->middle->vm_start; - merge_target = true; + + /* Note: vma iterator must be pointing to 'start'. */ + vma_iter_config(vmg->vmi, vmg->start, vmg->end); } else if (vmg->__adjust_next_start) { + /* + * In this case alone, the VMA we manipulate is vmg->middle, but + * we ultimately return vmg->next. + */ + vma = vmg->middle; adjust = vmg->next; /* The NEGATIVE value by which we offset vmg->next->vm_start. */ adj_start = -(vmg->middle->vm_end - vmg->end); - /* - * In all cases but this - merge right, shrink next - we write - * vmg->target to the maple tree and return this as the merged VMA. - */ - merge_target = false; + + vma_iter_config(vmg->vmi, vmg->next->vm_start + adj_start, + vmg->next->vm_end); } else { + vma = vmg->target; adjust = NULL; adj_start = 0; - merge_target = true; - } - - if (vmg->__remove_middle) - remove = vmg->middle; - if (vmg->__remove_next) - remove2 = vmg->next; - - init_multi_vma_prep(&vp, vmg->target, adjust, remove, remove2); - VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && - vp.anon_vma != adjust->anon_vma); - - if (merge_target) { - /* Note: vma iterator must be pointing to 'start'. */ + /* Note: vma iterator must be pointing to 'start'. */ vma_iter_config(vmg->vmi, vmg->start, vmg->end); - } else { - vma_iter_config(vmg->vmi, adjust->vm_start + adj_start, - adjust->vm_end); } - if (vma_iter_prealloc(vmg->vmi, vmg->target)) + init_multi_vma_prep(&vp, vma, vmg); + + if (vma_iter_prealloc(vmg->vmi, vma)) return NULL; vma_prepare(&vp); - vma_adjust_trans_huge(vmg->target, vmg->start, vmg->end, adj_start); - vma_set_range(vmg->target, vmg->start, vmg->end, vmg->pgoff); - - if (merge_target) - vma_iter_store(vmg->vmi, vmg->target); + vma_adjust_trans_huge(vma, vmg->start, vmg->end, adj_start); + vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); if (adj_start) { adjust->vm_start += adj_start; adjust->vm_pgoff += PHYS_PFN(adj_start); - - if (!merge_target) - vma_iter_store(vmg->vmi, adjust); } - vma_complete(&vp, vmg->vmi, vmg->target->vm_mm); + vma_iter_store(vmg->vmi, vmg->target); + + vma_complete(&vp, vmg->vmi, vma->vm_mm); - return merge_target ? vmg->target : vmg->next; + return vmg->target; } /* We can only remove VMAs when merging if they do not have a close hook. */ @@ -833,11 +836,15 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( /* No matter what happens, we will be adjusting middle. */ vma_start_write(middle); - if (merge_left) - vma_start_write(prev); - - if (merge_right) + if (merge_right) { vma_start_write(next); + vmg->target = next; + } + + if (merge_left) { + vma_start_write(prev); + vmg->target = prev; + } if (merge_both) { /* @@ -847,7 +854,6 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * extend delete delete */ - vmg->target = prev; vmg->start = prev->vm_start; vmg->end = next->vm_end; vmg->pgoff = prev->vm_pgoff; @@ -868,7 +874,6 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * extend shrink/delete */ - vmg->target = prev; vmg->start = prev->vm_start; vmg->pgoff = prev->vm_pgoff; @@ -892,7 +897,6 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( VM_WARN_ON_VMG(vmg->start > middle->vm_start && prev && middle != prev, vmg); if (vmg->__remove_middle) { - vmg->target = next; vmg->end = next->vm_end; vmg->pgoff = next->vm_pgoff - pglen; } else { @@ -903,7 +907,6 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( * merged VMA is NOT vmg->target, but rather vmg->next. */ vmg->__adjust_next_start = true; - vmg->target = middle; vmg->start = middle->vm_start; vmg->end = start; vmg->pgoff = middle->vm_pgoff; diff --git a/mm/vma.h b/mm/vma.h index e18487797fa4..e55e68abfbe3 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -82,11 +82,7 @@ struct vma_merge_struct { struct vm_area_struct *prev; struct vm_area_struct *middle; struct vm_area_struct *next; - /* - * This is the VMA we ultimately target to become the merged VMA, except - * for the one exception of merge right, shrink next (for details of - * this scenario see vma_merge_existing_range()). - */ + /* This is the VMA we ultimately target to become the merged VMA. */ struct vm_area_struct *target; /* * Initially, the start, end, pgoff fields are provided by the caller -- 2.50.1 From c372473a545edff2fdbc002fc67c181e17c7557b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 31 Jan 2025 12:31:53 +0000 Subject: [PATCH 02/16] mm: completely abstract unnecessary adj_start calculation The adj_start calculation has been a constant source of confusion in the VMA merge code. There are two cases to consider, one where we adjust the start of the vmg->middle VMA (i.e. the vmg->__adjust_middle_start merge flag is set), in which case adj_start is calculated as: (1) adj_start = vmg->end - vmg->middle->vm_start And the case where we adjust the start of the vmg->next VMA (i.e. the vmg->__adjust_next_start merge flag is set), in which case adj_start is calculated as: (2) adj_start = -(vmg->middle->vm_end - vmg->end) We apply (1) thusly: vmg->middle->vm_start = vmg->middle->vm_start + vmg->end - vmg->middle->vm_start Which simplifies to: vmg->middle->vm_start = vmg->end Similarly, we apply (2) as: vmg->next->vm_start = vmg->next->vm_start + -(vmg->middle->vm_end - vmg->end) Noting that for these VMAs to be mergeable vmg->middle->vm_end == vmg->next->vm_start and so this simplifies to: vmg->next->vm_start = vmg->next->vm_start + -(vmg->next->vm_start - vmg->end) Which simplifies to: vmg->next->vm_start = vmg->end Therefore in each case, we simply need to adjust the start of the VMA to vmg->end (!) and can do away with this adj_start calculation. The only caveat is that we must ensure we update the vm_pgoff field correctly. We therefore abstract this entire calculation to a new function vmg_adjust_set_range() which performs this calculation and sets the adjusted VMA's new range using the general vma_set_range() function. We also must update vma_adjust_trans_huge() which expects the now-abstracted adj_start parameter. It turns out this is wholly unnecessary. In vma_adjust_trans_huge() the relevant code is: if (adjust_next > 0) { struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); } The only case where this is relevant is when vmg->__adjust_middle_start is specified (in which case adj_next would have been positive), i.e. the one in which the vma specified is vmg->prev and this the sought 'next' VMA would be vmg->middle. We can therefore eliminate the find_vma() invocation altogether and simply provide the vmg->middle VMA in this instance, or NULL otherwise. Again we have an adj_next offset calculation: next->vm_start + vmg->end - vmg->middle->vm_start Where next == vmg->middle this simplifies to vmg->end as previously demonstrated. Therefore nstart is equal to vmg->end, which is already passed to vma_adjust_trans_huge() via the 'end' parameter and so this code (rather delightfully) simplifies to: if (next) split_huge_pmd_if_needed(next, end); With these changes in place, it becomes silly for commit_merge() to return vmg->target, as it is always the same and threaded through vmg, so we finally change commit_merge() to return an error value once again. This patch has no change in functional behaviour. Link: https://lkml.kernel.org/r/7bce2cd4b5afb56211822835d145471280c3dccc.1738326519.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 +- mm/huge_memory.c | 19 ++---- mm/vma.c | 99 +++++++++++++++----------------- tools/testing/vma/vma_internal.h | 4 +- 4 files changed, 57 insertions(+), 69 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 93e509b6c00e..e1bea54820ff 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -404,7 +404,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, - unsigned long end, long adjust_next); + unsigned long end, struct vm_area_struct *next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); @@ -571,7 +571,7 @@ static inline int madvise_collapse(struct vm_area_struct *vma, static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, - long adjust_next) + struct vm_area_struct *next) { } static inline int is_swap_pmd(pmd_t pmd) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e33da765c428..e7ac4f0dc21d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3017,9 +3017,9 @@ static inline void split_huge_pmd_if_needed(struct vm_area_struct *vma, unsigned } void vma_adjust_trans_huge(struct vm_area_struct *vma, - unsigned long start, - unsigned long end, - long adjust_next) + unsigned long start, + unsigned long end, + struct vm_area_struct *next) { /* Check if we need to split start first. */ split_huge_pmd_if_needed(vma, start); @@ -3027,16 +3027,9 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, /* Check if we need to split end next. */ split_huge_pmd_if_needed(vma, end); - /* - * If we're also updating the next vma vm_start, - * check if we need to split it. - */ - if (adjust_next > 0) { - struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); - unsigned long nstart = next->vm_start; - nstart += adjust_next; - split_huge_pmd_if_needed(next, nstart); - } + /* If we're incrementing next->vm_start, we might need to split it. */ + if (next) + split_huge_pmd_if_needed(next, end); } static void unmap_folio(struct folio *folio) diff --git a/mm/vma.c b/mm/vma.c index b10625e8fc99..603e538a093f 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -513,7 +513,7 @@ __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); - vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + vma_adjust_trans_huge(vma, vma->vm_start, addr, NULL); if (new_below) { vma->vm_start = addr; @@ -644,46 +644,44 @@ void validate_mm(struct mm_struct *mm) #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* - * Actually perform the VMA merge operation. - * - * On success, returns the merged VMA. Otherwise returns NULL. + * Based on the vmg flag indicating whether we need to adjust the vm_start field + * for the middle or next VMA, we calculate what the range of the newly adjusted + * VMA ought to be, and set the VMA's range accordingly. */ -static struct vm_area_struct *commit_merge(struct vma_merge_struct *vmg) +static void vmg_adjust_set_range(struct vma_merge_struct *vmg) { - struct vm_area_struct *vma; - struct vma_prepare vp; struct vm_area_struct *adjust; - long adj_start; + pgoff_t pgoff; - /* - * If modifying an existing VMA and we don't remove vmg->middle, then we - * shrink the adjacent VMA. - */ if (vmg->__adjust_middle_start) { - vma = vmg->target; adjust = vmg->middle; - /* The POSITIVE value by which we offset vmg->middle->vm_start. */ - adj_start = vmg->end - vmg->middle->vm_start; - - /* Note: vma iterator must be pointing to 'start'. */ - vma_iter_config(vmg->vmi, vmg->start, vmg->end); + pgoff = adjust->vm_pgoff + PHYS_PFN(vmg->end - adjust->vm_start); } else if (vmg->__adjust_next_start) { - /* - * In this case alone, the VMA we manipulate is vmg->middle, but - * we ultimately return vmg->next. - */ - vma = vmg->middle; adjust = vmg->next; - /* The NEGATIVE value by which we offset vmg->next->vm_start. */ - adj_start = -(vmg->middle->vm_end - vmg->end); + pgoff = adjust->vm_pgoff - PHYS_PFN(adjust->vm_start - vmg->end); + } else { + return; + } + + vma_set_range(adjust, vmg->end, adjust->vm_end, pgoff); +} + +/* + * Actually perform the VMA merge operation. + * + * Returns 0 on success, or an error value on failure. + */ +static int commit_merge(struct vma_merge_struct *vmg) +{ + struct vm_area_struct *vma; + struct vma_prepare vp; - vma_iter_config(vmg->vmi, vmg->next->vm_start + adj_start, - vmg->next->vm_end); + if (vmg->__adjust_next_start) { + /* We manipulate middle and adjust next, which is the target. */ + vma = vmg->middle; + vma_iter_config(vmg->vmi, vmg->end, vmg->next->vm_end); } else { vma = vmg->target; - adjust = NULL; - adj_start = 0; - /* Note: vma iterator must be pointing to 'start'. */ vma_iter_config(vmg->vmi, vmg->start, vmg->end); } @@ -691,22 +689,22 @@ static struct vm_area_struct *commit_merge(struct vma_merge_struct *vmg) init_multi_vma_prep(&vp, vma, vmg); if (vma_iter_prealloc(vmg->vmi, vma)) - return NULL; + return -ENOMEM; vma_prepare(&vp); - vma_adjust_trans_huge(vma, vmg->start, vmg->end, adj_start); + /* + * THP pages may need to do additional splits if we increase + * middle->vm_start. + */ + vma_adjust_trans_huge(vma, vmg->start, vmg->end, + vmg->__adjust_middle_start ? vmg->middle : NULL); vma_set_range(vma, vmg->start, vmg->end, vmg->pgoff); - - if (adj_start) { - adjust->vm_start += adj_start; - adjust->vm_pgoff += PHYS_PFN(adj_start); - } - + vmg_adjust_set_range(vmg); vma_iter_store(vmg->vmi, vmg->target); vma_complete(&vp, vmg->vmi, vma->vm_mm); - return vmg->target; + return 0; } /* We can only remove VMAs when merging if they do not have a close hook. */ @@ -749,7 +747,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( { struct vm_area_struct *middle = vmg->middle; struct vm_area_struct *prev = vmg->prev; - struct vm_area_struct *next, *res; + struct vm_area_struct *next; struct vm_area_struct *anon_dup = NULL; unsigned long start = vmg->start; unsigned long end = vmg->end; @@ -900,12 +898,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( vmg->end = next->vm_end; vmg->pgoff = next->vm_pgoff - pglen; } else { - /* - * We shrink middle and expand next. - * - * IMPORTANT: This is the ONLY case where the final - * merged VMA is NOT vmg->target, but rather vmg->next. - */ + /* We shrink middle and expand next. */ vmg->__adjust_next_start = true; vmg->start = middle->vm_start; vmg->end = start; @@ -918,8 +911,10 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( if (err) goto abort; - res = commit_merge(vmg); - if (!res) { + err = commit_merge(vmg); + if (err) { + VM_WARN_ON(err != -ENOMEM); + if (anon_dup) unlink_anon_vmas(anon_dup); @@ -927,9 +922,9 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( return NULL; } - khugepaged_enter_vma(res, vmg->flags); + khugepaged_enter_vma(vmg->target, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; - return res; + return vmg->target; abort: vma_iter_set(vmg->vmi, start); @@ -1092,7 +1087,7 @@ int vma_expand(struct vma_merge_struct *vmg) if (remove_next) vmg->__remove_next = true; - if (!commit_merge(vmg)) + if (commit_merge(vmg)) goto nomem; return 0; @@ -1132,7 +1127,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, init_vma_prep(&vp, vma); vma_prepare(&vp); - vma_adjust_trans_huge(vma, start, end, 0); + vma_adjust_trans_huge(vma, start, end, NULL); vma_iter_clear(vmi); vma_set_range(vma, start, end, pgoff); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 1eae23039854..bb273927af0f 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -796,12 +796,12 @@ static inline void vma_start_write(struct vm_area_struct *vma) static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, - long adjust_next) + struct vm_area_struct *next) { (void)vma; (void)start; (void)end; - (void)adjust_next; + (void)next; } static inline void vma_iter_free(struct vma_iterator *vmi) -- 2.50.1 From 51ff4d7486f0c0b4110a6da4af805b179dd7b11e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Feb 2025 15:18:00 -0800 Subject: [PATCH 03/16] mm: avoid extra mem_alloc_profiling_enabled() checks Refactor code to avoid extra mem_alloc_profiling_enabled() checks inside pgalloc_tag_get() function which is often called after that check was already done. Link: https://lkml.kernel.org/r/20250201231803.2661189-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: David Wang <00107082@163.com> Cc: Steven Rostedt Cc: Kent Overstreet Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Vlastimil Babka Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 35 +++++++++++++++++++---------------- lib/alloc_tag.c | 6 +++--- mm/page_alloc.c | 3 +-- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 3469c4b20105..4a82b6b4820e 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -205,28 +205,32 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) } } -static inline struct alloc_tag *pgalloc_tag_get(struct page *page) +/* Should be called only if mem_alloc_profiling_enabled() */ +static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) { struct alloc_tag *tag = NULL; - - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_sub_check(&ref); - if (ref.ct) - tag = ct_to_alloc_tag(ref.ct); - put_page_tag_ref(handle); - } + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub_check(&ref); + if (ref.ct) + tag = ct_to_alloc_tag(ref.ct); + put_page_tag_ref(handle); } return tag; } -static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) +static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) { - if (mem_alloc_profiling_enabled() && tag) + struct alloc_tag *tag; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = __pgalloc_tag_get(page); + if (tag) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } @@ -241,8 +245,7 @@ static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } -static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 19b45617bdcf..1d893e313614 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -174,7 +174,7 @@ void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) if (!mem_alloc_profiling_enabled()) return; - tag = pgalloc_tag_get(&folio->page); + tag = __pgalloc_tag_get(&folio->page); if (!tag) return; @@ -200,10 +200,10 @@ void pgalloc_tag_swap(struct folio *new, struct folio *old) if (!mem_alloc_profiling_enabled()) return; - tag_old = pgalloc_tag_get(&old->page); + tag_old = __pgalloc_tag_get(&old->page); if (!tag_old) return; - tag_new = pgalloc_tag_get(&new->page); + tag_new = __pgalloc_tag_get(&new->page); if (!tag_new) return; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 542d25f77be8..38c2b2d20b1d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4833,12 +4833,11 @@ void __free_pages(struct page *page, unsigned int order) { /* get PageHead before we drop reference */ int head = PageHead(page); - struct alloc_tag *tag = pgalloc_tag_get(page); if (put_page_testzero(page)) free_frozen_pages(page, order); else if (!head) { - pgalloc_tag_sub_pages(tag, (1 << order) - 1); + pgalloc_tag_sub_pages(page, (1 << order) - 1); while (order-- > 0) free_frozen_pages(page + (1 << order), order); } -- 2.50.1 From a642b27b991fd663de1676bf91583d1e2397d93d Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Feb 2025 15:18:01 -0800 Subject: [PATCH 04/16] alloc_tag: uninline code gated by mem_alloc_profiling_key in slab allocator When a sizable code section is protected by a disabled static key, that code gets into the instruction cache even though it's not executed and consumes the cache, increasing cache misses. This can be remedied by moving such code into a separate uninlined function. On a Pixel6 phone, slab allocation profiling overhead measured with CONFIG_MEM_ALLOC_PROFILING=y and profiling disabled is: baseline modified Big core 3.31% 0.17% Medium core 3.79% 0.57% Little core 6.68% 1.28% This improvement comes at the expense of the configuration when profiling gets enabled, since there is now an additional function call. The overhead from this additional call on Pixel6 is: Big core 0.66% Middle core 1.23% Little core 2.42% However this is negligible when compared with the overall overhead of the memory allocation profiling when it is enabled. On x86 this patch does not make noticeable difference because the overhead with mem_alloc_profiling_key disabled is much lower (under 1%) to start with, so any improvement is less visible and hard to distinguish from the noise. The overhead from additional call when profiling is enabled is also within noise levels. Link: https://lkml.kernel.org/r/20250201231803.2661189-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Vlastimil Babka Reviewed-by: Shakeel Butt Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Steven Rostedt Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- mm/slub.c | 51 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 1f50129dcfb3..184fd2b14758 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2000,7 +2000,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, return 0; } -static inline void free_slab_obj_exts(struct slab *slab) +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void free_slab_obj_exts(struct slab *slab) { struct slabobj_ext *obj_exts; @@ -2077,33 +2078,37 @@ prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) return slab_obj_exts(slab) + obj_to_index(s, slab, p); } -static inline void -alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) { - if (need_slab_obj_ext()) { - struct slabobj_ext *obj_exts; + struct slabobj_ext *obj_exts; - obj_exts = prepare_slab_obj_exts_hook(s, flags, object); - /* - * Currently obj_exts is used only for allocation profiling. - * If other users appear then mem_alloc_profiling_enabled() - * check should be added before alloc_tag_add(). - */ - if (likely(obj_exts)) - alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); - } + obj_exts = prepare_slab_obj_exts_hook(s, flags, object); + /* + * Currently obj_exts is used only for allocation profiling. + * If other users appear then mem_alloc_profiling_enabled() + * check should be added before alloc_tag_add(). + */ + if (likely(obj_exts)) + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); } static inline void -alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, - int objects) +alloc_tagging_slab_alloc_hook(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (need_slab_obj_ext()) + __alloc_tagging_slab_alloc_hook(s, object, flags); +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline void +__alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) { struct slabobj_ext *obj_exts; int i; - if (!mem_alloc_profiling_enabled()) - return; - /* slab->obj_exts might not be NULL if it was created for MEMCG accounting. */ if (s->flags & (SLAB_NO_OBJ_EXT | SLAB_NOLEAKTRACE)) return; @@ -2119,6 +2124,14 @@ alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, } } +static inline void +alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, + int objects) +{ + if (mem_alloc_profiling_enabled()) + __alloc_tagging_slab_free_hook(s, slab, p, objects); +} + #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void -- 2.50.1 From 93d5440ece3c0aa341fb02e3a44a1b7ab44304c8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Sat, 1 Feb 2025 15:18:02 -0800 Subject: [PATCH 05/16] alloc_tag: uninline code gated by mem_alloc_profiling_key in page allocator When a sizable code section is protected by a disabled static key, that code gets into the instruction cache even though it's not executed and consumes the cache, increasing cache misses. This can be remedied by moving such code into a separate uninlined function. On a Pixel6 phone, page allocation profiling overhead measured with CONFIG_MEM_ALLOC_PROFILING=y and profiling disabled is: baseline modified Big core 4.93% 1.53% Medium core 4.39% 1.41% Little core 1.02% 0.36% This improvement comes at the expense of the configuration when profiling gets enabled, since there is now an additional function call. The overhead from this additional call on Pixel6 is: Big core 0.24% Middle core 0.63% Little core 1.1% However this is negligible when compared with the overall overhead of the memory allocation profiling when it is enabled. On x86 this patch does not make noticeable difference because the overhead with mem_alloc_profiling_key disabled is much lower (under 1%) to start with, so any improvement is less visible and hard to distinguish from the noise. The overhead from additional call when profiling is enabled is also within noise levels. Link: https://lkml.kernel.org/r/20250201231803.2661189-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Zijlstra (Intel) Cc: Sourav Panda Cc: Steven Rostedt Cc: Vlastimil Babka Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/pgalloc_tag.h | 60 +++------------------------- mm/page_alloc.c | 78 +++++++++++++++++++++++++++++++++++++ 2 files changed, 83 insertions(+), 55 deletions(-) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 4a82b6b4820e..c74077977830 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -162,47 +162,13 @@ static inline void update_page_tag_ref(union pgtag_ref_handle handle, union code } } -static inline void clear_page_tag_ref(struct page *page) -{ - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - set_codetag_empty(&ref); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} - -static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, - unsigned int nr) -{ - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} +/* Should be called only if mem_alloc_profiling_enabled() */ +void __clear_page_tag_ref(struct page *page); -static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) +static inline void clear_page_tag_ref(struct page *page) { - if (mem_alloc_profiling_enabled()) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(page, &ref, &handle)) { - alloc_tag_sub(&ref, PAGE_SIZE * nr); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } + if (mem_alloc_profiling_enabled()) + __clear_page_tag_ref(page); } /* Should be called only if mem_alloc_profiling_enabled() */ @@ -222,18 +188,6 @@ static inline struct alloc_tag *__pgalloc_tag_get(struct page *page) return tag; } -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) -{ - struct alloc_tag *tag; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = __pgalloc_tag_get(page); - if (tag) - this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); -} - void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); void pgalloc_tag_swap(struct folio *new, struct folio *old); @@ -242,10 +196,6 @@ void __init alloc_tag_sec_init(void); #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void clear_page_tag_ref(struct page *page) {} -static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, - unsigned int nr) {} -static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} -static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} static inline void pgalloc_tag_swap(struct folio *new, struct folio *old) {} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 38c2b2d20b1d..d875f055aa53 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1041,6 +1041,84 @@ static void kernel_init_pages(struct page *page, int numpages) kasan_enable_current(); } +#ifdef CONFIG_MEM_ALLOC_PROFILING + +/* Should be called only if mem_alloc_profiling_enabled() */ +void __clear_page_tag_ref(struct page *page) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + set_codetag_empty(&ref); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline +void __pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int nr) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } +} + +static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int nr) +{ + if (mem_alloc_profiling_enabled()) + __pgalloc_tag_add(page, task, nr); +} + +/* Should be called only if mem_alloc_profiling_enabled() */ +static noinline +void __pgalloc_tag_sub(struct page *page, unsigned int nr) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub(&ref, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } +} + +static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) +{ + if (mem_alloc_profiling_enabled()) + __pgalloc_tag_sub(page, nr); +} + +static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) +{ + struct alloc_tag *tag; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = __pgalloc_tag_get(page); + if (tag) + this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); +} + +#else /* CONFIG_MEM_ALLOC_PROFILING */ + +static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int nr) {} +static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} +static inline void pgalloc_tag_sub_pages(struct page *page, unsigned int nr) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + __always_inline bool free_pages_prepare(struct page *page, unsigned int order) { -- 2.50.1 From 023fff71d893d9aa7814078f24d730afccbab9b9 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 4 Feb 2025 22:53:43 +0000 Subject: [PATCH 06/16] selftests/mm: fix thuge-gen test name uniqueness The thuge-gen test_mmap() and test_shmget() tests are repeatedly run for a variety of sizes but always report the result of their test with the same name, meaning that automated sysetms running the tests are unable to distinguish between the various tests. Add the supplied sizes to the logged test names to distinguish between runs. My test automation was getting pretty confused about what was going on - the test names are a pretty important external interface. Link: https://lkml.kernel.org/r/20250204-kselftest-mm-fix-dups-v1-1-6afe417ef4bb@kernel.org Fixes: b38bd9b2c448 ("selftests/mm: thuge-gen: conform to TAP format output") Signed-off-by: Mark Brown Reviewed-by: Dev Jain Cc: Muhammad Usama Anjum Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/thuge-gen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index e4370b79b62f..cd5174d735be 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -127,7 +127,7 @@ void test_mmap(unsigned long size, unsigned flags) show(size); ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, - "%s mmap\n", __func__); + "%s mmap %lu\n", __func__, size); if (munmap(map, size * NUM_PAGES)) ksft_exit_fail_msg("%s: unmap %s\n", __func__, strerror(errno)); @@ -165,7 +165,7 @@ void test_shmget(unsigned long size, unsigned flags) show(size); ksft_test_result(size == getpagesize() || (before - after) == NUM_PAGES, - "%s: mmap\n", __func__); + "%s: mmap %lu\n", __func__, size); if (shmdt(map)) ksft_exit_fail_msg("%s: shmdt: %s\n", __func__, strerror(errno)); } -- 2.50.1 From 4cc39f91ef6c6f876651eb231974a59ffbcb3a21 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Feb 2025 22:15:14 -0800 Subject: [PATCH 07/16] mm/madvise: split out mmap locking operations for madvise() Patch series "mm/madvise: remove redundant mmap_lock operations from process_madvise()". process_madvise() calls do_madvise() for each address range. Then, each do_madvise() invocation holds and releases same mmap_lock. Optimize the redundant lock operations by splitting do_madvise() internal logic including the mmap_lock operations, and calling the small logic directly from process_madvise() in a sequence that removes the redundant locking. As a result of this change, process_madvise() becomes more efficient and less racy in terms of its results and latency. Note that the potential downside of this series is that other mmap_lock holders may take more time due to the increased length of mmap_lock critical section for process_madvise() calls. But there is maximum limit in the kernel space (IOV_MAX), and userspace can control the critical section length by setting the request size. Hence, the downside would be limited and controllable. Evaluation ========== I measured the time to apply MADV_DONTNEED advice to 256 MiB memory using multiple madvise() calls, 4 KiB per each call. I also do the same with process_madvise(), but with varying batch size (vlen) from 1 to 1024. The source code for the measurement is available at GitHub[1]. Because the microbenchmark result is not that stable, I ran each configuration five times and use the average. The measurement results are as below. 'sz_batches' column shows the batch size of process_madvise() calls. '0' batch size is for madvise() calls case. 'before' and 'after' columns are the measured time to apply MADV_DONTNEED to the 256 MiB memory buffer in nanoseconds, on kernels that built without and with the last patch of this series, respectively. So lower value means better efficiency. 'after/before' column is the ratio of 'after' to 'before'. sz_batches before after after/before 0 146294215.2 121280536.2 0.829017989769427 1 165851018.8 136305598.2 0.821855658085351 2 129469321.2 103740383.6 0.801273866569094 4 110369232.4 87835896.2 0.795836795182785 8 102906232.4 77420920.2 0.752344327397609 16 97551017.4 74959714.4 0.768415506038587 32 94809848.2 71200848.4 0.750985786305689 64 96087575.6 72593180 0.755489765942227 128 96154163.8 68517055.4 0.712575022154163 256 92901257.6 69054216.6 0.743307662177439 512 93646170.8 67053296.2 0.716028168874151 1024 92663219.2 70168196.8 0.75723892830177 Despite the unstable nature of the test program, the trend is as we expect. The measurement shows this patchset reduces the process_madvise() latency, proportional to the batching size. The latency gain was about 20% with the batch size 2, and it has increased to about 28% with the batch size 512, since more number of mmap locking is reduced with larger batch size. Note that the standard devitation of the measurements for each sz_batches configuration ranged from 1.9% to 7.2%. That is, this result is not very stable. The average of the standard deviations for different batch sizes were 4.62% and 4.70% for the 'before' and 'after' kernel measurements. Also note that this patch has somehow decreased latencies of madvise() and single batch size process_madvise(). Seems this code path is small enough to significantly be affected by compiler optimizations including inlining of split-out functions. Please focus on only the improvement amount that changed by the batch size. [1] https://github.com/sjp38/eval_proc_madvise This patch (of 4): Split out the madvise behavior-dependent mmap_lock operations from do_madvise(), for easier reuse of the logic in an upcoming change. [lorenzo.stoakes@oracle.com: fix madvise_[un]lock() issue] Link: https://lkml.kernel.org/r/2f448f7b-1da7-4099-aa9e-0179d47fde40@lucifer.local [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20250206061517.2958-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250206061517.2958-2-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Davidlohr Bueso Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: SeongJae Park Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 62 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 08b207f8e61e..fa5dae5a7723 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1574,6 +1574,50 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ + +#ifdef CONFIG_MEMORY_FAILURE +static bool is_memory_failure(int behavior) +{ + switch (behavior) { + case MADV_HWPOISON: + case MADV_SOFT_OFFLINE: + return true; + default: + return false; + } +} +#else +static bool is_memory_failure(int behavior) +{ + return false; +} +#endif + +static int madvise_lock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return 0; + + if (madvise_need_mmap_write(behavior)) { + if (mmap_write_lock_killable(mm)) + return -EINTR; + } else { + mmap_read_lock(mm); + } + return 0; +} + +static void madvise_unlock(struct mm_struct *mm, int behavior) +{ + if (is_memory_failure(behavior)) + return; + + if (madvise_need_mmap_write(behavior)) + mmap_write_unlock(mm); + else + mmap_read_unlock(mm); +} + /* * The madvise(2) system call. * @@ -1650,7 +1694,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh { unsigned long end; int error; - int write; size_t len; struct blk_plug plug; @@ -1672,19 +1715,15 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh if (end == start) return 0; + error = madvise_lock(mm, behavior); + if (error) + return error; + #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) return madvise_inject_error(behavior, start, start + len_in); #endif - write = madvise_need_mmap_write(behavior); - if (write) { - if (mmap_write_lock_killable(mm)) - return -EINTR; - } else { - mmap_read_lock(mm); - } - start = untagged_addr_remote(mm, start); end = start + len; @@ -1701,10 +1740,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh } blk_finish_plug(&plug); - if (write) - mmap_write_unlock(mm); - else - mmap_read_unlock(mm); + madvise_unlock(mm, behavior); return error; } -- 2.50.1 From dbb0020bbc2c9f563d68564b36d6e8d32f82008b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Feb 2025 22:15:15 -0800 Subject: [PATCH 08/16] mm/madvise: split out madvise input validity check Split out the madvise parameters validation logic from do_madvise(), for easy reuse of the logic from a future change. Link: https://lkml.kernel.org/r/20250206061517.2958-3-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Davidlohr Bueso Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index fa5dae5a7723..ca858b8a837b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1618,6 +1618,27 @@ static void madvise_unlock(struct mm_struct *mm, int behavior) mmap_read_unlock(mm); } +static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) +{ + size_t len; + + if (!madvise_behavior_valid(behavior)) + return false; + + if (!PAGE_ALIGNED(start)) + return false; + len = PAGE_ALIGN(len_in); + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return false; + + if (start + len < start) + return false; + + return true; +} + /* * The madvise(2) system call. * @@ -1697,20 +1718,11 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh size_t len; struct blk_plug plug; - if (!madvise_behavior_valid(behavior)) + if (!is_valid_madvise(start, len_in, behavior)) return -EINVAL; - if (!PAGE_ALIGNED(start)) - return -EINVAL; len = PAGE_ALIGN(len_in); - - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return -EINVAL; - end = start + len; - if (end < start) - return -EINVAL; if (end == start) return 0; -- 2.50.1 From 457753da6462024ad821bcb4df2d828cf2ef18be Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Feb 2025 22:15:16 -0800 Subject: [PATCH 09/16] mm/madvise: split out madvise() behavior execution Split out the madvise behavior applying logic from do_madvise() to make it easier to reuse from the following change. Link: https://lkml.kernel.org/r/20250206061517.2958-4-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 53 +++++++++++++++++++++++++++++----------------------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index ca858b8a837b..6e31e3202d71 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1639,6 +1639,35 @@ static bool is_valid_madvise(unsigned long start, size_t len_in, int behavior) return true; } +static int madvise_do_behavior(struct mm_struct *mm, + unsigned long start, size_t len_in, size_t len, int behavior) +{ + struct blk_plug plug; + unsigned long end; + int error; + +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_inject_error(behavior, start, start + len_in); +#endif + start = untagged_addr_remote(mm, start); + end = start + len; + + blk_start_plug(&plug); + switch (behavior) { + case MADV_POPULATE_READ: + case MADV_POPULATE_WRITE: + error = madvise_populate(mm, start, end, behavior); + break; + default: + error = madvise_walk_vmas(mm, start, end, behavior, + madvise_vma_behavior); + break; + } + blk_finish_plug(&plug); + return error; +} + /* * The madvise(2) system call. * @@ -1716,7 +1745,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh unsigned long end; int error; size_t len; - struct blk_plug plug; if (!is_valid_madvise(start, len_in, behavior)) return -EINVAL; @@ -1730,28 +1758,7 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh error = madvise_lock(mm, behavior); if (error) return error; - -#ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) - return madvise_inject_error(behavior, start, start + len_in); -#endif - - start = untagged_addr_remote(mm, start); - end = start + len; - - blk_start_plug(&plug); - switch (behavior) { - case MADV_POPULATE_READ: - case MADV_POPULATE_WRITE: - error = madvise_populate(mm, start, end, behavior); - break; - default: - error = madvise_walk_vmas(mm, start, end, behavior, - madvise_vma_behavior); - break; - } - blk_finish_plug(&plug); - + error = madvise_do_behavior(mm, start, len_in, len, behavior); madvise_unlock(mm, behavior); return error; -- 2.50.1 From 4000e3d0a367c5ff2035a0394b01b93974be6cb1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 5 Feb 2025 22:15:17 -0800 Subject: [PATCH 10/16] mm/madvise: remove redundant mmap_lock operations from process_madvise() Optimize redundant mmap lock operations from process_madvise() by directly doing the mmap locking first, and then the remaining works for all ranges in the loop. [akpm@linux-foundation.org: update comment, per Lorenzo] Link: https://lkml.kernel.org/r/20250206061517.2958-5-sj@kernel.org Signed-off-by: SeongJae Park Reviewed-by: Shakeel Butt Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/madvise.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 6e31e3202d71..6ecead476a80 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1778,16 +1778,33 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, total_len = iov_iter_count(iter); + ret = madvise_lock(mm, behavior); + if (ret) + return ret; + while (iov_iter_count(iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), - iter_iov_len(iter), behavior); + unsigned long start = (unsigned long)iter_iov_addr(iter); + size_t len_in = iter_iov_len(iter); + size_t len; + + if (!is_valid_madvise(start, len_in, behavior)) { + ret = -EINVAL; + break; + } + + len = PAGE_ALIGN(len_in); + if (start + len == start) + ret = 0; + else + ret = madvise_do_behavior(mm, start, len_in, len, + behavior); /* * An madvise operation is attempting to restart the syscall, * but we cannot proceed as it would not be correct to repeat * the operation in aggregate, and would be surprising to the * user. * - * As we have already dropped locks, it is safe to just loop and + * We drop and reacquire locks so it is safe to just loop and * try again. We check for fatal signals in case we need exit * early anyway. */ @@ -1796,12 +1813,17 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, ret = -EINTR; break; } + + /* Drop and reacquire lock to unwind race. */ + madvise_unlock(mm, behavior); + madvise_lock(mm, behavior); continue; } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); } + madvise_unlock(mm, behavior); ret = (total_len - iov_iter_count(iter)) ? : ret; -- 2.50.1 From 33c9b01ed2fcbc101cdfeb497f4581e981e7c1e7 Mon Sep 17 00:00:00 2001 From: Liu Ye Date: Thu, 6 Feb 2025 14:09:58 +0800 Subject: [PATCH 11/16] mm/memfd: fix spelling and grammatical issues The comment "If a private mapping then writability is irrelevant" contains a typo. It should be "If a private mapping then writability is irrelevant". The comment "SEAL_EXEC implys SEAL_WRITE, making W^X from the start." contains a typo. It should be "SEAL_EXEC implies SEAL_WRITE, making W^X from the start." Link: https://lkml.kernel.org/r/20250206060958.98010-1-liuye@kylinos.cn Signed-off-by: Liu Ye Signed-off-by: Andrew Morton --- mm/memfd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index 37f7be57c2f5..c64df1343059 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -259,7 +259,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) } /* - * SEAL_EXEC implys SEAL_WRITE, making W^X from the start. + * SEAL_EXEC implies SEAL_WRITE, making W^X from the start. */ if (seals & F_SEAL_EXEC && inode->i_mode & 0111) seals |= F_SEAL_SHRINK|F_SEAL_GROW|F_SEAL_WRITE|F_SEAL_FUTURE_WRITE; @@ -337,7 +337,7 @@ static int check_write_seal(unsigned long *vm_flags_ptr) unsigned long vm_flags = *vm_flags_ptr; unsigned long mask = vm_flags & (VM_SHARED | VM_WRITE); - /* If a private matting then writability is irrelevant. */ + /* If a private mapping then writability is irrelevant. */ if (!(mask & VM_SHARED)) return 0; -- 2.50.1 From 81fe88a946051f1dceef72fb3f87bb0880392464 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:10 +0800 Subject: [PATCH 12/16] mm/swap_state.c: fix the obsolete code comment Patch series "Tiny cleanup and improvements about SWAP code". These are all made during review and from reading the patchset "[PATCH v3 00/13] mm, swap: rework of swap allocator locks" from Kairui. This patch (of 12): Since commit 85a1333417a7 ("mm/swap: use dedicated entry for swap in folio"), there's a dedicated field in folio for swap entry. Let's update the code comment above add_to_swap_cache() accordingly. Link: https://lkml.kernel.org/r/20250205092721.9395-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20250205092721.9395-2-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Kairui Song Cc: Baoquan he Cc: Chris Li (Google) Signed-off-by: Andrew Morton --- mm/swap_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 2e1acb210e57..aabde86d1f47 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -85,7 +85,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) /* * add_to_swap_cache resembles filemap_add_folio on swapper_space, - * but sets SwapCache flag and private instead of mapping and index. + * but sets SwapCache flag and 'swap' instead of mapping and index. */ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) -- 2.50.1 From cd57a3fb37f91ef6106bc970d2b5c5878d3b5627 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:11 +0800 Subject: [PATCH 13/16] mm/swap_state.c: optimize the code in clear_shadow_from_swap_cache() Use ALIGN to achieve the same effect and simplify the code. Link: https://lkml.kernel.org/r/20250205092721.9395-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swap_state.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index aabde86d1f47..8a84371980e9 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -270,9 +270,7 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ - curr >>= SWAP_ADDRESS_SPACE_SHIFT; - curr++; - curr <<= SWAP_ADDRESS_SPACE_SHIFT; + curr = ALIGN((curr + 1), SWAP_ADDRESS_SPACE_PAGES); if (curr > end) break; } -- 2.50.1 From 0eb7d2c337f9df3b6adbcbdce213595d94781e05 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:12 +0800 Subject: [PATCH 14/16] mm/swap: remove SWAP_FLAG_PRIO_SHIFT It doesn't make sense to have a zero value of shift. Remove it to avoid confusion. Link: https://lkml.kernel.org/r/20250205092721.9395-4-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- include/linux/swap.h | 1 - mm/swapfile.c | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 9a48e79a0a52..bbd06cbd1f2b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -24,7 +24,6 @@ struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff -#define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ diff --git a/mm/swapfile.c b/mm/swapfile.c index df7c4e8b089c..3a17e40f4c95 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3453,8 +3453,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) - prio = - (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; + prio = swap_flags & SWAP_FLAG_PRIO_MASK; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", -- 2.50.1 From 9b9cba7289ba7e5076fbfe913860306b4672631c Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:13 +0800 Subject: [PATCH 15/16] mm/swap: skip scanning cluster range if it's empty cluster Since ci->lock has been taken when isolating cluster from si->free_clusters or taking si->percpu_cluster->next[order], it's unnecessary to scan and check the cluster range availability if i'ts empty cluster, and this can accelerate the huge page swapping. Link: https://lkml.kernel.org/r/20250205092721.9395-5-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index 3a17e40f4c95..0a0c4118759e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -730,6 +730,9 @@ static bool cluster_scan_range(struct swap_info_struct *si, unsigned long offset, end = start + nr_pages; unsigned char *map = si->swap_map; + if (cluster_is_empty(ci)) + return true; + for (offset = start; offset < end; offset++) { switch (READ_ONCE(map[offset])) { case 0: -- 2.50.1 From b4735d94c29f6a65362d3cce0d55aa65e0e319e3 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:14 +0800 Subject: [PATCH 16/16] mm/swap: rename swap_is_has_cache() to swap_only_has_cache() There are two predicates in the name of swap_is_has_cache() which is confusing. Renaming it to remove the confusion and can better reflect its functionality. Link: https://lkml.kernel.org/r/20250205092721.9395-6-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 0a0c4118759e..0a9c1efeffd6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -161,7 +161,7 @@ static long swap_usage_in_pages(struct swap_info_struct *si) /* Reclaim directly, bypass the slot cache and don't touch device lock */ #define TTRS_DIRECT 0x8 -static bool swap_is_has_cache(struct swap_info_struct *si, +static bool swap_only_has_cache(struct swap_info_struct *si, unsigned long offset, int nr_pages) { unsigned char *map = si->swap_map + offset; @@ -243,7 +243,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * reference or pending writeback, and can't be allocated to others. */ ci = lock_cluster(si, offset); - need_reclaim = swap_is_has_cache(si, offset, nr_pages); + need_reclaim = swap_only_has_cache(si, offset, nr_pages); unlock_cluster(ci); if (!need_reclaim) goto out_unlock; @@ -1577,7 +1577,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) return; ci = lock_cluster(si, offset); - if (swap_is_has_cache(si, offset, size)) + if (swap_only_has_cache(si, offset, size)) swap_entry_range_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { -- 2.50.1