From e218d9fedd056a0b17468d6e19fddaf2c3550f97 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:52 -0800 Subject: [PATCH 01/16] mm: remove extra vma_numab_state_init() call vma_init() already memset's the whole vm_area_struct to 0, so there is no need to an additional vma_numab_state_init(). Link: https://lkml.kernel.org/r/20250213224655.1680278-16-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 06f179c844c3..aad932c4bcf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -956,7 +956,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_numab_state_init(vma); vma_lock_init(vma, false); } -- 2.51.0 From e49510bf00de4f832eaaebcfc113795f127ad519 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:53 -0800 Subject: [PATCH 02/16] mm: prepare lock_vma_under_rcu() for vma reuse possibility Once we make vma cache SLAB_TYPESAFE_BY_RCU, it will be possible for a vma to be reused and attached to another mm after lock_vma_under_rcu() locks the vma. lock_vma_under_rcu() should ensure that vma_start_read() is using the original mm and after locking the vma it should ensure that vma->vm_mm has not changed from under us. Link: https://lkml.kernel.org/r/20250213224655.1680278-17-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++++---- mm/memory.c | 7 ++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index aad932c4bcf0..e3f962de1677 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -739,10 +739,13 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. */ -static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { int oldcnt; @@ -753,7 +756,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) return NULL; /* @@ -780,7 +783,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { vma_refcount_put(vma); return NULL; } @@ -914,7 +917,8 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} diff --git a/mm/memory.c b/mm/memory.c index 51f233404b02..39bceed7448f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6452,7 +6452,7 @@ retry: if (!vma) goto inval; - vma = vma_start_read(vma); + vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { /* Check if the VMA got isolated after we found it */ if (PTR_ERR(vma) == -EAGAIN) { @@ -6471,8 +6471,9 @@ retry: * fields are accessible for RCU readers. */ - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + /* Check if the vma we locked is the right one. */ + if (unlikely(vma->vm_mm != mm || + address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); -- 2.51.0 From 3104138517fc66aad21f4a2487bb572e9fc2e3ec Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:54 -0800 Subject: [PATCH 03/16] mm: make vma cache SLAB_TYPESAFE_BY_RCU To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that object reuse before RCU grace period is over will be detected by lock_vma_under_rcu(). Current checks are sufficient as long as vma is detached before it is freed. The only place this is not currently happening is in exit_mmap(). Add the missing vma_mark_detached() in exit_mmap(). Another issue which might trick lock_vma_under_rcu() during vma reuse is vm_area_dup(), which copies the entire content of the vma into a new one, overriding new vma's vm_refcnt and temporarily making it appear as attached. This might trick a racing lock_vma_under_rcu() to operate on a reused vma if it found the vma before it got reused. To prevent this situation, we should ensure that vm_refcnt stays at detached state (0) when it is copied and advances to attached state only after it is added into the vma tree. Introduce vm_area_init_from() which preserves new vma's vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached state with no current readers when they are freed, lock_vma_under_rcu() will not be able to take vm_refcnt after vma got detached even if vma is reused. vma_mark_attached() in modified to include a release fence to ensure all stores to the vma happen before vm_refcnt gets initialized. Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate vm_area_struct reuse and will minimize the number of call_rcu() calls. [surenb@google.com: remove atomic_set_release() usage in tools/] Link: https://lkml.kernel.org/r/20250217054351.2973666-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-18-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 +- include/linux/mm_types.h | 13 ++++-- include/linux/slab.h | 6 --- kernel/fork.c | 73 ++++++++++++++++++++------------ mm/mmap.c | 3 +- mm/vma.c | 11 ++--- mm/vma.h | 2 +- tools/include/linux/refcount.h | 5 +++ tools/testing/vma/vma_internal.h | 9 +--- 9 files changed, 70 insertions(+), 56 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e3f962de1677..14115c9949d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); -/* Use only if VMA has no other users */ -void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; @@ -890,7 +888,7 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_detached(vma); - refcount_set(&vma->vm_refcnt, 1); + refcount_set_release(&vma->vm_refcnt, 1); } void vma_mark_detached(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3aa0e20be41..6a93abb4452b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -574,6 +574,12 @@ static inline void *folio_get_private(struct folio *folio) typedef unsigned long vm_flags_t; +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that @@ -677,6 +683,9 @@ struct vma_numab_state { * * Only explicitly marked struct members may be accessed by RCU readers before * getting a stable reference. + * + * WARNING: when adding new members, please update vm_area_init_from() to copy + * them during vm_area_struct content duplication. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -687,9 +696,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; /* diff --git a/include/linux/slab.h b/include/linux/slab.h index ad902a2d692b..f8924fd6ea26 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -243,12 +243,6 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif -/* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * diff --git a/kernel/fork.c b/kernel/fork.c index 48a0038f606f..364b2d4fd3ef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return vma; } +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +} + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); @@ -458,11 +494,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); + vm_area_init_from(orig, new); vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); @@ -471,7 +503,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return new; } -void __vm_area_free(struct vm_area_struct *vma) +void vm_area_free(struct vm_area_struct *vma) { /* The vma should be detached while being destroyed. */ vma_assert_detached(vma); @@ -480,25 +512,6 @@ void __vm_area_free(struct vm_area_struct *vma) kmem_cache_free(vm_area_cachep, vma); } -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - __vm_area_free(vma); -} -#endif - -void vm_area_free(struct vm_area_struct *vma) -{ -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -3156,6 +3169,11 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3172,8 +3190,9 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, - SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC| + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); diff --git a/mm/mmap.c b/mm/mmap.c index 6401a1d73f4a..15d6cd7cc845 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1305,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, /* unreachable = */ true); + vma_mark_detached(vma); + remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); diff --git a/mm/vma.c b/mm/vma.c index 53f4d0efce4d..5cdc5612bfc1 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -420,19 +420,14 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, /* * Close a vm structure and free it. */ -void remove_vma(struct vm_area_struct *vma, bool unreachable) +void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - if (unreachable) { - vma_mark_detached(vma); - __vm_area_free(vma); - } else { - vm_area_free(vma); - } + vm_area_free(vma); } /* @@ -1218,7 +1213,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - remove_vma(vma, /* unreachable = */ false); + remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); diff --git a/mm/vma.h b/mm/vma.h index 55be77ff042f..7356ca5a22d3 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -218,7 +218,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); -void remove_vma(struct vm_area_struct *vma, bool unreachable); +void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); diff --git a/tools/include/linux/refcount.h b/tools/include/linux/refcount.h index 36cb29bc57c2..1f30956e070d 100644 --- a/tools/include/linux/refcount.h +++ b/tools/include/linux/refcount.h @@ -60,6 +60,11 @@ static inline void refcount_set(refcount_t *r, unsigned int n) atomic_set(&r->refs, n); } +static inline void refcount_set_release(refcount_t *r, unsigned int n) +{ + atomic_set(&r->refs, n); +} + static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index b385170fbb8f..572ab2cea763 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -476,7 +476,7 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_detached(vma); - refcount_set(&vma->vm_refcnt, 1); + refcount_set_release(&vma->vm_refcnt, 1); } static inline void vma_mark_detached(struct vm_area_struct *vma) @@ -696,14 +696,9 @@ static inline void mpol_put(struct mempolicy *) { } -static inline void __vm_area_free(struct vm_area_struct *vma) -{ - free(vma); -} - static inline void vm_area_free(struct vm_area_struct *vma) { - __vm_area_free(vma); + free(vma); } static inline void lru_add_drain(void) -- 2.51.0 From 795f29616e85aff32248e695c9cc1fbc8b4c9632 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:55 -0800 Subject: [PATCH 04/16] docs/mm: document latest changes to vm_lock Change the documentation to reflect that vm_lock is integrated into vma and replaced with vm_refcnt. Document newly introduced vma_start_read_locked{_nested} functions. Link: https://lkml.kernel.org/r/20250213224655.1680278-19-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Suren Baghdasaryan Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/mm/process_addrs.rst | 44 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index 81417fa2ed20..e6756e78b476 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -716,9 +716,14 @@ calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`, before releasing the RCU lock via :c:func:`!rcu_read_unlock`. -VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for -their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it -via :c:func:`!vma_end_read`. +In cases when the user already holds mmap read lock, :c:func:`!vma_start_read_locked` +and :c:func:`!vma_start_read_locked_nested` can be used. These functions do not +fail due to lock contention but the caller should still check their return values +in case they fail for other reasons. + +VMA read locks increment :c:member:`!vma.vm_refcnt` reference counter for their +duration and the caller of :c:func:`!lock_vma_under_rcu` must drop it via +:c:func:`!vma_end_read`. VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always @@ -726,9 +731,9 @@ acquired. An mmap write lock **must** be held for the duration of the VMA write lock, releasing or downgrading the mmap write lock also releases the VMA write lock so there is no :c:func:`!vma_end_write` function. -Note that a semaphore write lock is not held across a VMA lock. Rather, a -sequence number is used for serialisation, and the write semaphore is only -acquired at the point of write lock to update this. +Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily +modified so that readers can detect the presense of a writer. The reference counter is +restored once the vma sequence number used for serialisation is updated. This ensures the semantics we require - VMA write locks provide exclusive write access to the VMA. @@ -738,7 +743,7 @@ Implementation details The VMA lock mechanism is designed to be a lightweight means of avoiding the use of the heavily contended mmap lock. It is implemented using a combination of a -read/write semaphore and sequence numbers belonging to the containing +reference counter and sequence numbers belonging to the containing :c:struct:`!struct mm_struct` and the VMA. Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic @@ -779,28 +784,31 @@ release of any VMA locks on its release makes sense, as you would never want to keep VMAs locked across entirely separate write operations. It also maintains correct lock ordering. -Each time a VMA read lock is acquired, we acquire a read lock on the -:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that -the sequence count of the VMA does not match that of the mm. +Each time a VMA read lock is acquired, we increment :c:member:`!vma.vm_refcnt` +reference counter and check that the sequence count of the VMA does not match +that of the mm. -If it does, the read lock fails. If it does not, we hold the lock, excluding -writers, but permitting other readers, who will also obtain this lock under RCU. +If it does, the read lock fails and :c:member:`!vma.vm_refcnt` is dropped. +If it does not, we keep the reference counter raised, excluding writers, but +permitting other readers, who can also obtain this lock under RCU. Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu` are also RCU safe, so the whole read lock operation is guaranteed to function correctly. -On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock` -read/write semaphore, before setting the VMA's sequence number under this lock, -also simultaneously holding the mmap write lock. +On the write side, we set a bit in :c:member:`!vma.vm_refcnt` which can't be +modified by readers and wait for all readers to drop their reference count. +Once there are no readers, the VMA's sequence number is set to match that of +the mm. During this entire operation mmap write lock is held. This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep until these are finished and mutual exclusion is achieved. -After setting the VMA's sequence number, the lock is released, avoiding -complexity with a long-term held write lock. +After setting the VMA's sequence number, the bit in :c:member:`!vma.vm_refcnt` +indicating a writer is cleared. From this point on, VMA's sequence number will +indicate VMA's write-locked state until mmap write lock is dropped or downgraded. -This clever combination of a read/write semaphore and sequence count allows for +This clever combination of a reference counter and sequence count allows for fast RCU-based per-VMA lock acquisition (especially on page fault, though utilised elsewhere) with minimal complexity around lock ordering. -- 2.51.0 From fcd807a03b864e2c7b2aa5eaade185127c4e2414 Mon Sep 17 00:00:00 2001 From: Marcelo Moreira Date: Mon, 17 Feb 2025 18:54:31 -0300 Subject: [PATCH 05/16] Docs/mm/damon: fix spelling and grammar in monitoring_intervals_tuning_example.rst This patch fixes some spelling and grammar mistakes in the documentation, improving the readability. - multipled -> multiplied - idential -> identical - minuts -> minutes - efficieny -> efficiency Link: https://lkml.kernel.org/r/20250217215512.12833-1-marcelomoreira1905@gmail.com Signed-off-by: Marcelo Moreira Reviewed-by: SeongJae Park Cc: Shuah khan Signed-off-by: Andrew Morton --- .../mm/damon/monitoring_intervals_tuning_example.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/mm/damon/monitoring_intervals_tuning_example.rst b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst index 334a854efb40..7207cbed591f 100644 --- a/Documentation/mm/damon/monitoring_intervals_tuning_example.rst +++ b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst @@ -36,7 +36,7 @@ Then, list the DAMON-found regions of different access patterns, sorted by the "access temperature". "Access temperature" is a metric representing the access-hotness of a region. It is calculated as a weighted sum of the access frequency and the age of the region. If the access frequency is 0 %, the -temperature is multipled by minus one. That is, if a region is not accessed, +temperature is multiplied by minus one. That is, if a region is not accessed, it gets minus temperature and it gets lower as not accessed for longer time. The sorting is in temperature-ascendint order, so the region at the top of the list is the coldest, and the one at the bottom is the hottest one. :: @@ -58,11 +58,11 @@ list is the coldest, and the one at the bottom is the hottest one. :: The list shows not seemingly hot regions, and only minimum access pattern diversity. Every region has zero access frequency. The number of region is 10, which is the default ``min_nr_regions value``. Size of each region is also -nearly idential. We can suspect this is because “adaptive regions adjustment” +nearly identical. We can suspect this is because “adaptive regions adjustment” mechanism was not well working. As the guide suggested, we can get relative hotness of regions using ``age`` as the recency information. That would be better than nothing, but given the fact that the longest age is only about 6 -seconds while we waited about ten minuts, it is unclear how useful this will +seconds while we waited about ten minutes, it is unclear how useful this will be. The temperature ranges to total size of regions of each range histogram @@ -190,7 +190,7 @@ for sampling and aggregation intervals, respectively). :: The number of regions having different access patterns has significantly increased. Size of each region is also more varied. Total size of non-zero access frequency regions is also significantly increased. Maybe this is already -good enough to make some meaningful memory management efficieny changes. +good enough to make some meaningful memory management efficiency changes. 800ms/16s intervals: Another bias ================================= -- 2.51.0 From 63a23847dc47113b879a5f53cc0ca5cedc881ffd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:06 +0000 Subject: [PATCH 06/16] fs: convert block_commit_write() to take a folio All callers now have a folio, so pass it in instead of converting folio->page->folio. Link: https://lkml.kernel.org/r/20250217192009.437916-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/buffer.c | 14 ++++---------- fs/ext4/inline.c | 2 +- fs/ext4/move_extent.c | 2 +- fs/iomap/buffered-io.c | 2 +- fs/ocfs2/aops.c | 4 ++-- fs/ocfs2/file.c | 2 +- fs/udf/file.c | 2 +- include/linux/buffer_head.h | 2 +- 8 files changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index cc8452f60251..c66a59bb068b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2166,7 +2166,7 @@ int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static void __block_commit_write(struct folio *folio, size_t from, size_t to) +void block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; @@ -2204,6 +2204,7 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to) if (!partial) folio_mark_uptodate(folio); } +EXPORT_SYMBOL(block_commit_write); /* * block_write_begin takes care of the basic task of block allocation and @@ -2262,7 +2263,7 @@ int block_write_end(struct file *file, struct address_space *mapping, flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(folio, start, start + copied); + block_commit_write(folio, start, start + copied); return copied; } @@ -2578,13 +2579,6 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(cont_write_begin); -void block_commit_write(struct page *page, unsigned from, unsigned to) -{ - struct folio *folio = page_folio(page); - __block_commit_write(folio, from, to); -} -EXPORT_SYMBOL(block_commit_write); - /* * block_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must @@ -2630,7 +2624,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret)) goto out_unlock; - __block_commit_write(folio, 0, end); + block_commit_write(folio, 0, end); folio_mark_dirty(folio); folio_wait_stable(folio); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3536ca7e4fcc..0af474c8b260 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -637,7 +637,7 @@ retry: goto retry; if (folio) - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); out: if (folio) { folio_unlock(folio); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 898443e98efc..48649be64d6a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -399,7 +399,7 @@ data_copy: bh = bh->b_this_page; } - block_commit_write(&folio[0]->page, from, from + replaced_size); + block_commit_write(folio[0], from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..f3904d13cda4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1484,7 +1484,7 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, &iter->iomap); if (ret) return ret; - block_commit_write(&folio->page, 0, length); + block_commit_write(folio, 0, length); } else { WARN_ON_ONCE(!folio_test_uptodate(folio)); folio_mark_dirty(folio); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 5bbeb6fbb1ac..ee1d92ed950f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -920,7 +920,7 @@ static void ocfs2_write_failure(struct inode *inode, ocfs2_jbd2_inode_add_write(wc->w_handle, inode, user_pos, user_len); - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); } } } @@ -2012,7 +2012,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); } - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); } } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e54f2c4b5a90..2056cf08ac1e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -813,7 +813,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - block_commit_write(&folio->page, block_start + 1, block_start + 1); + block_commit_write(folio, block_start + 1, block_start + 1); } /* diff --git a/fs/udf/file.c b/fs/udf/file.c index 412fe7c4d348..0d76c4f37b3e 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -69,7 +69,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) goto out_unlock; } - block_commit_write(&folio->page, 0, end); + block_commit_write(folio, 0, end); out_dirty: folio_mark_dirty(folio); folio_wait_stable(folio); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 932139c5d46f..6672e1a5031c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -271,7 +271,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); -void block_commit_write(struct page *page, unsigned int from, unsigned int to); +void block_commit_write(struct folio *folio, size_t from, size_t to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); -- 2.51.0 From 52d671a1a36a16f3a0dd9a2beff964e75bce9787 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:07 +0000 Subject: [PATCH 07/16] fs: remove page_file_mapping() This wrapper has no more callers. Delete it. Link: https://lkml.kernel.org/r/20250217192009.437916-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..975c56fb4f85 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -575,11 +575,6 @@ static inline struct address_space *folio_flush_mapping(struct folio *folio) return folio_mapping(folio); } -static inline struct address_space *page_file_mapping(struct page *page) -{ - return folio_file_mapping(page_folio(page)); -} - /** * folio_inode - Get the host inode for this folio. * @folio: The folio. -- 2.51.0 From 0d40cfe63a2f19b9d375382e6d90b9ebd412901e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:08 +0000 Subject: [PATCH 08/16] fs: remove folio_file_mapping() No callers of this function remain as filesystems no longer see swapfile pages through their normal read/write paths. Link: https://lkml.kernel.org/r/20250217192009.437916-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 975c56fb4f85..ad7c0f615e9b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -535,26 +535,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -/** - * folio_file_mapping - Find the mapping this folio belongs to. - * @folio: The folio. - * - * For folios which are in the page cache, return the mapping that this - * page belongs to. Folios in the swap cache return the mapping of the - * swap file or swap device where the data is stored. This is different - * from the mapping returned by folio_mapping(). The only reason to - * use it is if, like NFS, you return 0 from ->activate_swapfile. - * - * Do not call this for folios which aren't in the page cache or swap cache. - */ -static inline struct address_space *folio_file_mapping(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return swapcache_mapping(folio); - - return folio->mapping; -} - /** * folio_flush_mapping - Find the file mapping this folio belongs to. * @folio: The folio. -- 2.51.0 From 8e4909d693224134fb14ae082c379168b43112c9 Mon Sep 17 00:00:00 2001 From: Suchit K Date: Sat, 15 Feb 2025 22:30:42 +0530 Subject: [PATCH 09/16] Documentation/mm: fix spelling mistake The word watermark was misspelled as "watemark". Link: https://lkml.kernel.org/r/CAO9wTFhe4sf1eVVgijt2cdLPPsUHBj7B=HN-380_JSpve5KbvQ@mail.gmail.com Signed-off-by: Suchit Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/mm/balance.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/balance.rst b/Documentation/mm/balance.rst index abaa78561c31..c4962c89a7f5 100644 --- a/Documentation/mm/balance.rst +++ b/Documentation/mm/balance.rst @@ -81,7 +81,7 @@ Page stealing from process memory and shm is done if stealing the page would alleviate memory pressure on any zone in the page's node that has fallen below its watermark. -watemark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These +watermark[WMARK_MIN/WMARK_LOW/WMARK_HIGH]/low_on_memory/zone_wake_kswapd: These are per-zone fields, used to determine when a zone needs to be balanced. When the number of pages falls below watermark[WMARK_MIN], the hysteric field low_on_memory gets set. This stays set till the number of free pages becomes -- 2.51.0 From af3b45aac5c9d0bdaebf4e93e3fecddc3f363857 Mon Sep 17 00:00:00 2001 From: Ujwal Kundur Date: Sat, 15 Feb 2025 13:48:03 +0530 Subject: [PATCH 10/16] selftests/mm: fix spelling Fix misspelling flagged by codespell. Link: https://lkml.kernel.org/r/20250215081803.1793-1-ujwal.kundur@gmail.com Signed-off-by: Ujwal Kundur Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 31e0c8a3110d..5457a078690d 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -348,7 +348,7 @@ int uffd_test_ctx_init(uint64_t features, const char **errmsg) /* * After initialization of area_src, we must explicitly release pages * for area_dst to make sure it's fully empty. Otherwise we could have - * some area_dst pages be errornously initialized with zero pages, + * some area_dst pages be erroneously initialized with zero pages, * hence we could hit memory corruption later in the test. * * One example is when THP is globally enabled, above allocate_area() -- 2.51.0 From 86758b504864913233f6a16076184ba784cd4466 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Tue, 18 Feb 2025 15:49:54 +0530 Subject: [PATCH 11/16] mm/ioremap: pass pgprot_t to ioremap_prot() instead of unsigned long ioremap_prot() currently accepts pgprot_val parameter as an unsigned long, thus implicitly assuming that pgprot_val and pgprot_t could never be bigger than unsigned long. But this assumption soon will not be true on arm64 when using D128 pgtables. In 128 bit page table configuration, unsigned long is 64 bit, but pgprot_t is 128 bit. Passing platform abstracted pgprot_t argument is better as compared to size based data types. Let's change the parameter to directly pass pgprot_t like another similar helper generic_ioremap_prot(). Without this change in place, D128 configuration does not work on arm64 as the top 64 bits gets silently stripped when passing the protection value to this function. Link: https://lkml.kernel.org/r/20250218101954.415331-1-anshuman.khandual@arm.com Signed-off-by: Ryan Roberts Co-developed-by: Anshuman Khandual Signed-off-by: Anshuman Khandual Acked-by: Catalin Marinas [arm64] Signed-off-by: Andrew Morton --- arch/arc/mm/ioremap.c | 6 ++---- arch/arm64/include/asm/io.h | 6 +++--- arch/arm64/kernel/acpi.c | 2 +- arch/arm64/mm/ioremap.c | 3 +-- arch/csky/include/asm/io.h | 2 +- arch/loongarch/include/asm/io.h | 10 +++++----- arch/mips/include/asm/io.h | 8 ++++---- arch/mips/mm/ioremap.c | 4 ++-- arch/mips/mm/ioremap64.c | 4 ++-- arch/parisc/include/asm/io.h | 2 +- arch/parisc/mm/ioremap.c | 4 ++-- arch/powerpc/include/asm/io.h | 2 +- arch/powerpc/mm/ioremap.c | 4 ++-- arch/powerpc/platforms/ps3/spu.c | 4 ++-- arch/riscv/include/asm/io.h | 2 +- arch/riscv/kernel/acpi.c | 2 +- arch/s390/include/asm/io.h | 4 ++-- arch/s390/pci/pci.c | 4 ++-- arch/sh/boards/mach-landisk/setup.c | 2 +- arch/sh/boards/mach-lboxre2/setup.c | 2 +- arch/sh/boards/mach-sh03/setup.c | 2 +- arch/sh/include/asm/io.h | 2 +- arch/sh/mm/ioremap.c | 3 +-- arch/x86/include/asm/io.h | 2 +- arch/x86/mm/ioremap.c | 4 ++-- arch/xtensa/include/asm/io.h | 6 +++--- arch/xtensa/mm/ioremap.c | 4 ++-- include/asm-generic/io.h | 4 ++-- mm/ioremap.c | 4 ++-- mm/memory.c | 6 +++--- 30 files changed, 55 insertions(+), 59 deletions(-) diff --git a/arch/arc/mm/ioremap.c b/arch/arc/mm/ioremap.c index b07004d53267..fd8897a0e52c 100644 --- a/arch/arc/mm/ioremap.c +++ b/arch/arc/mm/ioremap.c @@ -32,7 +32,7 @@ void __iomem *ioremap(phys_addr_t paddr, unsigned long size) return (void __iomem *)(u32)paddr; return ioremap_prot(paddr, size, - pgprot_val(pgprot_noncached(PAGE_KERNEL))); + pgprot_noncached(PAGE_KERNEL)); } EXPORT_SYMBOL(ioremap); @@ -44,10 +44,8 @@ EXPORT_SYMBOL(ioremap); * might need finer access control (R/W/X) */ void __iomem *ioremap_prot(phys_addr_t paddr, size_t size, - unsigned long flags) + pgprot_t prot) { - pgprot_t prot = __pgprot(flags); - /* force uncached */ return generic_ioremap_prot(paddr, size, pgprot_noncached(prot)); } diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 76ebbdc6ffdd..9b96840fb979 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -270,9 +270,9 @@ int arm64_ioremap_prot_hook_register(const ioremap_prot_hook_t hook); #define _PAGE_IOREMAP PROT_DEVICE_nGnRE #define ioremap_wc(addr, size) \ - ioremap_prot((addr), (size), PROT_NORMAL_NC) + ioremap_prot((addr), (size), __pgprot(PROT_NORMAL_NC)) #define ioremap_np(addr, size) \ - ioremap_prot((addr), (size), PROT_DEVICE_nGnRnE) + ioremap_prot((addr), (size), __pgprot(PROT_DEVICE_nGnRnE)) /* * io{read,write}{16,32,64}be() macros @@ -293,7 +293,7 @@ static inline void __iomem *ioremap_cache(phys_addr_t addr, size_t size) if (pfn_is_map_memory(__phys_to_pfn(addr))) return (void __iomem *)__phys_to_virt(addr); - return ioremap_prot(addr, size, PROT_NORMAL); + return ioremap_prot(addr, size, __pgprot(PROT_NORMAL)); } /* diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index e6f66491fbe9..b9a66fc146c9 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -379,7 +379,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) prot = __acpi_get_writethrough_mem_attribute(); } } - return ioremap_prot(phys, size, pgprot_val(prot)); + return ioremap_prot(phys, size, prot); } /* diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index 6cc0b7e7eb03..10e246f11271 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -15,10 +15,9 @@ int arm64_ioremap_prot_hook_register(ioremap_prot_hook_t hook) } void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) + pgprot_t pgprot) { unsigned long last_addr = phys_addr + size - 1; - pgprot_t pgprot = __pgprot(prot); /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) diff --git a/arch/csky/include/asm/io.h b/arch/csky/include/asm/io.h index ed53f0b47388..536d3bf32ff1 100644 --- a/arch/csky/include/asm/io.h +++ b/arch/csky/include/asm/io.h @@ -36,7 +36,7 @@ */ #define ioremap_wc(addr, size) \ ioremap_prot((addr), (size), \ - (_PAGE_IOREMAP & ~_CACHE_MASK) | _CACHE_UNCACHED) + __pgprot((_PAGE_IOREMAP & ~_CACHE_MASK) | _CACHE_UNCACHED)) #include diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index e77a56eaf906..eaff72b38dc8 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -23,9 +23,9 @@ extern void __init early_iounmap(void __iomem *addr, unsigned long size); #ifdef CONFIG_ARCH_IOREMAP static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, - unsigned long prot_val) + pgprot_t prot) { - switch (prot_val & _CACHE_MASK) { + switch (pgprot_val(prot) & _CACHE_MASK) { case _CACHE_CC: return (void __iomem *)(unsigned long)(CACHE_BASE + offset); case _CACHE_SUC: @@ -38,7 +38,7 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, } #define ioremap(offset, size) \ - ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL_SUC)) + ioremap_prot((offset), (size), PAGE_KERNEL_SUC) #define iounmap(addr) ((void)(addr)) @@ -55,10 +55,10 @@ static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, */ #define ioremap_wc(offset, size) \ ioremap_prot((offset), (size), \ - pgprot_val(wc_enabled ? PAGE_KERNEL_WUC : PAGE_KERNEL_SUC)) + wc_enabled ? PAGE_KERNEL_WUC : PAGE_KERNEL_SUC) #define ioremap_cache(offset, size) \ - ioremap_prot((offset), (size), pgprot_val(PAGE_KERNEL)) + ioremap_prot((offset), (size), PAGE_KERNEL) #define mmiowb() wmb() diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index 0bddb568af7c..4dacf40ebefd 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -126,7 +126,7 @@ static inline unsigned long isa_virt_to_bus(volatile void *address) } void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, - unsigned long prot_val); + pgprot_t prot); void iounmap(const volatile void __iomem *addr); /* @@ -141,7 +141,7 @@ void iounmap(const volatile void __iomem *addr); * address. */ #define ioremap(offset, size) \ - ioremap_prot((offset), (size), _CACHE_UNCACHED) + ioremap_prot((offset), (size), __pgprot(_CACHE_UNCACHED)) /* * ioremap_cache - map bus memory into CPU space @@ -159,7 +159,7 @@ void iounmap(const volatile void __iomem *addr); * memory-like regions on I/O busses. */ #define ioremap_cache(offset, size) \ - ioremap_prot((offset), (size), _page_cachable_default) + ioremap_prot((offset), (size), __pgprot(_page_cachable_default)) /* * ioremap_wc - map bus memory into CPU space @@ -180,7 +180,7 @@ void iounmap(const volatile void __iomem *addr); * _CACHE_UNCACHED option (see cpu_probe() method). */ #define ioremap_wc(offset, size) \ - ioremap_prot((offset), (size), boot_cpu_data.writecombine) + ioremap_prot((offset), (size), __pgprot(boot_cpu_data.writecombine)) #if defined(CONFIG_CPU_CAVIUM_OCTEON) #define war_io_reorder_wmb() wmb() diff --git a/arch/mips/mm/ioremap.c b/arch/mips/mm/ioremap.c index d8243d61ef32..c6c4576cd4a8 100644 --- a/arch/mips/mm/ioremap.c +++ b/arch/mips/mm/ioremap.c @@ -44,9 +44,9 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages, * ioremap_prot gives the caller control over cache coherency attributes (CCA) */ void __iomem *ioremap_prot(phys_addr_t phys_addr, unsigned long size, - unsigned long prot_val) + pgprot_t prot) { - unsigned long flags = prot_val & _CACHE_MASK; + unsigned long flags = pgprot_val(prot) & _CACHE_MASK; unsigned long offset, pfn, last_pfn; struct vm_struct *area; phys_addr_t last_addr; diff --git a/arch/mips/mm/ioremap64.c b/arch/mips/mm/ioremap64.c index 15e7820d6a5f..acc03ba20098 100644 --- a/arch/mips/mm/ioremap64.c +++ b/arch/mips/mm/ioremap64.c @@ -3,9 +3,9 @@ #include void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, - unsigned long prot_val) + pgprot_t prot) { - unsigned long flags = prot_val & _CACHE_MASK; + unsigned long flags = pgprot_val(prot) & _CACHE_MASK; u64 base = (flags == _CACHE_UNCACHED ? IO_BASE : UNCAC_BASE); void __iomem *addr; diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h index 3143cf29ce27..04b783e2a6d1 100644 --- a/arch/parisc/include/asm/io.h +++ b/arch/parisc/include/asm/io.h @@ -131,7 +131,7 @@ static inline void gsc_writeq(unsigned long long val, unsigned long addr) _PAGE_ACCESSED | _PAGE_NO_CACHE) #define ioremap_wc(addr, size) \ - ioremap_prot((addr), (size), _PAGE_IOREMAP) + ioremap_prot((addr), (size), __pgprot(_PAGE_IOREMAP)) #define pci_iounmap pci_iounmap diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c index fd996472dfe7..0b65c4b3baee 100644 --- a/arch/parisc/mm/ioremap.c +++ b/arch/parisc/mm/ioremap.c @@ -14,7 +14,7 @@ #include void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) + pgprot_t prot) { #ifdef CONFIG_EISA unsigned long end = phys_addr + size - 1; @@ -41,6 +41,6 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, } } - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, prot); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index fd92ac450169..0436cdc7cfcc 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -895,7 +895,7 @@ void __iomem *ioremap_wt(phys_addr_t address, unsigned long size); void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); #define ioremap_cache(addr, size) \ - ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) + ioremap_prot((addr), (size), PAGE_KERNEL) #define iounmap iounmap diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 7b0afcabd89f..0d6615620ada 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -41,9 +41,9 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) return __ioremap_caller(addr, size, prot, caller); } -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags) +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, pgprot_t prot) { - pte_t pte = __pte(flags); + pte_t pte = __pte(pgprot_val(prot)); void *caller = __builtin_return_address(0); /* writeable implies dirty for kernel addresses */ diff --git a/arch/powerpc/platforms/ps3/spu.c b/arch/powerpc/platforms/ps3/spu.c index 4a2520ec6d7f..61b37c9400b2 100644 --- a/arch/powerpc/platforms/ps3/spu.c +++ b/arch/powerpc/platforms/ps3/spu.c @@ -190,10 +190,10 @@ static void spu_unmap(struct spu *spu) static int __init setup_areas(struct spu *spu) { struct table {char* name; unsigned long addr; unsigned long size;}; - unsigned long shadow_flags = pgprot_val(pgprot_noncached_wc(PAGE_KERNEL_RO)); spu_pdata(spu)->shadow = ioremap_prot(spu_pdata(spu)->shadow_addr, - sizeof(struct spe_shadow), shadow_flags); + sizeof(struct spe_shadow), + pgprot_noncached_wc(PAGE_KERNEL_RO)); if (!spu_pdata(spu)->shadow) { pr_debug("%s:%d: ioremap shadow failed\n", __func__, __LINE__); goto fail_ioremap; diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 1c5c641075d2..0536846db9b6 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -137,7 +137,7 @@ __io_writes_outs(outs, u64, q, __io_pbr(), __io_paw()) #ifdef CONFIG_MMU #define arch_memremap_wb(addr, size) \ - ((__force void *)ioremap_prot((addr), (size), _PAGE_KERNEL)) + ((__force void *)ioremap_prot((addr), (size), __pgprot(_PAGE_KERNEL))) #endif #endif /* _ASM_RISCV_IO_H */ diff --git a/arch/riscv/kernel/acpi.c b/arch/riscv/kernel/acpi.c index 2fd29695a788..3f6d5a6789e8 100644 --- a/arch/riscv/kernel/acpi.c +++ b/arch/riscv/kernel/acpi.c @@ -305,7 +305,7 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) } } - return ioremap_prot(phys, size, pgprot_val(prot)); + return ioremap_prot(phys, size, prot); } #ifdef CONFIG_PCI diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index fc9933a743d6..82f1043a4fc3 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -33,9 +33,9 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) #define ioremap_wc(addr, size) \ - ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL))) + ioremap_prot((addr), (size), pgprot_writecombine(PAGE_KERNEL)) #define ioremap_wt(addr, size) \ - ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL))) + ioremap_prot((addr), (size), pgprot_writethrough(PAGE_KERNEL)) static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 88f72745fa59..9fdcd733d40e 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -255,7 +255,7 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res, } void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) + pgprot_t prot) { /* * When PCI MIO instructions are unavailable the "physical" address @@ -265,7 +265,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, if (!static_branch_unlikely(&have_mio)) return (void __iomem *)phys_addr; - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, prot); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/sh/boards/mach-landisk/setup.c b/arch/sh/boards/mach-landisk/setup.c index 2c44b94f82fb..1b3f43c3ac46 100644 --- a/arch/sh/boards/mach-landisk/setup.c +++ b/arch/sh/boards/mach-landisk/setup.c @@ -58,7 +58,7 @@ static int __init landisk_devices_setup(void) /* open I/O area window */ paddrbase = virt_to_phys((void *)PA_AREA5_IO); prot = PAGE_KERNEL_PCC(1, _PAGE_PCC_IO16); - cf_ide_base = ioremap_prot(paddrbase, PAGE_SIZE, pgprot_val(prot)); + cf_ide_base = ioremap_prot(paddrbase, PAGE_SIZE, prot); if (!cf_ide_base) { printk("allocate_cf_area : can't open CF I/O window!\n"); return -ENOMEM; diff --git a/arch/sh/boards/mach-lboxre2/setup.c b/arch/sh/boards/mach-lboxre2/setup.c index 20d01b430f2a..e95bde207adb 100644 --- a/arch/sh/boards/mach-lboxre2/setup.c +++ b/arch/sh/boards/mach-lboxre2/setup.c @@ -53,7 +53,7 @@ static int __init lboxre2_devices_setup(void) paddrbase = virt_to_phys((void*)PA_AREA5_IO); psize = PAGE_SIZE; prot = PAGE_KERNEL_PCC(1, _PAGE_PCC_IO16); - cf0_io_base = (u32)ioremap_prot(paddrbase, psize, pgprot_val(prot)); + cf0_io_base = (u32)ioremap_prot(paddrbase, psize, prot); if (!cf0_io_base) { printk(KERN_ERR "%s : can't open CF I/O window!\n" , __func__ ); return -ENOMEM; diff --git a/arch/sh/boards/mach-sh03/setup.c b/arch/sh/boards/mach-sh03/setup.c index 3901b6031ad5..5c9312f334d3 100644 --- a/arch/sh/boards/mach-sh03/setup.c +++ b/arch/sh/boards/mach-sh03/setup.c @@ -75,7 +75,7 @@ static int __init sh03_devices_setup(void) /* open I/O area window */ paddrbase = virt_to_phys((void *)PA_AREA5_IO); prot = PAGE_KERNEL_PCC(1, _PAGE_PCC_IO16); - cf_ide_base = ioremap_prot(paddrbase, PAGE_SIZE, pgprot_val(prot)); + cf_ide_base = ioremap_prot(paddrbase, PAGE_SIZE, prot); if (!cf_ide_base) { printk("allocate_cf_area : can't open CF I/O window!\n"); return -ENOMEM; diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index cf5eab840d57..531ec49b878d 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -299,7 +299,7 @@ unsigned long long poke_real_address_q(unsigned long long addr, #define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL_NOCACHE) #define ioremap_cache(addr, size) \ - ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) + ioremap_prot((addr), (size), PAGE_KERNEL) #endif /* CONFIG_MMU */ #include diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index 33d20f34560f..5bbde53fb32d 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c @@ -73,10 +73,9 @@ __ioremap_29bit(phys_addr_t offset, unsigned long size, pgprot_t prot) #endif /* CONFIG_29BIT */ void __iomem __ref *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) + pgprot_t pgprot) { void __iomem *mapped; - pgprot_t pgprot = __pgprot(prot); mapped = __ioremap_trapped(phys_addr, size); if (mapped) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index ed580c7f9d0a..0794936ec187 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -170,7 +170,7 @@ extern void __iomem *ioremap_uc(resource_size_t offset, unsigned long size); #define ioremap_uc ioremap_uc extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size); #define ioremap_cache ioremap_cache -extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, unsigned long prot_val); +extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size, pgprot_t prot); #define ioremap_prot ioremap_prot extern void __iomem *ioremap_encrypted(resource_size_t phys_addr, unsigned long size); #define ioremap_encrypted ioremap_encrypted diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 38ff7791a9c7..d501f0871aa5 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -440,10 +440,10 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size) EXPORT_SYMBOL(ioremap_cache); void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, - unsigned long prot_val) + pgprot_t prot) { return __ioremap_caller(phys_addr, size, - pgprot2cachemode(__pgprot(prot_val)), + pgprot2cachemode(prot), __builtin_return_address(0), false); } EXPORT_SYMBOL(ioremap_prot); diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h index 934e58399c8c..7cdcc2deab3e 100644 --- a/arch/xtensa/include/asm/io.h +++ b/arch/xtensa/include/asm/io.h @@ -29,7 +29,7 @@ * I/O memory mapping functions. */ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot); + pgprot_t prot); #define ioremap_prot ioremap_prot #define iounmap iounmap @@ -40,7 +40,7 @@ static inline void __iomem *ioremap(unsigned long offset, unsigned long size) return (void*)(offset-XCHAL_KIO_PADDR+XCHAL_KIO_BYPASS_VADDR); else return ioremap_prot(offset, size, - pgprot_val(pgprot_noncached(PAGE_KERNEL))); + pgprot_noncached(PAGE_KERNEL)); } #define ioremap ioremap @@ -51,7 +51,7 @@ static inline void __iomem *ioremap_cache(unsigned long offset, && offset - XCHAL_KIO_PADDR < XCHAL_KIO_SIZE) return (void*)(offset-XCHAL_KIO_PADDR+XCHAL_KIO_CACHED_VADDR); else - return ioremap_prot(offset, size, pgprot_val(PAGE_KERNEL)); + return ioremap_prot(offset, size, PAGE_KERNEL); } #define ioremap_cache ioremap_cache diff --git a/arch/xtensa/mm/ioremap.c b/arch/xtensa/mm/ioremap.c index 8ca660b7ab49..26f238fa9d0d 100644 --- a/arch/xtensa/mm/ioremap.c +++ b/arch/xtensa/mm/ioremap.c @@ -11,12 +11,12 @@ #include void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) + pgprot_t prot) { unsigned long pfn = __phys_to_pfn((phys_addr)); WARN_ON(pfn_valid(pfn)); - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, prot); } EXPORT_SYMBOL(ioremap_prot); diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index a5cbbf3e26ec..402020b23423 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1111,7 +1111,7 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot); void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot); + pgprot_t prot); void iounmap(volatile void __iomem *addr); void generic_iounmap(volatile void __iomem *addr); @@ -1120,7 +1120,7 @@ void generic_iounmap(volatile void __iomem *addr); static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { /* _PAGE_IOREMAP needs to be supplied by the architecture */ - return ioremap_prot(addr, size, _PAGE_IOREMAP); + return ioremap_prot(addr, size, __pgprot(_PAGE_IOREMAP)); } #endif #endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */ diff --git a/mm/ioremap.c b/mm/ioremap.c index 3e049dfb28bd..c36dd9f62fd5 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -50,9 +50,9 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, #ifndef ioremap_prot void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) + pgprot_t prot) { - return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, prot); } EXPORT_SYMBOL(ioremap_prot); #endif diff --git a/mm/memory.c b/mm/memory.c index 39bceed7448f..270c9357475d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6727,7 +6727,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { resource_size_t phys_addr; - unsigned long prot = 0; + pgprot_t prot = __pgprot(0); void __iomem *maddr; int offset = offset_in_page(addr); int ret = -EINVAL; @@ -6737,7 +6737,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, retry: if (follow_pfnmap_start(&args)) return -EINVAL; - prot = pgprot_val(args.pgprot); + prot = args.pgprot; phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT; writable = args.writable; follow_pfnmap_end(&args); @@ -6752,7 +6752,7 @@ retry: if (follow_pfnmap_start(&args)) goto out_unmap; - if ((prot != pgprot_val(args.pgprot)) || + if ((pgprot_val(prot) != pgprot_val(args.pgprot)) || (phys_addr != (args.pfn << PAGE_SHIFT)) || (writable != args.writable)) { follow_pfnmap_end(&args); -- 2.51.0 From 381ff0341ac63d245035f274cacd3f1b8068d388 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Feb 2025 14:37:04 -0800 Subject: [PATCH 12/16] Docs/mm/damon/design: fix typo on DAMOS filters usage doc link Patch series "Docs/mm/damon: misc DAMOS filters documentation fixes and improves". Fix and improve DAMOS filters documentation by fixing a copy-paste typo, adding hugepage_size filter documentation on design doc, moving logic details from usage to design, clarify DAMOS filters handling sequence based on handling layer, and re-organizing the filters type list for easier understanding of the handling sequence. This patch (of 5): The link from DAMOS filters design doc to usage doc has a typo calling filters as watermarks. Fix it. Link: https://lkml.kernel.org/r/20250218223708.53437-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250218223708.53437-2-sj@kernel.org Fixes: d31f5626a0e1 ("Docs/mm/damon/design: add links to sections of DAMON sysfs interface usage doc") Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index e28c6a1b40ae..12ae7e1209c8 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -617,7 +617,7 @@ Below ``type`` of filters are currently supported. - Applied to pages that belonging to a given DAMON monitoring target. - Handled by the core logic. -To know how user-space can set the watermarks via :ref:`DAMON sysfs interface +To know how user-space can set the filters via :ref:`DAMON sysfs interface `, refer to :ref:`filters ` part of the documentation. -- 2.51.0 From e52a942b47c8b32072e7f342aca600016bcfca33 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Feb 2025 14:37:05 -0800 Subject: [PATCH 13/16] Docs/mm/damon/design: document hugepage_size filter 'hugepage_size' DAMOS filter type is not documented on the design doc. Add a description of the type. Link: https://lkml.kernel.org/r/20250218223708.53437-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 12ae7e1209c8..a959c081bc59 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -610,6 +610,9 @@ Below ``type`` of filters are currently supported. - Applied to pages that are accessed after the last access check from the scheme. - Handled by operations set layer. Supported by only ``paddr`` set. +- pages that managed in a given size range + - Applied to pages that managed in a given size range. + - Handled by operations set layer. Supported by only ``paddr`` set. - address range - Applied to pages that belonging to a given address range. - Handled by the core logic. -- 2.51.0 From 0f28583b28d84a5eaaf299e01b7301922ae8da46 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Feb 2025 14:37:06 -0800 Subject: [PATCH 14/16] Docs/damon: move DAMOS filter type names and meaning to design doc DAMON sysfs usage doc is describing DAMOS filter type names and their meanings in short. The design doc is providing the short meaning and detailed descriptions, too. This is unnecessary duplicates and confuses where to document new DAMOS filter types and features. Move the details from usage to design doc. Link: https://lkml.kernel.org/r/20250218223708.53437-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 28 +++++++++----------- Documentation/mm/damon/design.rst | 12 ++++----- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 51af66c208c5..dc37bba96273 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -408,21 +408,19 @@ in the numeric order. Each filter directory contains nine files, namely ``type``, ``matching``, ``allow``, ``memcg_path``, ``addr_start``, ``addr_end``, ``min``, ``max`` -and ``target_idx``. To ``type`` file, you can write one of six special -keywords: ``anon`` for anonymous pages, ``memcg`` for specific memory cgroup, -``young`` for young pages, ``addr`` for specific address range (an open-ended -interval), ``hugepage_size`` for large folios of a specific size range [``min``, -``max``] or ``target`` for specific DAMON monitoring target filtering. Meaning -of the types are same to the description on the :ref:`design doc -`. - -In case of the memory cgroup filtering, you can specify the memory cgroup of -the interest by writing the path of the memory cgroup from the cgroups mount -point to ``memcg_path`` file. In case of the address range filtering, you can -specify the start and end address of the range to ``addr_start`` and -``addr_end`` files, respectively. For the DAMON monitoring target filtering, -you can specify the index of the target between the list of the DAMON context's -monitoring targets list to ``target_idx`` file. +and ``target_idx``. To ``type`` file, you can write the type of the filter. +Refer to :ref:`the design doc ` for available type +names and their meanings. + +For ``memcg`` type, you can specify the memory cgroup of the interest by +writing the path of the memory cgroup from the cgroups mount point to +``memcg_path`` file. For ``addr`` type, you can specify the start and end +address of the range (open-ended interval) to ``addr_start`` and ``addr_end`` +files, respectively. For ``hugepage_size`` type, you can specify the minimum +and maximum size of the range (closed interval) to ``min`` and ``max`` files, +respectively. For ``target`` type, you can specify the index of the target +between the list of the DAMON context's monitoring targets list to +``target_idx`` file. You can write ``Y`` or ``N`` to ``matching`` file to specify whether the filter is for memory that matches the ``type``. You can write ``Y`` or ``N`` to diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index a959c081bc59..7360e5ac0d06 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -600,23 +600,23 @@ counted as the scheme has tried. This difference affects the statistics. Below ``type`` of filters are currently supported. -- anonymous page +- anon - Applied to pages that containing data that not stored in files. - Handled by operations set layer. Supported by only ``paddr`` set. -- memory cgroup +- memcg - Applied to pages that belonging to a given cgroup. - Handled by operations set layer. Supported by only ``paddr`` set. -- young page +- young - Applied to pages that are accessed after the last access check from the scheme. - Handled by operations set layer. Supported by only ``paddr`` set. -- pages that managed in a given size range +- hugepage_size - Applied to pages that managed in a given size range. - Handled by operations set layer. Supported by only ``paddr`` set. -- address range +- addr - Applied to pages that belonging to a given address range. - Handled by the core logic. -- DAMON monitoring target +- target - Applied to pages that belonging to a given DAMON monitoring target. - Handled by the core logic. -- 2.51.0 From 4a4d8e792506432270e27516cf03a8208cbbec8b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Feb 2025 14:37:07 -0800 Subject: [PATCH 15/16] Docs/mm/damon/design: clarify handling layer based filters evaluation sequence If an element of memory matches a DAMOS filter, filters that installed after that get no chance to make any effect to the element. Hence in what order DAMOS filters are handled is important, if both allow filters and reject filters are used together. The ordering is affected by both the installation order and which layter the filters are handled. The design document is not clearly documenting the latter part. Clarify it on the design doc. Link: https://lkml.kernel.org/r/20250218223708.53437-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 7360e5ac0d06..8b9727d91434 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -569,11 +569,21 @@ number of filters for each scheme. Each filter specifies - whether it is to allow (include) or reject (exclude) applying the scheme's action to the memory (``allow``). -When multiple filters are installed, each filter is evaluated in the installed -order. If a part of memory is matched to one of the filter, next filters are -ignored. If the memory passes through the filters evaluation stage because it -is not matched to any of the filters, applying the scheme's action to it is -allowed, same to the behavior when no filter exists. +For efficient handling of filters, some types of filters are handled by the +core layer, while others are handled by operations set. In the latter case, +hence, support of the filter types depends on the DAMON operations set. In +case of the core layer-handled filters, the memory regions that excluded by the +filter are not counted as the scheme has tried to the region. In contrast, if +a memory regions is filtered by an operations set layer-handled filter, it is +counted as the scheme has tried. This difference affects the statistics. + +When multiple filters are installed, the group of filters that handled by the +core layer are evaluated first. After that, the group of filters that handled +by the operations layer are evaluated. Filters in each of the groups are +evaluated in the installed order. If a part of memory is matched to one of the +filter, next filters are ignored. If the memory passes through the filters +evaluation stage because it is not matched to any of the filters, applying the +scheme's action to it is allowed, same to the behavior when no filter exists. For example, let's assume 1) a filter for allowing anonymous pages and 2) another filter for rejecting young pages are installed in the order. If a page @@ -590,14 +600,6 @@ filter-allowed or filters evaluation stage passed. It means that installing allow-filters at the end of the list makes no practical change but only filters-checking overhead. -For efficient handling of filters, some types of filters are handled by the -core layer, while others are handled by operations set. In the latter case, -hence, support of the filter types depends on the DAMON operations set. In -case of the core layer-handled filters, the memory regions that excluded by the -filter are not counted as the scheme has tried to the region. In contrast, if -a memory regions is filtered by an operations set layer-handled filter, it is -counted as the scheme has tried. This difference affects the statistics. - Below ``type`` of filters are currently supported. - anon -- 2.51.0 From edab6ffd792a7774e7d5fd7eb1d6251e452010f5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 18 Feb 2025 14:37:08 -0800 Subject: [PATCH 16/16] Docs/mm/damon/design: categorize DAMOS filter types based on handling layer On what DAMON layer a DAMOS filter is handled is important to expect in what order filters will be evaluated. Re-organize the DAMOS filter types list on the design doc to categorize types based on the handling layer, to let users more easily understand the handling order. Link: https://lkml.kernel.org/r/20250218223708.53437-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 34 ++++++++++++++----------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 8b9727d91434..6a66aa0833fd 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -602,25 +602,21 @@ filters-checking overhead. Below ``type`` of filters are currently supported. -- anon - - Applied to pages that containing data that not stored in files. - - Handled by operations set layer. Supported by only ``paddr`` set. -- memcg - - Applied to pages that belonging to a given cgroup. - - Handled by operations set layer. Supported by only ``paddr`` set. -- young - - Applied to pages that are accessed after the last access check from the - scheme. - - Handled by operations set layer. Supported by only ``paddr`` set. -- hugepage_size - - Applied to pages that managed in a given size range. - - Handled by operations set layer. Supported by only ``paddr`` set. -- addr - - Applied to pages that belonging to a given address range. - - Handled by the core logic. -- target - - Applied to pages that belonging to a given DAMON monitoring target. - - Handled by the core logic. +- Core layer handled + - addr + - Applied to pages that belonging to a given address range. + - target + - Applied to pages that belonging to a given DAMON monitoring target. +- Operations layer handled, supported by only ``paddr`` operations set. + - anon + - Applied to pages that containing data that not stored in files. + - memcg + - Applied to pages that belonging to a given cgroup. + - young + - Applied to pages that are accessed after the last access check from the + scheme. + - hugepage_size + - Applied to pages that managed in a given size range. To know how user-space can set the filters via :ref:`DAMON sysfs interface `, refer to :ref:`filters ` part of the -- 2.51.0