From 7440adb405dfc4abe21bc95abf3481e6a6649c05 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:44 -0800 Subject: [PATCH 01/16] mm: allow vma_start_read_locked/vma_start_read_locked_nested to fail With upcoming replacement of vm_lock with vm_refcnt, we need to handle a possibility of vma_start_read_locked/vma_start_read_locked_nested failing due to refcount overflow. Prepare for such possibility by changing these APIs and adjusting their users. Link: https://lkml.kernel.org/r/20250213224655.1680278-8-surenb@google.com Signed-off-by: Suren Baghdasaryan Cc: Lokesh Gidra Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 ++++-- mm/userfaultfd.c | 30 +++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 003c3e5c0a96..09b48af68699 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -747,10 +747,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ -static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) +static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { mmap_assert_locked(vma->vm_mm); down_read_nested(&vma->vm_lock.lock, subclass); + return true; } /* @@ -759,10 +760,11 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int * not be used in such cases because it might fail due to mm_lock_seq overflow. * This functionality is used to obtain vma read lock and drop the mmap read lock. */ -static inline void vma_start_read_locked(struct vm_area_struct *vma) +static inline bool vma_start_read_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); down_read(&vma->vm_lock.lock); + return true; } static inline void vma_end_read(struct vm_area_struct *vma) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 48ac81bbfee6..fbf2cf62ab9f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -85,8 +85,12 @@ static struct vm_area_struct *uffd_lock_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = find_vma_and_prepare_anon(mm, address); - if (!IS_ERR(vma)) - vma_start_read_locked(vma); + if (!IS_ERR(vma)) { + bool locked = vma_start_read_locked(vma); + + if (!locked) + vma = ERR_PTR(-EAGAIN); + } mmap_read_unlock(mm); return vma; @@ -1555,12 +1559,24 @@ static int uffd_move_lock(struct mm_struct *mm, mmap_read_lock(mm); err = find_vmas_mm_locked(mm, dst_start, src_start, dst_vmap, src_vmap); - if (!err) { - vma_start_read_locked(*dst_vmap); - if (*dst_vmap != *src_vmap) - vma_start_read_locked_nested(*src_vmap, - SINGLE_DEPTH_NESTING); + if (err) + goto out; + + if (!vma_start_read_locked(*dst_vmap)) { + err = -EAGAIN; + goto out; } + + /* Nothing further to do if both vmas are locked. */ + if (*dst_vmap == *src_vmap) + goto out; + + if (!vma_start_read_locked_nested(*src_vmap, SINGLE_DEPTH_NESTING)) { + /* Undo dst_vmap locking if src_vmap failed to lock */ + vma_end_read(*dst_vmap); + err = -EAGAIN; + } +out: mmap_read_unlock(mm); return err; } -- 2.50.1 From ce0853966085dd8eab7153ce0b815c4a07d86698 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:45 -0800 Subject: [PATCH 02/16] mm: move mmap_init_lock() out of the header file mmap_init_lock() is used only from mm_init() in fork.c, therefore it does not have to reside in the header file. This move lets us avoid including additional headers in mmap_lock.h later, when mmap_init_lock() needs to initialize rcuwait object. Link: https://lkml.kernel.org/r/20250213224655.1680278-9-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mmap_lock.h | 6 ------ kernel/fork.c | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 45a21faa3ff6..4706c6769902 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -122,12 +122,6 @@ static inline bool mmap_lock_speculate_retry(struct mm_struct *mm, unsigned int #endif /* CONFIG_PER_VMA_LOCK */ -static inline void mmap_init_lock(struct mm_struct *mm) -{ - init_rwsem(&mm->mmap_lock); - mm_lock_seqcount_init(mm); -} - static inline void mmap_write_lock(struct mm_struct *mm) { __mmap_lock_trace_start_locking(mm, true); diff --git a/kernel/fork.c b/kernel/fork.c index 5bf3e407c795..f1af413e5aa4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1230,6 +1230,12 @@ static void mm_init_uprobes_state(struct mm_struct *mm) #endif } +static void mmap_init_lock(struct mm_struct *mm) +{ + init_rwsem(&mm->mmap_lock); + mm_lock_seqcount_init(mm); +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { -- 2.50.1 From 45ad9f5290dc4bb2249e951d4b3756d3ebda2d66 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:46 -0800 Subject: [PATCH 03/16] mm: uninline the main body of vma_start_write() vma_start_write() is used in many places and will grow in size very soon. It is not used in performance critical paths and uninlining it should limit the future code size growth. No functional changes. Link: https://lkml.kernel.org/r/20250213224655.1680278-10-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 +++--------- mm/memory.c | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 09b48af68699..c24c521e38a2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -787,6 +787,8 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, unsigned int *mm_l return (vma->vm_lock_seq == *mm_lock_seq); } +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq); + /* * Begin writing to a VMA. * Exclude concurrent readers under the per-VMA lock until the currently @@ -799,15 +801,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock.lock); - /* - * We should use WRITE_ONCE() here because we can have concurrent reads - * from the early lockless pessimistic check in vma_start_read(). - * We don't really care about the correctness of that early check, but - * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. - */ - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock.lock); + __vma_start_write(vma, mm_lock_seq); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) diff --git a/mm/memory.c b/mm/memory.c index 6ef014220e09..f2f7dc215b6b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6353,6 +6353,20 @@ fail: #endif #ifdef CONFIG_PER_VMA_LOCK +void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) +{ + down_write(&vma->vm_lock.lock); + /* + * We should use WRITE_ONCE() here because we can have concurrent reads + * from the early lockless pessimistic check in vma_start_read(). + * We don't really care about the correctness of that early check, but + * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. + */ + WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); + up_write(&vma->vm_lock.lock); +} +EXPORT_SYMBOL_GPL(__vma_start_write); + /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the -- 2.50.1 From 7f8ceea0c58039dcea3d31b8d5da58aa5f6e12bf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:47 -0800 Subject: [PATCH 04/16] refcount: provide ops for cases when object's memory can be reused For speculative lookups where a successful inc_not_zero() pins the object, but where we still need to double check if the object acquired is indeed the one we set out to acquire (identity check), needs this validation to happen *after* the increment. Similarly, when a new object is initialized and its memory might have been previously occupied by another object, all stores to initialize the object should happen *before* refcount initialization. Notably SLAB_TYPESAFE_BY_RCU is one such an example when this ordering is required for reference counting. Add refcount_{add|inc}_not_zero_acquire() to guarantee the proper ordering between acquiring a reference count on an object and performing the identity check for that object. Add refcount_set_release() to guarantee proper ordering between stores initializing object attributes and the store initializing the refcount. refcount_set_release() should be done after all other object attributes are initialized. Once refcount_set_release() is called, the object should be considered visible to other tasks even if it was not yet added into an object collection normally used to discover it. This is because other tasks might have discovered the object previously occupying the same memory and after memory reuse they can succeed in taking refcount for the new object and start using it. Object reuse example to consider: consumer: obj = lookup(collection, key); if (!refcount_inc_not_zero_acquire(&obj->ref)) return; if (READ_ONCE(obj->key) != key) { /* identity check */ put_ref(obj); return; } use(obj->value); producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; obj->value = new_value; refcount_set_release(obj->ref, 1); add(collection, new_key, obj); refcount_{add|inc}_not_zero_acquire() is required to prevent the following reordering when refcount_inc_not_zero() is used instead: consumer: obj = lookup(collection, key); if (READ_ONCE(obj->key) != key) { /* reordered identity check */ put_ref(obj); return; } producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; obj->value = new_value; refcount_set_release(obj->ref, 1); add(collection, new_key, obj); if (!refcount_inc_not_zero(&obj->ref)) return; use(obj->value); /* USING WRONG OBJECT */ refcount_set_release() is required to prevent the following reordering when refcount_set() is used instead: consumer: obj = lookup(collection, key); producer: remove(collection, obj->key); if (!refcount_dec_and_test(&obj->ref)) return; obj->key = KEY_INVALID; free(obj); obj = malloc(); /* obj is reused */ obj->key = new_key; /* new_key == old_key */ refcount_set(obj->ref, 1); if (!refcount_inc_not_zero_acquire(&obj->ref)) return; if (READ_ONCE(obj->key) != key) { /* pass since new_key == old_key */ put_ref(obj); return; } use(obj->value); /* USING STALE obj->value */ obj->value = new_value; /* reordered store */ add(collection, key, obj); [surenb@google.com: fix title underlines in refcount-vs-atomic.rst] Link: https://lkml.kernel.org/r/20250217161645.3137927-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-11-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Vlastimil Babka [slab] Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Peter Zijlstra Cc: Will Deacon Cc: Paul E. McKenney Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/RCU/whatisRCU.rst | 10 ++ Documentation/core-api/refcount-vs-atomic.rst | 37 +++++- include/linux/refcount.h | 106 ++++++++++++++++++ include/linux/slab.h | 9 ++ 4 files changed, 156 insertions(+), 6 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 1ef5784c1b84..53faeed7c190 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -971,6 +971,16 @@ unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be initialized after each and every call to kmem_cache_alloc(), which renders reference-free spinlock acquisition completely unsafe. Therefore, when using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter. +If using refcount_t, the specialized refcount_{add|inc}_not_zero_acquire() +and refcount_set_release() APIs should be used to ensure correct operation +ordering when verifying object identity and when initializing newly +allocated objects. Acquire fence in refcount_{add|inc}_not_zero_acquire() +ensures that identity checks happen *after* reference count is taken. +refcount_set_release() should be called after a newly allocated object is +fully initialized and release fence ensures that new values are visible +*before* refcount can be successfully taken by other users. Once +refcount_set_release() is called, the object should be considered visible +by other tasks. (Those willing to initialize their locks in a kmem_cache constructor may also use locking, including cache-friendly sequence locking.) diff --git a/Documentation/core-api/refcount-vs-atomic.rst b/Documentation/core-api/refcount-vs-atomic.rst index 79a009ce11df..94e628c1eb49 100644 --- a/Documentation/core-api/refcount-vs-atomic.rst +++ b/Documentation/core-api/refcount-vs-atomic.rst @@ -86,7 +86,19 @@ Memory ordering guarantee changes: * none (both fully unordered) -case 2) - increment-based ops that return no value +case 2) - non-"Read/Modify/Write" (RMW) ops with release ordering +----------------------------------------------------------------- + +Function changes: + + * atomic_set_release() --> refcount_set_release() + +Memory ordering guarantee changes: + + * none (both provide RELEASE ordering) + + +case 3) - increment-based ops that return no value -------------------------------------------------- Function changes: @@ -98,7 +110,7 @@ Memory ordering guarantee changes: * none (both fully unordered) -case 3) - decrement-based RMW ops that return no value +case 4) - decrement-based RMW ops that return no value ------------------------------------------------------ Function changes: @@ -110,7 +122,7 @@ Memory ordering guarantee changes: * fully unordered --> RELEASE ordering -case 4) - increment-based RMW ops that return a value +case 5) - increment-based RMW ops that return a value ----------------------------------------------------- Function changes: @@ -126,7 +138,20 @@ Memory ordering guarantees changes: result of obtaining pointer to the object! -case 5) - generic dec/sub decrement-based RMW ops that return a value +case 6) - increment-based RMW ops with acquire ordering that return a value +--------------------------------------------------------------------------- + +Function changes: + + * atomic_inc_not_zero() --> refcount_inc_not_zero_acquire() + * no atomic counterpart --> refcount_add_not_zero_acquire() + +Memory ordering guarantees changes: + + * fully ordered --> ACQUIRE ordering on success + + +case 7) - generic dec/sub decrement-based RMW ops that return a value --------------------------------------------------------------------- Function changes: @@ -139,7 +164,7 @@ Memory ordering guarantees changes: * fully ordered --> RELEASE ordering + ACQUIRE ordering on success -case 6) other decrement-based RMW ops that return a value +case 8) other decrement-based RMW ops that return a value --------------------------------------------------------- Function changes: @@ -154,7 +179,7 @@ Memory ordering guarantees changes: .. note:: atomic_add_unless() only provides full order on success. -case 7) - lock-based RMW +case 9) - lock-based RMW ------------------------ Function changes: diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 35f039ecb272..4589d2e7bfea 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -87,6 +87,15 @@ * The decrements dec_and_test() and sub_and_test() also provide acquire * ordering on success. * + * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() provide + * acquire and release ordering for cases when the memory occupied by the + * object might be reused to store another object. This is important for the + * cases where secondary validation is required to detect such reuse, e.g. + * SLAB_TYPESAFE_BY_RCU. The secondary validation checks have to happen after + * the refcount is taken, hence acquire order is necessary. Similarly, when the + * object is initialized, all stores to its attributes should be visible before + * the refcount is set, otherwise a stale attribute value might be used by + * another task which succeeds in taking a refcount to the new object. */ #ifndef _LINUX_REFCOUNT_H @@ -125,6 +134,31 @@ static inline void refcount_set(refcount_t *r, int n) atomic_set(&r->refs, n); } +/** + * refcount_set_release - set a refcount's value with release ordering + * @r: the refcount + * @n: value to which the refcount will be set + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides release memory ordering which will order previous memory operations + * against this store. This ensures all updates to this object are visible + * once the refcount is set and stale values from the object previously + * occupying this memory are overwritten with new ones. + * + * This function should be called only after new object is fully initialized. + * After this call the object should be considered visible to other tasks even + * if it was not yet added into an object collection normally used to discover + * it. This is because other tasks might have discovered the object previously + * occupying the same memory and after memory reuse they can succeed in taking + * refcount to the new object and start using it. + */ +static inline void refcount_set_release(refcount_t *r, int n) +{ + atomic_set_release(&r->refs, n); +} + /** * refcount_read - get a refcount's value * @r: the refcount @@ -178,6 +212,52 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) return __refcount_add_not_zero(i, r, NULL); } +static inline __must_check __signed_wrap +bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +{ + int old = refcount_read(r); + + do { + if (!old) + break; + } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); + + if (oldp) + *oldp = old; + + if (unlikely(old < 0 || old + i < 0)) + refcount_warn_saturate(r, REFCOUNT_ADD_NOT_ZERO_OVF); + + return old; +} + +/** + * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 + * + * @i: the value to add to the refcount + * @r: the refcount + * + * Will saturate at REFCOUNT_SATURATED and WARN. + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides acquire memory ordering on success, it is assumed the caller has + * guaranteed the object memory to be stable (RCU, etc.). It does provide a + * control dependency and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc_not_zero_acquire() should instead be used to increment a + * reference count. + * + * Return: false if the passed refcount is 0, true otherwise + */ +static inline __must_check bool refcount_add_not_zero_acquire(int i, refcount_t *r) +{ + return __refcount_add_not_zero_acquire(i, r, NULL); +} + static inline __signed_wrap void __refcount_add(int i, refcount_t *r, int *oldp) { @@ -236,6 +316,32 @@ static inline __must_check bool refcount_inc_not_zero(refcount_t *r) return __refcount_inc_not_zero(r, NULL); } +static inline __must_check bool __refcount_inc_not_zero_acquire(refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_acquire(1, r, oldp); +} + +/** + * refcount_inc_not_zero_acquire - increment a refcount with acquire ordering unless it is 0 + * @r: the refcount to increment + * + * Similar to refcount_inc_not_zero(), but provides acquire memory ordering on + * success. + * + * This function should be used when memory occupied by the object might be + * reused to store another object -- consider SLAB_TYPESAFE_BY_RCU. + * + * Provides acquire memory ordering on success, it is assumed the caller has + * guaranteed the object memory to be stable (RCU, etc.). It does provide a + * control dependency and thereby orders future stores. See the comment on top. + * + * Return: true if the increment was successful, false otherwise + */ +static inline __must_check bool refcount_inc_not_zero_acquire(refcount_t *r) +{ + return __refcount_inc_not_zero_acquire(r, NULL); +} + static inline void __refcount_inc(refcount_t *r, int *oldp) { __refcount_add(1, r, oldp); diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..ad902a2d692b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -136,6 +136,15 @@ enum _slab_flag_bits { * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * + * Note that object identity check has to be done *after* acquiring a + * reference, therefore user has to ensure proper ordering for loads. + * Similarly, when initializing objects allocated with SLAB_TYPESAFE_BY_RCU, + * the newly allocated object has to be fully initialized *before* its + * refcount gets initialized and proper ordering for stores is required. + * refcount_{add|inc}_not_zero_acquire() and refcount_set_release() are + * designed with the proper fences required for reference counting objects + * allocated with SLAB_TYPESAFE_BY_RCU. + * * Note that it is not possible to acquire a lock within a structure * allocated with SLAB_TYPESAFE_BY_RCU without first acquiring a reference * as described above. The reason is that SLAB_TYPESAFE_BY_RCU pages -- 2.50.1 From 4e0dbe105d5088c77eb09de6f049aaf44711a2ec Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:48 -0800 Subject: [PATCH 05/16] refcount: introduce __refcount_{add|inc}_not_zero_limited_acquire Introduce functions to increase refcount but with a top limit above which they will fail to increase (the limit is inclusive). Setting the limit to INT_MAX indicates no limit. Link: https://lkml.kernel.org/r/20250213224655.1680278-12-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Vlastimil Babka Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/refcount.h | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 4589d2e7bfea..80dc023ac2bf 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -213,13 +213,20 @@ static inline __must_check bool refcount_add_not_zero(int i, refcount_t *r) } static inline __must_check __signed_wrap -bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +bool __refcount_add_not_zero_limited_acquire(int i, refcount_t *r, int *oldp, + int limit) { int old = refcount_read(r); do { if (!old) break; + + if (i > limit - old) { + if (oldp) + *oldp = old; + return false; + } } while (!atomic_try_cmpxchg_acquire(&r->refs, &old, old + i)); if (oldp) @@ -231,6 +238,18 @@ bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) return old; } +static inline __must_check bool +__refcount_inc_not_zero_limited_acquire(refcount_t *r, int *oldp, int limit) +{ + return __refcount_add_not_zero_limited_acquire(1, r, oldp, limit); +} + +static inline __must_check __signed_wrap +bool __refcount_add_not_zero_acquire(int i, refcount_t *r, int *oldp) +{ + return __refcount_add_not_zero_limited_acquire(i, r, oldp, INT_MAX); +} + /** * refcount_add_not_zero_acquire - add a value to a refcount with acquire ordering unless it is 0 * -- 2.50.1 From f35ab95ca0af7a27feab57b9d7e906405bddb093 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:49 -0800 Subject: [PATCH 06/16] mm: replace vm_lock and detached flag with a reference count rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore and the vma->detached flag with a refcount (vm_refcnt). When vma is in detached state, vm_refcnt is 0 and only a call to vma_mark_attached() can take it out of this state. Note that unlike before, now we enforce both vma_mark_attached() and vma_mark_detached() to be done only after vma has been write-locked. vma_mark_attached() changes vm_refcnt to 1 to indicate that it has been attached to the vma tree. When a reader takes read lock, it increments vm_refcnt, unless the top usable bit of vm_refcnt (0x40000000) is set, indicating presence of a writer. When writer takes write lock, it sets the top usable bit to indicate its presence. If there are readers, writer will wait using newly introduced mm->vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. refcount might overflow if there are many competing readers, in which case read-locking will fail. Readers are expected to handle such failures. In summary: 1. all readers increment the vm_refcnt; 2. writer sets top usable (writer) bit of vm_refcnt; 3. readers cannot increment the vm_refcnt if the writer bit is set; 4. in the presence of readers, writer must wait for the vm_refcnt to drop to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma with no readers; 5. vm_refcnt overflow is handled by the readers. While this vm_lock replacement does not yet result in a smaller vm_area_struct (it stays at 256 bytes due to cacheline alignment), it allows for further size optimization by structure member regrouping to bring the size of vm_area_struct below 192 bytes. [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Peter Zijlstra Suggested-by: Matthew Wilcox Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 128 ++++++++++++++++++++----------- include/linux/mm_types.h | 22 +++--- kernel/fork.c | 13 ++-- mm/init-mm.c | 1 + mm/memory.c | 90 +++++++++++++++++++--- tools/testing/vma/linux/atomic.h | 5 ++ tools/testing/vma/vma_internal.h | 63 ++++++++------- 7 files changed, 217 insertions(+), 105 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c24c521e38a2..06f179c844c3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -32,6 +32,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -697,19 +698,54 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK -static inline void vma_lock_init(struct vm_area_struct *vma) +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) { - init_rwsem(&vma->vm_lock.lock); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + static struct lock_class_key lockdep_key; + + lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); +#endif + if (reset_refcnt) + refcount_set(&vma->vm_refcnt, 0); vma->vm_lock_seq = UINT_MAX; } +static inline bool is_vma_writer_only(int refcnt) +{ + /* + * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma + * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on + * a detached vma happens only in vma_mark_detached() and is a rare + * case, therefore most of the time there will be no unnecessary wakeup. + */ + return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; +} + +static inline void vma_refcount_put(struct vm_area_struct *vma) +{ + /* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ + struct mm_struct *mm = vma->vm_mm; + int oldcnt; + + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); + if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { + + if (is_vma_writer_only(oldcnt - 1)) + rcuwait_wake_up(&mm->vma_writer_wait); + } +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got + * detached. */ -static inline bool vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) { + int oldcnt; + /* * Check before locking. A race might cause false locked result. * We can use READ_ONCE() for the mm_lock_seq here, and don't need @@ -718,15 +754,25 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * need ordering is below. */ if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) - return false; + return NULL; - if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) - return false; + /* + * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() + * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. + * Acquire fence is required here to avoid reordering against later + * vm_lock_seq check and checks inside lock_vma_under_rcu(). + */ + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) { + /* return EAGAIN if vma got detached from under us */ + return oldcnt ? NULL : ERR_PTR(-EAGAIN); + } + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); /* - * Overflow might produce false locked result. + * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. * False unlocked result is impossible because we modify and check - * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq + * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq * modification invalidates all existing locks. * * We must use ACQUIRE semantics for the mm_lock_seq so that if we are @@ -735,10 +781,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock.lock); - return false; + vma_refcount_put(vma); + return NULL; } - return true; + + return vma; } /* @@ -749,8 +796,14 @@ static inline bool vma_start_read(struct vm_area_struct *vma) */ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { + int oldcnt; + mmap_assert_locked(vma->vm_mm); - down_read_nested(&vma->vm_lock.lock, subclass); + if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, + VMA_REF_LIMIT))) + return false; + + rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); return true; } @@ -762,16 +815,12 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int */ static inline bool vma_start_read_locked(struct vm_area_struct *vma) { - mmap_assert_locked(vma->vm_mm); - down_read(&vma->vm_lock.lock); - return true; + return vma_start_read_locked_nested(vma, 0); } static inline void vma_end_read(struct vm_area_struct *vma) { - rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock.lock); - rcu_read_unlock(); + vma_refcount_put(vma); } /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ @@ -813,38 +862,35 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock.lock)) - vma_assert_write_locked(vma); + unsigned int mm_lock_seq; + + VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && + !__is_vma_write_locked(vma, &mm_lock_seq), vma); } +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ static inline void vma_assert_attached(struct vm_area_struct *vma) { - WARN_ON_ONCE(vma->detached); + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { - WARN_ON_ONCE(!vma->detached); + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); } static inline void vma_mark_attached(struct vm_area_struct *vma) { - vma_assert_detached(vma); - vma->detached = false; -} - -static inline void vma_mark_detached(struct vm_area_struct *vma) -{ - /* When detaching vma should be write-locked */ vma_assert_write_locked(vma); - vma_assert_attached(vma); - vma->detached = true; + vma_assert_detached(vma); + refcount_set(&vma->vm_refcnt, 1); } -static inline bool is_vma_detached(struct vm_area_struct *vma) -{ - return vma->detached; -} +void vma_mark_detached(struct vm_area_struct *vma); static inline void release_fault_lock(struct vm_fault *vmf) { @@ -867,9 +913,9 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ -static inline void vma_lock_init(struct vm_area_struct *vma) {} -static inline bool vma_start_read(struct vm_area_struct *vma) - { return false; } +static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} +static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) + { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -910,12 +956,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); -#ifdef CONFIG_PER_VMA_LOCK - /* vma is not locked, can't use vma_mark_detached() */ - vma->detached = true; -#endif vma_numab_state_init(vma); - vma_lock_init(vma); + vma_lock_init(vma, false); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36dea20cd101..9de0a6cb3c2d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -19,6 +19,7 @@ #include #include #include +#include #include @@ -629,9 +630,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) } #endif -struct vma_lock { - struct rw_semaphore lock; -}; +#define VMA_LOCK_OFFSET 0x40000000 +#define VMA_REF_LIMIT (VMA_LOCK_OFFSET - 1) struct vma_numab_state { /* @@ -709,19 +709,13 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* - * Flag to indicate areas detached from the mm->mm_mt tree. - * Unstable RCU readers are allowed to read this. - */ - bool detached; - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -784,7 +778,10 @@ struct vm_area_struct { struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_PER_VMA_LOCK /* Unstable RCU readers are allowed to read this. */ - struct vma_lock vm_lock ____cacheline_aligned_in_smp; + refcount_t vm_refcnt ____cacheline_aligned_in_smp; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map vmlock_dep_map; +#endif #endif } __randomize_layout; @@ -920,6 +917,7 @@ struct mm_struct { * by mmlist_lock */ #ifdef CONFIG_PER_VMA_LOCK + struct rcuwait vma_writer_wait; /* * This field has lock-like semantics, meaning it is sometimes * accessed with ACQUIRE/RELEASE semantics. diff --git a/kernel/fork.c b/kernel/fork.c index f1af413e5aa4..48a0038f606f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -463,12 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) * will be reinitialized. */ data_race(memcpy(new, orig, sizeof(*new))); - vma_lock_init(new); + vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); -#ifdef CONFIG_PER_VMA_LOCK - /* vma is not locked, can't use vma_mark_detached() */ - new->detached = true; -#endif vma_numab_state_init(new); dup_anon_vma_name(orig, new); @@ -477,6 +473,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) void __vm_area_free(struct vm_area_struct *vma) { + /* The vma should be detached while being destroyed. */ + vma_assert_detached(vma); vma_numab_state_free(vma); free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); @@ -488,8 +486,6 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) struct vm_area_struct *vma = container_of(head, struct vm_area_struct, vm_rcu); - /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma); __vm_area_free(vma); } #endif @@ -1234,6 +1230,9 @@ static void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); mm_lock_seqcount_init(mm); +#ifdef CONFIG_PER_VMA_LOCK + rcuwait_init(&mm->vma_writer_wait); +#endif } static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, diff --git a/mm/init-mm.c b/mm/init-mm.c index 6af3ad675930..4600e7605cab 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,6 +40,7 @@ struct mm_struct init_mm = { .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), #ifdef CONFIG_PER_VMA_LOCK + .vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait), .mm_lock_seq = SEQCNT_ZERO(init_mm.mm_lock_seq), #endif .user_ns = &init_user_ns, diff --git a/mm/memory.c b/mm/memory.c index f2f7dc215b6b..51f233404b02 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6353,9 +6353,47 @@ fail: #endif #ifdef CONFIG_PER_VMA_LOCK +static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) +{ + unsigned int tgt_refcnt = VMA_LOCK_OFFSET; + + /* Additional refcnt if the vma is attached. */ + if (!detaching) + tgt_refcnt++; + + /* + * If vma is detached then only vma_mark_attached() can raise the + * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). + */ + if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) + return false; + + rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); + rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, + refcount_read(&vma->vm_refcnt) == tgt_refcnt, + TASK_UNINTERRUPTIBLE); + lock_acquired(&vma->vmlock_dep_map, _RET_IP_); + + return true; +} + +static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) +{ + *detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); + rwsem_release(&vma->vmlock_dep_map, _RET_IP_); +} + void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) { - down_write(&vma->vm_lock.lock); + bool locked; + + /* + * __vma_enter_locked() returns false immediately if the vma is not + * attached, otherwise it waits until refcnt is indicating that vma + * is attached with no readers. + */ + locked = __vma_enter_locked(vma, false); + /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). @@ -6363,10 +6401,40 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock.lock); + + if (locked) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(detached); /* vma should remain attached */ + } } EXPORT_SYMBOL_GPL(__vma_start_write); +void vma_mark_detached(struct vm_area_struct *vma) +{ + vma_assert_write_locked(vma); + vma_assert_attached(vma); + + /* + * We are the only writer, so no need to use vma_refcount_put(). + * The condition below is unlikely because the vma has been already + * write-locked and readers can increment vm_refcnt only temporarily + * before they check vm_lock_seq, realize the vma is locked and drop + * back the vm_refcnt. That is a narrow window for observing a raised + * vm_refcnt. + */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* Wait until vma is detached with no readers. */ + if (__vma_enter_locked(vma, true)) { + bool detached; + + __vma_exit_locked(vma, &detached); + WARN_ON_ONCE(!detached); + } + } +} + /* * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be * stable and not isolated. If the VMA is not found or is being modified the @@ -6384,15 +6452,17 @@ retry: if (!vma) goto inval; - if (!vma_start_read(vma)) - goto inval; + vma = vma_start_read(vma); + if (IS_ERR_OR_NULL(vma)) { + /* Check if the VMA got isolated after we found it */ + if (PTR_ERR(vma) == -EAGAIN) { + count_vm_vma_lock_event(VMA_LOCK_MISS); + /* The area was replaced with another one */ + goto retry; + } - /* Check if the VMA got isolated after we found it */ - if (is_vma_detached(vma)) { - vma_end_read(vma); - count_vm_vma_lock_event(VMA_LOCK_MISS); - /* The area was replaced with another one */ - goto retry; + /* Failed to lock the VMA */ + goto inval; } /* * At this point, we have a stable reference to a VMA: The VMA is diff --git a/tools/testing/vma/linux/atomic.h b/tools/testing/vma/linux/atomic.h index 3e1b6adc027b..788c597c4fde 100644 --- a/tools/testing/vma/linux/atomic.h +++ b/tools/testing/vma/linux/atomic.h @@ -9,4 +9,9 @@ #define atomic_set(x, y) uatomic_set(x, y) #define U8_MAX UCHAR_MAX +#ifndef atomic_cmpxchg_relaxed +#define atomic_cmpxchg_relaxed uatomic_cmpxchg +#define atomic_cmpxchg_release uatomic_cmpxchg +#endif /* atomic_cmpxchg_relaxed */ + #endif /* _LINUX_ATOMIC_H */ diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 34277842156c..ba838097d3f6 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include extern unsigned long stack_guard_gap; #ifdef CONFIG_MMU @@ -135,10 +135,6 @@ typedef __bitwise unsigned int vm_fault_t; */ #define pr_warn_once pr_err -typedef struct refcount_struct { - atomic_t refs; -} refcount_t; - struct kref { refcount_t refcount; }; @@ -233,15 +229,12 @@ struct mm_struct { unsigned long flags; /* Must use atomic bitops to access */ }; -struct vma_lock { - struct rw_semaphore lock; -}; - - struct file { struct address_space *f_mapping; }; +#define VMA_LOCK_OFFSET 0x40000000 + struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -269,16 +262,13 @@ struct vm_area_struct { }; #ifdef CONFIG_PER_VMA_LOCK - /* Flag to indicate areas detached from the mm->mm_mt tree */ - bool detached; - /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock.lock (in write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock.lock (in read or write mode) + * - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -287,7 +277,6 @@ struct vm_area_struct { * slowpath. */ unsigned int vm_lock_seq; - struct vma_lock vm_lock; #endif /* @@ -340,6 +329,10 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + refcount_t vm_refcnt; +#endif } __randomize_layout; struct vm_fault {}; @@ -464,33 +457,40 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) return mas_find(&vmi->mas, ULONG_MAX); } -static inline void vma_lock_init(struct vm_area_struct *vma) -{ - init_rwsem(&vma->vm_lock.lock); - vma->vm_lock_seq = UINT_MAX; -} - +/* + * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these + * assertions should be made either under mmap_write_lock or when the object + * has been isolated under mmap_write_lock, ensuring no competing writers. + */ static inline void vma_assert_attached(struct vm_area_struct *vma) { - WARN_ON_ONCE(vma->detached); + WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); } static inline void vma_assert_detached(struct vm_area_struct *vma) { - WARN_ON_ONCE(!vma->detached); + WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); } static inline void vma_assert_write_locked(struct vm_area_struct *); static inline void vma_mark_attached(struct vm_area_struct *vma) { - vma->detached = false; + vma_assert_write_locked(vma); + vma_assert_detached(vma); + refcount_set(&vma->vm_refcnt, 1); } static inline void vma_mark_detached(struct vm_area_struct *vma) { - /* When detaching vma should be write-locked */ vma_assert_write_locked(vma); - vma->detached = true; + vma_assert_attached(vma); + /* We are the only writer, so no need to use vma_refcount_put(). */ + if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { + /* + * Reader must have temporarily raised vm_refcnt but it will + * drop it without using the vma since vma is write-locked. + */ + } } extern const struct vm_operations_struct vma_dummy_vm_ops; @@ -503,9 +503,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - /* vma is not locked, can't use vma_mark_detached() */ - vma->detached = true; - vma_lock_init(vma); + vma->vm_lock_seq = UINT_MAX; } static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) @@ -528,10 +526,9 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return NULL; memcpy(new, orig, sizeof(*new)); - vma_lock_init(new); + refcount_set(&new->vm_refcnt, 0); + new->vm_lock_seq = UINT_MAX; INIT_LIST_HEAD(&new->anon_vma_chain); - /* vma is not locked, can't use vma_mark_detached() */ - new->detached = true; return new; } -- 2.50.1 From 6bef4c2f97221f3b595d08c8656eb5845ef80fe9 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:50 -0800 Subject: [PATCH 07/16] mm: move lesser used vma_area_struct members into the last cacheline Move several vma_area_struct members which are rarely or never used during page fault handling into the last cacheline to better pack vm_area_struct. As a result vm_area_struct will fit into 3 as opposed to 4 cachelines. New typical vm_area_struct layout: struct vm_area_struct { union { struct { long unsigned int vm_start; /* 0 8 */ long unsigned int vm_end; /* 8 8 */ }; /* 0 16 */ freeptr_t vm_freeptr; /* 0 8 */ }; /* 0 16 */ struct mm_struct * vm_mm; /* 16 8 */ pgprot_t vm_page_prot; /* 24 8 */ union { const vm_flags_t vm_flags; /* 32 8 */ vm_flags_t __vm_flags; /* 32 8 */ }; /* 32 8 */ unsigned int vm_lock_seq; /* 40 4 */ /* XXX 4 bytes hole, try to pack */ struct list_head anon_vma_chain; /* 48 16 */ /* --- cacheline 1 boundary (64 bytes) --- */ struct anon_vma * anon_vma; /* 64 8 */ const struct vm_operations_struct * vm_ops; /* 72 8 */ long unsigned int vm_pgoff; /* 80 8 */ struct file * vm_file; /* 88 8 */ void * vm_private_data; /* 96 8 */ atomic_long_t swap_readahead_info; /* 104 8 */ struct mempolicy * vm_policy; /* 112 8 */ struct vma_numab_state * numab_state; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ refcount_t vm_refcnt (__aligned__(64)); /* 128 4 */ /* XXX 4 bytes hole, try to pack */ struct { struct rb_node rb (__aligned__(8)); /* 136 24 */ long unsigned int rb_subtree_last; /* 160 8 */ } __attribute__((__aligned__(8))) shared; /* 136 32 */ struct anon_vma_name * anon_name; /* 168 8 */ struct vm_userfaultfd_ctx vm_userfaultfd_ctx; /* 176 8 */ /* size: 192, cachelines: 3, members: 18 */ /* sum members: 176, holes: 2, sum holes: 8 */ /* padding: 8 */ /* forced alignments: 2, forced holes: 1, sum forced holes: 4 */ } __attribute__((__aligned__(64))); Memory consumption per 1000 VMAs becomes 48 pages: slabinfo after vm_area_struct changes: ... : ... vm_area_struct ... 192 42 2 : ... Link: https://lkml.kernel.org/r/20250213224655.1680278-14-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 38 +++++++++++++++----------------- tools/testing/vma/vma_internal.h | 37 +++++++++++++++---------------- 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9de0a6cb3c2d..c3aa0e20be41 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -725,17 +725,6 @@ struct vm_area_struct { */ unsigned int vm_lock_seq; #endif - - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma @@ -755,14 +744,6 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif @@ -775,7 +756,6 @@ struct vm_area_struct { #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif - struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_PER_VMA_LOCK /* Unstable RCU readers are allowed to read this. */ refcount_t vm_refcnt ____cacheline_aligned_in_smp; @@ -783,6 +763,24 @@ struct vm_area_struct { struct lockdep_map vmlock_dep_map; #endif #endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; #ifdef CONFIG_NUMA diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index ba838097d3f6..b385170fbb8f 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -279,16 +279,6 @@ struct vm_area_struct { unsigned int vm_lock_seq; #endif - /* - * For areas with an address space and backing store, - * linkage into the address_space->i_mmap interval tree. - * - */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; - /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma * list, after a COW of one of the file pages. A MAP_SHARED vma @@ -308,14 +298,6 @@ struct vm_area_struct { struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ -#ifdef CONFIG_ANON_VMA_NAME - /* - * For private and shared anonymous mappings, a pointer to a null - * terminated string containing the name given to the vma, or NULL if - * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. - */ - struct anon_vma_name *anon_name; -#endif #ifdef CONFIG_SWAP atomic_long_t swap_readahead_info; #endif @@ -328,11 +310,28 @@ struct vm_area_struct { #ifdef CONFIG_NUMA_BALANCING struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif - struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_PER_VMA_LOCK /* Unstable RCU readers are allowed to read this. */ refcount_t vm_refcnt; #endif + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. + * + */ + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; +#ifdef CONFIG_ANON_VMA_NAME + /* + * For private and shared anonymous mappings, a pointer to a null + * terminated string containing the name given to the vma, or NULL if + * unnamed. Serialized by mmap_lock. Use anon_vma_name to access. + */ + struct anon_vma_name *anon_name; +#endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; } __randomize_layout; struct vm_fault {}; -- 2.50.1 From 3dd98c5c442358c5aefa13e2c91ee9dee32d776e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:51 -0800 Subject: [PATCH 08/16] mm/debug: print vm_refcnt state when dumping the vma vm_refcnt encodes a number of useful states: - whether vma is attached or detached - the number of current vma readers - presence of a vma writer Let's include it in the vma dump. Link: https://lkml.kernel.org/r/20250213224655.1680278-15-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- mm/debug.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/debug.c b/mm/debug.c index e1282b85a877..2d1bd67d957b 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -181,11 +181,17 @@ void dump_vma(const struct vm_area_struct *vma) pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" +#ifdef CONFIG_PER_VMA_LOCK + "refcnt %x\n" +#endif "flags: %#lx(%pGv)\n", vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, +#ifdef CONFIG_PER_VMA_LOCK + refcount_read(&vma->vm_refcnt), +#endif vma->vm_flags, &vma->vm_flags); } EXPORT_SYMBOL(dump_vma); -- 2.50.1 From e218d9fedd056a0b17468d6e19fddaf2c3550f97 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:52 -0800 Subject: [PATCH 09/16] mm: remove extra vma_numab_state_init() call vma_init() already memset's the whole vm_area_struct to 0, so there is no need to an additional vma_numab_state_init(). Link: https://lkml.kernel.org/r/20250213224655.1680278-16-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 06f179c844c3..aad932c4bcf0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -956,7 +956,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_mm = mm; vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); - vma_numab_state_init(vma); vma_lock_init(vma, false); } -- 2.50.1 From e49510bf00de4f832eaaebcfc113795f127ad519 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:53 -0800 Subject: [PATCH 10/16] mm: prepare lock_vma_under_rcu() for vma reuse possibility Once we make vma cache SLAB_TYPESAFE_BY_RCU, it will be possible for a vma to be reused and attached to another mm after lock_vma_under_rcu() locks the vma. lock_vma_under_rcu() should ensure that vma_start_read() is using the original mm and after locking the vma it should ensure that vma->vm_mm has not changed from under us. Link: https://lkml.kernel.org/r/20250213224655.1680278-17-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++++---- mm/memory.c | 7 ++++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index aad932c4bcf0..e3f962de1677 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -739,10 +739,13 @@ static inline void vma_refcount_put(struct vm_area_struct *vma) * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to * using mmap_lock. The function should never yield false unlocked result. + * False locked result is possible if mm_lock_seq overflows or if vma gets + * reused and attached to a different mm before we lock it. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got * detached. */ -static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { int oldcnt; @@ -753,7 +756,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) * we don't rely on for anything - the mm_lock_seq read against which we * need ordering is below. */ - if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) + if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) return NULL; /* @@ -780,7 +783,7 @@ static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) * after it has been unlocked. * This pairs with RELEASE semantics in vma_end_write_all(). */ - if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { + if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) { vma_refcount_put(vma); return NULL; } @@ -914,7 +917,8 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} -static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) +static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm, + struct vm_area_struct *vma) { return NULL; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} diff --git a/mm/memory.c b/mm/memory.c index 51f233404b02..39bceed7448f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6452,7 +6452,7 @@ retry: if (!vma) goto inval; - vma = vma_start_read(vma); + vma = vma_start_read(mm, vma); if (IS_ERR_OR_NULL(vma)) { /* Check if the VMA got isolated after we found it */ if (PTR_ERR(vma) == -EAGAIN) { @@ -6471,8 +6471,9 @@ retry: * fields are accessible for RCU readers. */ - /* Check since vm_start/vm_end might change before we lock the VMA */ - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + /* Check if the vma we locked is the right one. */ + if (unlikely(vma->vm_mm != mm || + address < vma->vm_start || address >= vma->vm_end)) goto inval_end_read; rcu_read_unlock(); -- 2.50.1 From 3104138517fc66aad21f4a2487bb572e9fc2e3ec Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:54 -0800 Subject: [PATCH 11/16] mm: make vma cache SLAB_TYPESAFE_BY_RCU To enable SLAB_TYPESAFE_BY_RCU for vma cache we need to ensure that object reuse before RCU grace period is over will be detected by lock_vma_under_rcu(). Current checks are sufficient as long as vma is detached before it is freed. The only place this is not currently happening is in exit_mmap(). Add the missing vma_mark_detached() in exit_mmap(). Another issue which might trick lock_vma_under_rcu() during vma reuse is vm_area_dup(), which copies the entire content of the vma into a new one, overriding new vma's vm_refcnt and temporarily making it appear as attached. This might trick a racing lock_vma_under_rcu() to operate on a reused vma if it found the vma before it got reused. To prevent this situation, we should ensure that vm_refcnt stays at detached state (0) when it is copied and advances to attached state only after it is added into the vma tree. Introduce vm_area_init_from() which preserves new vma's vm_refcnt and use it in vm_area_dup(). Since all vmas are in detached state with no current readers when they are freed, lock_vma_under_rcu() will not be able to take vm_refcnt after vma got detached even if vma is reused. vma_mark_attached() in modified to include a release fence to ensure all stores to the vma happen before vm_refcnt gets initialized. Finally, make vm_area_cachep SLAB_TYPESAFE_BY_RCU. This will facilitate vm_area_struct reuse and will minimize the number of call_rcu() calls. [surenb@google.com: remove atomic_set_release() usage in tools/] Link: https://lkml.kernel.org/r/20250217054351.2973666-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-18-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Liam R. Howlett Cc: Lokesh Gidra Cc: Lorenzo Stoakes Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 +- include/linux/mm_types.h | 13 ++++-- include/linux/slab.h | 6 --- kernel/fork.c | 73 ++++++++++++++++++++------------ mm/mmap.c | 3 +- mm/vma.c | 11 ++--- mm/vma.h | 2 +- tools/include/linux/refcount.h | 5 +++ tools/testing/vma/vma_internal.h | 9 +--- 9 files changed, 70 insertions(+), 56 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e3f962de1677..14115c9949d8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -258,8 +258,6 @@ void setup_initial_init_mm(void *start_code, void *end_code, struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); -/* Use only if VMA has no other users */ -void __vm_area_free(struct vm_area_struct *vma); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; @@ -890,7 +888,7 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_detached(vma); - refcount_set(&vma->vm_refcnt, 1); + refcount_set_release(&vma->vm_refcnt, 1); } void vma_mark_detached(struct vm_area_struct *vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c3aa0e20be41..6a93abb4452b 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -574,6 +574,12 @@ static inline void *folio_get_private(struct folio *folio) typedef unsigned long vm_flags_t; +/* + * freeptr_t represents a SLUB freelist pointer, which might be encoded + * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. + */ +typedef struct { unsigned long v; } freeptr_t; + /* * A region containing a mapping of a non-memory backed file under NOMMU * conditions. These are held in a global tree and are pinned by the VMAs that @@ -677,6 +683,9 @@ struct vma_numab_state { * * Only explicitly marked struct members may be accessed by RCU readers before * getting a stable reference. + * + * WARNING: when adding new members, please update vm_area_init_from() to copy + * them during vm_area_struct content duplication. */ struct vm_area_struct { /* The first cache line has the info for VMA tree walking. */ @@ -687,9 +696,7 @@ struct vm_area_struct { unsigned long vm_start; unsigned long vm_end; }; -#ifdef CONFIG_PER_VMA_LOCK - struct rcu_head vm_rcu; /* Used for deferred freeing. */ -#endif + freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */ }; /* diff --git a/include/linux/slab.h b/include/linux/slab.h index ad902a2d692b..f8924fd6ea26 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -243,12 +243,6 @@ enum _slab_flag_bits { #define SLAB_NO_OBJ_EXT __SLAB_FLAG_UNUSED #endif -/* - * freeptr_t represents a SLUB freelist pointer, which might be encoded - * and not dereferenceable if CONFIG_SLAB_FREELIST_HARDENED is enabled. - */ -typedef struct { unsigned long v; } freeptr_t; - /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * diff --git a/kernel/fork.c b/kernel/fork.c index 48a0038f606f..364b2d4fd3ef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -449,6 +449,42 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return vma; } +static void vm_area_init_from(const struct vm_area_struct *src, + struct vm_area_struct *dest) +{ + dest->vm_mm = src->vm_mm; + dest->vm_ops = src->vm_ops; + dest->vm_start = src->vm_start; + dest->vm_end = src->vm_end; + dest->anon_vma = src->anon_vma; + dest->vm_pgoff = src->vm_pgoff; + dest->vm_file = src->vm_file; + dest->vm_private_data = src->vm_private_data; + vm_flags_init(dest, src->vm_flags); + memcpy(&dest->vm_page_prot, &src->vm_page_prot, + sizeof(dest->vm_page_prot)); + /* + * src->shared.rb may be modified concurrently when called from + * dup_mmap(), but the clone will reinitialize it. + */ + data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); + memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, + sizeof(dest->vm_userfaultfd_ctx)); +#ifdef CONFIG_ANON_VMA_NAME + dest->anon_name = src->anon_name; +#endif +#ifdef CONFIG_SWAP + memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, + sizeof(dest->swap_readahead_info)); +#endif +#ifndef CONFIG_MMU + dest->vm_region = src->vm_region; +#endif +#ifdef CONFIG_NUMA + dest->vm_policy = src->vm_policy; +#endif +} + struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); @@ -458,11 +494,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); - /* - * orig->shared.rb may be modified concurrently, but the clone - * will be reinitialized. - */ - data_race(memcpy(new, orig, sizeof(*new))); + vm_area_init_from(orig, new); vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); @@ -471,7 +503,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return new; } -void __vm_area_free(struct vm_area_struct *vma) +void vm_area_free(struct vm_area_struct *vma) { /* The vma should be detached while being destroyed. */ vma_assert_detached(vma); @@ -480,25 +512,6 @@ void __vm_area_free(struct vm_area_struct *vma) kmem_cache_free(vm_area_cachep, vma); } -#ifdef CONFIG_PER_VMA_LOCK -static void vm_area_free_rcu_cb(struct rcu_head *head) -{ - struct vm_area_struct *vma = container_of(head, struct vm_area_struct, - vm_rcu); - - __vm_area_free(vma); -} -#endif - -void vm_area_free(struct vm_area_struct *vma) -{ -#ifdef CONFIG_PER_VMA_LOCK - call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); -#else - __vm_area_free(vma); -#endif -} - static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { @@ -3156,6 +3169,11 @@ void __init mm_cache_init(void) void __init proc_caches_init(void) { + struct kmem_cache_args args = { + .use_freeptr_offset = true, + .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), + }; + sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| @@ -3172,8 +3190,9 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, - SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC| + vm_area_cachep = kmem_cache_create("vm_area_struct", + sizeof(struct vm_area_struct), &args, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); diff --git a/mm/mmap.c b/mm/mmap.c index 6401a1d73f4a..15d6cd7cc845 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1305,7 +1305,8 @@ void exit_mmap(struct mm_struct *mm) do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - remove_vma(vma, /* unreachable = */ true); + vma_mark_detached(vma); + remove_vma(vma); count++; cond_resched(); vma = vma_next(&vmi); diff --git a/mm/vma.c b/mm/vma.c index 53f4d0efce4d..5cdc5612bfc1 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -420,19 +420,14 @@ static bool can_vma_merge_right(struct vma_merge_struct *vmg, /* * Close a vm structure and free it. */ -void remove_vma(struct vm_area_struct *vma, bool unreachable) +void remove_vma(struct vm_area_struct *vma) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); - if (unreachable) { - vma_mark_detached(vma); - __vm_area_free(vma); - } else { - vm_area_free(vma); - } + vm_area_free(vma); } /* @@ -1218,7 +1213,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) - remove_vma(vma, /* unreachable = */ false); + remove_vma(vma); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); diff --git a/mm/vma.h b/mm/vma.h index 55be77ff042f..7356ca5a22d3 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -218,7 +218,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); -void remove_vma(struct vm_area_struct *vma, bool unreachable); +void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); diff --git a/tools/include/linux/refcount.h b/tools/include/linux/refcount.h index 36cb29bc57c2..1f30956e070d 100644 --- a/tools/include/linux/refcount.h +++ b/tools/include/linux/refcount.h @@ -60,6 +60,11 @@ static inline void refcount_set(refcount_t *r, unsigned int n) atomic_set(&r->refs, n); } +static inline void refcount_set_release(refcount_t *r, unsigned int n) +{ + atomic_set(&r->refs, n); +} + static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index b385170fbb8f..572ab2cea763 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -476,7 +476,7 @@ static inline void vma_mark_attached(struct vm_area_struct *vma) { vma_assert_write_locked(vma); vma_assert_detached(vma); - refcount_set(&vma->vm_refcnt, 1); + refcount_set_release(&vma->vm_refcnt, 1); } static inline void vma_mark_detached(struct vm_area_struct *vma) @@ -696,14 +696,9 @@ static inline void mpol_put(struct mempolicy *) { } -static inline void __vm_area_free(struct vm_area_struct *vma) -{ - free(vma); -} - static inline void vm_area_free(struct vm_area_struct *vma) { - __vm_area_free(vma); + free(vma); } static inline void lru_add_drain(void) -- 2.50.1 From 795f29616e85aff32248e695c9cc1fbc8b4c9632 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 13 Feb 2025 14:46:55 -0800 Subject: [PATCH 12/16] docs/mm: document latest changes to vm_lock Change the documentation to reflect that vm_lock is integrated into vma and replaced with vm_refcnt. Document newly introduced vma_start_read_locked{_nested} functions. Link: https://lkml.kernel.org/r/20250213224655.1680278-19-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Tested-by: Shivank Garg Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Jann Horn Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Klara Modin Cc: Lokesh Gidra Cc: Mateusz Guzik Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Oleg Nesterov Cc: Pasha Tatashin Cc: "Paul E . McKenney" Cc: Peter Xu Cc: Peter Zijlstra (Intel) Cc: Shakeel Butt Cc: Sourav Panda Cc: Suren Baghdasaryan Cc: Wei Yang Cc: Will Deacon Cc: Heiko Carstens Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/mm/process_addrs.rst | 44 ++++++++++++++++++------------ 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/Documentation/mm/process_addrs.rst b/Documentation/mm/process_addrs.rst index 81417fa2ed20..e6756e78b476 100644 --- a/Documentation/mm/process_addrs.rst +++ b/Documentation/mm/process_addrs.rst @@ -716,9 +716,14 @@ calls :c:func:`!rcu_read_lock` to ensure that the VMA is looked up in an RCU critical section, then attempts to VMA lock it via :c:func:`!vma_start_read`, before releasing the RCU lock via :c:func:`!rcu_read_unlock`. -VMA read locks hold the read lock on the :c:member:`!vma->vm_lock` semaphore for -their duration and the caller of :c:func:`!lock_vma_under_rcu` must release it -via :c:func:`!vma_end_read`. +In cases when the user already holds mmap read lock, :c:func:`!vma_start_read_locked` +and :c:func:`!vma_start_read_locked_nested` can be used. These functions do not +fail due to lock contention but the caller should still check their return values +in case they fail for other reasons. + +VMA read locks increment :c:member:`!vma.vm_refcnt` reference counter for their +duration and the caller of :c:func:`!lock_vma_under_rcu` must drop it via +:c:func:`!vma_end_read`. VMA **write** locks are acquired via :c:func:`!vma_start_write` in instances where a VMA is about to be modified, unlike :c:func:`!vma_start_read` the lock is always @@ -726,9 +731,9 @@ acquired. An mmap write lock **must** be held for the duration of the VMA write lock, releasing or downgrading the mmap write lock also releases the VMA write lock so there is no :c:func:`!vma_end_write` function. -Note that a semaphore write lock is not held across a VMA lock. Rather, a -sequence number is used for serialisation, and the write semaphore is only -acquired at the point of write lock to update this. +Note that when write-locking a VMA lock, the :c:member:`!vma.vm_refcnt` is temporarily +modified so that readers can detect the presense of a writer. The reference counter is +restored once the vma sequence number used for serialisation is updated. This ensures the semantics we require - VMA write locks provide exclusive write access to the VMA. @@ -738,7 +743,7 @@ Implementation details The VMA lock mechanism is designed to be a lightweight means of avoiding the use of the heavily contended mmap lock. It is implemented using a combination of a -read/write semaphore and sequence numbers belonging to the containing +reference counter and sequence numbers belonging to the containing :c:struct:`!struct mm_struct` and the VMA. Read locks are acquired via :c:func:`!vma_start_read`, which is an optimistic @@ -779,28 +784,31 @@ release of any VMA locks on its release makes sense, as you would never want to keep VMAs locked across entirely separate write operations. It also maintains correct lock ordering. -Each time a VMA read lock is acquired, we acquire a read lock on the -:c:member:`!vma->vm_lock` read/write semaphore and hold it, while checking that -the sequence count of the VMA does not match that of the mm. +Each time a VMA read lock is acquired, we increment :c:member:`!vma.vm_refcnt` +reference counter and check that the sequence count of the VMA does not match +that of the mm. -If it does, the read lock fails. If it does not, we hold the lock, excluding -writers, but permitting other readers, who will also obtain this lock under RCU. +If it does, the read lock fails and :c:member:`!vma.vm_refcnt` is dropped. +If it does not, we keep the reference counter raised, excluding writers, but +permitting other readers, who can also obtain this lock under RCU. Importantly, maple tree operations performed in :c:func:`!lock_vma_under_rcu` are also RCU safe, so the whole read lock operation is guaranteed to function correctly. -On the write side, we acquire a write lock on the :c:member:`!vma->vm_lock` -read/write semaphore, before setting the VMA's sequence number under this lock, -also simultaneously holding the mmap write lock. +On the write side, we set a bit in :c:member:`!vma.vm_refcnt` which can't be +modified by readers and wait for all readers to drop their reference count. +Once there are no readers, the VMA's sequence number is set to match that of +the mm. During this entire operation mmap write lock is held. This way, if any read locks are in effect, :c:func:`!vma_start_write` will sleep until these are finished and mutual exclusion is achieved. -After setting the VMA's sequence number, the lock is released, avoiding -complexity with a long-term held write lock. +After setting the VMA's sequence number, the bit in :c:member:`!vma.vm_refcnt` +indicating a writer is cleared. From this point on, VMA's sequence number will +indicate VMA's write-locked state until mmap write lock is dropped or downgraded. -This clever combination of a read/write semaphore and sequence count allows for +This clever combination of a reference counter and sequence count allows for fast RCU-based per-VMA lock acquisition (especially on page fault, though utilised elsewhere) with minimal complexity around lock ordering. -- 2.50.1 From fcd807a03b864e2c7b2aa5eaade185127c4e2414 Mon Sep 17 00:00:00 2001 From: Marcelo Moreira Date: Mon, 17 Feb 2025 18:54:31 -0300 Subject: [PATCH 13/16] Docs/mm/damon: fix spelling and grammar in monitoring_intervals_tuning_example.rst This patch fixes some spelling and grammar mistakes in the documentation, improving the readability. - multipled -> multiplied - idential -> identical - minuts -> minutes - efficieny -> efficiency Link: https://lkml.kernel.org/r/20250217215512.12833-1-marcelomoreira1905@gmail.com Signed-off-by: Marcelo Moreira Reviewed-by: SeongJae Park Cc: Shuah khan Signed-off-by: Andrew Morton --- .../mm/damon/monitoring_intervals_tuning_example.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Documentation/mm/damon/monitoring_intervals_tuning_example.rst b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst index 334a854efb40..7207cbed591f 100644 --- a/Documentation/mm/damon/monitoring_intervals_tuning_example.rst +++ b/Documentation/mm/damon/monitoring_intervals_tuning_example.rst @@ -36,7 +36,7 @@ Then, list the DAMON-found regions of different access patterns, sorted by the "access temperature". "Access temperature" is a metric representing the access-hotness of a region. It is calculated as a weighted sum of the access frequency and the age of the region. If the access frequency is 0 %, the -temperature is multipled by minus one. That is, if a region is not accessed, +temperature is multiplied by minus one. That is, if a region is not accessed, it gets minus temperature and it gets lower as not accessed for longer time. The sorting is in temperature-ascendint order, so the region at the top of the list is the coldest, and the one at the bottom is the hottest one. :: @@ -58,11 +58,11 @@ list is the coldest, and the one at the bottom is the hottest one. :: The list shows not seemingly hot regions, and only minimum access pattern diversity. Every region has zero access frequency. The number of region is 10, which is the default ``min_nr_regions value``. Size of each region is also -nearly idential. We can suspect this is because “adaptive regions adjustment” +nearly identical. We can suspect this is because “adaptive regions adjustment” mechanism was not well working. As the guide suggested, we can get relative hotness of regions using ``age`` as the recency information. That would be better than nothing, but given the fact that the longest age is only about 6 -seconds while we waited about ten minuts, it is unclear how useful this will +seconds while we waited about ten minutes, it is unclear how useful this will be. The temperature ranges to total size of regions of each range histogram @@ -190,7 +190,7 @@ for sampling and aggregation intervals, respectively). :: The number of regions having different access patterns has significantly increased. Size of each region is also more varied. Total size of non-zero access frequency regions is also significantly increased. Maybe this is already -good enough to make some meaningful memory management efficieny changes. +good enough to make some meaningful memory management efficiency changes. 800ms/16s intervals: Another bias ================================= -- 2.50.1 From 63a23847dc47113b879a5f53cc0ca5cedc881ffd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:06 +0000 Subject: [PATCH 14/16] fs: convert block_commit_write() to take a folio All callers now have a folio, so pass it in instead of converting folio->page->folio. Link: https://lkml.kernel.org/r/20250217192009.437916-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/buffer.c | 14 ++++---------- fs/ext4/inline.c | 2 +- fs/ext4/move_extent.c | 2 +- fs/iomap/buffered-io.c | 2 +- fs/ocfs2/aops.c | 4 ++-- fs/ocfs2/file.c | 2 +- fs/udf/file.c | 2 +- include/linux/buffer_head.h | 2 +- 8 files changed, 12 insertions(+), 18 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index cc8452f60251..c66a59bb068b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2166,7 +2166,7 @@ int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static void __block_commit_write(struct folio *folio, size_t from, size_t to) +void block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; @@ -2204,6 +2204,7 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to) if (!partial) folio_mark_uptodate(folio); } +EXPORT_SYMBOL(block_commit_write); /* * block_write_begin takes care of the basic task of block allocation and @@ -2262,7 +2263,7 @@ int block_write_end(struct file *file, struct address_space *mapping, flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(folio, start, start + copied); + block_commit_write(folio, start, start + copied); return copied; } @@ -2578,13 +2579,6 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(cont_write_begin); -void block_commit_write(struct page *page, unsigned from, unsigned to) -{ - struct folio *folio = page_folio(page); - __block_commit_write(folio, from, to); -} -EXPORT_SYMBOL(block_commit_write); - /* * block_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must @@ -2630,7 +2624,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret)) goto out_unlock; - __block_commit_write(folio, 0, end); + block_commit_write(folio, 0, end); folio_mark_dirty(folio); folio_wait_stable(folio); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3536ca7e4fcc..0af474c8b260 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -637,7 +637,7 @@ retry: goto retry; if (folio) - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); out: if (folio) { folio_unlock(folio); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 898443e98efc..48649be64d6a 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -399,7 +399,7 @@ data_copy: bh = bh->b_this_page; } - block_commit_write(&folio[0]->page, from, from + replaced_size); + block_commit_write(folio[0], from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..f3904d13cda4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1484,7 +1484,7 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, &iter->iomap); if (ret) return ret; - block_commit_write(&folio->page, 0, length); + block_commit_write(folio, 0, length); } else { WARN_ON_ONCE(!folio_test_uptodate(folio)); folio_mark_dirty(folio); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 5bbeb6fbb1ac..ee1d92ed950f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -920,7 +920,7 @@ static void ocfs2_write_failure(struct inode *inode, ocfs2_jbd2_inode_add_write(wc->w_handle, inode, user_pos, user_len); - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); } } } @@ -2012,7 +2012,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); } - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); } } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e54f2c4b5a90..2056cf08ac1e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -813,7 +813,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - block_commit_write(&folio->page, block_start + 1, block_start + 1); + block_commit_write(folio, block_start + 1, block_start + 1); } /* diff --git a/fs/udf/file.c b/fs/udf/file.c index 412fe7c4d348..0d76c4f37b3e 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -69,7 +69,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) goto out_unlock; } - block_commit_write(&folio->page, 0, end); + block_commit_write(folio, 0, end); out_dirty: folio_mark_dirty(folio); folio_wait_stable(folio); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 932139c5d46f..6672e1a5031c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -271,7 +271,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct folio **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); -void block_commit_write(struct page *page, unsigned int from, unsigned int to); +void block_commit_write(struct folio *folio, size_t from, size_t to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); -- 2.50.1 From 52d671a1a36a16f3a0dd9a2beff964e75bce9787 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:07 +0000 Subject: [PATCH 15/16] fs: remove page_file_mapping() This wrapper has no more callers. Delete it. Link: https://lkml.kernel.org/r/20250217192009.437916-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..975c56fb4f85 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -575,11 +575,6 @@ static inline struct address_space *folio_flush_mapping(struct folio *folio) return folio_mapping(folio); } -static inline struct address_space *page_file_mapping(struct page *page) -{ - return folio_file_mapping(page_folio(page)); -} - /** * folio_inode - Get the host inode for this folio. * @folio: The folio. -- 2.50.1 From 0d40cfe63a2f19b9d375382e6d90b9ebd412901e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 17 Feb 2025 19:20:08 +0000 Subject: [PATCH 16/16] fs: remove folio_file_mapping() No callers of this function remain as filesystems no longer see swapfile pages through their normal read/write paths. Link: https://lkml.kernel.org/r/20250217192009.437916-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 975c56fb4f85..ad7c0f615e9b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -535,26 +535,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -/** - * folio_file_mapping - Find the mapping this folio belongs to. - * @folio: The folio. - * - * For folios which are in the page cache, return the mapping that this - * page belongs to. Folios in the swap cache return the mapping of the - * swap file or swap device where the data is stored. This is different - * from the mapping returned by folio_mapping(). The only reason to - * use it is if, like NFS, you return 0 from ->activate_swapfile. - * - * Do not call this for folios which aren't in the page cache or swap cache. - */ -static inline struct address_space *folio_file_mapping(struct folio *folio) -{ - if (unlikely(folio_test_swapcache(folio))) - return swapcache_mapping(folio); - - return folio->mapping; -} - /** * folio_flush_mapping - Find the file mapping this folio belongs to. * @folio: The folio. -- 2.50.1