]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm: add per-VMA lock and helper functions to control it
authorSuren Baghdasaryan <surenb@google.com>
Wed, 11 May 2022 23:32:37 +0000 (16:32 -0700)
committerLiam R. Howlett <Liam.Howlett@oracle.com>
Wed, 4 Jan 2023 20:59:24 +0000 (15:59 -0500)
Introduce a per-VMA rw_semaphore to be used during page fault handling
instead of mmap_lock. Because there are cases when multiple VMAs need
to be exclusively locked during VMA tree modifications, instead of the
usual lock/unlock patter we mark a VMA as locked by taking per-VMA lock
exclusively and setting vma->lock_seq to the current mm->lock_seq. When
mmap_write_lock holder is done with all modifications and drops mmap_lock,
it will increment mm->lock_seq, effectively unlocking all VMAs marked as
locked.
VMA lock is placed on the cache line boundary so that its 'count' field
falls into the first cache line while the rest of the fields fall into
the second cache line. This lets the 'count' field to be cached with
other frequently accessed fields and used quickly in uncontended case
while 'owner' and other fields used in the contended case will not
invalidate the first cache line while waiting on the lock.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
include/linux/mm.h
include/linux/mm_types.h
include/linux/mmap_lock.h
kernel/fork.c
mm/init-mm.c

index 8bbcccbc55654341c4d2b361549e1da4da6d65cb..8d5bb4e1b4e6a7c8218549c8d69f91552bad35b1 100644 (file)
@@ -611,6 +611,84 @@ struct vm_operations_struct {
                                          unsigned long addr);
 };
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+       init_rwsem(&vma->lock);
+       vma->vm_lock_seq = -1;
+}
+
+static inline void vma_write_lock(struct vm_area_struct *vma)
+{
+       int mm_lock_seq;
+
+       mmap_assert_write_locked(vma->vm_mm);
+
+       /*
+        * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+        * mm->mm_lock_seq can't be concurrently modified.
+        */
+       mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+       if (vma->vm_lock_seq == mm_lock_seq)
+               return;
+
+       down_write(&vma->lock);
+       vma->vm_lock_seq = mm_lock_seq;
+       up_write(&vma->lock);
+}
+
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+{
+       if (unlikely(down_read_trylock(&vma->lock) == 0))
+               return false;
+
+       /*
+        * Overflow might produce false locked result but it's not critical
+        * because we just fall back to using mmap_lock in such case.
+        * False unlocked result is critical but is impossible because we
+        * modify and check vma->vm_lock_seq under vma->lock protection and
+        * mm->mm_lock_seq modification invalidates all existing locks.
+        */
+       if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) {
+               up_read(&vma->lock);
+               return false;
+       }
+       return true;
+}
+
+static inline void vma_read_unlock(struct vm_area_struct *vma)
+{
+       up_read(&vma->lock);
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+       lockdep_assert_held(&vma->lock);
+       VM_BUG_ON_VMA(!rwsem_is_locked(&vma->lock), vma);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+       mmap_assert_write_locked(vma->vm_mm);
+       /*
+        * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+        * mm->mm_lock_seq can't be concurrently modified.
+        */
+       VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline void vma_write_lock(struct vm_area_struct *vma) {}
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+               { return false; }
+static inline void vma_read_unlock(struct vm_area_struct *vma) {}
+static inline void vma_assert_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
        static const struct vm_operations_struct dummy_vm_ops = {};
@@ -619,6 +697,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
        vma->vm_mm = mm;
        vma->vm_ops = &dummy_vm_ops;
        INIT_LIST_HEAD(&vma->anon_vma_chain);
+       vma_init_lock(vma);
 }
 
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
index 2b618d73ee262a5bd625991030564de620e1a82d..b73ea2a37d5c599c8a88c33a3298cb7b01f8f2e9 100644 (file)
@@ -467,6 +467,21 @@ struct vm_area_struct {
         */
        pgprot_t vm_page_prot;
 
+#ifdef CONFIG_PER_VMA_LOCK
+       long vm_lock_seq;
+
+       /*
+        * With default kernel config, lock's offset inside vm_area_struct is
+        * at 56, which causes its two hot fields 'count' and 'owner' to be in
+        * 2 different cachelines. When the lock is contended, both of the
+        * 2 fields will be accessed frequently and current layout will help
+        * to reduce cache bouncing.
+        *
+        * So please be careful with adding new fields before lock, which can
+        * push the 2 fields into one cacheline.
+        */
+       struct rw_semaphore lock;
+#endif
        /*
         * For areas with an address space and backing store,
         * linkage into the address_space->i_mmap interval tree.
@@ -593,6 +608,9 @@ struct mm_struct {
                                          * init_mm.mmlist, and are protected
                                          * by mmlist_lock
                                          */
+#ifdef CONFIG_PER_VMA_LOCK
+               int mm_lock_seq;
+#endif
 
 
                unsigned long hiwater_rss; /* High-watermark of RSS usage */
index e49ba91bb1f0f8a96278cca16107bb98a0a9850d..40facd4c398b542d5a23686f5f9502658db83259 100644 (file)
@@ -72,6 +72,17 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm)
        VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
 }
 
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_write_unlock_mm(struct mm_struct *mm)
+{
+       mmap_assert_write_locked(mm);
+       /* No races during update due to exclusive mmap_lock being held */
+       WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_write_unlock_mm(struct mm_struct *mm) {}
+#endif
+
 static inline void mmap_init_lock(struct mm_struct *mm)
 {
        init_rwsem(&mm->mmap_lock);
@@ -114,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
 static inline void mmap_write_unlock(struct mm_struct *mm)
 {
        __mmap_lock_trace_released(mm, true);
+       vma_write_unlock_mm(mm);
        up_write(&mm->mmap_lock);
 }
 
 static inline void mmap_write_downgrade(struct mm_struct *mm)
 {
        __mmap_lock_trace_acquire_returned(mm, false, true);
+       vma_write_unlock_mm(mm);
        downgrade_write(&mm->mmap_lock);
 }
 
index 0d353bcaef5ec4c5e8c28089747fff39ec8a3a72..e6d3c557d55f8433177e2517f1c099b3fc833f1e 100644 (file)
@@ -474,6 +474,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
                 */
                *new = data_race(*orig);
                INIT_LIST_HEAD(&new->anon_vma_chain);
+               vma_init_lock(new);
                dup_anon_vma_name(orig, new);
        }
        return new;
@@ -1130,6 +1131,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        seqcount_init(&mm->write_protect_seq);
        mmap_init_lock(mm);
        INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+       WRITE_ONCE(mm->mm_lock_seq, 0);
+#endif
        mm_pgtables_bytes_init(mm);
        mm->map_count = 0;
        mm->locked_vm = 0;
index c9327abb771c54be8ca69273d81cef0b5755e8c7..33269314e06017509dd39255ea910118bdc7e0a2 100644 (file)
@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
        .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
        .arg_lock       =  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
        .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+       .mm_lock_seq    = 0,
+#endif
        .user_ns        = &init_user_ns,
        .cpu_bitmap     = CPU_BITS_NONE,
 #ifdef CONFIG_IOMMU_SVA