unsigned long addr);
};
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_init_lock(struct vm_area_struct *vma)
+{
+ init_rwsem(&vma->lock);
+ vma->vm_lock_seq = -1;
+}
+
+static inline void vma_write_lock(struct vm_area_struct *vma)
+{
+ int mm_lock_seq;
+
+ mmap_assert_write_locked(vma->vm_mm);
+
+ /*
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+ mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
+ if (vma->vm_lock_seq == mm_lock_seq)
+ return;
+
+ down_write(&vma->lock);
+ vma->vm_lock_seq = mm_lock_seq;
+ up_write(&vma->lock);
+}
+
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+{
+ if (unlikely(down_read_trylock(&vma->lock) == 0))
+ return false;
+
+ /*
+ * Overflow might produce false locked result but it's not critical
+ * because we just fall back to using mmap_lock in such case.
+ * False unlocked result is critical but is impossible because we
+ * modify and check vma->vm_lock_seq under vma->lock protection and
+ * mm->mm_lock_seq modification invalidates all existing locks.
+ */
+ if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) {
+ up_read(&vma->lock);
+ return false;
+ }
+ return true;
+}
+
+static inline void vma_read_unlock(struct vm_area_struct *vma)
+{
+ up_read(&vma->lock);
+}
+
+static inline void vma_assert_locked(struct vm_area_struct *vma)
+{
+ lockdep_assert_held(&vma->lock);
+ VM_BUG_ON_VMA(!rwsem_is_locked(&vma->lock), vma);
+}
+
+static inline void vma_assert_write_locked(struct vm_area_struct *vma)
+{
+ mmap_assert_write_locked(vma->vm_mm);
+ /*
+ * current task is holding mmap_write_lock, both vma->vm_lock_seq and
+ * mm->mm_lock_seq can't be concurrently modified.
+ */
+ VM_BUG_ON_VMA(vma->vm_lock_seq != READ_ONCE(vma->vm_mm->mm_lock_seq), vma);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+static inline void vma_init_lock(struct vm_area_struct *vma) {}
+static inline void vma_write_lock(struct vm_area_struct *vma) {}
+static inline bool vma_read_trylock(struct vm_area_struct *vma)
+ { return false; }
+static inline void vma_read_unlock(struct vm_area_struct *vma) {}
+static inline void vma_assert_locked(struct vm_area_struct *vma) {}
+static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
+
+#endif /* CONFIG_PER_VMA_LOCK */
+
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{
static const struct vm_operations_struct dummy_vm_ops = {};
vma->vm_mm = mm;
vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain);
+ vma_init_lock(vma);
}
static inline void vma_set_anonymous(struct vm_area_struct *vma)
*/
pgprot_t vm_page_prot;
+#ifdef CONFIG_PER_VMA_LOCK
+ long vm_lock_seq;
+
+ /*
+ * With default kernel config, lock's offset inside vm_area_struct is
+ * at 56, which causes its two hot fields 'count' and 'owner' to be in
+ * 2 different cachelines. When the lock is contended, both of the
+ * 2 fields will be accessed frequently and current layout will help
+ * to reduce cache bouncing.
+ *
+ * So please be careful with adding new fields before lock, which can
+ * push the 2 fields into one cacheline.
+ */
+ struct rw_semaphore lock;
+#endif
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
* init_mm.mmlist, and are protected
* by mmlist_lock
*/
+#ifdef CONFIG_PER_VMA_LOCK
+ int mm_lock_seq;
+#endif
unsigned long hiwater_rss; /* High-watermark of RSS usage */
VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
}
+#ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_write_unlock_mm(struct mm_struct *mm)
+{
+ mmap_assert_write_locked(mm);
+ /* No races during update due to exclusive mmap_lock being held */
+ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
+}
+#else
+static inline void vma_write_unlock_mm(struct mm_struct *mm) {}
+#endif
+
static inline void mmap_init_lock(struct mm_struct *mm)
{
init_rwsem(&mm->mmap_lock);
static inline void mmap_write_unlock(struct mm_struct *mm)
{
__mmap_lock_trace_released(mm, true);
+ vma_write_unlock_mm(mm);
up_write(&mm->mmap_lock);
}
static inline void mmap_write_downgrade(struct mm_struct *mm)
{
__mmap_lock_trace_acquire_returned(mm, false, true);
+ vma_write_unlock_mm(mm);
downgrade_write(&mm->mmap_lock);
}
*/
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
+ vma_init_lock(new);
dup_anon_vma_name(orig, new);
}
return new;
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
+#ifdef CONFIG_PER_VMA_LOCK
+ WRITE_ONCE(mm->mm_lock_seq, 0);
+#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
+#ifdef CONFIG_PER_VMA_LOCK
+ .mm_lock_seq = 0,
+#endif
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
#ifdef CONFIG_IOMMU_SVA