From: Liam R. Howlett <Liam.Howlett@Oracle.com>
Date: Fri, 20 Nov 2020 02:59:10 +0000 (-0500)
Subject: mm/mmap: Rewrite __do_munmap() to be more maple tree friendly
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=5cd3c54236e95b702e54021e7b5313dc76c96aad;p=users%2Fjedix%2Flinux-maple.git

mm/mmap: Rewrite __do_munmap() to be more maple tree friendly

Before going full-blown ma_state on __do_munmap(), do better on splits
by avoiding cloning a tmp VMA and just allocate one on the stack to be
discarded.  In the case of a double-adjust of the start and end vma,
then a duplication is unavoidable and handled in the slow path as
before.

Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
---

diff --git a/mm/mmap.c b/mm/mmap.c
index ca0f9ea5e73e..07dc215e2c04 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2562,44 +2562,6 @@ static void unmap_region(struct mm_struct *mm,
 	tlb_finish_mmu(&tlb, start, end);
 }
 
-/*
- * Create a list of vma's touched by the unmap, removing them from the mm's
- * vma list as we go..
- */
-static bool
-detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
-	struct vm_area_struct *prev, unsigned long end)
-{
-	struct vm_area_struct **insertion_point;
-	struct vm_area_struct *tail_vma = NULL;
-
-	insertion_point = (prev ? &prev->vm_next : &mm->mmap);
-	vma->vm_prev = NULL;
-	vma_mt_szero(mm, vma->vm_start, end);
-	do {
-		mm->map_count--;
-		tail_vma = vma;
-		vma = vma->vm_next;
-	} while (vma && vma->vm_start < end);
-	*insertion_point = vma;
-	if (vma)
-		vma->vm_prev = prev;
-	else
-		mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
-	tail_vma->vm_next = NULL;
-
-	/*
-	 * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
-	 * VM_GROWSUP VMA. Such VMAs can change their size under
-	 * down_read(mmap_lock) and collide with the VMA we are about to unmap.
-	 */
-	if (vma && (vma->vm_flags & VM_GROWSDOWN))
-		return false;
-	if (prev && (prev->vm_flags & VM_GROWSUP))
-		return false;
-	return true;
-}
-
 /*
  * __split_vma() bypasses sysctl_max_map_count checking.  We use this where it
  * has already been checked or doesn't make sense to fail.
@@ -2693,6 +2655,80 @@ static inline void unlock_range(struct vm_area_struct *start, unsigned long limi
 		tmp = tmp->vm_next;
 	}
 }
+
+void vma_shorten(struct vm_area_struct *vma, unsigned long start,
+			unsigned long end, struct vm_area_struct *unmap)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long old_start = vma->vm_start;
+	unsigned long old_end = vma->vm_end;
+//	unsigned long old_pgoff = vma->vm_pgoff;
+	struct vm_area_struct *next = vma->vm_next;
+	struct address_space *mapping = NULL;
+	struct rb_root_cached *root = NULL;
+	struct anon_vma *anon_vma = NULL;
+	struct file *file = vma->vm_file;
+
+
+	vma_init(unmap, mm);
+	if (end != old_end) { // shortening the start, unmap the end of vma.
+		unmap->vm_start = end;
+		unmap->vm_end = old_end;
+		unmap->vm_next = vma->vm_next;
+		unmap->vm_prev = vma;
+	} else {
+		unmap->vm_start = old_start;
+		unmap->vm_end = start;
+		unmap->vm_next = vma;
+		unmap->vm_prev = vma->vm_prev;
+	}
+
+	vma_adjust_trans_huge(vma, vma->vm_start, end, 0);
+
+	if (file) {
+		mapping = file->f_mapping;
+		root = &mapping->i_mmap;
+		uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+
+		i_mmap_lock_write(mapping);
+	}
+
+	anon_vma = vma->anon_vma;
+	if (anon_vma) {
+		anon_vma_lock_write(anon_vma);
+		anon_vma_interval_tree_pre_update_vma(vma);
+	}
+
+	if (file) {
+		flush_dcache_mmap_lock(mapping);
+		vma_interval_tree_remove(vma, root);
+	}
+
+	if (end == old_end)
+		vma->vm_pgoff += (old_start - start) >> PAGE_SHIFT;
+
+	vma->vm_end = end;
+	vma->vm_start = start;
+
+	if (file) {
+		vma_interval_tree_insert(vma, root);
+		flush_dcache_mmap_unlock(mapping);
+	}
+
+	if (!next) {
+		mm->highest_vm_end = vm_end_gap(vma);
+	}
+
+	if (anon_vma) {
+		anon_vma_interval_tree_post_update_vma(vma);
+		anon_vma_unlock_write(anon_vma);
+	}
+
+	if (file) {
+		i_mmap_unlock_write(mapping);
+		uprobe_mmap(vma);
+	}
+}
 /* Munmap is split into 2 main parts -- this part which finds
  * what needs doing, and the areas themselves, which do the
  * work.  This now handles partial unmappings.
@@ -2702,7 +2738,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
 		struct list_head *uf, bool downgrade)
 {
 	unsigned long end;
-	struct vm_area_struct *vma, *prev, *last;
+	struct vm_area_struct *vma, *next, *prev, *last;
+	struct vm_area_struct start_split, end_split;
+	int map_count = 0;
+	//MA_STATE(mas, &mm->mm_mt, start, start);
+
 
 	if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
 		return -EINVAL;
@@ -2720,80 +2760,123 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
 	if (!vma)
 		return 0;
 
-	/* we have start < vma->vm_end  */
-
-	/*
-	 * If we need to split any vma, do it now to save pain later.
-	 *
-	 * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
-	 * unmapped vm_area_struct will remain in use: so lower split_vma
-	 * places tmp vma above, and higher split_vma places tmp vma below.
-	 */
-	if (start > vma->vm_start) {
-		int error;
-		/*
-		 * Make sure that map_count on return from munmap() will
-		 * not exceed its limit; but let map_count go just above
-		 * its limit temporarily, to help free resources as expected.
-		 */
-		if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
-			return -ENOMEM;
-
-		error = __split_vma(mm, vma, start, 0);
+	if (unlikely(uf)) {
+		int error = userfaultfd_unmap_prep(vma, start, end, uf);
 		if (error)
 			return error;
-		prev = vma;
-		vma = vma_next(mm, prev);
+	}
+
+	if (start > vma->vm_start) {
+		if (unlikely(vma->vm_end > end)) {
+			// adjust the same vma twice requires a split.
+			int error = __split_vma(mm, vma, start, 0);
+			if (error)
+				return error;
+			prev = vma;
+			vma = vma_next(mm, prev);
+		} else {
+			vma_shorten(vma, vma->vm_start, start, &start_split);
+			prev = vma;
+			vma = &start_split;
+			map_count--;
+		}
 	} else {
 		prev = vma->vm_prev;
 	}
 
-	if (vma->vm_end >= end)
+	if (vma->vm_end >= end) // almost always the case
 		last = vma;
 	else
 		last = find_vma_intersection(mm, end - 1, end);
 
 	/* Does it split the last one? */
 	if (last && end < last->vm_end) {
-		int error = __split_vma(mm, last, end, 1);
-		if (error)
-			return error;
-		vma = vma_next(mm, prev);
+		vma_shorten(last, end, last->vm_end, &end_split);
+		if (last == vma)
+			vma = &end_split;
+
+		// map_count will count the existing vma in this case
+		map_count--;
+		last = &end_split;
 	}
 
 
-	if (unlikely(uf)) {
-		/*
-		 * If userfaultfd_unmap_prep returns an error the vmas
-		 * will remain splitted, but userland will get a
-		 * highly unexpected error anyway. This is no
-		 * different than the case where the first of the two
-		 * __split_vma fails, but we don't undo the first
-		 * split, despite we could. This is unlikely enough
-		 * failure that it's not worth optimizing it for.
-		 */
-		int error = userfaultfd_unmap_prep(vma, start, end, uf);
-		if (error)
-			return error;
-	}
 
 	/*
 	 * unlock any mlock()ed ranges before detaching vmas
 	 */
-	if (mm->locked_vm)
-		unlock_range(vma, end);
+	next = vma;
+	while (next && next->vm_start < end) {
+		map_count++;
+		if (next->vm_flags & VM_LOCKED) {
+			mm->locked_vm -= vma_pages(next);
+			munlock_vma_pages_all(next);
+		}
+
+		next = next->vm_next;
+	}
+	//printk("tmp %px, map count is %d\n", tmp, map_count);
+
 
 	/* Detach vmas from the MM linked list and remove from the mm tree*/
-	if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
+	vma->vm_prev = NULL;
+	if (prev)
+		prev->vm_next = next;
+	else
+		mm->mmap = next;
+
+	if (next)
+		next->vm_prev = prev;
+	else
+		mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
+
+	if (!last)
+		last = vma;
+
+	last->vm_next = NULL;
+	mm->map_count -= map_count;
+	vma_mt_szero(mm, start, end);
+
+	if (next && (next->vm_flags & VM_GROWSDOWN))
 		downgrade = false;
+	else if (prev && (prev->vm_flags & VM_GROWSUP))
+		downgrade = false;
+
 
 	if (downgrade)
 		mmap_write_downgrade(mm);
 
+	/* vma -> last is a separate lined list.  Add start_split and end_split
+	 * if necessary */
 	unmap_region(mm, vma, prev, start, end);
 
+	if (vma == &start_split) {
+		if (vma->vm_flags & VM_ACCOUNT) {
+			long nrpages = vma_pages(vma);
+
+			vm_stat_account(mm, vma->vm_flags, -nrpages);
+			vm_unacct_memory(nrpages);
+		}
+		vma = vma->vm_next;
+	}
+
+	// Cleanup accounting.
+	if (last == &end_split) {
+		if (last->vm_flags & VM_ACCOUNT) {
+			long nrpages = vma_pages(last);
+			vm_stat_account(mm, last->vm_flags, -nrpages);
+			vm_unacct_memory(nrpages);
+		}
+
+		if (last->vm_prev)
+			last->vm_prev->vm_next = NULL;
+		if (vma == last)
+			vma = NULL;
+	}
+
 	/* Fix up all other VM information */
-	remove_vma_list(mm, vma);
+	if (vma)
+		remove_vma_list(mm, vma);
 
 	return downgrade ? 1 : 0;
 }