]> www.infradead.org Git - nvme.git/commitdiff
mm: batch unlink_file_vma calls in free_pgd_range
authorMateusz Guzik <mjguzik@gmail.com>
Tue, 21 May 2024 23:43:21 +0000 (01:43 +0200)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 4 Jul 2024 02:29:58 +0000 (19:29 -0700)
Execs of dynamically linked binaries at 20-ish cores are bottlenecked on
the i_mmap_rwsem semaphore, while the biggest singular contributor is
free_pgd_range inducing the lock acquire back-to-back for all consecutive
mappings of a given file.

Tracing the count of said acquires while building the kernel shows:
[1, 2)     799579 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[2, 3)          0 |                                                    |
[3, 4)       3009 |                                                    |
[4, 5)       3009 |                                                    |
[5, 6)     326442 |@@@@@@@@@@@@@@@@@@@@@                               |

So in particular there were 326442 opportunities to coalesce 5 acquires
into 1.

Doing so increases execs per second by 4% (~50k to ~52k) when running
the benchmark linked below.

The lock remains the main bottleneck, I have not looked at other spots
yet.

Bench can be found here:
http://apollo.backplane.com/DFlyMisc/doexec.c

$ cc -O2 -o shared-doexec doexec.c
$ ./shared-doexec $(nproc)

Note this particular test makes sure binaries are separate, but the
loader is shared.

Stats collected on the patched kernel (+ "noinline") with:
bpftrace -e 'kprobe:unlink_file_vma_batch_process
{ @ = lhist(((struct unlink_vma_file_batch *)arg0)->count, 0, 8, 1); }'

Link: https://lkml.kernel.org/r/20240521234321.359501-1-mjguzik@gmail.com
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/internal.h
mm/memory.c
mm/mmap.c

index 6902b7dd85091c7f804a4b675cd4075f3d7f955d..774bf23b030c99a399a6a6ecc41cd19686c7dc3c 100644 (file)
@@ -1515,4 +1515,13 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
 void workingset_update_node(struct xa_node *node);
 extern struct list_lru shadow_nodes;
 
+struct unlink_vma_file_batch {
+       int count;
+       struct vm_area_struct *vmas[8];
+};
+
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *);
+void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *);
+void unlink_file_vma_batch_final(struct unlink_vma_file_batch *);
+
 #endif /* __MM_INTERNAL_H */
index edf2004fcdf97dd3b22d73f592e37abf81f65dcf..1411edbb55d052e7a8afa810afe66fe4978f4086 100644 (file)
@@ -365,6 +365,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                   struct vm_area_struct *vma, unsigned long floor,
                   unsigned long ceiling, bool mm_wr_locked)
 {
+       struct unlink_vma_file_batch vb;
+
        do {
                unsigned long addr = vma->vm_start;
                struct vm_area_struct *next;
@@ -384,12 +386,15 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                if (mm_wr_locked)
                        vma_start_write(vma);
                unlink_anon_vmas(vma);
-               unlink_file_vma(vma);
 
                if (is_vm_hugetlb_page(vma)) {
+                       unlink_file_vma(vma);
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                } else {
+                       unlink_file_vma_batch_init(&vb);
+                       unlink_file_vma_batch_add(&vb, vma);
+
                        /*
                         * Optimization: gather nearby vmas into one call down
                         */
@@ -402,8 +407,9 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas,
                                if (mm_wr_locked)
                                        vma_start_write(vma);
                                unlink_anon_vmas(vma);
-                               unlink_file_vma(vma);
+                               unlink_file_vma_batch_add(&vb, vma);
                        }
+                       unlink_file_vma_batch_final(&vb);
                        free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next ? next->vm_start : ceiling);
                }
index 83b4682ec85cfa0398ab0e50f9c4c8f35f993d90..e42d89f98071ea82602ef38736537038b97399ba 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -131,6 +131,47 @@ void unlink_file_vma(struct vm_area_struct *vma)
        }
 }
 
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
+{
+       vb->count = 0;
+}
+
+static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
+{
+       struct address_space *mapping;
+       int i;
+
+       mapping = vb->vmas[0]->vm_file->f_mapping;
+       i_mmap_lock_write(mapping);
+       for (i = 0; i < vb->count; i++) {
+               VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
+               __remove_shared_vm_struct(vb->vmas[i], mapping);
+       }
+       i_mmap_unlock_write(mapping);
+
+       unlink_file_vma_batch_init(vb);
+}
+
+void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
+                              struct vm_area_struct *vma)
+{
+       if (vma->vm_file == NULL)
+               return;
+
+       if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
+           vb->count == ARRAY_SIZE(vb->vmas))
+               unlink_file_vma_batch_process(vb);
+
+       vb->vmas[vb->count] = vma;
+       vb->count++;
+}
+
+void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
+{
+       if (vb->count > 0)
+               unlink_file_vma_batch_process(vb);
+}
+
 /*
  * Close a vm structure and free it.
  */