Patch series "shoot lazy tlbs", v4.
On a 16-socket 192-core POWER8 system, a context switching benchmark with
as many software threads as CPUs (so each switch will go in and out of
idle), upstream can achieve a rate of about 1 million context switches per
second. After this series it goes up to 118 million.
This patch (of 4):
Add explicit _lazy_tlb annotated functions for lazy mm refcounting. This
makes lazy mm references more obvious, and allows explicit refcounting to
be removed if it is not used.
If a kernel thread's current lazy tlb mm happens to be the one it wants to
use, then kthread_use_mm() cleverly transfers the mm refcount from the
lazy tlb mm reference to the returned reference. If the lazy tlb mm
reference is no longer identical to a normal reference, this trick does
not work, so that is changed to be explicit about the two references.
Link: https://lkml.kernel.org/r/20210605014216.446867-1-npiggin@gmail.com
Link: https://lkml.kernel.org/r/20210605014216.446867-2-npiggin@gmail.com
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@ozlabs.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Anton Blanchard <anton@ozlabs.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
current->mm = mm;
current->active_mm = mm;
activate_mm(active_mm, mm);
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
ecard_init_pgtables(mm);
return 0;
}
if (IS_ENABLED(CONFIG_PPC32))
setup_kup();
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
smp_store_cpu_info(cpu);
if (current->active_mm == mm) {
WARN_ON_ONCE(current->mm != NULL);
/* Is a kernel thread and is using mm as the lazy tlb */
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
current->active_mm = &init_mm;
switch_mm_irqs_off(mm, &init_mm, current);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
/*
setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm);
mm_update_next_owner(old_mm);
mmput(old_mm);
- return 0;
+ } else {
+ mmdrop_lazy_tlb(active_mm);
}
- mmdrop(active_mm);
return 0;
}
__mmdrop(mm);
}
+/* Helpers for lazy TLB mm refcounting */
+static inline void mmgrab_lazy_tlb(struct mm_struct *mm)
+{
+ mmgrab(mm);
+}
+
+static inline void mmdrop_lazy_tlb(struct mm_struct *mm)
+{
+ mmdrop(mm);
+}
+
/**
* mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin.
*/
if (mm != &init_mm)
idle->active_mm = &init_mm;
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
return 0;
}
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
- mmgrab(mm);
+ mmgrab_lazy_tlb(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD));
WARN_ON_ONCE(tsk->mm);
+ mmgrab(mm);
+
task_lock(tsk);
/* Hold off tlb flush IPIs while switching mm's */
local_irq_disable();
active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
+ if (active_mm != mm)
tsk->active_mm = mm;
- }
tsk->mm = mm;
membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
* mmdrop(), or explicitly with smp_mb().
*/
if (active_mm != mm)
- mmdrop(active_mm);
+ mmdrop_lazy_tlb(active_mm);
else
smp_mb();
local_irq_disable();
tsk->mm = NULL;
membarrier_update_current_mm(NULL);
+ mmgrab_lazy_tlb(mm);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
task_unlock(tsk);
+
+ mmdrop(mm);
}
EXPORT_SYMBOL_GPL(kthread_unuse_mm);
* rq->curr, before returning to userspace, so provide them here:
*
* - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
- * provided by mmdrop(),
+ * provided by mmdrop_lazy_tlb(),
* - a sync_core for SYNC_CORE.
*/
if (mm) {
membarrier_mm_sync_core_before_usermode(mm);
- mmdrop(mm);
+ mmdrop_lazy_tlb(mm);
}
+
if (unlikely(prev_state == TASK_DEAD)) {
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
/*
* kernel -> kernel lazy + transfer active
- * user -> kernel lazy + mmgrab() active
+ * user -> kernel lazy + mmgrab_lazy_tlb() active
*
- * kernel -> user switch + mmdrop() active
+ * kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
*/
if (!next->mm) { // to kernel
next->active_mm = prev->active_mm;
if (prev->mm) // from user
- mmgrab(prev->active_mm);
+ mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
} else { // to user
switch_mm_irqs_off(prev->active_mm, next->mm, next);
if (!prev->mm) { // from kernel
- /* will mmdrop() in finish_task_switch(). */
+ /* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
/*
* The boot idle thread does lazy MMU switching as well:
*/
- mmgrab(&init_mm);
+ mmgrab_lazy_tlb(&init_mm);
enter_lazy_tlb(&init_mm, current);
/*