From 4b39c4548be44c57e6c80eaca7b43d2a662390c1 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 20 Mar 2023 15:03:45 -0300 Subject: [PATCH] vmstat: add pcp remote node draining via cpu_vm_stats_fold Large NUMA systems might have significant portions of system memory to be trapped in pcp queues. The number of pcp is determined by the number of processors and nodes in a system. A system with 4 processors and 2 nodes has 8 pcps which is okay. But a system with 1024 processors and 512 nodes has 512k pcps with a high potential for large amount of memory being caught in them. Enable remote node draining for the CONFIG_HAVE_CMPXCHG_LOCAL case, where vmstat_shepherd will perform the aging and draining via cpu_vm_stats_fold. Link: https://lkml.kernel.org/r/20230320180745.858515310@redhat.com Signed-off-by: Marcelo Tosatti Suggested-by: Vlastimil Babka Cc: Aaron Tomlin Cc: Christoph Lameter Cc: Frederic Weisbecker Cc: Heiko Carstens Cc: Huacai Chen Cc: Michal Hocko Cc: Peter Xu Cc: "Russell King (Oracle)" Signed-off-by: Andrew Morton --- include/linux/vmstat.h | 2 +- mm/page_alloc.c | 2 +- mm/vmstat.c | 44 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index fed855bae6d8..9e36a34cb52f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -297,7 +297,7 @@ extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); -void cpu_vm_stats_fold(int cpu); +void cpu_vm_stats_fold(int cpu, bool do_pagesets); void refresh_zone_stat_thresholds(void); struct ctl_table; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e365cefc6c56..7e99271e31dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8563,7 +8563,7 @@ static int page_alloc_cpu_dead(unsigned int cpu) * Zero the differential counters of the dead processor * so that the vm statistics are consistent. */ - cpu_vm_stats_fold(cpu); + cpu_vm_stats_fold(cpu, false); for_each_populated_zone(zone) zone_pcp_update(zone, 0); diff --git a/mm/vmstat.c b/mm/vmstat.c index 9f1a8a162ff1..0a6d742322db 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -928,7 +928,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets) * There cannot be any access by the offline cpu and therefore * synchronization is simplified. */ -void cpu_vm_stats_fold(int cpu) +void cpu_vm_stats_fold(int cpu, bool do_pagesets) { struct pglist_data *pgdat; struct zone *zone; @@ -938,6 +938,9 @@ void cpu_vm_stats_fold(int cpu) for_each_populated_zone(zone) { struct per_cpu_zonestat *pzstats; +#ifdef CONFIG_NUMA + struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); +#endif pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu); @@ -948,6 +951,11 @@ void cpu_vm_stats_fold(int cpu) v = xchg(&pzstats->vm_stat_diff[i], 0); atomic_long_add(v, &zone->vm_stat[i]); global_zone_diff[i] += v; +#ifdef CONFIG_NUMA + /* 3 seconds idle till flush */ + if (do_pagesets) + pcp->expire = 3; +#endif } } #ifdef CONFIG_NUMA @@ -959,6 +967,38 @@ void cpu_vm_stats_fold(int cpu) zone_numa_event_add(v, zone, i); } } + + if (do_pagesets) { + cond_resched(); + /* + * Deal with draining the remote pageset of a + * processor + * + * Check if there are pages remaining in this pageset + * if not then there is nothing to expire. + */ + if (!pcp->expire || !pcp->count) + continue; + + /* + * We never drain zones local to this processor. + */ + if (zone_to_nid(zone) == cpu_to_node(cpu)) { + pcp->expire = 0; + continue; + } + + WARN_ON(pcp->expire < 0); + /* + * pcp->expire is only accessed from vmstat_shepherd context, + * therefore no locking is required. + */ + if (--pcp->expire) + continue; + + if (pcp->count) + drain_zone_pages(zone, pcp); + } #endif } @@ -2066,7 +2106,7 @@ static int refresh_all_vm_stats(void) cpus_read_lock(); for_each_online_cpu(cpu) { - cpu_vm_stats_fold(cpu); + cpu_vm_stats_fold(cpu, true); cond_resched(); } cpus_read_unlock(); -- 2.50.1