]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
vmstat: add pcp remote node draining via cpu_vm_stats_fold
authorMarcelo Tosatti <mtosatti@redhat.com>
Mon, 20 Mar 2023 18:03:45 +0000 (15:03 -0300)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 5 Apr 2023 23:02:32 +0000 (16:02 -0700)
Large NUMA systems might have significant portions of system memory to be
trapped in pcp queues.  The number of pcp is determined by the number of
processors and nodes in a system.  A system with 4 processors and 2 nodes
has 8 pcps which is okay.  But a system with 1024 processors and 512 nodes
has 512k pcps with a high potential for large amount of memory being
caught in them.

Enable remote node draining for the CONFIG_HAVE_CMPXCHG_LOCAL case, where
vmstat_shepherd will perform the aging and draining via cpu_vm_stats_fold.

Link: https://lkml.kernel.org/r/20230320180745.858515310@redhat.com
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Aaron Tomlin <atomlin@atomlin.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: "Russell King (Oracle)" <linux@armlinux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/vmstat.h
mm/page_alloc.c
mm/vmstat.c

index fed855bae6d8e88956064f85f1260bca522bb5fb..9e36a34cb52ffd40fc1a75278f0d9191c2d65544 100644 (file)
@@ -297,7 +297,7 @@ extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_node_state(struct pglist_data *, enum node_stat_item);
 
 void quiet_vmstat(void);
-void cpu_vm_stats_fold(int cpu);
+void cpu_vm_stats_fold(int cpu, bool do_pagesets);
 void refresh_zone_stat_thresholds(void);
 
 struct ctl_table;
index e365cefc6c56cf148a5601a51b1b62707ee8776c..7e99271e31dc4f4cd48f85e412989a29c41513ed 100644 (file)
@@ -8563,7 +8563,7 @@ static int page_alloc_cpu_dead(unsigned int cpu)
         * Zero the differential counters of the dead processor
         * so that the vm statistics are consistent.
         */
-       cpu_vm_stats_fold(cpu);
+       cpu_vm_stats_fold(cpu, false);
 
        for_each_populated_zone(zone)
                zone_pcp_update(zone, 0);
index 9f1a8a162ff1ed61aa6bff5b294e080756ba3364..0a6d742322db04ba79a66870a90c9da7088a601a 100644 (file)
@@ -928,7 +928,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
  * There cannot be any access by the offline cpu and therefore
  * synchronization is simplified.
  */
-void cpu_vm_stats_fold(int cpu)
+void cpu_vm_stats_fold(int cpu, bool do_pagesets)
 {
        struct pglist_data *pgdat;
        struct zone *zone;
@@ -938,6 +938,9 @@ void cpu_vm_stats_fold(int cpu)
 
        for_each_populated_zone(zone) {
                struct per_cpu_zonestat *pzstats;
+#ifdef CONFIG_NUMA
+               struct per_cpu_pages *pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
+#endif
 
                pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
 
@@ -948,6 +951,11 @@ void cpu_vm_stats_fold(int cpu)
                                v = xchg(&pzstats->vm_stat_diff[i], 0);
                                atomic_long_add(v, &zone->vm_stat[i]);
                                global_zone_diff[i] += v;
+#ifdef CONFIG_NUMA
+                               /* 3 seconds idle till flush */
+                               if (do_pagesets)
+                                       pcp->expire = 3;
+#endif
                        }
                }
 #ifdef CONFIG_NUMA
@@ -959,6 +967,38 @@ void cpu_vm_stats_fold(int cpu)
                                zone_numa_event_add(v, zone, i);
                        }
                }
+
+               if (do_pagesets) {
+                       cond_resched();
+                       /*
+                        * Deal with draining the remote pageset of a
+                        * processor
+                        *
+                        * Check if there are pages remaining in this pageset
+                        * if not then there is nothing to expire.
+                        */
+                       if (!pcp->expire || !pcp->count)
+                               continue;
+
+                       /*
+                        * We never drain zones local to this processor.
+                        */
+                       if (zone_to_nid(zone) == cpu_to_node(cpu)) {
+                               pcp->expire = 0;
+                               continue;
+                       }
+
+                       WARN_ON(pcp->expire < 0);
+                       /*
+                        * pcp->expire is only accessed from vmstat_shepherd context,
+                        * therefore no locking is required.
+                        */
+                       if (--pcp->expire)
+                               continue;
+
+                       if (pcp->count)
+                               drain_zone_pages(zone, pcp);
+               }
 #endif
        }
 
@@ -2066,7 +2106,7 @@ static int refresh_all_vm_stats(void)
 
        cpus_read_lock();
        for_each_online_cpu(cpu) {
-               cpu_vm_stats_fold(cpu);
+               cpu_vm_stats_fold(cpu, true);
                cond_resched();
        }
        cpus_read_unlock();