]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm/vmstat: switch counter modification to cmpxchg
authorMarcelo Tosatti <mtosatti@redhat.com>
Mon, 20 Mar 2023 18:03:40 +0000 (15:03 -0300)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 5 Apr 2023 23:02:31 +0000 (16:02 -0700)
In preparation for switching vmstat shepherd to flush per-CPU counters
remotely, switch the __{mod,inc,dec} functions that modify the counters to
use cmpxchg.

To facilitate reviewing, functions are ordered in the text file, as:

__{mod,inc,dec}_{zone,node}_page_state
#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
{mod,inc,dec}_{zone,node}_page_state
#else
{mod,inc,dec}_{zone,node}_page_state
#endif

This patch defines the __ versions for the
CONFIG_HAVE_CMPXCHG_LOCAL case to be their non-"__" counterparts:

#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
{mod,inc,dec}_{zone,node}_page_state
__{mod,inc,dec}_{zone,node}_page_state = {mod,inc,dec}_{zone,node}_page_state
#else
{mod,inc,dec}_{zone,node}_page_state
__{mod,inc,dec}_{zone,node}_page_state
#endif

To test the performance difference, a page allocator microbenchmark:
https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/mm/bench/page_bench01.c
with loops=1000000 was used, on Intel Core i7-11850H @ 2.50GHz.

For the single_page_alloc_free test, which does

        /** Loop to measure **/
        for (i = 0; i < rec->loops; i++) {
                my_page = alloc_page(gfp_mask);
                if (unlikely(my_page == NULL))
                        return 0;
                __free_page(my_page);
        }

Unit is cycles.

Vanilla Patched Diff
115.25 117 1.4%

Link: https://lkml.kernel.org/r/20230320180745.733575720@redhat.com
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Aaron Tomlin <atomlin@atomlin.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Frederic Weisbecker <frederic@kernel.org>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: "Russell King (Oracle)" <linux@armlinux.org.uk>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/page_alloc.c
mm/vmstat.c

index de5731b0a31c877a21ce2de106e826389ce3ce15..e365cefc6c56cf148a5601a51b1b62707ee8776c 100644 (file)
@@ -8562,9 +8562,6 @@ static int page_alloc_cpu_dead(unsigned int cpu)
        /*
         * Zero the differential counters of the dead processor
         * so that the vm statistics are consistent.
-        *
-        * This is only okay since the processor is dead and cannot
-        * race with what we are doing.
         */
        cpu_vm_stats_fold(cpu);
 
index c28046371b451e0562951e64942e27bb5268fdbe..bac2970b985ae58f57cabe5d1e589afc5bd5da2a 100644 (file)
@@ -334,6 +334,188 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
        }
 }
 
+#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ *     0       No overstepping
+ *     1       Overstepping half of threshold
+ *     -1      Overstepping minus half of threshold
+ */
+static inline void mod_zone_state(struct zone *zone, enum zone_stat_item item,
+                                 long delta, int overstep_mode)
+{
+       struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
+       s8 __percpu *p = pcp->vm_stat_diff + item;
+       long o, n, t, z;
+
+       do {
+               z = 0;  /* overflow to zone counters */
+
+               /*
+                * The fetching of the stat_threshold is racy. We may apply
+                * a counter threshold to the wrong the cpu if we get
+                * rescheduled while executing here. However, the next
+                * counter update will apply the threshold again and
+                * therefore bring the counter under the threshold again.
+                *
+                * Most of the time the thresholds are the same anyways
+                * for all cpus in a zone.
+                */
+               t = this_cpu_read(pcp->stat_threshold);
+
+               o = this_cpu_read(*p);
+               n = delta + o;
+
+               if (abs(n) > t) {
+                       int os = overstep_mode * (t >> 1);
+
+                       /* Overflow must be added to zone counters */
+                       z = n + os;
+                       n = -os;
+               }
+       } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+       if (z)
+               zone_page_state_add(z, zone, item);
+}
+
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                        long delta)
+{
+       mod_zone_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+
+void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                          long delta)
+{
+       mod_zone_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(__mod_zone_page_state);
+
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+       mod_zone_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+
+void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+       mod_zone_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(__inc_zone_page_state);
+
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+       mod_zone_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+       mod_zone_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(__dec_zone_page_state);
+
+static inline void mod_node_state(struct pglist_data *pgdat,
+                                 enum node_stat_item item,
+                                 int delta, int overstep_mode)
+{
+       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
+       s8 __percpu *p = pcp->vm_node_stat_diff + item;
+       long o, n, t, z;
+
+       if (vmstat_item_in_bytes(item)) {
+               /*
+                * Only cgroups use subpage accounting right now; at
+                * the global level, these items still change in
+                * multiples of whole pages. Store them as pages
+                * internally to keep the per-cpu counters compact.
+                */
+               VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+               delta >>= PAGE_SHIFT;
+       }
+
+       do {
+               z = 0;  /* overflow to node counters */
+
+               /*
+                * The fetching of the stat_threshold is racy. We may apply
+                * a counter threshold to the wrong the cpu if we get
+                * rescheduled while executing here. However, the next
+                * counter update will apply the threshold again and
+                * therefore bring the counter under the threshold again.
+                *
+                * Most of the time the thresholds are the same anyways
+                * for all cpus in a node.
+                */
+               t = this_cpu_read(pcp->stat_threshold);
+
+               o = this_cpu_read(*p);
+               n = delta + o;
+
+               if (abs(n) > t) {
+                       int os = overstep_mode * (t >> 1);
+
+                       /* Overflow must be added to node counters */
+                       z = n + os;
+                       n = -os;
+               }
+       } while (this_cpu_cmpxchg(*p, o, n) != o);
+
+       if (z)
+               node_page_state_add(z, pgdat, item);
+}
+
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                       long delta)
+{
+       mod_node_state(pgdat, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_node_page_state);
+
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
+                                       long delta)
+{
+       mod_node_state(pgdat, item, delta, 0);
+}
+EXPORT_SYMBOL(__mod_node_page_state);
+
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
+{
+       mod_node_state(pgdat, item, 1, 1);
+}
+
+void inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+       mod_node_state(page_pgdat(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_node_page_state);
+
+void __inc_node_page_state(struct page *page, enum node_stat_item item)
+{
+       mod_node_state(page_pgdat(page), item, 1, 1);
+}
+EXPORT_SYMBOL(__inc_node_page_state);
+
+void dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+       mod_node_state(page_pgdat(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_node_page_state);
+
+void __dec_node_page_state(struct page *page, enum node_stat_item item)
+{
+       mod_node_state(page_pgdat(page), item, -1, -1);
+}
+EXPORT_SYMBOL(__dec_node_page_state);
+#else
 /*
  * For use when we know that interrupts are disabled,
  * or when we know that preemption is disabled and that
@@ -541,149 +723,6 @@ void __dec_node_page_state(struct page *page, enum node_stat_item item)
 }
 EXPORT_SYMBOL(__dec_node_page_state);
 
-#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
-/*
- * If we have cmpxchg_local support then we do not need to incur the overhead
- * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
- *
- * mod_state() modifies the zone counter state through atomic per cpu
- * operations.
- *
- * Overstep mode specifies how overstep should handled:
- *     0       No overstepping
- *     1       Overstepping half of threshold
- *     -1      Overstepping minus half of threshold
-*/
-static inline void mod_zone_state(struct zone *zone,
-       enum zone_stat_item item, long delta, int overstep_mode)
-{
-       struct per_cpu_zonestat __percpu *pcp = zone->per_cpu_zonestats;
-       s8 __percpu *p = pcp->vm_stat_diff + item;
-       long o, n, t, z;
-
-       do {
-               z = 0;  /* overflow to zone counters */
-
-               /*
-                * The fetching of the stat_threshold is racy. We may apply
-                * a counter threshold to the wrong the cpu if we get
-                * rescheduled while executing here. However, the next
-                * counter update will apply the threshold again and
-                * therefore bring the counter under the threshold again.
-                *
-                * Most of the time the thresholds are the same anyways
-                * for all cpus in a zone.
-                */
-               t = this_cpu_read(pcp->stat_threshold);
-
-               o = this_cpu_read(*p);
-               n = delta + o;
-
-               if (abs(n) > t) {
-                       int os = overstep_mode * (t >> 1) ;
-
-                       /* Overflow must be added to zone counters */
-                       z = n + os;
-                       n = -os;
-               }
-       } while (this_cpu_cmpxchg(*p, o, n) != o);
-
-       if (z)
-               zone_page_state_add(z, zone, item);
-}
-
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
-                        long delta)
-{
-       mod_zone_state(zone, item, delta, 0);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-
-void inc_zone_page_state(struct page *page, enum zone_stat_item item)
-{
-       mod_zone_state(page_zone(page), item, 1, 1);
-}
-EXPORT_SYMBOL(inc_zone_page_state);
-
-void dec_zone_page_state(struct page *page, enum zone_stat_item item)
-{
-       mod_zone_state(page_zone(page), item, -1, -1);
-}
-EXPORT_SYMBOL(dec_zone_page_state);
-
-static inline void mod_node_state(struct pglist_data *pgdat,
-       enum node_stat_item item, int delta, int overstep_mode)
-{
-       struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
-       s8 __percpu *p = pcp->vm_node_stat_diff + item;
-       long o, n, t, z;
-
-       if (vmstat_item_in_bytes(item)) {
-               /*
-                * Only cgroups use subpage accounting right now; at
-                * the global level, these items still change in
-                * multiples of whole pages. Store them as pages
-                * internally to keep the per-cpu counters compact.
-                */
-               VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
-               delta >>= PAGE_SHIFT;
-       }
-
-       do {
-               z = 0;  /* overflow to node counters */
-
-               /*
-                * The fetching of the stat_threshold is racy. We may apply
-                * a counter threshold to the wrong the cpu if we get
-                * rescheduled while executing here. However, the next
-                * counter update will apply the threshold again and
-                * therefore bring the counter under the threshold again.
-                *
-                * Most of the time the thresholds are the same anyways
-                * for all cpus in a node.
-                */
-               t = this_cpu_read(pcp->stat_threshold);
-
-               o = this_cpu_read(*p);
-               n = delta + o;
-
-               if (abs(n) > t) {
-                       int os = overstep_mode * (t >> 1) ;
-
-                       /* Overflow must be added to node counters */
-                       z = n + os;
-                       n = -os;
-               }
-       } while (this_cpu_cmpxchg(*p, o, n) != o);
-
-       if (z)
-               node_page_state_add(z, pgdat, item);
-}
-
-void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
-                                       long delta)
-{
-       mod_node_state(pgdat, item, delta, 0);
-}
-EXPORT_SYMBOL(mod_node_page_state);
-
-void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
-{
-       mod_node_state(pgdat, item, 1, 1);
-}
-
-void inc_node_page_state(struct page *page, enum node_stat_item item)
-{
-       mod_node_state(page_pgdat(page), item, 1, 1);
-}
-EXPORT_SYMBOL(inc_node_page_state);
-
-void dec_node_page_state(struct page *page, enum node_stat_item item)
-{
-       mod_node_state(page_pgdat(page), item, -1, -1);
-}
-EXPORT_SYMBOL(dec_node_page_state);
-#else
 /*
  * Use interrupt disable to serialize counter updates
  */