]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm,memcg: provide per-cgroup counters for NUMA balancing operations
authorKaiyang Zhao <kaiyang2@cs.cmu.edu>
Wed, 14 Aug 2024 17:42:27 +0000 (17:42 +0000)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 4 Sep 2024 04:15:36 +0000 (21:15 -0700)
The ability to observe the demotion and promotion decisions made by the
kernel on a per-cgroup basis is important for monitoring and tuning
containerized workloads on machines equipped with tiered memory.

Different containers in the system may experience drastically different
memory tiering actions that cannot be distinguished from the global
counters alone.

For example, a container running a workload that has a much hotter memory
accesses will likely see more promotions and fewer demotions, potentially
depriving a colocated container of top tier memory to such an extent that
its performance degrades unacceptably.

For another example, some containers may exhibit longer periods between
data reuse, causing much more numa_hint_faults than numa_pages_migrated.
In this case, tuning hot_threshold_ms may be appropriate, but the signal
can easily be lost if only global counters are available.

In the long term, we hope to introduce per-cgroup control of promotion and
demotion actions to implement memory placement policies in tiering.

This patch set adds seven counters to memory.stat in a cgroup:
numa_pages_migrated, numa_pte_updates, numa_hint_faults, pgdemote_kswapd,
pgdemote_khugepaged, pgdemote_direct and pgpromote_success.  pgdemote_*
and pgpromote_success are also available in memory.numa_stat.

count_memcg_events_mm() is added to count multiple event occurrences at
once, and get_mem_cgroup_from_folio() is added because we need to get a
reference to the memcg of a folio before it's migrated to track
numa_pages_migrated.  The accounting of PGDEMOTE_* is moved to
shrink_inactive_list() before being changed to per-cgroup.

[kaiyang2@cs.cmu.edu: add documentation of the memcg counters in cgroup-v2.rst]
Link: https://lkml.kernel.org/r/20240814235122.252309-1-kaiyang2@cs.cmu.edu
Link: https://lkml.kernel.org/r/20240814174227.30639-1-kaiyang2@cs.cmu.edu
Signed-off-by: Kaiyang Zhao <kaiyang2@cs.cmu.edu>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Wei Xu <weixugc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Documentation/admin-guide/cgroup-v2.rst
include/linux/memcontrol.h
include/linux/vmstat.h
mm/memcontrol.c
mm/memory.c
mm/mempolicy.c
mm/migrate.c
mm/vmscan.c

index f0499884124d2dacf920b0eb4d71b18e5e9df834..d3344218010cd7a8d6405ddab44ce3141a3f3cb3 100644 (file)
@@ -1617,6 +1617,25 @@ The following nested keys are defined.
                Usually because failed to allocate some continuous swap space
                for the huge page.
 
+         numa_pages_migrated (npn)
+               Number of pages migrated by NUMA balancing.
+
+         numa_pte_updates (npn)
+               Number of pages whose page table entries are modified by
+               NUMA balancing to produce NUMA hinting faults on access.
+
+         numa_hint_faults (npn)
+               Number of NUMA hinting faults.
+
+         pgdemote_kswapd
+               Number of pages demoted by kswapd.
+
+         pgdemote_direct
+               Number of pages demoted directly.
+
+         pgdemote_khugepaged
+               Number of pages demoted by khugepaged.
+
   memory.numa_stat
        A read-only nested-keyed file which exists on non-root cgroups.
 
index ed170399179acbbcaa29fb0bc8f24de35744500d..fe05fdb92779510e4661cdc605c6a926162a0d39 100644 (file)
@@ -784,6 +784,8 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 struct mem_cgroup *get_mem_cgroup_from_current(void);
 
+struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio);
+
 struct lruvec *folio_lruvec_lock(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irq(struct folio *folio);
 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
@@ -1028,8 +1030,8 @@ static inline void count_memcg_folio_events(struct folio *folio,
                count_memcg_events(memcg, idx, nr);
 }
 
-static inline void count_memcg_event_mm(struct mm_struct *mm,
-                                       enum vm_event_item idx)
+static inline void count_memcg_events_mm(struct mm_struct *mm,
+                                       enum vm_event_item idx, unsigned long count)
 {
        struct mem_cgroup *memcg;
 
@@ -1039,10 +1041,16 @@ static inline void count_memcg_event_mm(struct mm_struct *mm,
        rcu_read_lock();
        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
        if (likely(memcg))
-               count_memcg_events(memcg, idx, 1);
+               count_memcg_events(memcg, idx, count);
        rcu_read_unlock();
 }
 
+static inline void count_memcg_event_mm(struct mm_struct *mm,
+                                       enum vm_event_item idx)
+{
+       count_memcg_events_mm(mm, idx, 1);
+}
+
 static inline void memcg_memory_event(struct mem_cgroup *memcg,
                                      enum memcg_memory_event event)
 {
@@ -1262,6 +1270,11 @@ static inline struct mem_cgroup *get_mem_cgroup_from_current(void)
        return NULL;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+       return NULL;
+}
+
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css)
 {
@@ -1484,6 +1497,11 @@ static inline void count_memcg_folio_events(struct folio *folio,
 {
 }
 
+static inline void count_memcg_events_mm(struct mm_struct *mm,
+                                       enum vm_event_item idx, unsigned long count)
+{
+}
+
 static inline
 void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx)
 {
index 9eb77c9007e6231bdc8753f944455ccf26d39c87..d2761bf8ff32c9a536a267286f0ab0f1becec151 100644 (file)
@@ -32,6 +32,7 @@ struct reclaim_stat {
        unsigned nr_ref_keep;
        unsigned nr_unmap_fail;
        unsigned nr_lazyfree_fail;
+       unsigned nr_demoted;
 };
 
 /* Stat data for system wide items */
index 1decbd38216e5f95d8b950b99d82cbf3062022f5..0370ce03ce4cc50657cced16aea8cfb80bc72f33 100644 (file)
@@ -304,6 +304,12 @@ static const unsigned int memcg_node_stat_items[] = {
 #ifdef CONFIG_SWAP
        NR_SWAPCACHE,
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+       PGPROMOTE_SUCCESS,
+#endif
+       PGDEMOTE_KSWAPD,
+       PGDEMOTE_DIRECT,
+       PGDEMOTE_KHUGEPAGED,
 };
 
 static const unsigned int memcg_stat_items[] = {
@@ -436,6 +442,11 @@ static const unsigned int memcg_vm_event_stat[] = {
        THP_SWPOUT,
        THP_SWPOUT_FALLBACK,
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+       NUMA_PAGE_MIGRATE,
+       NUMA_PTE_UPDATES,
+       NUMA_HINT_FAULTS,
+#endif
 };
 
 #define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
@@ -935,6 +946,24 @@ again:
        return memcg;
 }
 
+/**
+ * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
+ * @folio: folio from which memcg should be extracted.
+ */
+struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+       struct mem_cgroup *memcg = folio_memcg(folio);
+
+       if (mem_cgroup_disabled())
+               return NULL;
+
+       rcu_read_lock();
+       if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
+               memcg = root_mem_cgroup;
+       rcu_read_unlock();
+       return memcg;
+}
+
 /**
  * mem_cgroup_iter - iterate over memory cgroup hierarchy
  * @root: hierarchy root
@@ -1340,6 +1369,13 @@ static const struct memory_stat memory_stats[] = {
        { "workingset_restore_anon",    WORKINGSET_RESTORE_ANON         },
        { "workingset_restore_file",    WORKINGSET_RESTORE_FILE         },
        { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
+
+       { "pgdemote_kswapd",            PGDEMOTE_KSWAPD         },
+       { "pgdemote_direct",            PGDEMOTE_DIRECT         },
+       { "pgdemote_khugepaged",        PGDEMOTE_KHUGEPAGED     },
+#ifdef CONFIG_NUMA_BALANCING
+       { "pgpromote_success",          PGPROMOTE_SUCCESS       },
+#endif
 };
 
 /* The actual unit of the state item, not the same as the output unit */
@@ -1364,6 +1400,9 @@ static int memcg_page_state_output_unit(int item)
        /*
         * Workingset state is actually in pages, but we export it to userspace
         * as a scalar count of events, so special case it here.
+        *
+        * Demotion and promotion activities are exported in pages, consistent
+        * with their global counterparts.
         */
        switch (item) {
        case WORKINGSET_REFAULT_ANON:
@@ -1373,6 +1412,12 @@ static int memcg_page_state_output_unit(int item)
        case WORKINGSET_RESTORE_ANON:
        case WORKINGSET_RESTORE_FILE:
        case WORKINGSET_NODERECLAIM:
+       case PGDEMOTE_KSWAPD:
+       case PGDEMOTE_DIRECT:
+       case PGDEMOTE_KHUGEPAGED:
+#ifdef CONFIG_NUMA_BALANCING
+       case PGPROMOTE_SUCCESS:
+#endif
                return 1;
        default:
                return memcg_page_state_unit(item);
index d1c741a39630f2974b28d69ec03f90f7939b98f2..c31ea300cdf6d6c2c63c6a2db87c73bebaab5dab 100644 (file)
@@ -5236,6 +5236,9 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
        vma_set_access_pid_bit(vma);
 
        count_vm_numa_event(NUMA_HINT_FAULTS);
+#ifdef CONFIG_NUMA_BALANCING
+       count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
+#endif
        if (folio_nid(folio) == numa_node_id()) {
                count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
                *flags |= TNF_FAULT_LOCAL;
index b3b5f376471f4e3663f844c5f0b85340acfcb4b4..b646fab3e45e101fd741eaba7bbdc6614e30562b 100644 (file)
@@ -676,8 +676,10 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
        tlb_gather_mmu(&tlb, vma->vm_mm);
 
        nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
-       if (nr_updated > 0)
+       if (nr_updated > 0) {
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+               count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
+       }
 
        tlb_finish_mmu(&tlb);
 
index 66a5f73ebfdf56b1d1d4c4e8f7be346acdfc3341..76cfc6c42eb38a0cacfa6d89188e724bd121e050 100644 (file)
@@ -2614,6 +2614,8 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
        int nr_remaining;
        unsigned int nr_succeeded;
        LIST_HEAD(migratepages);
+       struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
+       struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
 
        list_add(&folio->lru, &migratepages);
        nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
@@ -2623,12 +2625,13 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
                putback_movable_pages(&migratepages);
        if (nr_succeeded) {
                count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
+               count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded);
                if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
                    && !node_is_toptier(folio_nid(folio))
                    && node_is_toptier(node))
-                       mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
-                                           nr_succeeded);
+                       mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded);
        }
+       mem_cgroup_put(memcg);
        BUG_ON(!list_empty(&migratepages));
        return nr_remaining ? -EAGAIN : 0;
 }
index 1b6542aaf81e2fc2dab5b06beeac7981c0618383..1b1fad0c1e113a90da9e23cc3e156bf3dc564f62 100644 (file)
@@ -1016,9 +1016,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
                      (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
                      &nr_succeeded);
 
-       mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
-                           nr_succeeded);
-
        return nr_succeeded;
 }
 
@@ -1516,7 +1513,8 @@ keep:
        /* 'folio_list' is always empty here */
 
        /* Migrate folios selected for demotion */
-       nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
+       stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
+       nr_reclaimed += stat->nr_demoted;
        /* Folios that could not be demoted are still in @demote_folios */
        if (!list_empty(&demote_folios)) {
                /* Folios which weren't demoted go back on @folio_list */
@@ -1982,6 +1980,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
        spin_lock_irq(&lruvec->lru_lock);
        move_folios_to_lru(lruvec, &folio_list);
 
+       __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+                                       stat.nr_demoted);
        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
        item = PGSTEAL_KSWAPD + reclaimer_offset();
        if (!cgroup_reclaim(sc))