From 98c9389042f4d1e6aa73fbaf79e2e962c9497fc5 Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Fri, 4 Apr 2025 13:11:03 +0200 Subject: [PATCH 01/16] mm/compaction: reduce the difference between low and high watermarks Reduce the diff between low and high watermarks when compaction proactiveness is set to high. This allows users who set the proactiveness really high to have more stable fragmentation score over time. Link: https://lkml.kernel.org/r/20250404111103.1994507-3-mclapinski@google.com Signed-off-by: Michal Clapinski Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/sysctl/vm.rst | 6 ++++++ mm/compaction.c | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 8290177b4f75..b325bfbc2611 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -131,6 +131,12 @@ to latency spikes in unsuspecting applications. The kernel employs various heuristics to avoid wasting CPU cycles if it detects that proactive compaction is not being effective. +Setting the value above 80 will, in addition to lowering the acceptable level +of fragmentation, make the compaction code more sensitive to increases in +fragmentation, i.e. compaction will trigger more often, but reduce +fragmentation by a smaller amount. +This makes the fragmentation level more stable over time. + Be careful when setting it to extreme values like 100, as that may cause excessive background compaction activity. diff --git a/mm/compaction.c b/mm/compaction.c index f9ee06d55726..fe51a73b91a7 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2249,10 +2249,11 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) static unsigned int fragmentation_score_wmark(bool low) { - unsigned int wmark_low; + unsigned int wmark_low, leeway; wmark_low = 100U - sysctl_compaction_proactiveness; - return low ? wmark_low : min(wmark_low + 10, 100U); + leeway = min(10U, wmark_low / 2); + return low ? wmark_low : min(wmark_low + leeway, 100U); } static bool should_proactive_compact_node(pg_data_t *pgdat) -- 2.51.0 From aa8d89d1472b010cbcda7288a4da76e4852a260d Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 2 Apr 2025 22:33:26 -0700 Subject: [PATCH 02/16] memcg: vmalloc: simplify MEMCG_VMALLOC updates The vmalloc region can either be charged to a single memcg or none. At the moment kernel traverses all the pages backing the vmalloc region to update the MEMCG_VMALLOC stat. However there is no need to look at all the pages as all those pages will be charged to a single memcg or none. Simplify the MEMCG_VMALLOC update by just looking at the first page of the vmalloc region. [shakeel.butt@linux.dev: add comment] Link: https://lkml.kernel.org/r/bmlkdbqgwboyqrnxyom7n52fjmo76ux77jhqw5odc6c6dfon3h@zdylwtmlywbt Link: https://lkml.kernel.org/r/20250403053326.26860-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Yosry Ahmed Cc: Johannes Weiner Cc: Muchun Song Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2d7511654831..44b735a4573a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3371,12 +3371,13 @@ void vfree(const void *addr) if (unlikely(vm->flags & VM_FLUSH_RESET_PERMS)) vm_reset_perms(vm); + /* All pages of vm should be charged to same memcg, so use first one. */ + if (vm->nr_pages && !(vm->flags & VM_MAP_PUT_PAGES)) + mod_memcg_page_state(vm->pages[0], MEMCG_VMALLOC, -vm->nr_pages); for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; BUG_ON(!page); - if (!(vm->flags & VM_MAP_PUT_PAGES)) - mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* * High-order allocs for huge vmallocs are split, so * can be freed as an array of order-0 allocations @@ -3672,12 +3673,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, node, page_order, nr_small_pages, area->pages); atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (gfp_mask & __GFP_ACCOUNT) { - int i; - - for (i = 0; i < area->nr_pages; i++) - mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC, 1); - } + /* All pages of vm should be charged to same memcg, so use first one. */ + if (gfp_mask & __GFP_ACCOUNT && area->nr_pages) + mod_memcg_page_state(area->pages[0], MEMCG_VMALLOC, + area->nr_pages); /* * If not enough pages were obtained to accomplish an -- 2.51.0 From e56fa8f5e108676393a363201819a094b0991b3c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:05 -0700 Subject: [PATCH 03/16] memcg: remove root memcg check from refill_stock refill_stock can not be called with root memcg, so there is no need to check it. Instead add a warning if root is ever passed to it. Link: https://lkml.kernel.org/r/20250404013913.1663035-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20250404013913.1663035-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b16b5b807d7c..ae1e953cead7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1893,13 +1893,13 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { unsigned long flags; + VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); + if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) { /* * In case of unlikely failure to lock percpu stock_lock * uncharge memcg directly. */ - if (mem_cgroup_is_root(memcg)) - return; page_counter_uncharge(&memcg->memory, nr_pages); if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); -- 2.51.0 From 65d2d15f41c603dc8f11ff0d1bcda2dd847a65b5 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:06 -0700 Subject: [PATCH 04/16] memcg: decouple drain_obj_stock from local stock Currently drain_obj_stock() can potentially call __refill_stock which accesses local cpu stock and thus requires memcg stock's local_lock. However if we look at the code paths leading to drain_obj_stock(), there is never a good reason to refill the memcg stock at all from it. At the moment, drain_obj_stock can be called from reclaim, hotplug cpu teardown, mod_objcg_state() and refill_obj_stock(). For reclaim and hotplug there is no need to refill. For the other two paths, most probably the newly switched objcg would be used in near future and thus no need to refill stock with the older objcg. In addition, __refill_stock() from drain_obj_stock() happens on rare cases, so performance is not really an issue. Let's just uncharge directly instead of refill which will also decouple drain_obj_stock from local cpu stock and local_lock requirements. Link: https://lkml.kernel.org/r/20250404013913.1663035-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ae1e953cead7..52be78515d70 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2876,7 +2876,12 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); - __refill_stock(memcg, nr_pages); + if (!mem_cgroup_is_root(memcg)) { + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, + nr_pages); + } css_put(&memcg->css); } -- 2.51.0 From 89f342af660332656bbade4cc8a919a40674ac41 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:07 -0700 Subject: [PATCH 05/16] memcg: introduce memcg_uncharge At multiple places in memcontrol.c, the memory and memsw page counters are being uncharged. This is error-prone. Let's move the functionality to a newly introduced memcg_uncharge and call it from all those places. Link: https://lkml.kernel.org/r/20250404013913.1663035-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 52be78515d70..dfb3f14c1178 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1822,6 +1822,13 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages, return ret; } +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + page_counter_uncharge(&memcg->memory, nr_pages); + if (do_memsw_account()) + page_counter_uncharge(&memcg->memsw, nr_pages); +} + /* * Returns stocks cached in percpu and reset cached information. */ @@ -1834,10 +1841,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) return; if (stock_pages) { - page_counter_uncharge(&old->memory, stock_pages); - if (do_memsw_account()) - page_counter_uncharge(&old->memsw, stock_pages); - + memcg_uncharge(old, stock_pages); WRITE_ONCE(stock->nr_pages, 0); } @@ -1900,9 +1904,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) * In case of unlikely failure to lock percpu stock_lock * uncharge memcg directly. */ - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, nr_pages); + memcg_uncharge(memcg, nr_pages); return; } __refill_stock(memcg, nr_pages); @@ -2876,12 +2878,8 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); - if (!mem_cgroup_is_root(memcg)) { - page_counter_uncharge(&memcg->memory, nr_pages); - if (do_memsw_account()) - page_counter_uncharge(&memcg->memsw, - nr_pages); - } + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); css_put(&memcg->css); } @@ -4702,9 +4700,7 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug) static void uncharge_batch(const struct uncharge_gather *ug) { if (ug->nr_memory) { - page_counter_uncharge(&ug->memcg->memory, ug->nr_memory); - if (do_memsw_account()) - page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory); + memcg_uncharge(ug->memcg, ug->nr_memory); if (ug->nr_kmem) { mod_memcg_state(ug->memcg, MEMCG_KMEM, -ug->nr_kmem); memcg1_account_kmem(ug->memcg, -ug->nr_kmem); -- 2.51.0 From cbc091441d3adce9a1fbcd0a0a84bd6abe077a5b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:08 -0700 Subject: [PATCH 06/16] memcg: manually inline __refill_stock There are no more multiple callers of __refill_stock(), so simply inline it to refill_stock(). Link: https://lkml.kernel.org/r/20250404013913.1663035-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dfb3f14c1178..03a2be6d4a67 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1871,14 +1871,22 @@ static void drain_local_stock(struct work_struct *dummy) obj_cgroup_put(old); } -/* - * Cache charges(val) to local per_cpu area. - * This will be consumed by consume_stock() function, later. - */ -static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; unsigned int stock_pages; + unsigned long flags; + + VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); + + if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + /* + * In case of unlikely failure to lock percpu stock_lock + * uncharge memcg directly. + */ + memcg_uncharge(memcg, nr_pages); + return; + } stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ @@ -1891,23 +1899,7 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); -} - -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - unsigned long flags; - - VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); - if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) { - /* - * In case of unlikely failure to lock percpu stock_lock - * uncharge memcg directly. - */ - memcg_uncharge(memcg, nr_pages); - return; - } - __refill_stock(memcg, nr_pages); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } -- 2.51.0 From b6d0471117daec360bad2b881c1e2d0f4d2c4d3b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:09 -0700 Subject: [PATCH 07/16] memcg: no refilling stock from obj_cgroup_release obj_cgroup_release is called when all the references to the objcg have been released i.e. no more memory objects are pointing to it. Most probably objcg->memcg will be pointing to some ancestor memcg. In obj_cgroup_release(), the kernel calls obj_cgroup_uncharge_pages() which refills the local stock. There is no need to refill the local stock with some ancestor memcg and flush the local stock. Let's decouple obj_cgroup_release() from the local stock by uncharging instead of refilling. One additional benefit of this change is that it removes the requirement to only call obj_cgroup_put() outside of local_lock. Link: https://lkml.kernel.org/r/20250404013913.1663035-6-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 03a2be6d4a67..df52084e90f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -129,8 +129,7 @@ bool mem_cgroup_kmem_disabled(void) return cgroup_memory_nokmem; } -static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, - unsigned int nr_pages); +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -163,8 +162,16 @@ static void obj_cgroup_release(struct percpu_ref *ref) WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); nr_pages = nr_bytes >> PAGE_SHIFT; - if (nr_pages) - obj_cgroup_uncharge_pages(objcg, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + memcg1_account_kmem(memcg, -nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); + mem_cgroup_put(memcg); + } spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); -- 2.51.0 From ae51c775aa2b1fd4fbaf781740ec8762292bec3a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:10 -0700 Subject: [PATCH 08/16] memcg: do obj_cgroup_put inside drain_obj_stock Previously we could not call obj_cgroup_put() inside the local lock because on the put on the last reference, the release function obj_cgroup_release() may try to re-acquire the local lock. However that chain has been broken. Now simply do obj_cgroup_put() inside drain_obj_stock() instead of returning the old objcg. Link: https://lkml.kernel.org/r/20250404013913.1663035-7-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index df52084e90f4..7988a42b29bf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1785,7 +1785,7 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { }; static DEFINE_MUTEX(percpu_charge_mutex); -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); +static void drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); @@ -1859,7 +1859,6 @@ static void drain_stock(struct memcg_stock_pcp *stock) static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; unsigned long flags; /* @@ -1870,12 +1869,11 @@ static void drain_local_stock(struct work_struct *dummy) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - old = drain_obj_stock(stock); + drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); } static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -1958,18 +1956,16 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old; unsigned long flags; stock = &per_cpu(memcg_stock, cpu); /* drain_obj_stock requires stock_lock */ local_lock_irqsave(&memcg_stock.stock_lock, flags); - old = drain_obj_stock(stock); + drain_obj_stock(stock); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); drain_stock(stock); - obj_cgroup_put(old); return 0; } @@ -2766,24 +2762,20 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) } /* Replace the stock objcg with objcg, return the old objcg */ -static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock, - struct obj_cgroup *objcg) +static void replace_stock_objcg(struct memcg_stock_pcp *stock, + struct obj_cgroup *objcg) { - struct obj_cgroup *old = NULL; - - old = drain_obj_stock(stock); + drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; WRITE_ONCE(stock->cached_objcg, objcg); - return old; } static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; unsigned long flags; int *bytes; @@ -2796,7 +2788,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, * changes. */ if (READ_ONCE(stock->cached_objcg) != objcg) { - old = replace_stock_objcg(stock, objcg); + replace_stock_objcg(stock, objcg); stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ @@ -2837,7 +2829,6 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, __mod_objcg_mlstate(objcg, pgdat, idx, nr); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -2859,12 +2850,12 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) return ret; } -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) +static void drain_obj_stock(struct memcg_stock_pcp *stock) { struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) - return NULL; + return; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; @@ -2917,11 +2908,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) } WRITE_ONCE(stock->cached_objcg, NULL); - /* - * The `old' objects needs to be released by the caller via - * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. - */ - return old; + obj_cgroup_put(old); } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -2943,7 +2930,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; unsigned long flags; unsigned int nr_pages = 0; @@ -2951,7 +2937,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - old = replace_stock_objcg(stock, objcg); + replace_stock_objcg(stock, objcg); allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; @@ -2962,7 +2948,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); -- 2.51.0 From 42a1910cfd23c46307a000a81127e02df247284c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:11 -0700 Subject: [PATCH 09/16] memcg: use __mod_memcg_state in drain_obj_stock For non-PREEMPT_RT kernels, drain_obj_stock() is always called with irq disabled, so we can use __mod_memcg_state() instead of mod_memcg_state(). For PREEMPT_RT, we need to add memcg_stats_[un]lock in __mod_memcg_state(). Link: https://lkml.kernel.org/r/20250404013913.1663035-8-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7988a42b29bf..33aeddfff0ba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -710,10 +710,12 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; + memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); memcg_rstat_updated(memcg, val); trace_mod_memcg_state(memcg, idx, val); + memcg_stats_unlock(); } #ifdef CONFIG_MEMCG_V1 @@ -2866,7 +2868,7 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) memcg = get_mem_cgroup_from_objcg(old); - mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + __mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); if (!mem_cgroup_is_root(memcg)) memcg_uncharge(memcg, nr_pages); -- 2.51.0 From bc730030f956fe83e94625c546c30ce6bae32d6b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 3 Apr 2025 18:39:12 -0700 Subject: [PATCH 10/16] memcg: combine slab obj stock charging and accounting When handing slab objects, we use obj_cgroup_[un]charge() for (un)charging and mod_objcg_state() to account NR_SLAB_[UN]RECLAIMABLE_B. All these operations use the percpu stock for performance. However with the calls being separate, the stock_lock is taken twice in each case. By refactoring the code, we can turn mod_objcg_state() into __account_obj_stock() which is called on a stock that's already locked and validated. On the charging side we can call this function from consume_obj_stock() when it succeeds, and refill_obj_stock() in the fallback. We just expand parameters of these functions as necessary. The uncharge side from __memcg_slab_free_hook() is just the call to refill_obj_stock(). Other callers of obj_cgroup_[un]charge() (i.e. not slab) simply pass the extra parameters as NULL/zeroes to skip the __account_obj_stock() operation. In __memcg_slab_post_alloc_hook() we now charge each object separately, but that's not a problem as we did call mod_objcg_state() for each object separately, and most allocations are non-bulk anyway. This could be improved by batching all operations until slab_pgdat(slab) changes. Some preliminary benchmarking with a kfree(kmalloc()) loop of 10M iterations with/without __GFP_ACCOUNT: Before the patch: kmalloc/kfree !memcg: 581390144 cycles kmalloc/kfree memcg: 783689984 cycles After the patch: kmalloc/kfree memcg: 658723808 cycles More than half of the overhead of __GFP_ACCOUNT relative to non-accounted case seems eliminated. Link: https://lkml.kernel.org/r/20250404013913.1663035-9-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Signed-off-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 77 +++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33aeddfff0ba..3bb02f672e39 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2774,25 +2774,17 @@ static void replace_stock_objcg(struct memcg_stock_pcp *stock, WRITE_ONCE(stock->cached_objcg, objcg); } -static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static void __account_obj_stock(struct obj_cgroup *objcg, + struct memcg_stock_pcp *stock, int nr, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - unsigned long flags; int *bytes; - local_lock_irqsave(&memcg_stock.stock_lock, flags); - stock = this_cpu_ptr(&memcg_stock); - /* * Save vmstat data in stock and skip vmstat array update unless - * accumulating over a page of vmstat data or when pgdat or idx - * changes. + * accumulating over a page of vmstat data or when pgdat changes. */ - if (READ_ONCE(stock->cached_objcg) != objcg) { - replace_stock_objcg(stock, objcg); - stock->cached_pgdat = pgdat; - } else if (stock->cached_pgdat != pgdat) { + if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ struct pglist_data *oldpg = stock->cached_pgdat; @@ -2829,11 +2821,10 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, } if (nr) __mod_objcg_mlstate(objcg, pgdat, idx, nr); - - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } -static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + struct pglist_data *pgdat, enum node_stat_item idx) { struct memcg_stock_pcp *stock; unsigned long flags; @@ -2845,6 +2836,9 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; + + if (pgdat) + __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); @@ -2929,7 +2923,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, } static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - bool allow_uncharge) + bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, + enum node_stat_item idx) { struct memcg_stock_pcp *stock; unsigned long flags; @@ -2944,6 +2939,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } stock->nr_bytes += nr_bytes; + if (pgdat) + __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); + if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { nr_pages = stock->nr_bytes >> PAGE_SHIFT; stock->nr_bytes &= (PAGE_SIZE - 1); @@ -2955,12 +2953,13 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, obj_cgroup_uncharge_pages(objcg, nr_pages); } -int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, + struct pglist_data *pgdat, enum node_stat_item idx) { unsigned int nr_pages, nr_bytes; int ret; - if (consume_obj_stock(objcg, size)) + if (likely(consume_obj_stock(objcg, size, pgdat, idx))) return 0; /* @@ -2993,15 +2992,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) nr_pages += 1; ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); - if (!ret && nr_bytes) - refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); + if (!ret && (nr_bytes || pgdat)) + refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, + false, size, pgdat, idx); return ret; } +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ + return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); +} + void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size, true); + refill_obj_stock(objcg, size, true, 0, NULL, 0); } static inline size_t obj_full_size(struct kmem_cache *s) @@ -3053,23 +3058,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, return false; } - if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) - return false; - for (i = 0; i < size; i++) { slab = virt_to_slab(p[i]); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } + /* + * if we fail and size is 1, memcg_alloc_abort_single() will + * just free the object, which is ok as we have not assigned + * objcg to its obj_ext yet + * + * for larger sizes, kmem_cache_free_bulk() will uncharge + * any objects that were already charged and obj_ext assigned + * + * TODO: we could batch this until slab_pgdat(slab) changes + * between iterations, with a more complicated undo + */ + if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), + slab_pgdat(slab), cache_vmstat_idx(s))) + return false; + off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); slab_obj_exts(slab)[off].objcg = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); } return true; @@ -3078,6 +3092,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts) { + size_t obj_size = obj_full_size(s); + for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; @@ -3088,9 +3104,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, continue; obj_exts[off].objcg = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); + refill_obj_stock(objcg, obj_size, true, -obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); obj_cgroup_put(objcg); } } -- 2.51.0 From ac26920d58224a6d5c78ad3ff0d58fd7faa775c7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:13 -0700 Subject: [PATCH 11/16] memcg: manually inline replace_stock_objcg The replace_stock_objcg() is being called by only refill_obj_stock, so manually inline it. Link: https://lkml.kernel.org/r/20250404013913.1663035-10-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3bb02f672e39..aebb1f2c8657 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2763,17 +2763,6 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } -/* Replace the stock objcg with objcg, return the old objcg */ -static void replace_stock_objcg(struct memcg_stock_pcp *stock, - struct obj_cgroup *objcg) -{ - drain_obj_stock(stock); - obj_cgroup_get(objcg); - stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) - ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - WRITE_ONCE(stock->cached_objcg, objcg); -} - static void __account_obj_stock(struct obj_cgroup *objcg, struct memcg_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) @@ -2934,7 +2923,12 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - replace_stock_objcg(stock, objcg); + drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + WRITE_ONCE(stock->cached_objcg, objcg); + allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; -- 2.51.0 From 9c1c38bcdc9215f0bd231591b662bd0fbe8b7045 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:21 +0800 Subject: [PATCH 12/16] mm: swap: rename __swap_[entry/entries]_free[_locked] to swap_[entry/entries]_put[_locked] Patch series "Minor cleanups and improvements to swap freeing code", v4. This series contains some cleanups and improvements which are made during learning swapfile. Here is a summary of the changes: 1. Function naming improvments. - Use "put" instead of "free" to name functions which only do actual free when count drops to zero. - Use "entry" to name function only frees one swap slot. Use "entries" to name function could may free multi swap slots within one cluster. Use "_nr" suffix to name function which could free multi swap slots spanning cross multi clusters. 2. Eliminate the need to set swap slot to intermediate SWAP_HAS_CACHE value before do actual free by using swap_entry_range_free() 3. Add helpers swap_entries_put_map() and swap_entries_put_cache() as a general-purpose routine to free swap entries within a single cluster which will try batch-remove first and fallback to put eatch entry indvidually with cluster lock acquired/released only once. By using these helpers, we could remove repeated code, levarage batch-remove in more cases and aoivd to acquire/release cluster lock for each single swap entry. This patch (of 8): In __swap_entry_free[_locked] and __swap_entries_free, we decrease count first and only free swap entry if count drops to zero. This behavior is more akin to a put() operation rather than a free() operation. Therefore, rename these functions with "put" instead of "free". Additionally, add "_nr" suffix to swap_entries_put to indicate the input range may span swap clusters. Link: https://lkml.kernel.org/r/20250325162528.68385-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250325162528.68385-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f214843612dc..ea01cf05b0cb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1355,9 +1355,9 @@ out: return NULL; } -static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, - unsigned long offset, - unsigned char usage) +static unsigned char swap_entry_put_locked(struct swap_info_struct *si, + unsigned long offset, + unsigned char usage) { unsigned char count; unsigned char has_cache; @@ -1461,15 +1461,15 @@ put_out: return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *si, - swp_entry_t entry) +static unsigned char swap_entry_put(struct swap_info_struct *si, + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster(si, offset); - usage = __swap_entry_free_locked(si, offset, 1); + usage = swap_entry_put_locked(si, offset, 1); if (!usage) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); unlock_cluster(ci); @@ -1477,8 +1477,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, return usage; } -static bool __swap_entries_free(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); unsigned int type = swp_type(entry); @@ -1509,7 +1509,7 @@ static bool __swap_entries_free(struct swap_info_struct *si, fallback: for (i = 0; i < nr; i++) { if (data_race(si->swap_map[offset + i])) { - count = __swap_entry_free(si, swp_entry(type, offset + i)); + count = swap_entry_put(si, swp_entry(type, offset + i)); if (count == SWAP_HAS_CACHE) has_cache = true; } else { @@ -1560,7 +1560,7 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { - if (!__swap_entry_free_locked(si, offset, usage)) + if (!swap_entry_put_locked(si, offset, usage)) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); } while (++offset < end); unlock_cluster(ci); @@ -1607,7 +1607,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) swap_entry_range_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) + if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) swap_entry_range_free(si, ci, entry, 1); } } @@ -1806,7 +1806,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = __swap_entries_free(si, entry, nr); + any_only_cache = swap_entries_put_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1819,7 +1819,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) * Now go back over the range trying to reclaim the swap cache. This is * more efficient for large folios because we will only try to reclaim * the swap once per folio in the common case. If we do - * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the + * swap_entry_put() and __try_to_reclaim_swap() in the same loop, the * latter will get a reference and lock the folio for every individual * page but will only succeed once the swap slot for every subpage is * zero. @@ -3789,7 +3789,7 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of __swap_entry_free_locked() + * Called while __swap_duplicate() or caller of swap_entry_put_locked() * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, -- 2.51.0 From 64944ef6a13e774546eb7635bdd26ef963335a41 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:22 +0800 Subject: [PATCH 13/16] mm: swap: enable swap_entry_range_free() to drop any kind of last ref The original VM_BUG_ON only allows swap_entry_range_free() to drop last SWAP_HAS_CACHE ref. By allowing other kind of last ref in VM_BUG_ON, swap_entry_range_free() could be a more general-purpose function able to handle all kind of last ref. Following thi change, also rename swap_entry_range_free() to swap_entries_free() and update it's comment accordingly. This is a preparation to use swap_entries_free() to drop more kind of last ref other than SWAP_HAS_CACHE. [shikemeng@huaweicloud.com: add __maybe_unused attribute for swap_is_last_ref() and update comment] Link: https://lkml.kernel.org/r/20250410153908.612984-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250325162528.68385-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Tested-by: SeongJae Park Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index ea01cf05b0cb..c6b4c74622fc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -52,9 +52,9 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages); +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); @@ -1471,7 +1471,7 @@ static unsigned char swap_entry_put(struct swap_info_struct *si, ci = lock_cluster(si, offset); usage = swap_entry_put_locked(si, offset, 1); if (!usage) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + swap_entries_free(si, ci, swp_entry(si->type, offset), 1); unlock_cluster(ci); return usage; @@ -1501,7 +1501,7 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) - swap_entry_range_free(si, ci, entry, nr); + swap_entries_free(si, ci, entry, nr); unlock_cluster(ci); return has_cache; @@ -1520,12 +1520,22 @@ fallback: } /* - * Drop the last HAS_CACHE flag of swap entries, caller have to - * ensure all entries belong to the same cgroup. + * Check if it's the last ref of swap entry in the freeing path. + * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. */ -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages) +static inline bool __maybe_unused swap_is_last_ref(unsigned char count) +{ + return (count == SWAP_HAS_CACHE) || (count == 1) || + (count == SWAP_MAP_SHMEM); +} + +/* + * Drop the last ref of swap entries, caller have to ensure all entries + * belong to the same cgroup and cluster. + */ +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; @@ -1538,7 +1548,7 @@ static void swap_entry_range_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(*map != SWAP_HAS_CACHE); + VM_BUG_ON(!swap_is_last_ref(*map)); *map = 0; } while (++map < map_end); @@ -1561,7 +1571,7 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { if (!swap_entry_put_locked(si, offset, usage)) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + swap_entries_free(si, ci, swp_entry(si->type, offset), 1); } while (++offset < end); unlock_cluster(ci); } @@ -1604,11 +1614,11 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) ci = lock_cluster(si, offset); if (swap_only_has_cache(si, offset, size)) - swap_entry_range_free(si, ci, entry, size); + swap_entries_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entry_range_free(si, ci, entry, 1); + swap_entries_free(si, ci, entry, 1); } } unlock_cluster(ci); -- 2.51.0 From 835b868878d0127bd29b8c0009dc424a63dadffb Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:23 +0800 Subject: [PATCH 14/16] mm: swap: use swap_entries_free() to free swap entry in swap_entry_put_locked() In swap_entry_put_locked(), we will set slot to SWAP_HAS_CACHE before using swap_entries_free() to do actual swap entry freeing. This introduce an unnecessary intermediate state. By using swap_entries_free() in swap_entry_put_locked(), we can eliminate the need to set slot to SWAP_HAS_CACHE. This change would make the behavior of swap_entry_put_locked() more consistent with other put() operations which will do actual free work after put last reference. Link: https://lkml.kernel.org/r/20250325162528.68385-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Kairui Song Reviewed-by: Baoquan He Signed-off-by: Andrew Morton --- mm/swapfile.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index c6b4c74622fc..f0ba27db9b3e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1356,9 +1356,11 @@ out: } static unsigned char swap_entry_put_locked(struct swap_info_struct *si, - unsigned long offset, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned char usage) { + unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; @@ -1390,7 +1392,7 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, if (usage) WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); + swap_entries_free(si, ci, entry, 1); return usage; } @@ -1469,9 +1471,7 @@ static unsigned char swap_entry_put(struct swap_info_struct *si, unsigned char usage; ci = lock_cluster(si, offset); - usage = swap_entry_put_locked(si, offset, 1); - if (!usage) - swap_entries_free(si, ci, swp_entry(si->type, offset), 1); + usage = swap_entry_put_locked(si, ci, entry, 1); unlock_cluster(ci); return usage; @@ -1570,8 +1570,8 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { - if (!swap_entry_put_locked(si, offset, usage)) - swap_entries_free(si, ci, swp_entry(si->type, offset), 1); + swap_entry_put_locked(si, ci, swp_entry(si->type, offset), + usage); } while (++offset < end); unlock_cluster(ci); } @@ -1616,10 +1616,8 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (swap_only_has_cache(si, offset, size)) swap_entries_free(si, ci, entry, size); else { - for (int i = 0; i < size; i++, entry.val++) { - if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entries_free(si, ci, entry, 1); - } + for (int i = 0; i < size; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } unlock_cluster(ci); } -- 2.51.0 From 46e0ab2c62067a8d5e62ae90088b1743af83725d Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:24 +0800 Subject: [PATCH 15/16] mm: swap: use swap_entries_free() drop last ref count in swap_entries_put_nr() Use swap_entries_free() to directly free swap entries when the swap entries are not cached and referenced, without needing to set swap entries to set intermediate SWAP_HAS_CACHE state. Link: https://lkml.kernel.org/r/20250325162528.68385-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f0ba27db9b3e..f8fe507be4f3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,10 +1498,11 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, unlock_cluster(ci); goto fallback; } - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) swap_entries_free(si, ci, entry, nr); + else + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); unlock_cluster(ci); return has_cache; -- 2.51.0 From f2252acf444764e7de7380201c518ee5ab6fd9da Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:25 +0800 Subject: [PATCH 16/16] mm: swap: drop last SWAP_MAP_SHMEM flag in batch in swap_entries_put_nr() The SWAP_MAP_SHMEM indicates last map from shmem. Therefore we can drop SWAP_MAP_SHMEM in batch in similar way to drop last ref count in batch. Link: https://lkml.kernel.org/r/20250325162528.68385-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f8fe507be4f3..668684dc9efa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -192,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; unsigned char count = *map; - if (swap_count(count) != 1) + if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) return false; while (++map < map_end) { @@ -1487,7 +1487,10 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, unsigned char count; int i; - if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + if (nr <= 1) + goto fallback; + count = swap_count(data_race(si->swap_map[offset])); + if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; /* cross into another cluster */ if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) -- 2.51.0