From cbc091441d3adce9a1fbcd0a0a84bd6abe077a5b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:08 -0700 Subject: [PATCH 01/16] memcg: manually inline __refill_stock There are no more multiple callers of __refill_stock(), so simply inline it to refill_stock(). Link: https://lkml.kernel.org/r/20250404013913.1663035-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 34 +++++++++++++--------------------- 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dfb3f14c1178..03a2be6d4a67 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1871,14 +1871,22 @@ static void drain_local_stock(struct work_struct *dummy) obj_cgroup_put(old); } -/* - * Cache charges(val) to local per_cpu area. - * This will be consumed by consume_stock() function, later. - */ -static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock; unsigned int stock_pages; + unsigned long flags; + + VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); + + if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) { + /* + * In case of unlikely failure to lock percpu stock_lock + * uncharge memcg directly. + */ + memcg_uncharge(memcg, nr_pages); + return; + } stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached) != memcg) { /* reset if necessary */ @@ -1891,23 +1899,7 @@ static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); -} - -static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) -{ - unsigned long flags; - - VM_WARN_ON_ONCE(mem_cgroup_is_root(memcg)); - if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) { - /* - * In case of unlikely failure to lock percpu stock_lock - * uncharge memcg directly. - */ - memcg_uncharge(memcg, nr_pages); - return; - } - __refill_stock(memcg, nr_pages); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } -- 2.51.0 From b6d0471117daec360bad2b881c1e2d0f4d2c4d3b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:09 -0700 Subject: [PATCH 02/16] memcg: no refilling stock from obj_cgroup_release obj_cgroup_release is called when all the references to the objcg have been released i.e. no more memory objects are pointing to it. Most probably objcg->memcg will be pointing to some ancestor memcg. In obj_cgroup_release(), the kernel calls obj_cgroup_uncharge_pages() which refills the local stock. There is no need to refill the local stock with some ancestor memcg and flush the local stock. Let's decouple obj_cgroup_release() from the local stock by uncharging instead of refilling. One additional benefit of this change is that it removes the requirement to only call obj_cgroup_put() outside of local_lock. Link: https://lkml.kernel.org/r/20250404013913.1663035-6-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 03a2be6d4a67..df52084e90f4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -129,8 +129,7 @@ bool mem_cgroup_kmem_disabled(void) return cgroup_memory_nokmem; } -static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg, - unsigned int nr_pages); +static void memcg_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); static void obj_cgroup_release(struct percpu_ref *ref) { @@ -163,8 +162,16 @@ static void obj_cgroup_release(struct percpu_ref *ref) WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1)); nr_pages = nr_bytes >> PAGE_SHIFT; - if (nr_pages) - obj_cgroup_uncharge_pages(objcg, nr_pages); + if (nr_pages) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_objcg(objcg); + mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + memcg1_account_kmem(memcg, -nr_pages); + if (!mem_cgroup_is_root(memcg)) + memcg_uncharge(memcg, nr_pages); + mem_cgroup_put(memcg); + } spin_lock_irqsave(&objcg_lock, flags); list_del(&objcg->list); -- 2.51.0 From ae51c775aa2b1fd4fbaf781740ec8762292bec3a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:10 -0700 Subject: [PATCH 03/16] memcg: do obj_cgroup_put inside drain_obj_stock Previously we could not call obj_cgroup_put() inside the local lock because on the put on the last reference, the release function obj_cgroup_release() may try to re-acquire the local lock. However that chain has been broken. Now simply do obj_cgroup_put() inside drain_obj_stock() instead of returning the old objcg. Link: https://lkml.kernel.org/r/20250404013913.1663035-7-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 37 +++++++++++-------------------------- 1 file changed, 11 insertions(+), 26 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index df52084e90f4..7988a42b29bf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1785,7 +1785,7 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { }; static DEFINE_MUTEX(percpu_charge_mutex); -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); +static void drain_obj_stock(struct memcg_stock_pcp *stock); static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, struct mem_cgroup *root_memcg); @@ -1859,7 +1859,6 @@ static void drain_stock(struct memcg_stock_pcp *stock) static void drain_local_stock(struct work_struct *dummy) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; unsigned long flags; /* @@ -1870,12 +1869,11 @@ static void drain_local_stock(struct work_struct *dummy) local_lock_irqsave(&memcg_stock.stock_lock, flags); stock = this_cpu_ptr(&memcg_stock); - old = drain_obj_stock(stock); + drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); } static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -1958,18 +1956,16 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old; unsigned long flags; stock = &per_cpu(memcg_stock, cpu); /* drain_obj_stock requires stock_lock */ local_lock_irqsave(&memcg_stock.stock_lock, flags); - old = drain_obj_stock(stock); + drain_obj_stock(stock); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); drain_stock(stock); - obj_cgroup_put(old); return 0; } @@ -2766,24 +2762,20 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) } /* Replace the stock objcg with objcg, return the old objcg */ -static struct obj_cgroup *replace_stock_objcg(struct memcg_stock_pcp *stock, - struct obj_cgroup *objcg) +static void replace_stock_objcg(struct memcg_stock_pcp *stock, + struct obj_cgroup *objcg) { - struct obj_cgroup *old = NULL; - - old = drain_obj_stock(stock); + drain_obj_stock(stock); obj_cgroup_get(objcg); stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; WRITE_ONCE(stock->cached_objcg, objcg); - return old; } static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; unsigned long flags; int *bytes; @@ -2796,7 +2788,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, * changes. */ if (READ_ONCE(stock->cached_objcg) != objcg) { - old = replace_stock_objcg(stock, objcg); + replace_stock_objcg(stock, objcg); stock->cached_pgdat = pgdat; } else if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ @@ -2837,7 +2829,6 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, __mod_objcg_mlstate(objcg, pgdat, idx, nr); local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) @@ -2859,12 +2850,12 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) return ret; } -static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) +static void drain_obj_stock(struct memcg_stock_pcp *stock) { struct obj_cgroup *old = READ_ONCE(stock->cached_objcg); if (!old) - return NULL; + return; if (stock->nr_bytes) { unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; @@ -2917,11 +2908,7 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) } WRITE_ONCE(stock->cached_objcg, NULL); - /* - * The `old' objects needs to be released by the caller via - * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. - */ - return old; + obj_cgroup_put(old); } static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, @@ -2943,7 +2930,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, bool allow_uncharge) { struct memcg_stock_pcp *stock; - struct obj_cgroup *old = NULL; unsigned long flags; unsigned int nr_pages = 0; @@ -2951,7 +2937,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - old = replace_stock_objcg(stock, objcg); + replace_stock_objcg(stock, objcg); allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; @@ -2962,7 +2948,6 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); - obj_cgroup_put(old); if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); -- 2.51.0 From 42a1910cfd23c46307a000a81127e02df247284c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:11 -0700 Subject: [PATCH 04/16] memcg: use __mod_memcg_state in drain_obj_stock For non-PREEMPT_RT kernels, drain_obj_stock() is always called with irq disabled, so we can use __mod_memcg_state() instead of mod_memcg_state(). For PREEMPT_RT, we need to add memcg_stats_[un]lock in __mod_memcg_state(). Link: https://lkml.kernel.org/r/20250404013913.1663035-8-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Sebastian Andrzej Siewior Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7988a42b29bf..33aeddfff0ba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -710,10 +710,12 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; + memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); memcg_rstat_updated(memcg, val); trace_mod_memcg_state(memcg, idx, val); + memcg_stats_unlock(); } #ifdef CONFIG_MEMCG_V1 @@ -2866,7 +2868,7 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock) memcg = get_mem_cgroup_from_objcg(old); - mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + __mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); if (!mem_cgroup_is_root(memcg)) memcg_uncharge(memcg, nr_pages); -- 2.51.0 From bc730030f956fe83e94625c546c30ce6bae32d6b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 3 Apr 2025 18:39:12 -0700 Subject: [PATCH 05/16] memcg: combine slab obj stock charging and accounting When handing slab objects, we use obj_cgroup_[un]charge() for (un)charging and mod_objcg_state() to account NR_SLAB_[UN]RECLAIMABLE_B. All these operations use the percpu stock for performance. However with the calls being separate, the stock_lock is taken twice in each case. By refactoring the code, we can turn mod_objcg_state() into __account_obj_stock() which is called on a stock that's already locked and validated. On the charging side we can call this function from consume_obj_stock() when it succeeds, and refill_obj_stock() in the fallback. We just expand parameters of these functions as necessary. The uncharge side from __memcg_slab_free_hook() is just the call to refill_obj_stock(). Other callers of obj_cgroup_[un]charge() (i.e. not slab) simply pass the extra parameters as NULL/zeroes to skip the __account_obj_stock() operation. In __memcg_slab_post_alloc_hook() we now charge each object separately, but that's not a problem as we did call mod_objcg_state() for each object separately, and most allocations are non-bulk anyway. This could be improved by batching all operations until slab_pgdat(slab) changes. Some preliminary benchmarking with a kfree(kmalloc()) loop of 10M iterations with/without __GFP_ACCOUNT: Before the patch: kmalloc/kfree !memcg: 581390144 cycles kmalloc/kfree memcg: 783689984 cycles After the patch: kmalloc/kfree memcg: 658723808 cycles More than half of the overhead of __GFP_ACCOUNT relative to non-accounted case seems eliminated. Link: https://lkml.kernel.org/r/20250404013913.1663035-9-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Signed-off-by: Vlastimil Babka Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 77 +++++++++++++++++++++++++++++-------------------- 1 file changed, 46 insertions(+), 31 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 33aeddfff0ba..3bb02f672e39 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2774,25 +2774,17 @@ static void replace_stock_objcg(struct memcg_stock_pcp *stock, WRITE_ONCE(stock->cached_objcg, objcg); } -static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr) +static void __account_obj_stock(struct obj_cgroup *objcg, + struct memcg_stock_pcp *stock, int nr, + struct pglist_data *pgdat, enum node_stat_item idx) { - struct memcg_stock_pcp *stock; - unsigned long flags; int *bytes; - local_lock_irqsave(&memcg_stock.stock_lock, flags); - stock = this_cpu_ptr(&memcg_stock); - /* * Save vmstat data in stock and skip vmstat array update unless - * accumulating over a page of vmstat data or when pgdat or idx - * changes. + * accumulating over a page of vmstat data or when pgdat changes. */ - if (READ_ONCE(stock->cached_objcg) != objcg) { - replace_stock_objcg(stock, objcg); - stock->cached_pgdat = pgdat; - } else if (stock->cached_pgdat != pgdat) { + if (stock->cached_pgdat != pgdat) { /* Flush the existing cached vmstat data */ struct pglist_data *oldpg = stock->cached_pgdat; @@ -2829,11 +2821,10 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, } if (nr) __mod_objcg_mlstate(objcg, pgdat, idx, nr); - - local_unlock_irqrestore(&memcg_stock.stock_lock, flags); } -static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) +static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + struct pglist_data *pgdat, enum node_stat_item idx) { struct memcg_stock_pcp *stock; unsigned long flags; @@ -2845,6 +2836,9 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { stock->nr_bytes -= nr_bytes; ret = true; + + if (pgdat) + __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } local_unlock_irqrestore(&memcg_stock.stock_lock, flags); @@ -2929,7 +2923,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, } static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, - bool allow_uncharge) + bool allow_uncharge, int nr_acct, struct pglist_data *pgdat, + enum node_stat_item idx) { struct memcg_stock_pcp *stock; unsigned long flags; @@ -2944,6 +2939,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, } stock->nr_bytes += nr_bytes; + if (pgdat) + __account_obj_stock(objcg, stock, nr_acct, pgdat, idx); + if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) { nr_pages = stock->nr_bytes >> PAGE_SHIFT; stock->nr_bytes &= (PAGE_SIZE - 1); @@ -2955,12 +2953,13 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, obj_cgroup_uncharge_pages(objcg, nr_pages); } -int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +static int obj_cgroup_charge_account(struct obj_cgroup *objcg, gfp_t gfp, size_t size, + struct pglist_data *pgdat, enum node_stat_item idx) { unsigned int nr_pages, nr_bytes; int ret; - if (consume_obj_stock(objcg, size)) + if (likely(consume_obj_stock(objcg, size, pgdat, idx))) return 0; /* @@ -2993,15 +2992,21 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) nr_pages += 1; ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages); - if (!ret && nr_bytes) - refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false); + if (!ret && (nr_bytes || pgdat)) + refill_obj_stock(objcg, nr_bytes ? PAGE_SIZE - nr_bytes : 0, + false, size, pgdat, idx); return ret; } +int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) +{ + return obj_cgroup_charge_account(objcg, gfp, size, NULL, 0); +} + void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) { - refill_obj_stock(objcg, size, true); + refill_obj_stock(objcg, size, true, 0, NULL, 0); } static inline size_t obj_full_size(struct kmem_cache *s) @@ -3053,23 +3058,32 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, return false; } - if (obj_cgroup_charge(objcg, flags, size * obj_full_size(s))) - return false; - for (i = 0; i < size; i++) { slab = virt_to_slab(p[i]); if (!slab_obj_exts(slab) && alloc_slab_obj_exts(slab, s, flags, false)) { - obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } + /* + * if we fail and size is 1, memcg_alloc_abort_single() will + * just free the object, which is ok as we have not assigned + * objcg to its obj_ext yet + * + * for larger sizes, kmem_cache_free_bulk() will uncharge + * any objects that were already charged and obj_ext assigned + * + * TODO: we could batch this until slab_pgdat(slab) changes + * between iterations, with a more complicated undo + */ + if (obj_cgroup_charge_account(objcg, flags, obj_full_size(s), + slab_pgdat(slab), cache_vmstat_idx(s))) + return false; + off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); slab_obj_exts(slab)[off].objcg = objcg; - mod_objcg_state(objcg, slab_pgdat(slab), - cache_vmstat_idx(s), obj_full_size(s)); } return true; @@ -3078,6 +3092,8 @@ bool __memcg_slab_post_alloc_hook(struct kmem_cache *s, struct list_lru *lru, void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts) { + size_t obj_size = obj_full_size(s); + for (int i = 0; i < objects; i++) { struct obj_cgroup *objcg; unsigned int off; @@ -3088,9 +3104,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, continue; obj_exts[off].objcg = NULL; - obj_cgroup_uncharge(objcg, obj_full_size(s)); - mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), - -obj_full_size(s)); + refill_obj_stock(objcg, obj_size, true, -obj_size, + slab_pgdat(slab), cache_vmstat_idx(s)); obj_cgroup_put(objcg); } } -- 2.51.0 From ac26920d58224a6d5c78ad3ff0d58fd7faa775c7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 3 Apr 2025 18:39:13 -0700 Subject: [PATCH 06/16] memcg: manually inline replace_stock_objcg The replace_stock_objcg() is being called by only refill_obj_stock, so manually inline it. Link: https://lkml.kernel.org/r/20250404013913.1663035-10-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3bb02f672e39..aebb1f2c8657 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2763,17 +2763,6 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) obj_cgroup_put(objcg); } -/* Replace the stock objcg with objcg, return the old objcg */ -static void replace_stock_objcg(struct memcg_stock_pcp *stock, - struct obj_cgroup *objcg) -{ - drain_obj_stock(stock); - obj_cgroup_get(objcg); - stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) - ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; - WRITE_ONCE(stock->cached_objcg, objcg); -} - static void __account_obj_stock(struct obj_cgroup *objcg, struct memcg_stock_pcp *stock, int nr, struct pglist_data *pgdat, enum node_stat_item idx) @@ -2934,7 +2923,12 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock = this_cpu_ptr(&memcg_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ - replace_stock_objcg(stock, objcg); + drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; + WRITE_ONCE(stock->cached_objcg, objcg); + allow_uncharge = true; /* Allow uncharge when objcg changes */ } stock->nr_bytes += nr_bytes; -- 2.51.0 From 9c1c38bcdc9215f0bd231591b662bd0fbe8b7045 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:21 +0800 Subject: [PATCH 07/16] mm: swap: rename __swap_[entry/entries]_free[_locked] to swap_[entry/entries]_put[_locked] Patch series "Minor cleanups and improvements to swap freeing code", v4. This series contains some cleanups and improvements which are made during learning swapfile. Here is a summary of the changes: 1. Function naming improvments. - Use "put" instead of "free" to name functions which only do actual free when count drops to zero. - Use "entry" to name function only frees one swap slot. Use "entries" to name function could may free multi swap slots within one cluster. Use "_nr" suffix to name function which could free multi swap slots spanning cross multi clusters. 2. Eliminate the need to set swap slot to intermediate SWAP_HAS_CACHE value before do actual free by using swap_entry_range_free() 3. Add helpers swap_entries_put_map() and swap_entries_put_cache() as a general-purpose routine to free swap entries within a single cluster which will try batch-remove first and fallback to put eatch entry indvidually with cluster lock acquired/released only once. By using these helpers, we could remove repeated code, levarage batch-remove in more cases and aoivd to acquire/release cluster lock for each single swap entry. This patch (of 8): In __swap_entry_free[_locked] and __swap_entries_free, we decrease count first and only free swap entry if count drops to zero. This behavior is more akin to a put() operation rather than a free() operation. Therefore, rename these functions with "put" instead of "free". Additionally, add "_nr" suffix to swap_entries_put to indicate the input range may span swap clusters. Link: https://lkml.kernel.org/r/20250325162528.68385-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250325162528.68385-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f214843612dc..ea01cf05b0cb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1355,9 +1355,9 @@ out: return NULL; } -static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, - unsigned long offset, - unsigned char usage) +static unsigned char swap_entry_put_locked(struct swap_info_struct *si, + unsigned long offset, + unsigned char usage) { unsigned char count; unsigned char has_cache; @@ -1461,15 +1461,15 @@ put_out: return NULL; } -static unsigned char __swap_entry_free(struct swap_info_struct *si, - swp_entry_t entry) +static unsigned char swap_entry_put(struct swap_info_struct *si, + swp_entry_t entry) { struct swap_cluster_info *ci; unsigned long offset = swp_offset(entry); unsigned char usage; ci = lock_cluster(si, offset); - usage = __swap_entry_free_locked(si, offset, 1); + usage = swap_entry_put_locked(si, offset, 1); if (!usage) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); unlock_cluster(ci); @@ -1477,8 +1477,8 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, return usage; } -static bool __swap_entries_free(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); unsigned int type = swp_type(entry); @@ -1509,7 +1509,7 @@ static bool __swap_entries_free(struct swap_info_struct *si, fallback: for (i = 0; i < nr; i++) { if (data_race(si->swap_map[offset + i])) { - count = __swap_entry_free(si, swp_entry(type, offset + i)); + count = swap_entry_put(si, swp_entry(type, offset + i)); if (count == SWAP_HAS_CACHE) has_cache = true; } else { @@ -1560,7 +1560,7 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { - if (!__swap_entry_free_locked(si, offset, usage)) + if (!swap_entry_put_locked(si, offset, usage)) swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); } while (++offset < end); unlock_cluster(ci); @@ -1607,7 +1607,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) swap_entry_range_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) + if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) swap_entry_range_free(si, ci, entry, 1); } } @@ -1806,7 +1806,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = __swap_entries_free(si, entry, nr); + any_only_cache = swap_entries_put_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1819,7 +1819,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) * Now go back over the range trying to reclaim the swap cache. This is * more efficient for large folios because we will only try to reclaim * the swap once per folio in the common case. If we do - * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the + * swap_entry_put() and __try_to_reclaim_swap() in the same loop, the * latter will get a reference and lock the folio for every individual * page but will only succeed once the swap slot for every subpage is * zero. @@ -3789,7 +3789,7 @@ outer: * into, carry if so, or else fail until a new continuation page is allocated; * when the original swap_map count is decremented from 0 with continuation, * borrow from the continuation and report whether it still holds more. - * Called while __swap_duplicate() or caller of __swap_entry_free_locked() + * Called while __swap_duplicate() or caller of swap_entry_put_locked() * holds cluster lock. */ static bool swap_count_continued(struct swap_info_struct *si, -- 2.51.0 From 64944ef6a13e774546eb7635bdd26ef963335a41 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:22 +0800 Subject: [PATCH 08/16] mm: swap: enable swap_entry_range_free() to drop any kind of last ref The original VM_BUG_ON only allows swap_entry_range_free() to drop last SWAP_HAS_CACHE ref. By allowing other kind of last ref in VM_BUG_ON, swap_entry_range_free() could be a more general-purpose function able to handle all kind of last ref. Following thi change, also rename swap_entry_range_free() to swap_entries_free() and update it's comment accordingly. This is a preparation to use swap_entries_free() to drop more kind of last ref other than SWAP_HAS_CACHE. [shikemeng@huaweicloud.com: add __maybe_unused attribute for swap_is_last_ref() and update comment] Link: https://lkml.kernel.org/r/20250410153908.612984-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20250325162528.68385-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Tested-by: SeongJae Park Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 38 ++++++++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index ea01cf05b0cb..c6b4c74622fc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -52,9 +52,9 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, unsigned char); static void free_swap_count_continuations(struct swap_info_struct *); -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages); +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages); static void swap_range_alloc(struct swap_info_struct *si, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); @@ -1471,7 +1471,7 @@ static unsigned char swap_entry_put(struct swap_info_struct *si, ci = lock_cluster(si, offset); usage = swap_entry_put_locked(si, offset, 1); if (!usage) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + swap_entries_free(si, ci, swp_entry(si->type, offset), 1); unlock_cluster(ci); return usage; @@ -1501,7 +1501,7 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) - swap_entry_range_free(si, ci, entry, nr); + swap_entries_free(si, ci, entry, nr); unlock_cluster(ci); return has_cache; @@ -1520,12 +1520,22 @@ fallback: } /* - * Drop the last HAS_CACHE flag of swap entries, caller have to - * ensure all entries belong to the same cgroup. + * Check if it's the last ref of swap entry in the freeing path. + * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. */ -static void swap_entry_range_free(struct swap_info_struct *si, - struct swap_cluster_info *ci, - swp_entry_t entry, unsigned int nr_pages) +static inline bool __maybe_unused swap_is_last_ref(unsigned char count) +{ + return (count == SWAP_HAS_CACHE) || (count == 1) || + (count == SWAP_MAP_SHMEM); +} + +/* + * Drop the last ref of swap entries, caller have to ensure all entries + * belong to the same cgroup and cluster. + */ +static void swap_entries_free(struct swap_info_struct *si, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned int nr_pages) { unsigned long offset = swp_offset(entry); unsigned char *map = si->swap_map + offset; @@ -1538,7 +1548,7 @@ static void swap_entry_range_free(struct swap_info_struct *si, ci->count -= nr_pages; do { - VM_BUG_ON(*map != SWAP_HAS_CACHE); + VM_BUG_ON(!swap_is_last_ref(*map)); *map = 0; } while (++map < map_end); @@ -1561,7 +1571,7 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { if (!swap_entry_put_locked(si, offset, usage)) - swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1); + swap_entries_free(si, ci, swp_entry(si->type, offset), 1); } while (++offset < end); unlock_cluster(ci); } @@ -1604,11 +1614,11 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) ci = lock_cluster(si, offset); if (swap_only_has_cache(si, offset, size)) - swap_entry_range_free(si, ci, entry, size); + swap_entries_free(si, ci, entry, size); else { for (int i = 0; i < size; i++, entry.val++) { if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entry_range_free(si, ci, entry, 1); + swap_entries_free(si, ci, entry, 1); } } unlock_cluster(ci); -- 2.51.0 From 835b868878d0127bd29b8c0009dc424a63dadffb Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:23 +0800 Subject: [PATCH 09/16] mm: swap: use swap_entries_free() to free swap entry in swap_entry_put_locked() In swap_entry_put_locked(), we will set slot to SWAP_HAS_CACHE before using swap_entries_free() to do actual swap entry freeing. This introduce an unnecessary intermediate state. By using swap_entries_free() in swap_entry_put_locked(), we can eliminate the need to set slot to SWAP_HAS_CACHE. This change would make the behavior of swap_entry_put_locked() more consistent with other put() operations which will do actual free work after put last reference. Link: https://lkml.kernel.org/r/20250325162528.68385-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Kairui Song Reviewed-by: Baoquan He Signed-off-by: Andrew Morton --- mm/swapfile.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index c6b4c74622fc..f0ba27db9b3e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1356,9 +1356,11 @@ out: } static unsigned char swap_entry_put_locked(struct swap_info_struct *si, - unsigned long offset, + struct swap_cluster_info *ci, + swp_entry_t entry, unsigned char usage) { + unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; @@ -1390,7 +1392,7 @@ static unsigned char swap_entry_put_locked(struct swap_info_struct *si, if (usage) WRITE_ONCE(si->swap_map[offset], usage); else - WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE); + swap_entries_free(si, ci, entry, 1); return usage; } @@ -1469,9 +1471,7 @@ static unsigned char swap_entry_put(struct swap_info_struct *si, unsigned char usage; ci = lock_cluster(si, offset); - usage = swap_entry_put_locked(si, offset, 1); - if (!usage) - swap_entries_free(si, ci, swp_entry(si->type, offset), 1); + usage = swap_entry_put_locked(si, ci, entry, 1); unlock_cluster(ci); return usage; @@ -1570,8 +1570,8 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, ci = lock_cluster(si, offset); do { - if (!swap_entry_put_locked(si, offset, usage)) - swap_entries_free(si, ci, swp_entry(si->type, offset), 1); + swap_entry_put_locked(si, ci, swp_entry(si->type, offset), + usage); } while (++offset < end); unlock_cluster(ci); } @@ -1616,10 +1616,8 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (swap_only_has_cache(si, offset, size)) swap_entries_free(si, ci, entry, size); else { - for (int i = 0; i < size; i++, entry.val++) { - if (!swap_entry_put_locked(si, offset + i, SWAP_HAS_CACHE)) - swap_entries_free(si, ci, entry, 1); - } + for (int i = 0; i < size; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); } unlock_cluster(ci); } -- 2.51.0 From 46e0ab2c62067a8d5e62ae90088b1743af83725d Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:24 +0800 Subject: [PATCH 10/16] mm: swap: use swap_entries_free() drop last ref count in swap_entries_put_nr() Use swap_entries_free() to directly free swap entries when the swap entries are not cached and referenced, without needing to set swap entries to set intermediate SWAP_HAS_CACHE state. Link: https://lkml.kernel.org/r/20250325162528.68385-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f0ba27db9b3e..f8fe507be4f3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,10 +1498,11 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, unlock_cluster(ci); goto fallback; } - for (i = 0; i < nr; i++) - WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); if (!has_cache) swap_entries_free(si, ci, entry, nr); + else + for (i = 0; i < nr; i++) + WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); unlock_cluster(ci); return has_cache; -- 2.51.0 From f2252acf444764e7de7380201c518ee5ab6fd9da Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:25 +0800 Subject: [PATCH 11/16] mm: swap: drop last SWAP_MAP_SHMEM flag in batch in swap_entries_put_nr() The SWAP_MAP_SHMEM indicates last map from shmem. Therefore we can drop SWAP_MAP_SHMEM in batch in similar way to drop last ref count in batch. Link: https://lkml.kernel.org/r/20250325162528.68385-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f8fe507be4f3..668684dc9efa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -192,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si, unsigned char *map_end = map + nr_pages; unsigned char count = *map; - if (swap_count(count) != 1) + if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM) return false; while (++map < map_end) { @@ -1487,7 +1487,10 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, unsigned char count; int i; - if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1) + if (nr <= 1) + goto fallback; + count = swap_count(data_race(si->swap_map[offset])); + if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; /* cross into another cluster */ if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) -- 2.51.0 From 4d71d9062dd7b2ace56f2351b9f3f06e6c0acf81 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:26 +0800 Subject: [PATCH 12/16] mm: swap: free each cluster individually in swap_entries_put_map_nr() 1. Factor out general swap_entries_put_map() helper to drop entries belonging to one cluster. If entries are last map, free entries in batch, otherwise put entries with cluster lock acquired and released only once. 2. Iterate and call swap_entries_put_map() for each cluster in swap_entries_put_nr() to leverage batch-remove for last map belonging to one cluster and reduce lock acquire/release in fallback case. 3. As swap_entries_put_nr() won't handle SWAP_HSA_CACHE drop, rename it to swap_entries_put_map_nr(). 4. As we won't drop each entry invidually with swap_entry_put() now, do reclaim in free_swap_and_cache_nr() because swap_entries_put_map_nr() is general routine to drop reference and the relcaim work should only be done in free_swap_and_cache_nr(). Remove stale comment accordingly. Link: https://lkml.kernel.org/r/20250325162528.68385-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 70 +++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 668684dc9efa..4f4fc74239d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1463,25 +1463,10 @@ put_out: return NULL; } -static unsigned char swap_entry_put(struct swap_info_struct *si, - swp_entry_t entry) -{ - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); - unsigned char usage; - - ci = lock_cluster(si, offset); - usage = swap_entry_put_locked(si, ci, entry, 1); - unlock_cluster(ci); - - return usage; -} - -static bool swap_entries_put_nr(struct swap_info_struct *si, - swp_entry_t entry, int nr) +static bool swap_entries_put_map(struct swap_info_struct *si, + swp_entry_t entry, int nr) { unsigned long offset = swp_offset(entry); - unsigned int type = swp_type(entry); struct swap_cluster_info *ci; bool has_cache = false; unsigned char count; @@ -1492,14 +1477,10 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, count = swap_count(data_race(si->swap_map[offset])); if (count != 1 && count != SWAP_MAP_SHMEM) goto fallback; - /* cross into another cluster */ - if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) - goto fallback; ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster(ci); - goto fallback; + goto locked_fallback; } if (!has_cache) swap_entries_free(si, ci, entry, nr); @@ -1511,15 +1492,34 @@ static bool swap_entries_put_nr(struct swap_info_struct *si, return has_cache; fallback: - for (i = 0; i < nr; i++) { - if (data_race(si->swap_map[offset + i])) { - count = swap_entry_put(si, swp_entry(type, offset + i)); - if (count == SWAP_HAS_CACHE) - has_cache = true; - } else { - WARN_ON_ONCE(1); - } + ci = lock_cluster(si, offset); +locked_fallback: + for (i = 0; i < nr; i++, entry.val++) { + count = swap_entry_put_locked(si, ci, entry, 1); + if (count == SWAP_HAS_CACHE) + has_cache = true; + } + unlock_cluster(ci); + return has_cache; + +} + +static bool swap_entries_put_map_nr(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + int cluster_nr, cluster_rest; + unsigned long offset = swp_offset(entry); + bool has_cache = false; + + cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER; + while (nr) { + cluster_nr = min(nr, cluster_rest); + has_cache |= swap_entries_put_map(si, entry, cluster_nr); + cluster_rest = SWAPFILE_CLUSTER; + nr -= cluster_nr; + entry.val += cluster_nr; } + return has_cache; } @@ -1818,7 +1818,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) /* * First free all entries in the range. */ - any_only_cache = swap_entries_put_nr(si, entry, nr); + any_only_cache = swap_entries_put_map_nr(si, entry, nr); /* * Short-circuit the below loop if none of the entries had their @@ -1828,13 +1828,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr) goto out; /* - * Now go back over the range trying to reclaim the swap cache. This is - * more efficient for large folios because we will only try to reclaim - * the swap once per folio in the common case. If we do - * swap_entry_put() and __try_to_reclaim_swap() in the same loop, the - * latter will get a reference and lock the folio for every individual - * page but will only succeed once the swap slot for every subpage is - * zero. + * Now go back over the range trying to reclaim the swap cache. */ for (offset = start_offset; offset < end_offset; offset += nr) { nr = 1; -- 2.51.0 From d4f8000bd6b0cc33c9dddd369e72a13c4c080cb1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:27 +0800 Subject: [PATCH 13/16] mm: swap: factor out helper to drop cache of entries within a single cluster Factor out helper swap_entries_put_cache() from put_swap_folio() to serve as a general-purpose routine for dropping cache flag of entries within a single cluster. Link: https://lkml.kernel.org/r/20250325162528.68385-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f4fc74239d6..953dcd99006e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1463,6 +1463,22 @@ put_out: return NULL; } +static void swap_entries_put_cache(struct swap_info_struct *si, + swp_entry_t entry, int nr) +{ + unsigned long offset = swp_offset(entry); + struct swap_cluster_info *ci; + + ci = lock_cluster(si, offset); + if (swap_only_has_cache(si, offset, nr)) + swap_entries_free(si, ci, entry, nr); + else { + for (int i = 0; i < nr; i++, entry.val++) + swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); + } + unlock_cluster(ci); +} + static bool swap_entries_put_map(struct swap_info_struct *si, swp_entry_t entry, int nr) { @@ -1607,8 +1623,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) */ void put_swap_folio(struct folio *folio, swp_entry_t entry) { - unsigned long offset = swp_offset(entry); - struct swap_cluster_info *ci; struct swap_info_struct *si; int size = 1 << swap_entry_order(folio_order(folio)); @@ -1616,14 +1630,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster(si, offset); - if (swap_only_has_cache(si, offset, size)) - swap_entries_free(si, ci, entry, size); - else { - for (int i = 0; i < size; i++, entry.val++) - swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE); - } - unlock_cluster(ci); + swap_entries_put_cache(si, entry, size); } int __swap_count(swp_entry_t entry) -- 2.51.0 From ec9827cd28b13b88517812eb08b13d0ed97ae8f1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 26 Mar 2025 00:25:28 +0800 Subject: [PATCH 14/16] mm: swap: replace cluster_swap_free_nr() with swap_entries_put_[map/cache]() Replace cluster_swap_free_nr() with swap_entries_put_[map/cache]() to remove repeat code and leverage batch-remove for entries with last flag. After removing cluster_swap_free_nr, only functions with "_nr" suffix could free entries spanning cross clusters. Add corresponding description in comment of swap_entries_put_map_nr() as is first function with "_nr" suffix and have a non-suffix variant function swap_entries_put_map(). Link: https://lkml.kernel.org/r/20250325162528.68385-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Tim Chen Reviewed-by: Baoquan He Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 953dcd99006e..b86637cfb17a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1520,6 +1520,11 @@ locked_fallback: } +/* + * Only functions with "_nr" suffix are able to free entries spanning + * cross multi clusters, so ensure the range is within a single cluster + * when freeing entries with functions without "_nr" suffix. + */ static bool swap_entries_put_map_nr(struct swap_info_struct *si, swp_entry_t entry, int nr) { @@ -1581,21 +1586,6 @@ static void swap_entries_free(struct swap_info_struct *si, partial_free_cluster(si, ci); } -static void cluster_swap_free_nr(struct swap_info_struct *si, - unsigned long offset, int nr_pages, - unsigned char usage) -{ - struct swap_cluster_info *ci; - unsigned long end = offset + nr_pages; - - ci = lock_cluster(si, offset); - do { - swap_entry_put_locked(si, ci, swp_entry(si->type, offset), - usage); - } while (++offset < end); - unlock_cluster(ci); -} - /* * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. @@ -1612,7 +1602,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages) while (nr_pages) { nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); - cluster_swap_free_nr(sis, offset, nr, 1); + swap_entries_put_map(sis, swp_entry(sis->type, offset), nr); offset += nr; nr_pages -= nr; } @@ -3658,11 +3648,13 @@ int swapcache_prepare(swp_entry_t entry, int nr) return __swap_duplicate(entry, SWAP_HAS_CACHE, nr); } +/* + * Caller should ensure entries belong to the same folio so + * the entries won't span cross cluster boundary. + */ void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr) { - unsigned long offset = swp_offset(entry); - - cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE); + swap_entries_put_cache(si, entry, nr); } struct swap_info_struct *swp_swap_info(swp_entry_t entry) -- 2.51.0 From f4d1c32489117c9d38206a673d880d23d7d3bb8a Mon Sep 17 00:00:00 2001 From: SoumishDas Date: Tue, 25 Mar 2025 23:43:25 +0530 Subject: [PATCH 15/16] mm: add kernel-doc comment for free_pgd_range() Provide kernel-doc for free_pgd_range() so it's easier to understand what the function does and how it is used. Link: https://lkml.kernel.org/r/20250325181325.5774-1-soumish.das@gmail.com Signed-off-by: SoumishDas Signed-off-by: Andrew Morton --- mm/memory.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 86e7e66e3c5b..f41fac7118ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -278,8 +278,17 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, p4d_free_tlb(tlb, p4d, start); } -/* - * This function frees user-level page tables of a process. +/** + * free_pgd_range - Unmap and free page tables in the range + * @tlb: the mmu_gather containing pending TLB flush info + * @addr: virtual address start + * @end: virtual address end + * @floor: lowest address boundary + * @ceiling: highest address boundary + * + * This function tears down all user-level page tables in the + * specified virtual address range [@addr..@end). It is part of + * the memory unmap flow. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, -- 2.51.0 From 87a929ae4fb4709c577e1d41d6fb13f567a4aa03 Mon Sep 17 00:00:00 2001 From: "Dmitry V. Levin" Date: Mon, 3 Mar 2025 13:19:53 +0200 Subject: [PATCH 16/16] hexagon: add syscall_set_return_value() Patch series "ptrace: introduce PTRACE_SET_SYSCALL_INFO API", v7. PTRACE_SET_SYSCALL_INFO is a generic ptrace API that complements PTRACE_GET_SYSCALL_INFO by letting the ptracer modify details of system calls the tracee is blocked in. This API allows ptracers to obtain and modify system call details in a straightforward and architecture-agnostic way, providing a consistent way of manipulating the system call number and arguments across architectures. As in case of PTRACE_GET_SYSCALL_INFO, PTRACE_SET_SYSCALL_INFO also does not aim to address numerous architecture-specific system call ABI peculiarities, like differences in the number of system call arguments for such system calls as pread64 and preadv. The current implementation supports changing only those bits of system call information that are used by strace system call tampering, namely, syscall number, syscall arguments, and syscall return value. Support of changing additional details returned by PTRACE_GET_SYSCALL_INFO, such as instruction pointer and stack pointer, could be added later if needed, by using struct ptrace_syscall_info.flags to specify the additional details that should be set. Currently, "flags" and "reserved" fields of struct ptrace_syscall_info must be initialized with zeroes; "arch", "instruction_pointer", and "stack_pointer" fields are currently ignored. PTRACE_SET_SYSCALL_INFO currently supports only PTRACE_SYSCALL_INFO_ENTRY, PTRACE_SYSCALL_INFO_EXIT, and PTRACE_SYSCALL_INFO_SECCOMP operations. Other operations could be added later if needed. Ideally, PTRACE_SET_SYSCALL_INFO should have been introduced along with PTRACE_GET_SYSCALL_INFO, but it didn't happen. The last straw that convinced me to implement PTRACE_SET_SYSCALL_INFO was apparent failure to provide an API of changing the first system call argument on riscv architecture [1]. ptrace(2) man page: long ptrace(enum __ptrace_request request, pid_t pid, void *addr, void *data); ... PTRACE_SET_SYSCALL_INFO Modify information about the system call that caused the stop. The "data" argument is a pointer to struct ptrace_syscall_info that specifies the system call information to be set. The "addr" argument should be set to sizeof(struct ptrace_syscall_info)). This patch (of 6): hexagon is the only architecture that provides HAVE_ARCH_TRACEHOOK but doesn't define syscall_set_return_value(). Since this function is going to be needed on all HAVE_ARCH_TRACEHOOK architectures to implement PTRACE_SET_SYSCALL_INFO API, add it on hexagon, too. Link: https://lore.kernel.org/all/59505464-c84a-403d-972f-d4b2055eeaac@gmail.com/ [1] Link: https://lkml.kernel.org/r/20250303111953.GB24170@strace.io Signed-off-by: Dmitry V. Levin Cc: Alexander Gordeev Cc: Alexey Gladkov (Intel) Cc: Andreas Larsson Cc: anton ivanov Cc: Arnd Bergmann Cc: Borislav Betkov Cc: Brian Cain Cc: Charlie Jenkins Cc: Christian Borntraeger Cc: Christian Zankel Cc: Christophe Leroy Cc: Dave Hansen Cc: Davide Berardi Cc: David S. Miller Cc: Dinh Nguyen Cc: Eugene Syromyatnikov Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Maciej W. Rozycki Cc: Madhavan Srinivasan Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Frysinger Cc: Naveen N Rao Cc: Nicholas Piggin Cc: Oleg Nesterov Cc: Renzo Davoi Cc: Richard Weinberger Cc: Rich Felker Cc: Russel King Cc: Shuah Khan Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Thomas Gleinxer Cc: Vasily Gorbik Cc: Vineet Gupta Cc: WANG Xuerui Cc: Will Deacon Cc: Yoshinori Sato Cc: Eugene Syromiatnikov Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/syscall.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/hexagon/include/asm/syscall.h b/arch/hexagon/include/asm/syscall.h index f6e454f18038..951ca0ed8376 100644 --- a/arch/hexagon/include/asm/syscall.h +++ b/arch/hexagon/include/asm/syscall.h @@ -45,6 +45,13 @@ static inline long syscall_get_return_value(struct task_struct *task, return regs->r00; } +static inline void syscall_set_return_value(struct task_struct *task, + struct pt_regs *regs, + int error, long val) +{ + regs->r00 = (long) error ?: val; +} + static inline int syscall_get_arch(struct task_struct *task) { return AUDIT_ARCH_HEXAGON; -- 2.51.0