From 8dfc8cbf7b07da172b3a4c8bea064e84fbfa5d56 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:08 +0100 Subject: [PATCH 01/16] filemap: convert __readahead_batch() to use a folio Extract folios from i_mapping, not pages. Removes a hidden call to compound_head(), a use of thp_nr_pages() and an unnecessary assertion that we didn't find a tail page in the page cache. Link: https://lkml.kernel.org/r/20250402210612.2444135-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0ddd4bd8cdf8..c5c9b3770d75 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1424,7 +1424,7 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, { unsigned int i = 0; XA_STATE(xas, &rac->mapping->i_pages, 0); - struct page *page; + struct folio *folio; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; @@ -1433,13 +1433,12 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, xas_set(&xas, rac->_index); rcu_read_lock(); - xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, rac->_index + rac->_nr_pages - 1) { + if (xas_retry(&xas, folio)) continue; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageTail(page), page); - array[i++] = page; - rac->_batch_count += thp_nr_pages(page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + array[i++] = folio_page(folio, 0); + rac->_batch_count += folio_nr_pages(folio); if (i == array_sz) break; } -- 2.51.0 From 41e422a898da69e903a846dfb2b0c0ff62abc3cd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:09 +0100 Subject: [PATCH 02/16] filemap: remove readahead_page_batch() This function has no more callers; delete it. Link: https://lkml.kernel.org/r/20250402210612.2444135-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c5c9b3770d75..af25fb640463 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1447,20 +1447,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, return i; } -/** - * readahead_page_batch - Get a batch of pages to read. - * @rac: The current readahead request. - * @array: An array of pointers to struct page. - * - * Context: The pages are locked and have an elevated refcount. The caller - * should decreases the refcount once the page has been submitted for I/O - * and unlock the page once all I/O to that page has completed. - * Return: The number of pages placed in the array. 0 indicates the request - * is complete. - */ -#define readahead_page_batch(rac, array) \ - __readahead_batch(rac, array, ARRAY_SIZE(array)) - /** * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. -- 2.51.0 From 2355153ea8185e7deaf9ce728716dadb707ef59f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 22:06:10 +0100 Subject: [PATCH 03/16] mm: delete thp_nr_pages() All callers now use folio_nr_pages(). Delete this wrapper. Link: https://lkml.kernel.org/r/20250402210612.2444135-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/mm.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ae67d3a33792..dcdb798184ef 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2223,15 +2223,6 @@ static inline long compound_nr(struct page *page) return folio_large_nr_pages(folio); } -/** - * thp_nr_pages - The number of regular pages in this huge page. - * @page: The head page of a huge page. - */ -static inline long thp_nr_pages(struct page *page) -{ - return folio_nr_pages((struct folio *)page); -} - /** * folio_next - Move to the next physical folio. * @folio: The folio we're currently operating on. -- 2.51.0 From 56e5a103a721d0ef139bba7ff3d3ada6c8217d5b Mon Sep 17 00:00:00 2001 From: Nhat Pham Date: Wed, 2 Apr 2025 13:44:16 -0700 Subject: [PATCH 04/16] zsmalloc: prefer the the original page's node for compressed data Currently, zsmalloc, zswap's and zram's backend memory allocator, does not enforce any policy for the allocation of memory for the compressed data, instead just adopting the memory policy of the task entering reclaim, or the default policy (prefer local node) if no such policy is specified. This can lead to several pathological behaviors in multi-node NUMA systems: 1. Systems with CXL-based memory tiering can encounter the following inversion with zswap/zram: the coldest pages demoted to the CXL tier can return to the high tier when they are reclaimed to compressed swap, creating memory pressure on the high tier. 2. Consider a direct reclaimer scanning nodes in order of allocation preference. If it ventures into remote nodes, the memory it compresses there should stay there. Trying to shift those contents over to the reclaiming thread's preferred node further *increases* its local pressure, and provoking more spills. The remote node is also the most likely to refault this data again. This undesirable behavior was pointed out by Johannes Weiner in [1]. 3. For zswap writeback, the zswap entries are organized in node-specific LRUs, based on the node placement of the original pages, allowing for targeted zswap writeback for specific nodes. However, the compressed data of a zswap entry can be placed on a different node from the LRU it is placed on. This means that reclaim targeted at one node might not free up memory used for zswap entries in that node, but instead reclaiming memory in a different node. All of these issues will be resolved if the compressed data go to the same node as the original page. This patch encourages this behavior by having zswap and zram pass the node of the original page to zsmalloc, and have zsmalloc prefer the specified node if we need to allocate new (zs)pages for the compressed data. Note that we are not strictly binding the allocation to the preferred node. We still allow the allocation to fall back to other nodes when the preferred node is full, or if we have zspages with slots available on a different node. This is OK, and still a strict improvement over the status quo: 1. On a system with demotion enabled, we will generally prefer demotions over compressed swapping, and only swap when pages have already gone to the lowest tier. This patch should achieve the desired effect for the most part. 2. If the preferred node is out of memory, letting the compressed data going to other nodes can be better than the alternative (OOMs, keeping cold memory unreclaimed, disk swapping, etc.). 3. If the allocation go to a separate node because we have a zspage with slots available, at least we're not creating extra immediate memory pressure (since the space is already allocated). 3. While there can be mixings, we generally reclaim pages in same-node batches, which encourage zspage grouping that is more likely to go to the right node. 4. A strict binding would require partitioning zsmalloc by node, which is more complicated, and more prone to regression, since it reduces the storage density of zsmalloc. We need to evaluate the tradeoff and benchmark carefully before adopting such an involved solution. [1]: https://lore.kernel.org/linux-mm/20250331165306.GC2110528@cmpxchg.org/ [senozhatsky@chromium.org: coding-style fixes] Link: https://lkml.kernel.org/r/mnvexa7kseswglcqbhlot4zg3b3la2ypv2rimdl5mh5glbmhvz@wi6bgqn47hge Link: https://lkml.kernel.org/r/20250402204416.3435994-1-nphamcs@gmail.com Signed-off-by: Nhat Pham Suggested-by: Gregory Price Acked-by: Dan Williams Reviewed-by: Chengming Zhou Acked-by: Sergey Senozhatsky [zram, zsmalloc] Acked-by: Johannes Weiner Acked-by: Yosry Ahmed [zswap/zsmalloc] Cc: "Huang, Ying" Cc: Joanthan Cameron Cc: Minchan Kim Cc: SeongJae Park Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 11 ++++++++--- include/linux/zpool.h | 4 ++-- include/linux/zsmalloc.h | 3 ++- mm/zpool.c | 8 +++++--- mm/zsmalloc.c | 20 +++++++++++--------- mm/zswap.c | 2 +- 6 files changed, 29 insertions(+), 19 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index fda7d8624889..0ba18277ed7b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1694,7 +1694,7 @@ static int write_incompressible_page(struct zram *zram, struct page *page, */ handle = zs_malloc(zram->mem_pool, PAGE_SIZE, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle)) return PTR_ERR((void *)handle); @@ -1761,7 +1761,7 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) handle = zs_malloc(zram->mem_pool, comp_len, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle); @@ -1981,10 +1981,15 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, * We are holding per-CPU stream mutex and entry lock so better * avoid direct reclaim. Allocation error is not fatal since * we still have the old object in the mem_pool. + * + * XXX: technically, the node we really want here is the node that holds + * the original compressed data. But that would require us to modify + * zsmalloc API to return this information. For now, we will make do with + * the node of the page allocated for recompression. */ handle_new = zs_malloc(zram->mem_pool, comp_len_new, GFP_NOIO | __GFP_NOWARN | - __GFP_HIGHMEM | __GFP_MOVABLE); + __GFP_HIGHMEM | __GFP_MOVABLE, page_to_nid(page)); if (IS_ERR_VALUE(handle_new)) { zcomp_stream_put(zstrm); return PTR_ERR((void *)handle_new); diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 52f30e526607..369ef068fad8 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -22,7 +22,7 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, - unsigned long *handle); + unsigned long *handle, const int nid); void zpool_free(struct zpool *pool, unsigned long handle); @@ -64,7 +64,7 @@ struct zpool_driver { void (*destroy)(void *pool); int (*malloc)(void *pool, size_t size, gfp_t gfp, - unsigned long *handle); + unsigned long *handle, const int nid); void (*free)(void *pool, unsigned long handle); void *(*obj_read_begin)(void *pool, unsigned long handle, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index c26baf9fb331..13e9cc5490f7 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -26,7 +26,8 @@ struct zs_pool; struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags, + const int nid); void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); diff --git a/mm/zpool.c b/mm/zpool.c index 6d6d88930932..0a71d03369f1 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -226,20 +226,22 @@ const char *zpool_get_type(struct zpool *zpool) * @size: The amount of memory to allocate. * @gfp: The GFP flags to use when allocating memory. * @handle: Pointer to the handle to set + * @nid: The preferred node id. * * This allocates the requested amount of memory from the pool. * The gfp flags will be used when allocating memory, if the * implementation supports it. The provided @handle will be - * set to the allocated object handle. + * set to the allocated object handle. The allocation will + * prefer the NUMA node specified by @nid. * * Implementations must guarantee this to be thread-safe. * * Returns: 0 on success, negative value on error. */ int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - return zpool->driver->malloc(zpool->pool, size, gfp, handle); + return zpool->driver->malloc(zpool->pool, size, gfp, handle, nid); } /** diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d14a7e317ac8..513b08c7c941 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -243,9 +243,9 @@ static inline void zpdesc_dec_zone_page_state(struct zpdesc *zpdesc) dec_zone_page_state(zpdesc_page(zpdesc), NR_ZSPAGES); } -static inline struct zpdesc *alloc_zpdesc(gfp_t gfp) +static inline struct zpdesc *alloc_zpdesc(gfp_t gfp, const int nid) { - struct page *page = alloc_page(gfp); + struct page *page = alloc_pages_node(nid, gfp, 0); return page_zpdesc(page); } @@ -462,9 +462,9 @@ static void zs_zpool_destroy(void *pool) } static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, - unsigned long *handle) + unsigned long *handle, const int nid) { - *handle = zs_malloc(pool, size, gfp); + *handle = zs_malloc(pool, size, gfp, nid); if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); @@ -1043,8 +1043,8 @@ static void create_page_chain(struct size_class *class, struct zspage *zspage, * Allocate a zspage for the given size class */ static struct zspage *alloc_zspage(struct zs_pool *pool, - struct size_class *class, - gfp_t gfp) + struct size_class *class, + gfp_t gfp, const int nid) { int i; struct zpdesc *zpdescs[ZS_MAX_PAGES_PER_ZSPAGE]; @@ -1061,7 +1061,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, for (i = 0; i < class->pages_per_zspage; i++) { struct zpdesc *zpdesc; - zpdesc = alloc_zpdesc(gfp); + zpdesc = alloc_zpdesc(gfp, nid); if (!zpdesc) { while (--i >= 0) { zpdesc_dec_zone_page_state(zpdescs[i]); @@ -1336,12 +1336,14 @@ static unsigned long obj_malloc(struct zs_pool *pool, * @pool: pool to allocate from * @size: size of block to allocate * @gfp: gfp flags when allocating object + * @nid: The preferred node id to allocate new zspage (if needed) * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp, + const int nid) { unsigned long handle; struct size_class *class; @@ -1376,7 +1378,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) spin_unlock(&class->lock); - zspage = alloc_zspage(pool, class, gfp); + zspage = alloc_zspage(pool, class, gfp, nid); if (!zspage) { cache_free_handle(pool, handle); return (unsigned long)ERR_PTR(-ENOMEM); diff --git a/mm/zswap.c b/mm/zswap.c index 204fb59da33c..455e9425c5f5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -981,7 +981,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, zpool = pool->zpool; gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; - alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); + alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle, page_to_nid(page)); if (alloc_ret) goto unlock; -- 2.51.0 From a75ffa26122bba4a3fa9372aa13b32d7a5590425 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 2 Apr 2025 11:01:17 +0200 Subject: [PATCH 05/16] memcg, oom: do not bypass oom killer for dying tasks 7775face2079 ("memcg: killed threads should not invoke memcg OOM killer") has added a bypass of the oom killer path for dying threads because a very specific workload (described in the changelog) could hit "no killable tasks" path. This itself is not fatal condition but it could be annoying if this was a common case. On the other hand the bypass has some issues on its own. Without triggering oom killer we won't be able to trigger async oom reclaim (oom_reaper) which can operate on killed tasks as well as long as they still have their mm available. This could be the case during futex cleanup when the memory as pointed out by Johannes in [1]. The said case is still not fully understood but let's drop this bypass that was mostly driven by an artificial workload and allow dying tasks to go into oom path. This will make the code easier to reason about and also help corner cases where oom_reaper could help to release memory. Link: https://lore.kernel.org/all/20241212183012.GB1026@cmpxchg.org/T/#u [1] Link: https://lkml.kernel.org/r/20250402090117.130245-1-mhocko@kernel.org Signed-off-by: Michal Hocko Signed-off-by: Johannes Weiner Suggested-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: David Rientjes Cc: Muchun Song Cc: Rik van Riel Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c96c1f2b9cf5..b16b5b807d7c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1664,7 +1664,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, * A few threads which were not waiting at mutex_lock_killable() can * fail to bail out. Therefore, check again after holding oom_lock. */ - ret = task_is_dying() || out_of_memory(&oc); + ret = out_of_memory(&oc); unlock: mutex_unlock(&oom_lock); -- 2.51.0 From cd348c5e6af3b4220eb37a08fdaf9c924a2e1c19 Mon Sep 17 00:00:00 2001 From: Songtang Liu Date: Wed, 2 Apr 2025 03:41:16 -0400 Subject: [PATCH 06/16] mm: page_alloc: remove redundant READ_ONCE In the current code, batch is a local variable, and it cannot be concurrently modified. It's unnecessary to use READ_ONCE here, so remove it. Link: https://lkml.kernel.org/r/CAA=HWd1kn01ym8YuVFuAqK2Ggq3itEGkqX8T6eCXs_C7tiv-Jw@mail.gmail.com Fixes: 51a755c56dc0 ("mm: tune PCP high automatically") Signed-off-by: Songtang Liu Reviewed-by: Qi Zheng Reviewed-by: Huang Ying Reviewed-by: Anshuman Khandual Reviewed-by: Pankaj Gupta Reviewed-by: Oscar Salvador Reviewed-by: Vishal Moola (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8258349e49ac..1ee0e8265888 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2675,7 +2675,7 @@ static void free_frozen_page_commit(struct zone *zone, free_high = (pcp->free_count >= batch && (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) && (!(pcp->flags & PCPF_FREE_HIGH_BATCH) || - pcp->count >= READ_ONCE(batch))); + pcp->count >= batch)); pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER; } else if (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) { pcp->flags &= ~PCPF_PREV_FREE_HIGH_ORDER; -- 2.51.0 From 737e9d021993c99377bb61e762c0d6945a37615b Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Mon, 27 Jan 2025 10:34:03 -0500 Subject: [PATCH 07/16] memory: implement memory_block_advise/probe_max_size Patch series "memory,x86,acpi: hotplug memory alignment advisement", v8. When physical address regions are not aligned to memory block size, the misaligned portion is lost (stranded capacity). Block size (min/max/selected) is architecture defined. Most architectures tend to use the minimum block size or some simplistic heurist. On x86, memory block size increases up to 2GB, and is otherwise fitted to the alignment of non-hotplug (i.e. not special purpose memory). CXL exposes its memory for management through the ACPI CEDT (CXL Early Detection Table) in a field called the CXL Fixed Memory Window. Per the CXL specification, this memory must be aligned to at least 256MB. When a CFMW aligns on a size less than the block size, this causes a loss of up to 2GB per CFMW on x86. It is not uncommon for CFMW to be allocated per-device - though this behavior is BIOS defined. This patch set provides 3 things: 1) implement advise/query functions in driverse/base/memory.c to report/query architecture agnostic hotplug block alignment advice. 2) update x86 memblock size logic to consider the hotplug advice 3) add code in acpi/numa/srat.c to report CFMW alignment advice The advisement interfaces are design to be called during arch_init code prior to allocator and smp_init. start_kernel will call these through setup_arch() (via acpi and mm/init_64.c on x86), which occurs prior to mm_core_init and smp_init - so no need for atomics. There's an attempt to signal callers to advise() that query has already occurred, but this is predicated on the notion that query actually occurs (which presently only happens on the x86 arch). This is to assist debugging future users. Otherwise, the advise() call has been marked __init to help static discovery of bad call times. Once query is called the first time, it will always return the same value. Interfaces return -EBUSY and 0 respectively on systems without hotplug. This patch (of 3): Hotplug memory sources may have opinions on what the memblock size should be - usually for alignment purposes. For example, CXL memory extents can be 256MB with a matching alignment. If this size/alignment is smaller than the block size, it can result in stranded capacity. Implement memory_block_advise_max_size for use prior to allocator init, for software to advise the system on the max block size. Implement memory_block_probe_max_size for use by arch init code to calculate the best block size. Use of advice is architecture defined. The probe value can never change after first probe. Calls to advise after probe will return -EBUSY to aid debugging. On systems without hotplug, always return -ENODEV and 0 respectively. Link: https://lkml.kernel.org/r/20250127153405.3379117-1-gourry@gourry.net Link: https://lkml.kernel.org/r/20250127153405.3379117-2-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Ira Weiny Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Acked-by: Dan Williams Tested-by: Fan Ni Reviewed-by: Ira Weiny Acked-by: Oscar Salvador Cc: Alison Schofield Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Bruno Faccini Cc: Dave Hansen Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Haibo Xu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joanthan Cameron Cc: Len Brown Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Robert Richter Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- drivers/base/memory.c | 51 ++++++++++++++++++++++++++++++++++++++++++ include/linux/memory.h | 10 +++++++++ 2 files changed, 61 insertions(+) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 19469e7f88c2..ed3e69dc785c 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -110,6 +110,57 @@ static void memory_block_release(struct device *dev) kfree(mem); } + +/* Max block size to be set by memory_block_advise_max_size */ +static unsigned long memory_block_advised_size; +static bool memory_block_advised_size_queried; + +/** + * memory_block_advise_max_size() - advise memory hotplug on the max suggested + * block size, usually for alignment. + * @size: suggestion for maximum block size. must be aligned on power of 2. + * + * Early boot software (pre-allocator init) may advise archs on the max block + * size. This value can only decrease after initialization, as the intent is + * to identify the largest supported alignment for all sources. + * + * Use of this value is arch-defined, as is min/max block size. + * + * Return: 0 on success + * -EINVAL if size is 0 or not pow2 aligned + * -EBUSY if value has already been probed + */ +int __init memory_block_advise_max_size(unsigned long size) +{ + if (!size || !is_power_of_2(size)) + return -EINVAL; + + if (memory_block_advised_size_queried) + return -EBUSY; + + if (memory_block_advised_size) + memory_block_advised_size = min(memory_block_advised_size, size); + else + memory_block_advised_size = size; + + return 0; +} + +/** + * memory_block_advised_max_size() - query advised max hotplug block size. + * + * After the first call, the value can never change. Callers looking for the + * actual block size should use memory_block_size_bytes. This interface is + * intended for use by arch-init when initializing the hotplug block size. + * + * Return: advised size in bytes, or 0 if never set. + */ +unsigned long memory_block_advised_max_size(void) +{ + memory_block_advised_size_queried = true; + return memory_block_advised_size; +} + unsigned long __weak memory_block_size_bytes(void) { return MIN_MEMORY_BLOCK_SIZE; diff --git a/include/linux/memory.h b/include/linux/memory.h index 12daa6ec7d09..5ec4e6d209b9 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -149,6 +149,14 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) { return 0; } +static inline int memory_block_advise_max_size(unsigned long size) +{ + return -ENODEV; +} +static inline unsigned long memory_block_advised_max_size(void) +{ + return 0; +} #else /* CONFIG_MEMORY_HOTPLUG */ extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); @@ -181,6 +189,8 @@ int walk_dynamic_memory_groups(int nid, walk_memory_groups_func_t func, void memory_block_add_nid(struct memory_block *mem, int nid, enum meminit_context context); #endif /* CONFIG_NUMA */ +int memory_block_advise_max_size(unsigned long size); +unsigned long memory_block_advised_max_size(void); #endif /* CONFIG_MEMORY_HOTPLUG */ /* -- 2.51.0 From b1143537098b33affd4e50cec2c10e4b3858d93c Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Mon, 27 Jan 2025 10:34:04 -0500 Subject: [PATCH 08/16] x86: probe memory block size advisement value during mm init Systems with hotplug may provide an advisement value on what the memblock size should be. Probe this value when the rest of the configuration values are considered. The new heuristic is as follows 1) set_memory_block_size_order value if already set (cmdline param) 2) minimum block size if memory is less than large block limit 3) if no hotplug advice: Max block size if system is bare-metal, otherwise use end of memory alignment. 4) if hotplug advice: lesser of advice and end of memory alignment. Convert to cpu_feature_enabled() while at it.[1] [1] https://lore.kernel.org/all/20241031103401.GBZyNdGQ-ZyXKyzC_z@fat_crate.local/ Link: https://lkml.kernel.org/r/20250127153405.3379117-3-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Borislav Petkov Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: Dave Hansen Acked-by: Mike Rapoport (Microsoft) Acked-by: Dan Williams Tested-by: Fan Ni Reviewed-by: Ira Weiny Acked-by: Oscar Salvador Cc: Alison Schofield Cc: Andy Lutomirski Cc: Bruno Faccini Cc: Dave Jiang Cc: Greg Kroah-Hartman Cc: Haibo Xu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joanthan Cameron Cc: Len Brown Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Robert Richter Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- arch/x86/mm/init_64.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c4f6f591f2b..8cea7dcf8fa0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1464,16 +1464,21 @@ static unsigned long probe_memory_block_size(void) } /* - * Use max block size to minimize overhead on bare metal, where - * alignment for memory hotplug isn't a concern. + * When hotplug alignment is not a concern, maximize blocksize + * to minimize overhead. Otherwise, align to the lesser of advice + * alignment and end of memory alignment. */ - if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) { + bz = memory_block_advised_max_size(); + if (!bz) { bz = MAX_BLOCK_SIZE; - goto done; + if (!cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) + goto done; + } else { + bz = max(min(bz, MAX_BLOCK_SIZE), MIN_MEMORY_BLOCK_SIZE); } /* Find the largest allowed block size that aligns to memory end */ - for (bz = MAX_BLOCK_SIZE; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { + for (; bz > MIN_MEMORY_BLOCK_SIZE; bz >>= 1) { if (IS_ALIGNED(boot_mem_end, bz)) break; } -- 2.51.0 From 6e3d1b1813c77876e3b354ba94b0f9e1cc23e19f Mon Sep 17 00:00:00 2001 From: Gregory Price Date: Mon, 27 Jan 2025 10:34:05 -0500 Subject: [PATCH 09/16] acpi,srat: give memory block size advice based on CFMWS alignment Capacity is stranded when CFMWS regions are not aligned to block size. On x86, block size increases with capacity (2G blocks @ 64G capacity). Use CFMWS base/size to report memory block size alignment advice. Link: https://lkml.kernel.org/r/20250127153405.3379117-4-gourry@gourry.net Signed-off-by: Gregory Price Suggested-by: Dan Williams Acked-by: Mike Rapoport (Microsoft) Acked-by: David Hildenbrand Acked-by: Dan Williams Tested-by: Fan Ni Acked-by: Rafael J. Wysocki Reviewed-by: Ira Weiny Acked-by: Oscar Salvador Cc: Alison Schofield Cc: Andy Lutomirski Cc: Borislav Betkov Cc: Bruno Faccini Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Haibo Xu Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joanthan Cameron Cc: Len Brown Cc: Mike Rapoport Cc: Peter Zijlstra Cc: Robert Richter Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- drivers/acpi/numa/srat.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/numa/srat.c b/drivers/acpi/numa/srat.c index 0a725e46d017..5d0cbc5c88a0 100644 --- a/drivers/acpi/numa/srat.c +++ b/drivers/acpi/numa/srat.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -429,13 +430,23 @@ static int __init acpi_parse_cfmws(union acpi_subtable_headers *header, { struct acpi_cedt_cfmws *cfmws; int *fake_pxm = arg; - u64 start, end; + u64 start, end, align; int node; + int err; cfmws = (struct acpi_cedt_cfmws *)header; start = cfmws->base_hpa; end = cfmws->base_hpa + cfmws->window_size; + /* Align memblock size to CFMW regions if possible */ + align = 1UL << __ffs(start | end); + if (align >= SZ_256M) { + err = memory_block_advise_max_size(align); + if (err) + pr_warn("CFMWS: memblock size advise failed (%d)\n", err); + } else + pr_err("CFMWS: [BIOS BUG] base/size alignment violates spec\n"); + /* * The SRAT may have already described NUMA details for all, * or a portion of, this CFMWS HPA range. Extend the memblks -- 2.51.0 From b4c829fa4d56f3b566bbbb41c9a8ff0c83ae84c5 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 31 Mar 2025 19:10:25 -0700 Subject: [PATCH 10/16] mm/compaction: use folio in hugetlb pathway Use a folio in the hugetlb pathway during the compaction migrate-able pageblock scan. This removes a call to compound_head(). Link: https://lkml.kernel.org/r/20250401021025.637333-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Oscar Salvador Reviewed-by: Zi Yan Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/compaction.c | 8 ++++---- mm/hugetlb.c | 3 +-- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 8f3ac832ee7f..a57bed83c657 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -695,7 +695,7 @@ struct huge_bootmem_page { bool hugetlb_bootmem_page_zones_valid(int nid, struct huge_bootmem_page *m); -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list); +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list); int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn); void wait_for_freed_hugetlb_folios(void); struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, @@ -1083,7 +1083,7 @@ static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, return NULL; } -static inline int isolate_or_dissolve_huge_page(struct page *page, +static inline int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { return -ENOMEM; diff --git a/mm/compaction.c b/mm/compaction.c index ca71fd3c3181..dd868c861774 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1001,10 +1001,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, locked = NULL; } - ret = isolate_or_dissolve_huge_page(page, &cc->migratepages); + folio = page_folio(page); + ret = isolate_or_dissolve_huge_folio(folio, &cc->migratepages); /* - * Fail isolation in case isolate_or_dissolve_huge_page() + * Fail isolation in case isolate_or_dissolve_huge_folio() * reports an error. In case of -ENOMEM, abort right away. */ if (ret < 0) { @@ -1016,12 +1017,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, goto isolate_fail; } - if (PageHuge(page)) { + if (folio_test_hugetlb(folio)) { /* * Hugepage was successfully isolated and placed * on the cc->migratepages list. */ - folio = page_folio(page); low_pfn += folio_nr_pages(folio) - 1; goto isolate_success_no_list; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a44d4b0d844c..a2c111447812 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2896,10 +2896,9 @@ free_new: return ret; } -int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) +int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { struct hstate *h; - struct folio *folio = page_folio(page); int ret = -EBUSY; /* -- 2.51.0 From 2e976567233228ff928c2405f7e03ebb7fb7aa50 Mon Sep 17 00:00:00 2001 From: Ignacio Encinas Date: Mon, 31 Mar 2025 21:57:05 +0200 Subject: [PATCH 11/16] mm: annotate data race in update_hiwater_rss mm_struct.hiwater_rss can be accessed concurrently without proper synchronization as reported by KCSAN. This data race is benign as it only affects accounting information. Annotate it with data_race() to make KCSAN happy. Link: https://lkml.kernel.org/r/20250331-mm-maxrss-data-race-v2-1-cf958e6205bf@iencinas.com Signed-off-by: Ignacio Encinas Reported-by: syzbot+419c4b42acc36c420ad3@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67e3390c.050a0220.1ec46.0001.GAE@google.com/ Suggested-by: Lorenzo Stoakes Acked-by: Pedro Falcato Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dcdb798184ef..1690f21e7808 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -2796,7 +2797,7 @@ static inline void update_hiwater_rss(struct mm_struct *mm) { unsigned long _rss = get_mm_rss(mm); - if ((mm)->hiwater_rss < _rss) + if (data_race(mm->hiwater_rss) < _rss) (mm)->hiwater_rss = _rss; } -- 2.51.0 From 26d4d18b79659054e451be9487937e31e63d0853 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Tue, 25 Mar 2025 15:38:03 +0800 Subject: [PATCH 12/16] mm/show_mem: optimize si_meminfo_node by reducing redundant code Refactors the si_meminfo_node() function by reducing redundant code and improving readability. Moved the calculation of managed_pages inside the existing loop that processes pgdat->node_zones, eliminating the need for a separate loop. Simplified the logic by removing unnecessary preprocessor conditionals. Ensured that both totalram, totalhigh, and other memory statistics are consistently set without duplication. This change results in cleaner and more efficient code without altering functionality. Link: https://lkml.kernel.org/r/20250325073803.852594-1-ye.liu@linux.dev Signed-off-by: Ye Liu Acked-by: Mike Rapoport (Microsoft) Reviewed-by: Harry Yoo Signed-off-by: Andrew Morton --- mm/show_mem.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/mm/show_mem.c b/mm/show_mem.c index 6af13bcd2ab3..ad373b4b6e39 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -94,26 +94,20 @@ void si_meminfo_node(struct sysinfo *val, int nid) unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - val->totalram = managed_pages; - val->sharedram = node_page_state(pgdat, NR_SHMEM); - val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); -#ifdef CONFIG_HIGHMEM for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - + managed_pages += zone_managed_pages(zone); if (is_highmem(zone)) { managed_highpages += zone_managed_pages(zone); free_highpages += zone_page_state(zone, NR_FREE_PAGES); } } + + val->totalram = managed_pages; + val->sharedram = node_page_state(pgdat, NR_SHMEM); + val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES); val->totalhigh = managed_highpages; val->freehigh = free_highpages; -#else - val->totalhigh = managed_highpages; - val->freehigh = free_highpages; -#endif val->mem_unit = PAGE_SIZE; } #endif -- 2.51.0 From 0bf19a357e0eaf03e757ac9482c45a797e40157a Mon Sep 17 00:00:00 2001 From: Siddarth G Date: Thu, 3 Apr 2025 15:43:45 +0530 Subject: [PATCH 13/16] selftests/mm: convert page_size to unsigned long Cppcheck warning: int result is assigned to long long variable. If the variable is long long to avoid loss of information, then you have loss of information. This patch changes the type of page_size from 'unsigned int' to 'unsigned long' instead of using ULL suffixes. Changing hpage_size to 'unsigned long' was considered, but since gethugepage() expects an int, this change was avoided. Link: https://lkml.kernel.org/r/20250403101345.29226-1-siddarthsgml@gmail.com Signed-off-by: Siddarth G Reported-by: David Binderman Closes: https://lore.kernel.org/all/AS8PR02MB10217315060BBFDB21F19643E9CA62@AS8PR02MB10217.eurprd02.prod.outlook.com/ Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/pagemap_ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/testing/selftests/mm/pagemap_ioctl.c b/tools/testing/selftests/mm/pagemap_ioctl.c index 57b4bba2b45f..fe5ae8b25ff6 100644 --- a/tools/testing/selftests/mm/pagemap_ioctl.c +++ b/tools/testing/selftests/mm/pagemap_ioctl.c @@ -34,7 +34,7 @@ #define PAGEMAP "/proc/self/pagemap" int pagemap_fd; int uffd; -unsigned int page_size; +unsigned long page_size; unsigned int hpage_size; const char *progname; @@ -184,7 +184,7 @@ void *gethugetlb_mem(int size, int *shmid) int userfaultfd_tests(void) { - int mem_size, vec_size, written, num_pages = 16; + long mem_size, vec_size, written, num_pages = 16; char *mem, *vec; mem_size = num_pages * page_size; @@ -213,7 +213,7 @@ int userfaultfd_tests(void) written = pagemap_ioctl(mem, mem_size, vec, 1, PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC, vec_size - 2, PAGE_IS_WRITTEN, 0, 0, PAGE_IS_WRITTEN); if (written < 0) - ksft_exit_fail_msg("error %d %d %s\n", written, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", written, errno, strerror(errno)); ksft_test_result(written == 0, "%s all new pages must not be written (dirty)\n", __func__); @@ -995,7 +995,7 @@ int unmapped_region_tests(void) { void *start = (void *)0x10000000; int written, len = 0x00040000; - int vec_size = len / page_size; + long vec_size = len / page_size; struct page_region *vec = malloc(sizeof(struct page_region) * vec_size); /* 1. Get written pages */ @@ -1051,7 +1051,7 @@ static void test_simple(void) int sanity_tests(void) { unsigned long long mem_size, vec_size; - int ret, fd, i, buf_size; + long ret, fd, i, buf_size; struct page_region *vec; char *mem, *fmem; struct stat sbuf; @@ -1160,7 +1160,7 @@ int sanity_tests(void) ret = stat(progname, &sbuf); if (ret < 0) - ksft_exit_fail_msg("error %d %d %s\n", ret, errno, strerror(errno)); + ksft_exit_fail_msg("error %ld %d %s\n", ret, errno, strerror(errno)); fmem = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); if (fmem == MAP_FAILED) -- 2.51.0 From cf42d4cccf0d01e375175393776a16dc47b9996f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 27 Mar 2025 10:58:09 +0900 Subject: [PATCH 14/16] zram: modernize writeback interface The writeback interface supports a page_index=N parameter which performs writeback of the given page. Since we rarely need to writeback just one single page, the typical use case involves a number of writeback calls, each performing writeback of one page: echo page_index=100 > zram0/writeback ... echo page_index=200 > zram0/writeback echo page_index=500 > zram0/writeback ... echo page_index=700 > zram0/writeback One obvious downside of this is that it increases the number of syscalls. Less obvious, but a significantly more important downside, is that when given only one page to post-process zram cannot perform an optimal target selection. This becomes a critical limitation when writeback_limit is enabled, because under writeback_limit we want to guarantee the highest memory savings hence we first need to writeback pages that release the highest amount of zsmalloc pool memory. This patch adds page_indexes=LOW-HIGH parameter to the writeback interface: echo page_indexes=100-200 page_indexes=500-700 > zram0/writeback This gives zram a chance to apply an optimal target selection strategy on each iteration of the writeback loop. We also now permit multiple page_index parameters per call (previously zram would recognize only one page_index) and a mix or single pages and page ranges: echo page_index=42 page_index=99 page_indexes=100-200 \ page_indexes=500-700 > zram0/writeback Apart from that the patch also unifies parameters passing and resembles other "modern" zram device attributes (e.g. recompression), while the old interface used a mixed scheme: values-less parameters for mode and a key=value format for page_index. We still support the "old" value-less format for compatibility reasons. [senozhatsky@chromium.org: simplify parse_page_index() range checks, per Brian] nk: https://lkml.kernel.org/r/20250404015327.2427684-1-senozhatsky@chromium.org [sozhatsky@chromium.org: fix uninitialized variable in zram_writeback_slots(), per Dan] nk: https://lkml.kernel.org/r/20250409112611.1154282-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20250327015818.4148660-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Cc: Minchan Kim Cc: Richard Chang Cc: Sergey Senozhatsky Cc: Dan Carpenter Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 17 ++ drivers/block/zram/zram_drv.c | 320 +++++++++++++------- 2 files changed, 232 insertions(+), 105 deletions(-) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 9bdb30901a93..b8d36134a151 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -369,6 +369,23 @@ they could write a page index into the interface:: echo "page_index=1251" > /sys/block/zramX/writeback +In Linux 6.16 this interface underwent some rework. First, the interface +now supports `key=value` format for all of its parameters (`type=huge_idle`, +etc.) Second, the support for `page_indexes` was introduced, which specify +`LOW-HIGH` range (or ranges) of pages to be written-back. This reduces the +number of syscalls, but more importantly this enables optimal post-processing +target selection strategy. Usage example:: + + echo "type=idle" > /sys/block/zramX/writeback + echo "page_indexes=1-100 page_indexes=200-300" > \ + /sys/block/zramX/writeback + +We also now permit multiple page_index params per call and a mix of +single pages and page ranges:: + + echo page_index=42 page_index=99 page_indexes=100-200 \ + page_indexes=500-700 > /sys/block/zramX/writeback + If there are lots of write IO with flash device, potentially, it has flash wearout problem so that admin needs to design write limitation to guarantee storage health for entire product life. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0ba18277ed7b..94e6e9b80bf0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -734,114 +734,19 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, submit_bio(bio); } -#define PAGE_WB_SIG "page_index=" - -#define PAGE_WRITEBACK 0 -#define HUGE_WRITEBACK (1<<0) -#define IDLE_WRITEBACK (1<<1) -#define INCOMPRESSIBLE_WRITEBACK (1<<2) - -static int scan_slots_for_writeback(struct zram *zram, u32 mode, - unsigned long nr_pages, - unsigned long index, - struct zram_pp_ctl *ctl) +static int zram_writeback_slots(struct zram *zram, struct zram_pp_ctl *ctl) { - for (; nr_pages != 0; index++, nr_pages--) { - bool ok = true; - - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) - goto next; - - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_SAME)) - goto next; - - if (mode & IDLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_IDLE)) - goto next; - if (mode & HUGE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_HUGE)) - goto next; - if (mode & INCOMPRESSIBLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) - goto next; - - ok = place_pp_slot(zram, ctl, index); -next: - zram_slot_unlock(zram, index); - if (!ok) - break; - } - - return 0; -} - -static ssize_t writeback_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t len) -{ - struct zram *zram = dev_to_zram(dev); - unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; - struct zram_pp_ctl *ctl = NULL; + unsigned long blk_idx = 0; + struct page *page = NULL; struct zram_pp_slot *pps; - unsigned long index = 0; - struct bio bio; struct bio_vec bio_vec; - struct page *page = NULL; - ssize_t ret = len; - int mode, err; - unsigned long blk_idx = 0; - - if (sysfs_streq(buf, "idle")) - mode = IDLE_WRITEBACK; - else if (sysfs_streq(buf, "huge")) - mode = HUGE_WRITEBACK; - else if (sysfs_streq(buf, "huge_idle")) - mode = IDLE_WRITEBACK | HUGE_WRITEBACK; - else if (sysfs_streq(buf, "incompressible")) - mode = INCOMPRESSIBLE_WRITEBACK; - else { - if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) - return -EINVAL; - - if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || - index >= nr_pages) - return -EINVAL; - - nr_pages = 1; - mode = PAGE_WRITEBACK; - } - - down_read(&zram->init_lock); - if (!init_done(zram)) { - ret = -EINVAL; - goto release_init_lock; - } - - /* Do not permit concurrent post-processing actions. */ - if (atomic_xchg(&zram->pp_in_progress, 1)) { - up_read(&zram->init_lock); - return -EAGAIN; - } - - if (!zram->backing_dev) { - ret = -ENODEV; - goto release_init_lock; - } + struct bio bio; + int ret = 0, err; + u32 index; page = alloc_page(GFP_KERNEL); - if (!page) { - ret = -ENOMEM; - goto release_init_lock; - } - - ctl = init_pp_ctl(); - if (!ctl) { - ret = -ENOMEM; - goto release_init_lock; - } - - scan_slots_for_writeback(zram, mode, nr_pages, index, ctl); + if (!page) + return -ENOMEM; while ((pps = select_pp_slot(ctl))) { spin_lock(&zram->wb_limit_lock); @@ -929,10 +834,215 @@ next: if (blk_idx) free_block_bdev(zram, blk_idx); - -release_init_lock: if (page) __free_page(page); + + return ret; +} + +#define PAGE_WRITEBACK 0 +#define HUGE_WRITEBACK (1 << 0) +#define IDLE_WRITEBACK (1 << 1) +#define INCOMPRESSIBLE_WRITEBACK (1 << 2) + +static int parse_page_index(char *val, unsigned long nr_pages, + unsigned long *lo, unsigned long *hi) +{ + int ret; + + ret = kstrtoul(val, 10, lo); + if (ret) + return ret; + if (*lo >= nr_pages) + return -ERANGE; + *hi = *lo + 1; + return 0; +} + +static int parse_page_indexes(char *val, unsigned long nr_pages, + unsigned long *lo, unsigned long *hi) +{ + char *delim; + int ret; + + delim = strchr(val, '-'); + if (!delim) + return -EINVAL; + + *delim = 0x00; + ret = kstrtoul(val, 10, lo); + if (ret) + return ret; + if (*lo >= nr_pages) + return -ERANGE; + + ret = kstrtoul(delim + 1, 10, hi); + if (ret) + return ret; + if (*hi >= nr_pages || *lo > *hi) + return -ERANGE; + *hi += 1; + return 0; +} + +static int parse_mode(char *val, u32 *mode) +{ + *mode = 0; + + if (!strcmp(val, "idle")) + *mode = IDLE_WRITEBACK; + if (!strcmp(val, "huge")) + *mode = HUGE_WRITEBACK; + if (!strcmp(val, "huge_idle")) + *mode = IDLE_WRITEBACK | HUGE_WRITEBACK; + if (!strcmp(val, "incompressible")) + *mode = INCOMPRESSIBLE_WRITEBACK; + + if (*mode == 0) + return -EINVAL; + return 0; +} + +static int scan_slots_for_writeback(struct zram *zram, u32 mode, + unsigned long lo, unsigned long hi, + struct zram_pp_ctl *ctl) +{ + u32 index = lo; + + while (index < hi) { + bool ok = true; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME)) + goto next; + + if (mode & IDLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_IDLE)) + goto next; + if (mode & HUGE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_HUGE)) + goto next; + if (mode & INCOMPRESSIBLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + + ok = place_pp_slot(zram, ctl, index); +next: + zram_slot_unlock(zram, index); + if (!ok) + break; + index++; + } + + return 0; +} + +static ssize_t writeback_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + u64 nr_pages = zram->disksize >> PAGE_SHIFT; + unsigned long lo = 0, hi = nr_pages; + struct zram_pp_ctl *ctl = NULL; + char *args, *param, *val; + ssize_t ret = len; + int err, mode = 0; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + return -EINVAL; + } + + /* Do not permit concurrent post-processing actions. */ + if (atomic_xchg(&zram->pp_in_progress, 1)) { + up_read(&zram->init_lock); + return -EAGAIN; + } + + if (!zram->backing_dev) { + ret = -ENODEV; + goto release_init_lock; + } + + ctl = init_pp_ctl(); + if (!ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + args = skip_spaces(buf); + while (*args) { + args = next_arg(args, ¶m, &val); + + /* + * Workaround to support the old writeback interface. + * + * The old writeback interface has a minor inconsistency and + * requires key=value only for page_index parameter, while the + * writeback mode is a valueless parameter. + * + * This is not the case anymore and now all parameters are + * required to have values, however, we need to support the + * legacy writeback interface format so we check if we can + * recognize a valueless parameter as the (legacy) writeback + * mode. + */ + if (!val || !*val) { + err = parse_mode(param, &mode); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + break; + } + + if (!strcmp(param, "type")) { + err = parse_mode(val, &mode); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + break; + } + + if (!strcmp(param, "page_index")) { + err = parse_page_index(val, nr_pages, &lo, &hi); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + continue; + } + + if (!strcmp(param, "page_indexes")) { + err = parse_page_indexes(val, nr_pages, &lo, &hi); + if (err) { + ret = err; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, lo, hi, ctl); + continue; + } + } + + err = zram_writeback_slots(zram, ctl); + if (err) + ret = err; + +release_init_lock: release_pp_ctl(zram, ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); -- 2.51.0 From 3a531a9939623e13cf798e9ef5e7edd3e74fe733 Mon Sep 17 00:00:00 2001 From: Ye Liu Date: Fri, 28 Mar 2025 09:20:31 +0800 Subject: [PATCH 15/16] mm/page_alloc: simplify free_page_is_bad by removing free_page_is_bad_report Refactor free_page_is_bad() to call bad_page() directly, removing the intermediate free_page_is_bad_report(). This reduces unnecessary indirection, improving code clarity and maintainability without changing functionality. Link: https://lkml.kernel.org/r/20250328012031.1204993-1-ye.liu@linux.dev Signed-off-by: Ye Liu Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1ee0e8265888..b8c15d572f2d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -934,19 +934,13 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) return bad_reason; } -static void free_page_is_bad_report(struct page *page) -{ - bad_page(page, - page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); -} - static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return false; /* Something has gone sideways, find it */ - free_page_is_bad_report(page); + bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); return true; } -- 2.51.0 From bb317f00b9b7f59de5a2ef512a76a1ae285bd23f Mon Sep 17 00:00:00 2001 From: Michal Clapinski Date: Fri, 4 Apr 2025 13:11:02 +0200 Subject: [PATCH 16/16] mm/compaction: remove low watermark cap for proactive compaction Patch series "mm/compaction: allow more aggressive proactive compaction", v4. Our goal is to keep memory usage of a VM low on the host. For that reason, we use free page reporting which by default reports free pages of order 9 and larger to the host to be freed. The feature works well only if the memory in the guest is not fragmented below pages of order 9. Proactive compaction can be reused to achieve defragmentation after some parameter tweaking. When the fragmentation score (lower is better) gets larger than the high watermark, proactive compaction kicks in. Compaction stops when the score goes below the low watermark (or no progress is made and backoff kicks in). Let's define the difference between high and low watermarks as leeway. Before these changes, the minimum possible value for low watermark was 5 and the leeway was hardcoded to 10 (so minimum possible value for high watermark was 15). To test this, I created a VM with 19GB of memory and free page reporting enabled. The VM was ~idle. I meassured the memory usage from inside the guest (/proc/meminfo) and from the host (provided by the hypervisor). Before: https://drive.google.com/file/d/1Xw23lRry_PgEH3f6QRnSGvoHh2u9UHyI/view?usp=sharing After: https://drive.google.com/file/d/1wMhpIzepx6t44F70yCPA50n1S5V2AT-a/view?usp=sharing This patch (of 2): Previously a min cap of 5 has been set in the commit introducing proactive compaction. This was to make sure users don't hurt themselves by setting the proactiveness to 100 and making their system unresponsive. But the compaction mechanism has a backoff mechanism that will sleep for 30s if no progress is made, so I don't see a significant risk here. My system (19GB of memory) has been perfectly fine with both watermarks hardcoded to 0. Link: https://lkml.kernel.org/r/20250404111103.1994507-1-mclapinski@google.com Link: https://lkml.kernel.org/r/20250404111103.1994507-2-mclapinski@google.com Signed-off-by: Michal Clapinski Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index dd868c861774..f9ee06d55726 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2251,12 +2251,7 @@ static unsigned int fragmentation_score_wmark(bool low) { unsigned int wmark_low; - /* - * Cap the low watermark to avoid excessive compaction - * activity in case a user sets the proactiveness tunable - * close to 100 (maximum). - */ - wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); + wmark_low = 100U - sysctl_compaction_proactiveness; return low ? wmark_low : min(wmark_low + 10, 100U); } -- 2.51.0