From ac55b38fe2f9b486031439c5c4ed7fce07d0d838 Mon Sep 17 00:00:00 2001 From: Liu Ye Date: Wed, 5 Mar 2025 15:17:59 +0800 Subject: [PATCH 01/16] mm/shrinker: fix name consistency issue in shrinker_debugfs_rename() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit After calling debugfs_change_name function, the return value should be checked and the old name restored. If debugfs_change_name fails, the new name memory should be freed. The effect is that the shrinker->name is not consistent with the name displayed in debugfs. Link: https://lkml.kernel.org/r/20250305071759.661055-1-liuye@kylinos.cn Signed-off-by: Liu Ye Reviewed-by: Muchun Song Reviewed-by:Qi Zheng Cc: Dave Chinner Cc: Muchun Song Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/shrinker_debug.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 794bd433cce0..20eaee3e97f7 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -214,10 +214,14 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) ret = debugfs_change_name(shrinker->debugfs_entry, "%s-%d", shrinker->name, shrinker->debugfs_id); + if (ret) { + shrinker->name = old; + kfree_const(new); + } else { + kfree_const(old); + } mutex_unlock(&shrinker_mutex); - kfree_const(old); - return ret; } EXPORT_SYMBOL(shrinker_debugfs_rename); -- 2.51.0 From 9bbe033c75a56d72fc35e7c8ca6f3258d9782fa5 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:29 +0000 Subject: [PATCH 02/16] mm: zpool: add interfaces for object read/write APIs Patch series "Switch zswap to object read/write APIs". This patch series updates zswap to use the new object read/write APIs defined by zsmalloc in [1], and remove the old object mapping APIs and the related code from zpool and zsmalloc. This patch (of 5): Zsmalloc introduced new APIs to read/write objects besides mapping them. Add the necessary zpool interfaces. Link: https://lkml.kernel.org/r/20250305061134.4105762-1-yosry.ahmed@linux.dev Link: https://lkml.kernel.org/r/20250305061134.4105762-2-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/zpool.h | 17 +++++++++++++++ mm/zpool.c | 48 +++++++++++++++++++++++++++++++++++++++++++ mm/zsmalloc.c | 21 +++++++++++++++++++ 3 files changed, 86 insertions(+) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 5e6dc46b8cc4..1784e735ee04 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -52,6 +52,16 @@ void *zpool_map_handle(struct zpool *pool, unsigned long handle, void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, + void *local_copy); + +void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, + void *handle_mem); + +void zpool_obj_write(struct zpool *zpool, unsigned long handle, + void *handle_mem, size_t mem_len); + u64 zpool_get_total_pages(struct zpool *pool); @@ -90,6 +100,13 @@ struct zpool_driver { enum zpool_mapmode mm); void (*unmap)(void *pool, unsigned long handle); + void *(*obj_read_begin)(void *pool, unsigned long handle, + void *local_copy); + void (*obj_read_end)(void *pool, unsigned long handle, + void *handle_mem); + void (*obj_write)(void *pool, unsigned long handle, + void *handle_mem, size_t mem_len); + u64 (*total_pages)(void *pool); }; diff --git a/mm/zpool.c b/mm/zpool.c index 4bbd12d4b659..378c2d1e5638 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -320,6 +320,54 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) zpool->driver->unmap(zpool->pool, handle); } +/** + * zpool_obj_read_begin() - Start reading from a previously allocated handle. + * @zpool: The zpool that the handle was allocated from + * @handle: The handle to read from + * @local_copy: A local buffer to use if needed. + * + * This starts a read operation of a previously allocated handle. The passed + * @local_copy buffer may be used if needed by copying the memory into. + * zpool_obj_read_end() MUST be called after the read is completed to undo any + * actions taken (e.g. release locks). + * + * Returns: A pointer to the handle memory to be read, if @local_copy is used, + * the returned pointer is @local_copy. + */ +void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, + void *local_copy) +{ + return zpool->driver->obj_read_begin(zpool->pool, handle, local_copy); +} + +/** + * zpool_obj_read_end() - Finish reading from a previously allocated handle. + * @zpool: The zpool that the handle was allocated from + * @handle: The handle to read from + * @handle_mem: The pointer returned by zpool_obj_read_begin() + * + * Finishes a read operation previously started by zpool_obj_read_begin(). + */ +void zpool_obj_read_end(struct zpool *zpool, unsigned long handle, + void *handle_mem) +{ + zpool->driver->obj_read_end(zpool->pool, handle, handle_mem); +} + +/** + * zpool_obj_write() - Write to a previously allocated handle. + * @zpool: The zpool that the handle was allocated from + * @handle: The handle to read from + * @handle_mem: The memory to copy from into the handle. + * @mem_len: The length of memory to be written. + * + */ +void zpool_obj_write(struct zpool *zpool, unsigned long handle, + void *handle_mem, size_t mem_len) +{ + zpool->driver->obj_write(zpool->pool, handle, handle_mem, mem_len); +} + /** * zpool_get_total_pages() - The total size of the pool * @zpool: The zpool to check diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 63c99db71dc1..d84b300db64e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -507,6 +507,24 @@ static void zs_zpool_unmap(void *pool, unsigned long handle) zs_unmap_object(pool, handle); } +static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle, + void *local_copy) +{ + return zs_obj_read_begin(pool, handle, local_copy); +} + +static void zs_zpool_obj_read_end(void *pool, unsigned long handle, + void *handle_mem) +{ + zs_obj_read_end(pool, handle, handle_mem); +} + +static void zs_zpool_obj_write(void *pool, unsigned long handle, + void *handle_mem, size_t mem_len) +{ + zs_obj_write(pool, handle, handle_mem, mem_len); +} + static u64 zs_zpool_total_pages(void *pool) { return zs_get_total_pages(pool); @@ -522,6 +540,9 @@ static struct zpool_driver zs_zpool_driver = { .free = zs_zpool_free, .map = zs_zpool_map, .unmap = zs_zpool_unmap, + .obj_read_begin = zs_zpool_obj_read_begin, + .obj_read_end = zs_zpool_obj_read_end, + .obj_write = zs_zpool_obj_write, .total_pages = zs_zpool_total_pages, }; -- 2.51.0 From 7d4c9629b74ff7ad3b58e57324e235d710e55c21 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:30 +0000 Subject: [PATCH 03/16] mm: zswap: use object read/write APIs instead of object mapping APIs Use the new object read/write APIs instead of mapping APIs. On compress side, zpool_obj_write() is more concise and provides exactly what zswap needs to write the compressed object to the zpool, instead of map->copy->unmap. On the decompress side, zpool_obj_read_begin() is sleepable, which allows avoiding the memcpy() for zsmalloc and slightly simplifying the code by: - Avoiding checking if the zpool driver is sleepable, reducing special cases and shrinking the huge comment. - Having a single zpool_obj_read_end() call rather than multiple conditional zpool_unmap_handle() calls. The !virt_addr_valid() case can be removed in the future if the crypto API supports kmap addresses or by using kmap_to_page(), completely eliminating the memcpy() path in zswap_decompress(). This a step toward that. In that spirit, opportunistically make the comment more specific about the kmap case instead of generic non-linear addresses. This is the only case that needs to be handled in practice, and the generic comment makes it seem like a bigger problem that it actually is. Link: https://lkml.kernel.org/r/20250305061134.4105762-3-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Sergey Senozhatsky Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- mm/zswap.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 8a1ded8fa973..7de54f105d04 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -930,7 +930,6 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, unsigned int dlen = PAGE_SIZE; unsigned long handle; struct zpool *zpool; - char *buf; gfp_t gfp; u8 *dst; @@ -972,10 +971,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, if (alloc_ret) goto unlock; - buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); - memcpy(buf, dst, dlen); - zpool_unmap_handle(zpool, handle); - + zpool_obj_write(zpool, handle, dst, dlen); entry->handle = handle; entry->length = dlen; @@ -996,24 +992,22 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) struct zpool *zpool = entry->pool->zpool; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - u8 *src; + u8 *src, *obj; acomp_ctx = acomp_ctx_get_cpu_lock(entry->pool); - src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); + obj = zpool_obj_read_begin(zpool, entry->handle, acomp_ctx->buffer); + /* - * If zpool_map_handle is atomic, we cannot reliably utilize its mapped buffer - * to do crypto_acomp_decompress() which might sleep. In such cases, we must - * resort to copying the buffer to a temporary one. - * Meanwhile, zpool_map_handle() might return a non-linearly mapped buffer, - * such as a kmap address of high memory or even ever a vmap address. - * However, sg_init_one is only equipped to handle linearly mapped low memory. - * In such cases, we also must copy the buffer to a temporary and lowmem one. + * zpool_obj_read_begin() might return a kmap address of highmem when + * acomp_ctx->buffer is not used. However, sg_init_one() does not + * handle highmem addresses, so copy the object to acomp_ctx->buffer. */ - if ((acomp_ctx->is_sleepable && !zpool_can_sleep_mapped(zpool)) || - !virt_addr_valid(src)) { - memcpy(acomp_ctx->buffer, src, entry->length); + if (virt_addr_valid(obj)) { + src = obj; + } else { + WARN_ON_ONCE(obj == acomp_ctx->buffer); + memcpy(acomp_ctx->buffer, obj, entry->length); src = acomp_ctx->buffer; - zpool_unmap_handle(zpool, entry->handle); } sg_init_one(&input, src, entry->length); @@ -1023,8 +1017,7 @@ static void zswap_decompress(struct zswap_entry *entry, struct folio *folio) BUG_ON(crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)); BUG_ON(acomp_ctx->req->dlen != PAGE_SIZE); - if (src != acomp_ctx->buffer) - zpool_unmap_handle(zpool, entry->handle); + zpool_obj_read_end(zpool, entry->handle, obj); acomp_ctx_put_unlock(acomp_ctx); } -- 2.51.0 From fcbea574754c63f7035d0c4ef7dfb161b60b5bde Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:31 +0000 Subject: [PATCH 04/16] mm: zpool: remove object mapping APIs zpool_map_handle(), zpool_unmap_handle(), and zpool_can_sleep_mapped() are no longer used. Remove them with the underlying driver callbacks. Link: https://lkml.kernel.org/r/20250305061134.4105762-4-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/zpool.h | 30 --------------------- mm/zpool.c | 61 ------------------------------------------- mm/zsmalloc.c | 27 ------------------- 3 files changed, 118 deletions(-) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 1784e735ee04..2c8a9d2654f6 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -13,25 +13,6 @@ struct zpool; -/* - * Control how a handle is mapped. It will be ignored if the - * implementation does not support it. Its use is optional. - * Note that this does not refer to memory protection, it - * refers to how the memory will be copied in/out if copying - * is necessary during mapping; read-write is the safest as - * it copies the existing memory in on map, and copies the - * changed memory back out on unmap. Write-only does not copy - * in the memory and should only be used for initialization. - * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. - */ -enum zpool_mapmode { - ZPOOL_MM_RW, /* normal read-write mapping */ - ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ - ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ - - ZPOOL_MM_DEFAULT = ZPOOL_MM_RW -}; - bool zpool_has_pool(char *type); struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp); @@ -47,12 +28,6 @@ int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, void zpool_free(struct zpool *pool, unsigned long handle); -void *zpool_map_handle(struct zpool *pool, unsigned long handle, - enum zpool_mapmode mm); - -void zpool_unmap_handle(struct zpool *pool, unsigned long handle); - - void *zpool_obj_read_begin(struct zpool *zpool, unsigned long handle, void *local_copy); @@ -95,11 +70,6 @@ struct zpool_driver { unsigned long *handle); void (*free)(void *pool, unsigned long handle); - bool sleep_mapped; - void *(*map)(void *pool, unsigned long handle, - enum zpool_mapmode mm); - void (*unmap)(void *pool, unsigned long handle); - void *(*obj_read_begin)(void *pool, unsigned long handle, void *local_copy); void (*obj_read_end)(void *pool, unsigned long handle, diff --git a/mm/zpool.c b/mm/zpool.c index 378c2d1e5638..4fc665b42f5e 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -277,49 +277,6 @@ void zpool_free(struct zpool *zpool, unsigned long handle) zpool->driver->free(zpool->pool, handle); } -/** - * zpool_map_handle() - Map a previously allocated handle into memory - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to map - * @mapmode: How the memory should be mapped - * - * This maps a previously allocated handle into memory. The @mapmode - * param indicates to the implementation how the memory will be - * used, i.e. read-only, write-only, read-write. If the - * implementation does not support it, the memory will be treated - * as read-write. - * - * This may hold locks, disable interrupts, and/or preemption, - * and the zpool_unmap_handle() must be called to undo those - * actions. The code that uses the mapped handle should complete - * its operations on the mapped handle memory quickly and unmap - * as soon as possible. As the implementation may use per-cpu - * data, multiple handles should not be mapped concurrently on - * any cpu. - * - * Returns: A pointer to the handle's mapped memory area. - */ -void *zpool_map_handle(struct zpool *zpool, unsigned long handle, - enum zpool_mapmode mapmode) -{ - return zpool->driver->map(zpool->pool, handle, mapmode); -} - -/** - * zpool_unmap_handle() - Unmap a previously mapped handle - * @zpool: The zpool that the handle was allocated from - * @handle: The handle to unmap - * - * This unmaps a previously mapped handle. Any locks or other - * actions that the implementation took in zpool_map_handle() - * will be undone here. The memory area returned from - * zpool_map_handle() should no longer be used after this. - */ -void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) -{ - zpool->driver->unmap(zpool->pool, handle); -} - /** * zpool_obj_read_begin() - Start reading from a previously allocated handle. * @zpool: The zpool that the handle was allocated from @@ -381,23 +338,5 @@ u64 zpool_get_total_pages(struct zpool *zpool) return zpool->driver->total_pages(zpool->pool); } -/** - * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. - * @zpool: The zpool to test - * - * Some allocators enter non-preemptible context in ->map() callback (e.g. - * disable pagefaults) and exit that context in ->unmap(), which limits what - * we can do with the mapped object. For instance, we cannot wait for - * asynchronous crypto API to decompress such an object or take mutexes - * since those will call into the scheduler. This function tells us whether - * we use such an allocator. - * - * Returns: true if zpool can sleep; false otherwise. - */ -bool zpool_can_sleep_mapped(struct zpool *zpool) -{ - return zpool->driver->sleep_mapped; -} - MODULE_AUTHOR("Dan Streetman "); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d84b300db64e..56d6ed5c675b 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -482,31 +482,6 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } -static void *zs_zpool_map(void *pool, unsigned long handle, - enum zpool_mapmode mm) -{ - enum zs_mapmode zs_mm; - - switch (mm) { - case ZPOOL_MM_RO: - zs_mm = ZS_MM_RO; - break; - case ZPOOL_MM_WO: - zs_mm = ZS_MM_WO; - break; - case ZPOOL_MM_RW: - default: - zs_mm = ZS_MM_RW; - break; - } - - return zs_map_object(pool, handle, zs_mm); -} -static void zs_zpool_unmap(void *pool, unsigned long handle) -{ - zs_unmap_object(pool, handle); -} - static void *zs_zpool_obj_read_begin(void *pool, unsigned long handle, void *local_copy) { @@ -538,8 +513,6 @@ static struct zpool_driver zs_zpool_driver = { .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, - .map = zs_zpool_map, - .unmap = zs_zpool_unmap, .obj_read_begin = zs_zpool_obj_read_begin, .obj_read_end = zs_zpool_obj_read_end, .obj_write = zs_zpool_obj_write, -- 2.51.0 From 07864f1a57fb1f798a7d21f13e4929c9cb52daf7 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:32 +0000 Subject: [PATCH 05/16] mm: zsmalloc: remove object mapping APIs and per-CPU map areas zs_map_object() and zs_unmap_object() are no longer used, remove them. Since these are the only users of per-CPU mapping_areas, remove them and the associated CPU hotplug callbacks too. [yosry.ahmed@linux.dev: update the docs] Link: https://lkml.kernel.org/r/Z8ier-ZZp8T6MOTH@google.com Link: https://lkml.kernel.org/r/20250305061134.4105762-5-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Acked-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- Documentation/mm/zsmalloc.rst | 5 +- include/linux/cpuhotplug.h | 1 - include/linux/zsmalloc.h | 21 ---- mm/zsmalloc.c | 226 +--------------------------------- 4 files changed, 3 insertions(+), 250 deletions(-) diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst index 76902835e68e..d2bbecd78e14 100644 --- a/Documentation/mm/zsmalloc.rst +++ b/Documentation/mm/zsmalloc.rst @@ -27,9 +27,8 @@ Instead, it returns an opaque handle (unsigned long) which encodes actual location of the allocated object. The reason for this indirection is that zsmalloc does not keep zspages permanently mapped since that would cause issues on 32-bit systems where the VA region for kernel space mappings -is very small. So, before using the allocating memory, the object has to -be mapped using zs_map_object() to get a usable pointer and subsequently -unmapped using zs_unmap_object(). +is very small. So, using the allocated memory should be done through the +proper handle-based APIs. stat ==== diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 6cc5e484547c..1987400000b4 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -116,7 +116,6 @@ enum cpuhp_state { CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, - CPUHP_MM_ZS_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 7d70983cf398..c26baf9fb331 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -16,23 +16,6 @@ #include -/* - * zsmalloc mapping modes - * - * NOTE: These only make a difference when a mapped object spans pages. - */ -enum zs_mapmode { - ZS_MM_RW, /* normal read-write mapping */ - ZS_MM_RO, /* read-only (no copy-out at unmap time) */ - ZS_MM_WO /* write-only (no copy-in at map time) */ - /* - * NOTE: ZS_MM_WO should only be used for initializing new - * (uninitialized) allocations. Partial writes to already - * initialized allocations should use ZS_MM_RW to preserve the - * existing data. - */ -}; - struct zs_pool_stats { /* How many pages were migrated (freed) */ atomic_long_t pages_compacted; @@ -48,10 +31,6 @@ void zs_free(struct zs_pool *pool, unsigned long obj); size_t zs_huge_class_size(struct zs_pool *pool); -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm); -void zs_unmap_object(struct zs_pool *pool, unsigned long handle); - unsigned long zs_get_total_pages(struct zs_pool *pool); unsigned long zs_compact(struct zs_pool *pool); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 56d6ed5c675b..cd1c2a8ffef0 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -281,13 +281,6 @@ struct zspage { struct zspage_lock zsl; }; -struct mapping_area { - local_lock_t lock; - char *vm_buf; /* copy buffer for objects that span pages */ - char *vm_addr; /* address of kmap_local_page()'ed pages */ - enum zs_mapmode vm_mm; /* mapping mode */ -}; - static void zspage_lock_init(struct zspage *zspage) { static struct lock_class_key __key; @@ -522,11 +515,6 @@ static struct zpool_driver zs_zpool_driver = { MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ -/* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { - .lock = INIT_LOCAL_LOCK(lock), -}; - static inline bool __maybe_unused is_first_zpdesc(struct zpdesc *zpdesc) { return PagePrivate(zpdesc_page(zpdesc)); @@ -1111,93 +1099,6 @@ static struct zspage *find_get_zspage(struct size_class *class) return zspage; } -static inline int __zs_cpu_up(struct mapping_area *area) -{ - /* - * Make sure we don't leak memory if a cpu UP notification - * and zs_init() race and both call zs_cpu_up() on the same cpu - */ - if (area->vm_buf) - return 0; - area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); - if (!area->vm_buf) - return -ENOMEM; - return 0; -} - -static inline void __zs_cpu_down(struct mapping_area *area) -{ - kfree(area->vm_buf); - area->vm_buf = NULL; -} - -static void *__zs_map_object(struct mapping_area *area, - struct zpdesc *zpdescs[2], int off, int size) -{ - size_t sizes[2]; - char *buf = area->vm_buf; - - /* disable page faults to match kmap_local_page() return conditions */ - pagefault_disable(); - - /* no read fastpath */ - if (area->vm_mm == ZS_MM_WO) - goto out; - - sizes[0] = PAGE_SIZE - off; - sizes[1] = size - sizes[0]; - - /* copy object to per-cpu buffer */ - memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]); - memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]); -out: - return area->vm_buf; -} - -static void __zs_unmap_object(struct mapping_area *area, - struct zpdesc *zpdescs[2], int off, int size) -{ - size_t sizes[2]; - char *buf; - - /* no write fastpath */ - if (area->vm_mm == ZS_MM_RO) - goto out; - - buf = area->vm_buf; - buf = buf + ZS_HANDLE_SIZE; - size -= ZS_HANDLE_SIZE; - off += ZS_HANDLE_SIZE; - - sizes[0] = PAGE_SIZE - off; - sizes[1] = size - sizes[0]; - - /* copy per-cpu buffer to object */ - memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]); - memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]); - -out: - /* enable page faults to match kunmap_local() return conditions */ - pagefault_enable(); -} - -static int zs_cpu_prepare(unsigned int cpu) -{ - struct mapping_area *area; - - area = &per_cpu(zs_map_area, cpu); - return __zs_cpu_up(area); -} - -static int zs_cpu_dead(unsigned int cpu) -{ - struct mapping_area *area; - - area = &per_cpu(zs_map_area, cpu); - __zs_cpu_down(area); - return 0; -} - static bool can_merge(struct size_class *prev, int pages_per_zspage, int objs_per_zspage) { @@ -1245,117 +1146,6 @@ unsigned long zs_get_total_pages(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_get_total_pages); -/** - * zs_map_object - get address of allocated object from handle. - * @pool: pool from which the object was allocated - * @handle: handle returned from zs_malloc - * @mm: mapping mode to use - * - * Before using an object allocated from zs_malloc, it must be mapped using - * this function. When done with the object, it must be unmapped using - * zs_unmap_object. - * - * Only one object can be mapped per cpu at a time. There is no protection - * against nested mappings. - * - * This function returns with preemption and page faults disabled. - */ -void *zs_map_object(struct zs_pool *pool, unsigned long handle, - enum zs_mapmode mm) -{ - struct zspage *zspage; - struct zpdesc *zpdesc; - unsigned long obj, off; - unsigned int obj_idx; - - struct size_class *class; - struct mapping_area *area; - struct zpdesc *zpdescs[2]; - void *ret; - - /* - * Because we use per-cpu mapping areas shared among the - * pools/users, we can't allow mapping in interrupt context - * because it can corrupt another users mappings. - */ - BUG_ON(in_interrupt()); - - /* It guarantees it can get zspage from handle safely */ - read_lock(&pool->lock); - obj = handle_to_obj(handle); - obj_to_location(obj, &zpdesc, &obj_idx); - zspage = get_zspage(zpdesc); - - /* - * migration cannot move any zpages in this zspage. Here, class->lock - * is too heavy since callers would take some time until they calls - * zs_unmap_object API so delegate the locking from class to zspage - * which is smaller granularity. - */ - zspage_read_lock(zspage); - read_unlock(&pool->lock); - - class = zspage_class(pool, zspage); - off = offset_in_page(class->size * obj_idx); - - local_lock(&zs_map_area.lock); - area = this_cpu_ptr(&zs_map_area); - area->vm_mm = mm; - if (off + class->size <= PAGE_SIZE) { - /* this object is contained entirely within a page */ - area->vm_addr = kmap_local_zpdesc(zpdesc); - ret = area->vm_addr + off; - goto out; - } - - /* this object spans two pages */ - zpdescs[0] = zpdesc; - zpdescs[1] = get_next_zpdesc(zpdesc); - BUG_ON(!zpdescs[1]); - - ret = __zs_map_object(area, zpdescs, off, class->size); -out: - if (likely(!ZsHugePage(zspage))) - ret += ZS_HANDLE_SIZE; - - return ret; -} -EXPORT_SYMBOL_GPL(zs_map_object); - -void zs_unmap_object(struct zs_pool *pool, unsigned long handle) -{ - struct zspage *zspage; - struct zpdesc *zpdesc; - unsigned long obj, off; - unsigned int obj_idx; - - struct size_class *class; - struct mapping_area *area; - - obj = handle_to_obj(handle); - obj_to_location(obj, &zpdesc, &obj_idx); - zspage = get_zspage(zpdesc); - class = zspage_class(pool, zspage); - off = offset_in_page(class->size * obj_idx); - - area = this_cpu_ptr(&zs_map_area); - if (off + class->size <= PAGE_SIZE) - kunmap_local(area->vm_addr); - else { - struct zpdesc *zpdescs[2]; - - zpdescs[0] = zpdesc; - zpdescs[1] = get_next_zpdesc(zpdesc); - BUG_ON(!zpdescs[1]); - - __zs_unmap_object(area, zpdescs, off, class->size); - } - local_unlock(&zs_map_area.lock); - - zspage_read_unlock(zspage); -} -EXPORT_SYMBOL_GPL(zs_unmap_object); - void *zs_obj_read_begin(struct zs_pool *pool, unsigned long handle, void *local_copy) { @@ -1975,7 +1765,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, * the class lock protects zpage alloc/free in the zspage. */ spin_lock(&class->lock); - /* the zspage write_lock protects zpage access via zs_map_object */ + /* the zspage write_lock protects zpage access via zs_obj_read/write() */ if (!zspage_write_trylock(zspage)) { spin_unlock(&class->lock); write_unlock(&pool->lock); @@ -2459,23 +2249,11 @@ EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { - int ret; - - ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", - zs_cpu_prepare, zs_cpu_dead); - if (ret) - goto out; - #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif - zs_stat_init(); - return 0; - -out: - return ret; } static void __exit zs_exit(void) @@ -2483,8 +2261,6 @@ static void __exit zs_exit(void) #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif - cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); - zs_stat_exit(); } -- 2.51.0 From 7b60041156079f256a7df3f7de469de7db618580 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 5 Mar 2025 06:11:33 +0000 Subject: [PATCH 06/16] mm: zpool: remove zpool_malloc_support_movable() zpool_malloc_support_movable() always returns true for zsmalloc, the only remaining zpool driver. Remove it and set the gfp flags in zswap_compress() accordingly. Opportunistically use GFP_NOWAIT instead of __GFP_NOWARN | __GFP_KSWAPD_RECLAIM for conciseness as they are equivalent. Link: https://lkml.kernel.org/r/20250305061134.4105762-6-yosry.ahmed@linux.dev Signed-off-by: Yosry Ahmed Reviewed-by: Sergey Senozhatsky Acked-by: Johannes Weiner Acked-by: Nhat Pham Cc: Thomas Gleixner Cc: Chengming Zhou Cc: Herbert Xu Cc: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/zpool.h | 3 --- mm/zpool.c | 16 ---------------- mm/zsmalloc.c | 1 - mm/zswap.c | 4 +--- 4 files changed, 1 insertion(+), 23 deletions(-) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 2c8a9d2654f6..52f30e526607 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -21,8 +21,6 @@ const char *zpool_get_type(struct zpool *pool); void zpool_destroy_pool(struct zpool *pool); -bool zpool_malloc_support_movable(struct zpool *pool); - int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, unsigned long *handle); @@ -65,7 +63,6 @@ struct zpool_driver { void *(*create)(const char *name, gfp_t gfp); void (*destroy)(void *pool); - bool malloc_support_movable; int (*malloc)(void *pool, size_t size, gfp_t gfp, unsigned long *handle); void (*free)(void *pool, unsigned long handle); diff --git a/mm/zpool.c b/mm/zpool.c index 4fc665b42f5e..6d6d88930932 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -220,22 +220,6 @@ const char *zpool_get_type(struct zpool *zpool) return zpool->driver->type; } -/** - * zpool_malloc_support_movable() - Check if the zpool supports - * allocating movable memory - * @zpool: The zpool to check - * - * This returns if the zpool supports allocating movable memory. - * - * Implementations must guarantee this to be thread-safe. - * - * Returns: true if the zpool supports allocating movable memory, false if not - */ -bool zpool_malloc_support_movable(struct zpool *zpool) -{ - return zpool->driver->malloc_support_movable; -} - /** * zpool_malloc() - Allocate memory * @zpool: The zpool to allocate from. diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index cd1c2a8ffef0..961b270f023c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -503,7 +503,6 @@ static struct zpool_driver zs_zpool_driver = { .owner = THIS_MODULE, .create = zs_zpool_create, .destroy = zs_zpool_destroy, - .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, .obj_read_begin = zs_zpool_obj_read_begin, diff --git a/mm/zswap.c b/mm/zswap.c index 7de54f105d04..5f0e62289444 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -964,9 +964,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry, goto unlock; zpool = pool->zpool; - gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(zpool)) - gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; + gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE; alloc_ret = zpool_malloc(zpool, dlen, gfp, &handle); if (alloc_ret) goto unlock; -- 2.51.0 From c0d017896b72d7dd251dabf64765196ff9a46a0f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Feb 2025 17:44:17 +0800 Subject: [PATCH 07/16] mm: shmem: drop the unused macro Patch series "Some trivial cleanups for shmem". Patch 1 - Patch 5 do some trivial cleanups and refactoring for shmem. Patch 6 adds myself as shmem reviewer. This patch (of 6): Drop the unused 'BLOCKS_PER_PAGE' macro. Link: https://lkml.kernel.org/r/cover.1738918357.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/69264cee1d938442477e657004e4924f8a5c4dd4.1738918357.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index b276ae233dfa..6d6d5fce2120 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -86,7 +86,6 @@ static struct vfsmount *shm_mnt __ro_after_init; #include "internal.h" -#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) /* Pretend that each entry is of this size in directory's i_size */ -- 2.51.0 From 6d26a149f5483acdc8a9a7a8fcf5e737a324a2b6 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Feb 2025 17:44:18 +0800 Subject: [PATCH 08/16] mm: shmem: remove 'fadvise()' comments Similar to commit 255ff62d1586 ("docs: tmpfs: drop 'fadvise()' from the documentation"), fadvise() has no HUGEPAGE advise currently. Remove the confusing fadvise() comments. Link: https://lkml.kernel.org/r/fae702b9775f58b55b45be5eaad22d8586d0290a.1738918357.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 6d6d5fce2120..c63fd18cea50 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -525,9 +525,9 @@ static bool shmem_confirm_swap(struct address_space *mapping, * enables huge pages for the mount; * SHMEM_HUGE_WITHIN_SIZE: * only allocate huge pages if the page will be fully within i_size, - * also respect fadvise()/madvise() hints; + * also respect madvise() hints; * SHMEM_HUGE_ADVISE: - * only allocate huge pages if requested with fadvise()/madvise(); + * only allocate huge pages if requested with madvise(); */ #define SHMEM_HUGE_NEVER 0 -- 2.51.0 From d5e4e147c0f59259cd527d58295ba1bfefbad481 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Feb 2025 17:44:19 +0800 Subject: [PATCH 09/16] mm: shmem: remove duplicate error validation Remove duplicate error code checks for 'start' and 'end', as the get_order_from_str() will only return -EINVAL if the cmdline string is configured incorrectly. Link: https://lkml.kernel.org/r/dfadaba4c8b24c5ae1467fe8b6744b654c65ec91.1738918357.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c63fd18cea50..51bdeea828a0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5664,19 +5664,19 @@ static int __init setup_thp_shmem(char *str) THP_ORDERS_ALL_FILE_DEFAULT); } - if (start == -EINVAL) { + if (start < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", start_size); goto err; } - if (end == -EINVAL) { + if (end < 0) { pr_err("invalid size %s in thp_shmem boot parameter\n", end_size); goto err; } - if (start < 0 || end < 0 || start > end) + if (start > end) goto err; nr = end - start + 1; -- 2.51.0 From cd81c424b53fa174df1e18b021806bc978c8fb7f Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Feb 2025 17:44:20 +0800 Subject: [PATCH 10/16] mm: shmem: change the return value of shmem_find_swap_entries() The shmem_find_swap_entries() originally returned the index corresponding to the swap entry, but no callers used this return value. It should return the number of entries that were found like other functions, which can be used by the callers. No functional changes. Link: https://lkml.kernel.org/r/070489b5946b8379b2a2d25f78115cef167cd145.1738918357.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 51bdeea828a0..f5a081563022 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1379,9 +1379,9 @@ static void shmem_evict_inode(struct inode *inode) #endif } -static int shmem_find_swap_entries(struct address_space *mapping, - pgoff_t start, struct folio_batch *fbatch, - pgoff_t *indices, unsigned int type) +static unsigned int shmem_find_swap_entries(struct address_space *mapping, + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); struct folio *folio; @@ -1414,7 +1414,7 @@ static int shmem_find_swap_entries(struct address_space *mapping, } rcu_read_unlock(); - return xas.xa_index; + return folio_batch_count(fbatch); } /* @@ -1461,8 +1461,8 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type) do { folio_batch_init(&fbatch); - shmem_find_swap_entries(mapping, start, &fbatch, indices, type); - if (folio_batch_count(&fbatch) == 0) { + if (!shmem_find_swap_entries(mapping, start, &fbatch, + indices, type)) { ret = 0; break; } -- 2.51.0 From 086e66b690ae41c1c8c0ef0366cadeb089dff990 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Feb 2025 17:44:21 +0800 Subject: [PATCH 11/16] mm: shmem: factor out the within_size logic into a new helper Factor out the within_size logic into a new helper to remove duplicate code. Link: https://lkml.kernel.org/r/527dea9d7e32fe6b94c7fe00df2c126203017911.1738918357.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Suggested-by: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index f5a081563022..8de9b4a07a8a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -590,6 +590,28 @@ shmem_mapping_size_orders(struct address_space *mapping, pgoff_t index, loff_t w return order > 0 ? BIT(order + 1) - 1 : 0; } +static unsigned int shmem_get_orders_within_size(struct inode *inode, + unsigned long within_size_orders, pgoff_t index, + loff_t write_end) +{ + pgoff_t aligned_index; + unsigned long order; + loff_t i_size; + + order = highest_order(within_size_orders); + while (within_size_orders) { + aligned_index = round_up(index + 1, 1 << order); + i_size = max(write_end, i_size_read(inode)); + i_size = round_up(i_size, PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= aligned_index) + return within_size_orders; + + order = next_order(&within_size_orders, order); + } + + return 0; +} + static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index, loff_t write_end, bool shmem_huge_force, struct vm_area_struct *vma, @@ -598,9 +620,6 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index unsigned int maybe_pmd_order = HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER ? 0 : BIT(HPAGE_PMD_ORDER); unsigned long within_size_orders; - unsigned int order; - pgoff_t aligned_index; - loff_t i_size; if (!S_ISREG(inode->i_mode)) return 0; @@ -634,16 +653,11 @@ static unsigned int shmem_huge_global_enabled(struct inode *inode, pgoff_t index within_size_orders = shmem_mapping_size_orders(inode->i_mapping, index, write_end); - order = highest_order(within_size_orders); - while (within_size_orders) { - aligned_index = round_up(index + 1, 1 << order); - i_size = max(write_end, i_size_read(inode)); - i_size = round_up(i_size, PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= aligned_index) - return within_size_orders; + within_size_orders = shmem_get_orders_within_size(inode, within_size_orders, + index, write_end); + if (within_size_orders > 0) + return within_size_orders; - order = next_order(&within_size_orders, order); - } fallthrough; case SHMEM_HUGE_ADVISE: if (vm_flags & VM_HUGEPAGE) @@ -1747,10 +1761,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, unsigned long mask = READ_ONCE(huge_shmem_orders_always); unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size); unsigned long vm_flags = vma ? vma->vm_flags : 0; - pgoff_t aligned_index; unsigned int global_orders; - loff_t i_size; - int order; if (thp_disabled_by_hw() || (vma && vma_thp_disabled(vma, vm_flags))) return 0; @@ -1776,17 +1787,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, return READ_ONCE(huge_shmem_orders_inherit); /* Allow mTHP that will be fully within i_size. */ - order = highest_order(within_size_orders); - while (within_size_orders) { - aligned_index = round_up(index + 1, 1 << order); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >> PAGE_SHIFT >= aligned_index) { - mask |= within_size_orders; - break; - } - - order = next_order(&within_size_orders, order); - } + mask |= shmem_get_orders_within_size(inode, within_size_orders, index, 0); if (vm_flags & VM_HUGEPAGE) mask |= READ_ONCE(huge_shmem_orders_madvise); -- 2.51.0 From a91aaf8dd549dcee9caab227ecaa6cbc243bbc5a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Feb 2025 17:44:22 +0800 Subject: [PATCH 12/16] MAINTAINERS: add Baolin as shmem reviewer In the past year, I've primarily focused on shmem and added several features to it, such as support for mTHP, large folio swap-out and swap-in support, mTHP collapse support, skipping swapcache, and tmpfs support for large folios, and so on. Meanwhile I've also been helping with testing and reviewing shmem related patches. So I am willing to continue assisting with testing and reviewing shmem related patches. Let me be Cc'd on patches related to shmem. Link: https://lkml.kernel.org/r/bcefbba9b2b44d4e661e6cc2c4187292a5beb467.1738918357.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: Hugh Dickins Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index c13201979633..fb408698086e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23952,6 +23952,7 @@ F: drivers/hwmon/tmp513.c TMPFS (SHMEM FILESYSTEM) M: Hugh Dickins +R: Baolin Wang L: linux-mm@kvack.org S: Maintained F: include/linux/shmem_fs.h -- 2.51.0 From 995abaaadd30e2f9e49127694d41403839d3e1bd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Dec 2024 15:53:55 +0000 Subject: [PATCH 13/16] dax: remove access to page->index This looks like a complete mess (why are we setting page->index at page fault time?), but I no longer care about DAX, and there's no reason to let DAX hold us back from removing page->index. Link: https://lkml.kernel.org/r/20241216155408.8102-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jane Chu Reviewed-by: Dan Williams Cc: Alistair Popple Cc: Dave Jiang Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/dax/device.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 6d74e62bbee0..bc871a34b9cd 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -89,14 +89,13 @@ static void dax_set_mapping(struct vm_fault *vmf, pfn_t pfn, ALIGN_DOWN(vmf->address, fault_size)); for (i = 0; i < nr_pages; i++) { - struct page *page = pfn_to_page(pfn_t_to_pfn(pfn) + i); + struct folio *folio = pfn_folio(pfn_t_to_pfn(pfn) + i); - page = compound_head(page); - if (page->mapping) + if (folio->mapping) continue; - page->mapping = filp->f_mapping; - page->index = pgoff + i; + folio->mapping = filp->f_mapping; + folio->index = pgoff + i; } } -- 2.51.0 From 37cd93fc61037889461d6437a1e1a5caa730ef38 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Dec 2024 15:53:56 +0000 Subject: [PATCH 14/16] dax: use folios more widely within DAX Convert from pfn to folio instead of page and use those folios throughout to avoid accesses to page->index and page->mapping. Link: https://lkml.kernel.org/r/20241216155408.8102-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jane Chu Cc: Dan Willaims Cc: Dave Jiang Cc: Vishal Verma Cc: Alistair Popple Signed-off-by: Andrew Morton --- fs/dax.c | 53 +++++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 21b47402b3dc..972febc6fb9d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -320,38 +320,39 @@ static unsigned long dax_end_pfn(void *entry) for (pfn = dax_to_pfn(entry); \ pfn < dax_end_pfn(entry); pfn++) -static inline bool dax_page_is_shared(struct page *page) +static inline bool dax_folio_is_shared(struct folio *folio) { - return page->mapping == PAGE_MAPPING_DAX_SHARED; + return folio->mapping == PAGE_MAPPING_DAX_SHARED; } /* - * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the + * Set the folio->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the * refcount. */ -static inline void dax_page_share_get(struct page *page) +static inline void dax_folio_share_get(struct folio *folio) { - if (page->mapping != PAGE_MAPPING_DAX_SHARED) { + if (folio->mapping != PAGE_MAPPING_DAX_SHARED) { /* * Reset the index if the page was already mapped * regularly before. */ - if (page->mapping) - page->share = 1; - page->mapping = PAGE_MAPPING_DAX_SHARED; + if (folio->mapping) + folio->page.share = 1; + folio->mapping = PAGE_MAPPING_DAX_SHARED; } - page->share++; + folio->page.share++; } -static inline unsigned long dax_page_share_put(struct page *page) +static inline unsigned long dax_folio_share_put(struct folio *folio) { - return --page->share; + return --folio->page.share; } /* - * When it is called in dax_insert_entry(), the shared flag will indicate that - * whether this entry is shared by multiple files. If so, set the page->mapping - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. + * When it is called in dax_insert_entry(), the shared flag will indicate + * that whether this entry is shared by multiple files. If so, set + * the folio->mapping PAGE_MAPPING_DAX_SHARED, and use page->share + * as refcount. */ static void dax_associate_entry(void *entry, struct address_space *mapping, struct vm_area_struct *vma, unsigned long address, bool shared) @@ -364,14 +365,14 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, index = linear_page_index(vma, address & ~(size - 1)); for_each_mapped_pfn(entry, pfn) { - struct page *page = pfn_to_page(pfn); + struct folio *folio = pfn_folio(pfn); if (shared) { - dax_page_share_get(page); + dax_folio_share_get(folio); } else { - WARN_ON_ONCE(page->mapping); - page->mapping = mapping; - page->index = index + i++; + WARN_ON_ONCE(folio->mapping); + folio->mapping = mapping; + folio->index = index + i++; } } } @@ -385,17 +386,17 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, return; for_each_mapped_pfn(entry, pfn) { - struct page *page = pfn_to_page(pfn); + struct folio *folio = pfn_folio(pfn); - WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - if (dax_page_is_shared(page)) { + WARN_ON_ONCE(trunc && folio_ref_count(folio) > 1); + if (dax_folio_is_shared(folio)) { /* keep the shared flag if this page is still shared */ - if (dax_page_share_put(page) > 0) + if (dax_folio_share_put(folio) > 0) continue; } else - WARN_ON_ONCE(page->mapping && page->mapping != mapping); - page->mapping = NULL; - page->index = 0; + WARN_ON_ONCE(folio->mapping && folio->mapping != mapping); + folio->mapping = NULL; + folio->index = 0; } } -- 2.51.0 From 7851bf649d423edd7286b292739f2eefded3d35c Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:30:56 +1100 Subject: [PATCH 15/16] fuse: fix dax truncate/punch_hole fault path Patch series "fs/dax: Fix ZONE_DEVICE page reference counts", v9. Device and FS DAX pages have always maintained their own page reference counts without following the normal rules for page reference counting. In particular pages are considered free when the refcount hits one rather than zero and refcounts are not added when mapping the page. Tracking this requires special PTE bits (PTE_DEVMAP) and a secondary mechanism for allowing GUP to hold references on the page (see get_dev_pagemap). However there doesn't seem to be any reason why FS DAX pages need their own reference counting scheme. By treating the refcounts on these pages the same way as normal pages we can remove a lot of special checks. In particular pXd_trans_huge() becomes the same as pXd_leaf(), although I haven't made that change here. It also frees up a valuable SW define PTE bit on architectures that have devmap PTE bits defined. It also almost certainly allows further clean-up of the devmap managed functions, but I have left that as a future improvment. It also enables support for compound ZONE_DEVICE pages which is one of my primary motivators for doing this work. This patch (of 20): FS DAX requires file systems to call into the DAX layout prior to unlinking inodes to ensure there is no ongoing DMA or other remote access to the direct mapped page. The fuse file system implements fuse_dax_break_layouts() to do this which includes a comment indicating that passing dmap_end == 0 leads to unmapping of the whole file. However this is not true - passing dmap_end == 0 will not unmap anything before dmap_start, and further more dax_layout_busy_page_range() will not scan any of the range to see if there maybe ongoing DMA access to the range. Fix this by passing -1 for dmap_end to fuse_dax_break_layouts() which will invalidate the entire file range to dax_layout_busy_page_range(). Link: https://lkml.kernel.org/r/cover.8068ad144a7eea4a813670301f4d2a86a8e68ec4.1740713401.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/f09a34b6c40032022e4ddee6fadb7cc676f08867.1740713401.git-series.apopple@nvidia.com Fixes: 6ae330cad6ef ("virtiofs: serialize truncate/punch_hole and dax fault path") Signed-off-by: Alistair Popple Co-developed-by: Dan Williams Signed-off-by: Dan Williams Reviewed-by: Balbir Singh Tested-by: Alison Schofield Cc: Vivek Goyal Cc: Alexander Gordeev Cc: Asahi Lina Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/fuse/dax.c | 1 - fs/fuse/dir.c | 2 +- fs/fuse/file.c | 4 ++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 0b6ee6dd1fd6..b7f805d2a14f 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -682,7 +682,6 @@ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, 0, 0, fuse_wait_dax_page(inode)); } -/* dmap_end == 0 leads to unmapping of whole file */ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 3805f9b06c9d..3b031d24d369 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1940,7 +1940,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (FUSE_IS_DAX(inode) && is_truncate) { filemap_invalidate_lock(mapping); fault_blocked = true; - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) { filemap_invalidate_unlock(mapping); return err; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d63e56fd3dd2..754378dd9f71 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -253,7 +253,7 @@ static int fuse_open(struct inode *inode, struct file *file) if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out_inode_unlock; } @@ -3205,7 +3205,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); if (block_faults) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out; } -- 2.51.0 From cee91fa13a8e95f49d0ee6be020e68a71a209117 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Fri, 28 Feb 2025 14:30:57 +1100 Subject: [PATCH 16/16] fs/dax: return unmapped busy pages from dax_layout_busy_page_range() dax_layout_busy_page_range() is used by file systems to scan the DAX page-cache to unmap mapping pages from user-space and to determine if any pages in the given range are busy, either due to ongoing DMA or other get_user_pages() usage. Currently it checks to see the file mapping is mapped into user-space with mapping_mapped() and returns early if not, skipping the check for DMA busy pages. This is wrong as pages may still be undergoing DMA access even if they have subsequently been unmapped from user-space. Fix this by dropping the check for mapping_mapped(). Link: https://lkml.kernel.org/r/d85ce6c2d1400ff111ed7302d9eef223d0243c57.1740713401.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Dan Williams Reviewed-by: Dan Williams Reviewed-by: Balbir Singh Tested-by: Alison Schofield Cc: Alexander Gordeev Cc: Asahi Lina Cc: Bjorn Helgaas Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Hellwig Cc: Chunyan Zhang Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: Dave Jiang Cc: David Hildenbrand Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huacai Chen Cc: Ira Weiny Cc: Jan Kara Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: John Hubbard Cc: linmiaohe Cc: Logan Gunthorpe Cc: Matthew Wilcow (Oracle) Cc: Michael "Camp Drill Sergeant" Ellerman Cc: Nicholas Piggin Cc: Peter Xu Cc: Sven Schnelle Cc: Ted Ts'o Cc: Vasily Gorbik Cc: Vishal Verma Cc: Vivek Goyal Cc: WANG Xuerui Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 972febc6fb9d..b35f538c4330 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -691,7 +691,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return NULL; - if (!dax_mapping(mapping) || !mapping_mapped(mapping)) + if (!dax_mapping(mapping)) return NULL; /* If end == LLONG_MAX, all pages from start to till end of file */ -- 2.51.0