From 4e04d10c66a9c5c1d6df143401fd0452b3a93e22 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Tue, 17 Dec 2024 00:04:40 +0900 Subject: [PATCH 01/16] mm/zsmalloc: convert obj_to_page() and zs_free() to use zpdesc Rename obj_to_page() to obj_to_zpdesc() and also convert it and its user zs_free() to use zpdesc. Link: https://lkml.kernel.org/r/20241216150450.1228021-10-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Alex Shi Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 83d48cffe96f..112603f9449f 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -798,9 +798,9 @@ static void obj_to_location(unsigned long obj, struct zpdesc **zpdesc, *obj_idx = (obj & OBJ_INDEX_MASK); } -static void obj_to_page(unsigned long obj, struct page **page) +static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc) { - *page = pfn_to_page(obj >> OBJ_INDEX_BITS); + *zpdesc = pfn_zpdesc(obj >> OBJ_INDEX_BITS); } /** @@ -1462,7 +1462,7 @@ static void obj_free(int class_size, unsigned long obj) void zs_free(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; - struct page *f_page; + struct zpdesc *f_zpdesc; unsigned long obj; struct size_class *class; int fullness; @@ -1476,8 +1476,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) */ read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); - obj_to_page(obj, &f_page); - zspage = get_zspage(f_page); + obj_to_zpdesc(obj, &f_zpdesc); + zspage = get_zspage(zpdesc_page(f_zpdesc)); class = zspage_class(pool, zspage); spin_lock(&class->lock); read_unlock(&pool->migrate_lock); -- 2.50.1 From 68721300856cafde0188f71df59e1ca7fab1df22 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Tue, 17 Dec 2024 00:04:41 +0900 Subject: [PATCH 02/16] mm/zsmalloc: add two helpers for zs_page_migrate() and make it use zpdesc To convert page to zpdesc in zs_page_migrate(), we added zpdesc_is_isolated()/zpdesc_zone() helpers. No functional change. Link: https://lkml.kernel.org/r/20241216150450.1228021-11-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Alex Shi Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zpdesc.h | 11 +++++++++++ mm/zsmalloc.c | 30 ++++++++++++++++-------------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/mm/zpdesc.h b/mm/zpdesc.h index cbd37d9725c7..193d40226188 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -154,4 +154,15 @@ static inline void __zpdesc_set_movable(struct zpdesc *zpdesc, { __SetPageMovable(zpdesc_page(zpdesc), mops); } + +static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc) +{ + return PageIsolated(zpdesc_page(zpdesc)); +} + +static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) +{ + return page_zone(zpdesc_page(zpdesc)); +} + #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 112603f9449f..432e78e61d2e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1796,19 +1796,21 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct size_class *class; struct zspage *zspage; struct zpdesc *dummy; + struct zpdesc *newzpdesc = page_zpdesc(newpage); + struct zpdesc *zpdesc = page_zpdesc(page); void *s_addr, *d_addr, *addr; unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; - VM_BUG_ON_PAGE(!PageIsolated(page), page); + VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); /* We're committed, tell the world that this is a Zsmalloc page. */ - __SetPageZsmalloc(newpage); + __SetPageZsmalloc(zpdesc_page(newzpdesc)); /* The page is locked, so this pointer must remain valid */ - zspage = get_zspage(page); + zspage = get_zspage(zpdesc_page(zpdesc)); pool = zspage->pool; /* @@ -1825,30 +1827,30 @@ static int zs_page_migrate(struct page *newpage, struct page *page, /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); - offset = get_first_obj_offset(page); - s_addr = kmap_local_page(page); + offset = get_first_obj_offset(zpdesc_page(zpdesc)); + s_addr = kmap_local_zpdesc(zpdesc); /* * Here, any user cannot access all objects in the zspage so let's move. */ - d_addr = kmap_local_page(newpage); + d_addr = kmap_local_zpdesc(newzpdesc); copy_page(d_addr, s_addr); kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { - if (obj_allocated(page_zpdesc(page), addr, &handle)) { + if (obj_allocated(zpdesc, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); - new_obj = (unsigned long)location_to_obj(newpage, + new_obj = (unsigned long)location_to_obj(zpdesc_page(newzpdesc), obj_idx); record_obj(handle, new_obj); } } kunmap_local(s_addr); - replace_sub_page(class, zspage, page_zpdesc(newpage), page_zpdesc(page)); + replace_sub_page(class, zspage, newzpdesc, zpdesc); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release migration_lock. @@ -1857,14 +1859,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page, spin_unlock(&class->lock); migrate_write_unlock(zspage); - get_page(newpage); - if (page_zone(newpage) != page_zone(page)) { - dec_zone_page_state(page, NR_ZSPAGES); - inc_zone_page_state(newpage, NR_ZSPAGES); + zpdesc_get(newzpdesc); + if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) { + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_inc_zone_page_state(newzpdesc); } reset_page(page); - put_page(page); + zpdesc_put(zpdesc); return MIGRATEPAGE_SUCCESS; } -- 2.50.1 From 73349afa00419585c11ee232377879b1f2912881 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 17 Dec 2024 00:04:42 +0900 Subject: [PATCH 03/16] mm/zsmalloc: convert reset_page to reset_zpdesc zpdesc.zspage matches with page.private, zpdesc.next matches with page.index. They will be reset in reset_page() which is called prior to free base pages of a zspage. Since the fields that need to be initialized are independent of the order in struct zpdesc, Keep it to use struct page to ensure robustness against potential rearrangements of struct zpdesc fields in the future. [42.hyeyoo@gmail.com: reset zpdesc fields in reset_zpdesc()] Link: https://lkml.kernel.org/r/Z4Uw136VdG7vlKCL@localhost.localdomain [42.hyeyoo@gmail.com: keep reset_zpdesc() to use struct page fields] Link: https://lkml.kernel.org/r/20241216150450.1228021-12-42.hyeyoo@gmail.com Signed-off-by: Alex Shi Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 432e78e61d2e..98b152a0b264 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -843,12 +843,14 @@ static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj, return true; } -static void reset_page(struct page *page) +static void reset_zpdesc(struct zpdesc *zpdesc) { + struct page *page = zpdesc_page(zpdesc); + __ClearPageMovable(page); ClearPagePrivate(page); - set_page_private(page, 0); - page->index = 0; + zpdesc->zspage = NULL; + zpdesc->next = NULL; __ClearPageZsmalloc(page); } @@ -887,7 +889,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, do { VM_BUG_ON_PAGE(!PageLocked(page), page); next = get_next_page(page); - reset_page(page); + reset_zpdesc(page_zpdesc(page)); unlock_page(page); dec_zone_page_state(page, NR_ZSPAGES); put_page(page); @@ -1865,7 +1867,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, zpdesc_inc_zone_page_state(newzpdesc); } - reset_page(page); + reset_zpdesc(zpdesc); zpdesc_put(zpdesc); return MIGRATEPAGE_SUCCESS; -- 2.50.1 From 7f0b0c6642077296edec442c6f54a4d7c608392c Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Tue, 17 Dec 2024 00:04:43 +0900 Subject: [PATCH 04/16] mm/zsmalloc: convert __free_zspage() to use zpdesc Introduce zpdesc_is_locked() and convert __free_zspage() to use zpdesc. Link: https://lkml.kernel.org/r/20241216150450.1228021-13-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Alex Shi Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zpdesc.h | 4 ++++ mm/zsmalloc.c | 20 ++++++++++---------- 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/mm/zpdesc.h b/mm/zpdesc.h index 193d40226188..4a4e68446f5a 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -165,4 +165,8 @@ static inline struct zone *zpdesc_zone(struct zpdesc *zpdesc) return page_zone(zpdesc_page(zpdesc)); } +static inline bool zpdesc_is_locked(struct zpdesc *zpdesc) +{ + return folio_test_locked(zpdesc_folio(zpdesc)); +} #endif diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 98b152a0b264..ad54fd8e934f 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -878,23 +878,23 @@ unlock: static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { - struct page *page, *next; + struct zpdesc *zpdesc, *next; assert_spin_locked(&class->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); - next = page = get_first_page(zspage); + next = zpdesc = get_first_zpdesc(zspage); do { - VM_BUG_ON_PAGE(!PageLocked(page), page); - next = get_next_page(page); - reset_zpdesc(page_zpdesc(page)); - unlock_page(page); - dec_zone_page_state(page, NR_ZSPAGES); - put_page(page); - page = next; - } while (page != NULL); + VM_BUG_ON_PAGE(!zpdesc_is_locked(zpdesc), zpdesc_page(zpdesc)); + next = get_next_zpdesc(zpdesc); + reset_zpdesc(zpdesc); + zpdesc_unlock(zpdesc); + zpdesc_dec_zone_page_state(zpdesc); + zpdesc_put(zpdesc); + zpdesc = next; + } while (zpdesc != NULL); cache_free_zspage(pool, zspage); -- 2.50.1 From 2d57eb9ea996749365a4b2cdf2f2c688363b3b93 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Tue, 17 Dec 2024 00:04:44 +0900 Subject: [PATCH 05/16] mm/zsmalloc: convert location_to_obj() to take zpdesc As all users of location_to_obj() now use zpdesc, convert location_to_obj() to take zpdesc. Link: https://lkml.kernel.org/r/20241216150450.1228021-14-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Alex Shi Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index ad54fd8e934f..c09a78f9d453 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -804,15 +804,15 @@ static void obj_to_zpdesc(unsigned long obj, struct zpdesc **zpdesc) } /** - * location_to_obj - get obj value encoded from (, ) - * @page: page object resides in zspage + * location_to_obj - get obj value encoded from (, ) + * @zpdesc: zpdesc object resides in zspage * @obj_idx: object index */ -static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) +static unsigned long location_to_obj(struct zpdesc *zpdesc, unsigned int obj_idx) { unsigned long obj; - obj = page_to_pfn(page) << OBJ_INDEX_BITS; + obj = zpdesc_pfn(zpdesc) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; return obj; @@ -1358,7 +1358,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, kunmap_local(vaddr); mod_zspage_inuse(zspage, 1); - obj = location_to_obj(zpdesc_page(m_zpdesc), obj); + obj = location_to_obj(m_zpdesc, obj); record_obj(handle, obj); return obj; @@ -1845,8 +1845,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); - new_obj = (unsigned long)location_to_obj(zpdesc_page(newzpdesc), - obj_idx); + new_obj = (unsigned long)location_to_obj(newzpdesc, obj_idx); record_obj(handle, new_obj); } } -- 2.50.1 From 65a1cf15802c7a571299df507b1b72ba89ef1da8 Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Tue, 17 Dec 2024 00:04:45 +0900 Subject: [PATCH 06/16] mm/zsmalloc: convert migrate_zspage() to use zpdesc Use get_first_zpdesc/get_next_zpdesc to replace get_first/next_page. No functional change. Link: https://lkml.kernel.org/r/20241216150450.1228021-15-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Alex Shi Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c09a78f9d453..fd07670b0996 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1602,14 +1602,14 @@ static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, unsigned long used_obj, free_obj; unsigned long handle; int obj_idx = 0; - struct page *s_page = get_first_page(src_zspage); + struct zpdesc *s_zpdesc = get_first_zpdesc(src_zspage); struct size_class *class = pool->size_class[src_zspage->class]; while (1) { - handle = find_alloced_obj(class, page_zpdesc(s_page), &obj_idx); + handle = find_alloced_obj(class, s_zpdesc, &obj_idx); if (!handle) { - s_page = get_next_page(s_page); - if (!s_page) + s_zpdesc = get_next_zpdesc(s_zpdesc); + if (!s_zpdesc) break; obj_idx = 0; continue; -- 2.50.1 From 6d0adf4b6262157ed212b7869af8f2f5d72ea38a Mon Sep 17 00:00:00 2001 From: Hyeonggon Yoo <42.hyeyoo@gmail.com> Date: Tue, 17 Dec 2024 00:04:46 +0900 Subject: [PATCH 07/16] mm/zsmalloc: convert get_zspage() to take zpdesc Now that all users except get_next_page() (which will be removed in later patch) use zpdesc, convert get_zspage() to take zpdesc instead of page. Link: https://lkml.kernel.org/r/20241216150450.1228021-16-42.hyeyoo@gmail.com Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Alex Shi Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fd07670b0996..77fd4e0b5bb0 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -757,9 +757,9 @@ out: return newfg; } -static struct zspage *get_zspage(struct page *page) +static struct zspage *get_zspage(struct zpdesc *zpdesc) { - struct zspage *zspage = (struct zspage *)page_private(page); + struct zspage *zspage = zpdesc->zspage; BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; @@ -767,7 +767,7 @@ static struct zspage *get_zspage(struct page *page) static struct page *get_next_page(struct page *page) { - struct zspage *zspage = get_zspage(page); + struct zspage *zspage = get_zspage(page_zpdesc(page)); if (unlikely(ZsHugePage(zspage))) return NULL; @@ -777,7 +777,7 @@ static struct page *get_next_page(struct page *page) static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc) { - struct zspage *zspage = get_zspage(zpdesc_page(zpdesc)); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) return NULL; @@ -827,7 +827,7 @@ static inline bool obj_allocated(struct zpdesc *zpdesc, void *obj, unsigned long *phandle) { unsigned long handle; - struct zspage *zspage = get_zspage(zpdesc_page(zpdesc)); + struct zspage *zspage = get_zspage(zpdesc); if (unlikely(ZsHugePage(zspage))) { VM_BUG_ON_PAGE(!is_first_zpdesc(zpdesc), zpdesc_page(zpdesc)); @@ -1232,7 +1232,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); obj_to_location(obj, &zpdesc, &obj_idx); - zspage = get_zspage(zpdesc_page(zpdesc)); + zspage = get_zspage(zpdesc); /* * migration cannot move any zpages in this zspage. Here, class->lock @@ -1282,7 +1282,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) obj = handle_to_obj(handle); obj_to_location(obj, &zpdesc, &obj_idx); - zspage = get_zspage(zpdesc_page(zpdesc)); + zspage = get_zspage(zpdesc); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); @@ -1445,7 +1445,7 @@ static void obj_free(int class_size, unsigned long obj) obj_to_location(obj, &f_zpdesc, &f_objidx); f_offset = offset_in_page(class_size * f_objidx); - zspage = get_zspage(zpdesc_page(f_zpdesc)); + zspage = get_zspage(f_zpdesc); vaddr = kmap_local_zpdesc(f_zpdesc); link = (struct link_free *)(vaddr + f_offset); @@ -1479,7 +1479,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) read_lock(&pool->migrate_lock); obj = handle_to_obj(handle); obj_to_zpdesc(obj, &f_zpdesc); - zspage = get_zspage(zpdesc_page(f_zpdesc)); + zspage = get_zspage(f_zpdesc); class = zspage_class(pool, zspage); spin_lock(&class->lock); read_unlock(&pool->migrate_lock); @@ -1812,7 +1812,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, __SetPageZsmalloc(zpdesc_page(newzpdesc)); /* The page is locked, so this pointer must remain valid */ - zspage = get_zspage(zpdesc_page(zpdesc)); + zspage = get_zspage(zpdesc); pool = zspage->pool; /* -- 2.50.1 From 74999813c03323a9133fa378a1ce313c10cef24c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 17 Dec 2024 00:04:47 +0900 Subject: [PATCH 08/16] mm/zsmalloc: convert SetZsPageMovable and remove unused funcs Convert SetZsPageMovable() to use zpdesc, and then remove unused funcs: get_next_page()/get_first_page()/is_first_page(). Link: https://lkml.kernel.org/r/20241216150450.1228021-17-42.hyeyoo@gmail.com Originally-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Signed-off-by: Alex Shi Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 77fd4e0b5bb0..43b4f5772b1e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -452,11 +452,6 @@ static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { .lock = INIT_LOCAL_LOCK(lock), }; -static __maybe_unused int is_first_page(struct page *page) -{ - return PagePrivate(page); -} - static inline bool is_first_zpdesc(struct zpdesc *zpdesc) { return PagePrivate(zpdesc_page(zpdesc)); @@ -473,14 +468,6 @@ static inline void mod_zspage_inuse(struct zspage *zspage, int val) zspage->inuse += val; } -static inline struct page *get_first_page(struct zspage *zspage) -{ - struct page *first_page = zpdesc_page(zspage->first_zpdesc); - - VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); - return first_page; -} - static struct zpdesc *get_first_zpdesc(struct zspage *zspage) { struct zpdesc *first_zpdesc = zspage->first_zpdesc; @@ -765,16 +752,6 @@ static struct zspage *get_zspage(struct zpdesc *zpdesc) return zspage; } -static struct page *get_next_page(struct page *page) -{ - struct zspage *zspage = get_zspage(page_zpdesc(page)); - - if (unlikely(ZsHugePage(zspage))) - return NULL; - - return (struct page *)page->index; -} - static struct zpdesc *get_next_zpdesc(struct zpdesc *zpdesc) { struct zspage *zspage = get_zspage(zpdesc); @@ -1936,13 +1913,13 @@ static void init_deferred_free(struct zs_pool *pool) static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) { - struct page *page = get_first_page(zspage); + struct zpdesc *zpdesc = get_first_zpdesc(zspage); do { - WARN_ON(!trylock_page(page)); - __SetPageMovable(page, &zsmalloc_mops); - unlock_page(page); - } while ((page = get_next_page(page)) != NULL); + WARN_ON(!zpdesc_trylock(zpdesc)); + __zpdesc_set_movable(zpdesc, &zsmalloc_mops); + zpdesc_unlock(zpdesc); + } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); } #else static inline void zs_flush_migration(struct zs_pool *pool) { } -- 2.50.1 From fc5eec0d8c898f27057d797351ae5055c1daa585 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 17 Dec 2024 00:04:48 +0900 Subject: [PATCH 09/16] mm/zsmalloc: convert get/set_first_obj_offset() to take zpdesc Now that all users of get/set_first_obj_offset() are converted to use zpdesc, convert them to take zpdesc. Link: https://lkml.kernel.org/r/20241216150450.1228021-18-42.hyeyoo@gmail.com Signed-off-by: Alex Shi Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 43b4f5772b1e..69d1a7ea1a0d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -478,20 +478,20 @@ static struct zpdesc *get_first_zpdesc(struct zspage *zspage) #define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff -static inline unsigned int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct zpdesc *zpdesc) { - VM_WARN_ON_ONCE(!PageZsmalloc(page)); - return page->page_type & FIRST_OBJ_PAGE_TYPE_MASK; + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); + return zpdesc->first_obj_offset & FIRST_OBJ_PAGE_TYPE_MASK; } -static inline void set_first_obj_offset(struct page *page, unsigned int offset) +static inline void set_first_obj_offset(struct zpdesc *zpdesc, unsigned int offset) { /* With 24 bits available, we can support offsets into 16 MiB pages. */ BUILD_BUG_ON(PAGE_SIZE > SZ_16M); - VM_WARN_ON_ONCE(!PageZsmalloc(page)); + VM_WARN_ON_ONCE(!PageZsmalloc(zpdesc_page(zpdesc))); VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK); - page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK; - page->page_type |= offset & FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset &= ~FIRST_OBJ_PAGE_TYPE_MASK; + zpdesc->first_obj_offset |= offset & FIRST_OBJ_PAGE_TYPE_MASK; } static inline unsigned int get_freeobj(struct zspage *zspage) @@ -911,7 +911,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) struct link_free *link; void *vaddr; - set_first_obj_offset(zpdesc_page(zpdesc), off); + set_first_obj_offset(zpdesc, off); vaddr = kmap_local_zpdesc(zpdesc); link = (struct link_free *)vaddr + off / sizeof(*link); @@ -1555,7 +1555,7 @@ static unsigned long find_alloced_obj(struct size_class *class, unsigned long handle = 0; void *addr = kmap_local_zpdesc(zpdesc); - offset = get_first_obj_offset(zpdesc_page(zpdesc)); + offset = get_first_obj_offset(zpdesc); offset += class->size * index; while (offset < PAGE_SIZE) { @@ -1750,8 +1750,8 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, } while ((zpdesc = get_next_zpdesc(zpdesc)) != NULL); create_page_chain(class, zspage, zpdescs); - first_obj_offset = get_first_obj_offset(zpdesc_page(oldzpdesc)); - set_first_obj_offset(zpdesc_page(newzpdesc), first_obj_offset); + first_obj_offset = get_first_obj_offset(oldzpdesc); + set_first_obj_offset(newzpdesc, first_obj_offset); if (unlikely(ZsHugePage(zspage))) newzpdesc->handle = oldzpdesc->handle; __zpdesc_set_movable(newzpdesc, &zsmalloc_mops); @@ -1806,7 +1806,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); - offset = get_first_obj_offset(zpdesc_page(zpdesc)); + offset = get_first_obj_offset(zpdesc); s_addr = kmap_local_zpdesc(zpdesc); /* -- 2.50.1 From b5f469a140b9f535fe2d3770209d5d751822898a Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 17 Dec 2024 00:04:49 +0900 Subject: [PATCH 10/16] mm/zsmalloc: introduce __zpdesc_clear/set_zsmalloc() Add helper __zpdesc_clear_zsmalloc() for __ClearPageZsmalloc(), __zpdesc_set_zsmalloc() for __SetPageZsmalloc(), and use them in callers. [42.hyeyoo@gmail.com: keep reset_zpdesc() to use struct page] Link: https://lkml.kernel.org/r/20241216150450.1228021-19-42.hyeyoo@gmail.com Signed-off-by: Alex Shi Signed-off-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Acked-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/zpdesc.h | 10 ++++++++++ mm/zsmalloc.c | 6 +++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/mm/zpdesc.h b/mm/zpdesc.h index 4a4e68446f5a..fa47fece2237 100644 --- a/mm/zpdesc.h +++ b/mm/zpdesc.h @@ -155,6 +155,16 @@ static inline void __zpdesc_set_movable(struct zpdesc *zpdesc, __SetPageMovable(zpdesc_page(zpdesc), mops); } +static inline void __zpdesc_set_zsmalloc(struct zpdesc *zpdesc) +{ + __SetPageZsmalloc(zpdesc_page(zpdesc)); +} + +static inline void __zpdesc_clear_zsmalloc(struct zpdesc *zpdesc) +{ + __ClearPageZsmalloc(zpdesc_page(zpdesc)); +} + static inline bool zpdesc_is_isolated(struct zpdesc *zpdesc) { return PageIsolated(zpdesc_page(zpdesc)); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 69d1a7ea1a0d..817626a351f8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1001,13 +1001,13 @@ static struct zspage *alloc_zspage(struct zs_pool *pool, if (!zpdesc) { while (--i >= 0) { zpdesc_dec_zone_page_state(zpdescs[i]); - __ClearPageZsmalloc(zpdesc_page(zpdescs[i])); + __zpdesc_clear_zsmalloc(zpdescs[i]); free_zpdesc(zpdescs[i]); } cache_free_zspage(pool, zspage); return NULL; } - __SetPageZsmalloc(zpdesc_page(zpdesc)); + __zpdesc_set_zsmalloc(zpdesc); zpdesc_inc_zone_page_state(zpdesc); zpdescs[i] = zpdesc; @@ -1786,7 +1786,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc)); /* We're committed, tell the world that this is a Zsmalloc page. */ - __SetPageZsmalloc(zpdesc_page(newzpdesc)); + __zpdesc_set_zsmalloc(newzpdesc); /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(zpdesc); -- 2.50.1 From ade81479c7dda1ce3eedb215c78bc615bbd04f06 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 24 Dec 2024 02:52:38 +0000 Subject: [PATCH 11/16] memcg: fix soft lockup in the OOM process MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit A soft lockup issue was found in the product with about 56,000 tasks were in the OOM cgroup, it was traversing them when the soft lockup was triggered. watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [VM Thread:1503066] CPU: 2 PID: 1503066 Comm: VM Thread Kdump: loaded Tainted: G Hardware name: Huawei Cloud OpenStack Nova, BIOS RIP: 0010:console_unlock+0x343/0x540 RSP: 0000:ffffb751447db9a0 EFLAGS: 00000247 ORIG_RAX: ffffffffffffff13 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000ffffffff RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000247 RBP: ffffffffafc71f90 R08: 0000000000000000 R09: 0000000000000040 R10: 0000000000000080 R11: 0000000000000000 R12: ffffffffafc74bd0 R13: ffffffffaf60a220 R14: 0000000000000247 R15: 0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2fe6ad91f0 CR3: 00000004b2076003 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: vprintk_emit+0x193/0x280 printk+0x52/0x6e dump_task+0x114/0x130 mem_cgroup_scan_tasks+0x76/0x100 dump_header+0x1fe/0x210 oom_kill_process+0xd1/0x100 out_of_memory+0x125/0x570 mem_cgroup_out_of_memory+0xb5/0xd0 try_charge+0x720/0x770 mem_cgroup_try_charge+0x86/0x180 mem_cgroup_try_charge_delay+0x1c/0x40 do_anonymous_page+0xb5/0x390 handle_mm_fault+0xc4/0x1f0 This is because thousands of processes are in the OOM cgroup, it takes a long time to traverse all of them. As a result, this lead to soft lockup in the OOM process. To fix this issue, call 'cond_resched' in the 'mem_cgroup_scan_tasks' function per 1000 iterations. For global OOM, call 'touch_softlockup_watchdog' per 1000 iterations to avoid this issue. Link: https://lkml.kernel.org/r/20241224025238.3768787-1-chenridong@huaweicloud.com Fixes: 9cbb78bb3143 ("mm, memcg: introduce own oom handler to iterate only over its own threads") Signed-off-by: Chen Ridong Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Cc: Michal Koutný Cc: Signed-off-by: Andrew Morton --- mm/memcontrol.c | 7 ++++++- mm/oom_kill.c | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 65fb5eee1466..46f8b372d212 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1161,6 +1161,7 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; + int i = 0; BUG_ON(mem_cgroup_is_root(memcg)); @@ -1169,8 +1170,12 @@ void mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct task_struct *task; css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); - while (!ret && (task = css_task_iter_next(&it))) + while (!ret && (task = css_task_iter_next(&it))) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + cond_resched(); ret = fn(task, arg); + } css_task_iter_end(&it); if (ret) { mem_cgroup_iter_break(memcg, iter); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1c485beb0b93..044ebab2c941 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "internal.h" @@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); else { struct task_struct *p; + int i = 0; rcu_read_lock(); - for_each_process(p) + for_each_process(p) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + touch_softlockup_watchdog(); dump_task(p, oc); + } rcu_read_unlock(); } } -- 2.50.1 From 07438779313caafe52ac1a1a6958d735a5938988 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 26 Dec 2024 13:16:38 -0800 Subject: [PATCH 12/16] alloc_tag: avoid current->alloc_tag manipulations when profiling is disabled When memory allocation profiling is disabled there is no need to update current->alloc_tag and these manipulations add unnecessary overhead. Fix the overhead by skipping these extra updates. I ran comprehensive testing on Pixel 6 on Big, Medium and Little cores: Overhead before fixes Overhead after fixes slab alloc page alloc slab alloc page alloc Big 6.21% 5.32% 3.31% 4.93% Medium 4.51% 5.05% 3.79% 4.39% Little 7.62% 1.82% 6.68% 1.02% This is an allocation microbenchmark doing allocations in a tight loop. Not a really realistic scenario and useful only to make performance comparisons. Link: https://lkml.kernel.org/r/20241226211639.1357704-1-surenb@google.com Fixes: b951aaff5035 ("mm: enable page allocation tagging") Signed-off-by: Suren Baghdasaryan Cc: David Wang <00107082@163.com> Cc: Kent Overstreet Cc: Yu Zhao Cc: Zhenhua Huang Signed-off-by: Andrew Morton --- include/linux/alloc_tag.h | 11 ++++++++--- lib/alloc_tag.c | 2 ++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 0bbbe537c5f9..a946e0203e6d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -224,9 +224,14 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} #define alloc_hooks_tag(_tag, _do_alloc) \ ({ \ - struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \ - typeof(_do_alloc) _res = _do_alloc; \ - alloc_tag_restore(_tag, _old); \ + typeof(_do_alloc) _res; \ + if (mem_alloc_profiling_enabled()) { \ + struct alloc_tag * __maybe_unused _old; \ + _old = alloc_tag_save(_tag); \ + _res = _do_alloc; \ + alloc_tag_restore(_tag, _old); \ + } else \ + _res = _do_alloc; \ _res; \ }) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 4bb778be4476..894f19694010 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -29,6 +29,8 @@ EXPORT_SYMBOL(_shared_alloc_tag); DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, mem_alloc_profiling_key); +EXPORT_SYMBOL(mem_alloc_profiling_key); + DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed); struct alloc_tag_kernel_section kernel_tags = { NULL, 0 }; -- 2.50.1 From d563ced682505c4e0a713b0de73abf58550ff97e Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:20 +0800 Subject: [PATCH 13/16] mm, swap: minor clean up for swap entry allocation Patch series "mm, swap: rework of swap allocator locks", v4. This series greatly improved swap performance by reworking the locking design and simplify a lot of code path. Test showed a up to 400% vm-scalability improvement with pmem as SWAP, and up to 37% reduce of kernel compile real time with ZRAM as SWAP (up to 60% improvement in system time). This is part of the new swap allocator discussed during the "Swap Abstraction" discussion at LSF/MM 2024, and "mTHP and swap allocator" discussion at LPC 2024. This is a follow up of previous swap cluster allocator series: https://lore.kernel.org/linux-mm/20240730-swap-allocator-v5-0-cb9c148b9297@kernel.org/ Also enables further optimizations which will come later. Previous series introduced a fully cluster based allocator, this series completely get rid of the old allocator and makes the new allocator avoid touching the si->lock unless needed. This bring huge performance gain and get rid of slot cache for freeing path. Currently, swap locking is mainly composed of two locks, cluster lock (ci->lock) and device lock (si->lock). The device lock is widely used to protect many things, causing it to be the main bottleneck for SWAP. Cluster lock is much more fine-grained, so it will be best to use ci->lock instead of si->lock as much as possible. `perf lock' indicates this issue clearly. Doing linux kernel build using tmpfs and ZRAM with limited memory (make -j64 with 1G memcg and 4k pages), result of "perf lock contention -ab sleep 3" shows: contended total wait max wait avg wait type caller 34948 53.63 s 7.11 ms 1.53 ms spinlock free_swap_and_cache_nr+0x350 16569 40.05 s 6.45 ms 2.42 ms spinlock get_swap_pages+0x231 11191 28.41 s 7.03 ms 2.54 ms spinlock swapcache_free_entries+0x59 4147 22.78 s 122.66 ms 5.49 ms spinlock page_vma_mapped_walk+0x6f3 4595 7.17 s 6.79 ms 1.56 ms spinlock swapcache_free_entries+0x59 406027 2.74 s 2.59 ms 6.74 us spinlock list_lru_add+0x39 ...snip... The top 5 caller are all users of si->lock, total wait time sums to several minutes in the 3 seconds time window. Following the new allocator design, many operation doesn't need to touch si->lock at all. We only need to take si->lock when doing operations across multiple clusters (changing the cluster list). So ideally allocator should always take ci->lock first, then take si->lock only if needed. But due to historical reasons, ci->lock is used inside si->lock critical section, causing lock inversion if we simply try to acquire si->lock after acquiring ci->lock. This series audited all si->lock usage, clean up legacy codes, eliminate usage of si->lock as much as possible by introducing new designs based on the new cluster allocator. Old HDD allocation codes are removed, cluster allocator is adapted with small changes for HDD usage, test is looking OK. And this also removed slot cache for freeing path. The performance is even better without it now, and this enables other clean up and optimizations as discussed before: https://lore.kernel.org/all/CAMgjq7ACohT_uerSz8E_994ZZCv709Zor+43hdmesW_59W1BWw@mail.gmail.com/ After this series, lock contention on si->lock is nearly unobservable with `perf lock` with the same test above: contended total wait max wait avg wait type caller ... snip ... 52 127.12 us 3.82 us 2.44 us spinlock move_cluster+0x2c 56 120.77 us 12.41 us 2.16 us spinlock move_cluster+0x2c ... snip ... 10 21.96 us 2.78 us 2.20 us spinlock isolate_lock_cluster+0x20 ... snip ... 9 19.27 us 2.70 us 2.14 us spinlock move_cluster+0x2c ... snip ... 5 11.07 us 2.70 us 2.21 us spinlock isolate_lock_cluster+0x20 `move_cluster' and `isolate_lock_cluster' (two new introduced helper) are basically the only users of si->lock now, performance gain is huge, and LOC is reduced. Tests Results: vm-scalability ============== Running `usemem --init-time -O -y -x -R -31 1G` from vm-scalability in a 12G memory cgroup using simulated pmem as SWAP backend (32G pmem, 32 CPUs). Using 4K folio by default, 64k mTHP and sequential access (!-R) results are also provided. 6 test runs for each case, Total Throughput: Test Before (KB/s) (stdev) After (KB/s) (stdev) Delta --------------------------------------------------------------------------- Random (4K): 69937.11 (16449.77) 369816.17 (24476.68) +428.78% Random (64k): 123442.83 (13207.51) 216379.00 (25024.83) +75.28% Sequential (4K): 6313909.83 (148856.12) 6419860.66 (183563.38) +1.7% Sequential access will cause lower stress for the allocator so the gain is limited, but with random access (which is much closer to real workloads) the performance gain is huge. Build kernel with defconfig on tmpfs with ZRAM ============================================== Below results shows a test matrix using different memory cgroup limit and job numbets, and scaled up progressive for a intuitive result. Done on a 48c96t system. 6 test run for each case, it can be seen clearly that as concurrent job number goes higher the performance gain is higher, but even -j6 is showing slight improvement. make -j | System Time (seconds) | Total Time (seconds) (NR / Mem / ZRAM) | (Before / After / Delta) | (Before / After / Delta) With 4k pages only: 6 / 192M / 3G | 1533 / 1522 / -0.7% | 1420 / 1414 / -0.3% 12 / 256M / 4G | 2275 / 2226 / -2.2% | 758 / 742 / -2.1% 24 / 384M / 5G | 3596 / 3154 / -12.3% | 476 / 422 / -11.3% 48 / 768M / 7G | 8159 / 3605 / -55.8% | 330 / 221 / -33.0% 96 / 1.5G / 10G | 18541 / 6462 / -65.1% | 283 / 180 / -36.4% With 64k mTHP: 24 / 512M / 5G | 3585 / 3469 / -3.2% | 293 / 290 / -0.1% 48 / 1G / 7G | 8173 / 3607 / -55.9% | 251 / 158 / -37.0% 96 / 2G / 10G | 16305 / 7791 / -52.2% | 226 / 144 / -36.3% The fragmentation are reduced too: With: make -j96 / 1152M memcg, 64K mTHP: (avg of 4 test run) Before: hugepages-64kB/stats/swpout: 1696184 hugepages-64kB/stats/swpout_fallback: 414318 After: (-63.2% mTHP swapout failure) hugepages-64kB/stats/swpout: 1866267 hugepages-64kB/stats/swpout_fallback: 158330 There is a up to 65.1% improvement in sys time for build kernel test, and lower fragmentation rate. Build kernel with tinyconfig on tmpfs with HDD as swap: ======================================================= This test is similar to above, but HDD test is very noisy and slow, the deviation is huge, so just use tinyconfig instead and take the median test result of 3 test run, which looks OK: Before this series: 114.44user 29.11system 39:42.90elapsed 6%CPU 2901232inputs+0outputs (238877major+4227640minor)pagefaults After this commit: 113.90user 23.81system 38:11.77elapsed 6%CPU 2548728inputs+0outputs (235471major+4238110minor)pagefaults Single thread SWAP: =================== Sequential SWAP should also be slightly faster as we removed a lot of unnecessary parts. Test using micro benchmark for swapout/in 4G zero memory using ZRAM, 10 test runs: Swapout Before (avg. 3359304): 3353796 3358551 3371305 3356043 3367524 3355303 3355924 3354513 3360776 Swapin Before (avg. 1928698): 1920283 1927183 1934105 1921373 1926562 1938261 1927726 1928636 1934155 Swapout After (avg. 3347511, -0.4%): 3337863 3347948 3355235 3339081 3333134 3353006 3354917 3346055 3360359 Swapin After (avg. 1922290, -0.3%): 1919101 1925743 1916810 1917007 1923930 1935152 1917403 1923549 1921913 The gain is limited at noise level but seems slightly better. This patch (of 13): Direct reclaim can skip the whole folio after reclaimed a set of folio based slots. Also simplify the code for allocation, reduce indention. Link: https://lkml.kernel.org/r/20250113175732.48099-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20250113175732.48099-2-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Barry Song Cc: Chis Li (Google) Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/swapfile.c | 59 +++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index b0a9071cfe1d..f8002f110104 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -604,23 +604,28 @@ static bool cluster_reclaim_range(struct swap_info_struct *si, unsigned long start, unsigned long end) { unsigned char *map = si->swap_map; - unsigned long offset; + unsigned long offset = start; + int nr_reclaim; spin_unlock(&ci->lock); spin_unlock(&si->lock); - for (offset = start; offset < end; offset++) { + do { switch (READ_ONCE(map[offset])) { case 0: - continue; + offset++; + break; case SWAP_HAS_CACHE: - if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0) - continue; - goto out; + nr_reclaim = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); + if (nr_reclaim > 0) + offset += nr_reclaim; + else + goto out; + break; default: goto out; } - } + } while (offset < end); out: spin_lock(&si->lock); spin_lock(&ci->lock); @@ -838,35 +843,30 @@ new_cluster: &found, order, usage); frags++; if (found) - break; + goto done; } - if (!found) { + /* + * Nonfull clusters are moved to frag tail if we reached + * here, count them too, don't over scan the frag list. + */ + while (frags < si->frag_cluster_nr[order]) { + ci = list_first_entry(&si->frag_clusters[order], + struct swap_cluster_info, list); /* - * Nonfull clusters are moved to frag tail if we reached - * here, count them too, don't over scan the frag list. + * Rotate the frag list to iterate, they were all failing + * high order allocation or moved here due to per-CPU usage, + * this help keeping usable cluster ahead. */ - while (frags < si->frag_cluster_nr[order]) { - ci = list_first_entry(&si->frag_clusters[order], - struct swap_cluster_info, list); - /* - * Rotate the frag list to iterate, they were all failing - * high order allocation or moved here due to per-CPU usage, - * this help keeping usable cluster ahead. - */ - list_move_tail(&ci->list, &si->frag_clusters[order]); - offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), - &found, order, usage); - frags++; - if (found) - break; - } + list_move_tail(&ci->list, &si->frag_clusters[order]); + offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), + &found, order, usage); + frags++; + if (found) + goto done; } } - if (found) - goto done; - if (!list_empty(&si->discard_clusters)) { /* * we don't have free cluster but have some clusters in @@ -904,7 +904,6 @@ new_cluster: goto done; } } - done: cluster->next[order] = offset; return found; -- 2.50.1 From e027ec414fe8f540c6098ba1214e63e43b4eba1b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:21 +0800 Subject: [PATCH 14/16] mm, swap: fold swap_info_get_cont in the only caller The name of the function is confusing, and the code is much easier to follow after folding, also rename the confusing naming "p" to more meaningful "si". Link: https://lkml.kernel.org/r/20250113175732.48099-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Baoquan He Cc: Barry Song Cc: Chis Li Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/swapfile.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index f8002f110104..574059158627 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1375,22 +1375,6 @@ out: return NULL; } -static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, - struct swap_info_struct *q) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - - if (p != q) { - if (q != NULL) - spin_unlock(&q->lock); - if (p != NULL) - spin_lock(&p->lock); - } - return p; -} - static unsigned char __swap_entry_free_locked(struct swap_info_struct *si, unsigned long offset, unsigned char usage) @@ -1687,14 +1671,14 @@ static int swp_entry_cmp(const void *ent1, const void *ent2) void swapcache_free_entries(swp_entry_t *entries, int n) { - struct swap_info_struct *p, *prev; + struct swap_info_struct *si, *prev; int i; if (n <= 0) return; prev = NULL; - p = NULL; + si = NULL; /* * Sort swap entries by swap device, so each lock is only taken once. @@ -1704,13 +1688,20 @@ void swapcache_free_entries(swp_entry_t *entries, int n) if (nr_swapfiles > 1) sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL); for (i = 0; i < n; ++i) { - p = swap_info_get_cont(entries[i], prev); - if (p) - swap_entry_range_free(p, entries[i], 1); - prev = p; + si = _swap_info_get(entries[i]); + + if (si != prev) { + if (prev != NULL) + spin_unlock(&prev->lock); + if (si != NULL) + spin_lock(&si->lock); + } + if (si) + swap_entry_range_free(si, entries[i], 1); + prev = si; } - if (p) - spin_unlock(&p->lock); + if (si) + spin_unlock(&si->lock); } int __swap_count(swp_entry_t entry) -- 2.50.1 From 7277433096f6ce4a84a1620529ac4ba3e1041ee1 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:22 +0800 Subject: [PATCH 15/16] mm, swap: remove old allocation path for HDD We are currently using different swap allocation algorithm for HDD and non-HDD. This leads to the existence of a different set of locks, and the code path is heavily bloated, causing difficulties for further optimization and maintenance. This commit removes all HDD swap allocation and related dead code, and uses the cluster allocation algorithm instead. The performance may drop temporarily, but this should be negligible: The main advantage of the legacy HDD allocation algorithm is that it tends to use continuous slots, but swap device gets fragmented quickly anyway, and the attempt to use continuous slots will fail easily. This commit also enables mTHP swap on HDD, which is expected to be beneficial, and following commits will adapt and optimize the cluster allocator for HDD. Link: https://lkml.kernel.org/r/20250113175732.48099-4-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Suggested-by: "Huang, Ying" Reviewed-by: Baoquan He Cc: Barry Song Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/swap.h | 3 - mm/swapfile.c | 235 ++----------------------------------------- 2 files changed, 9 insertions(+), 229 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f3e0ac20c2e8..3a71198a6957 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -309,9 +309,6 @@ struct swap_info_struct { unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ - unsigned int cluster_next; /* likely index for next allocation */ - unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 574059158627..fca58d43b836 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1001,49 +1001,6 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries); } -static void set_cluster_next(struct swap_info_struct *si, unsigned long next) -{ - unsigned long prev; - - if (!(si->flags & SWP_SOLIDSTATE)) { - si->cluster_next = next; - return; - } - - prev = this_cpu_read(*si->cluster_next_cpu); - /* - * Cross the swap address space size aligned trunk, choose - * another trunk randomly to avoid lock contention on swap - * address space if possible. - */ - if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) != - (next >> SWAP_ADDRESS_SPACE_SHIFT)) { - /* No free swap slots available */ - if (si->highest_bit <= si->lowest_bit) - return; - next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); - next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); - next = max_t(unsigned int, next, si->lowest_bit); - } - this_cpu_write(*si->cluster_next_cpu, next); -} - -static bool swap_offset_available_and_locked(struct swap_info_struct *si, - unsigned long offset) -{ - if (data_race(!si->swap_map[offset])) { - spin_lock(&si->lock); - return true; - } - - if (vm_swap_full() && READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) { - spin_lock(&si->lock); - return true; - } - - return false; -} - static int cluster_alloc_swap(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) @@ -1071,13 +1028,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { - unsigned long offset; - unsigned long scan_base; - unsigned long last_in_cluster = 0; - int latency_ration = LATENCY_LIMIT; unsigned int nr_pages = 1 << order; - int n_ret = 0; - bool scanned_many = false; /* * We try to cluster swap pages by allocating them sequentially @@ -1089,7 +1040,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, * But we do now try to find an empty cluster. -Andrea * And we let swap pages go all over an SSD partition. Hugh */ - if (order > 0) { /* * Should not even be attempting large allocations when huge @@ -1109,158 +1059,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return 0; } - if (si->cluster_info) - return cluster_alloc_swap(si, usage, nr, slots, order); - - si->flags += SWP_SCANNING; - - /* For HDD, sequential access is more important. */ - scan_base = si->cluster_next; - offset = scan_base; - - if (unlikely(!si->cluster_nr--)) { - if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - - spin_unlock(&si->lock); - - /* - * If seek is expensive, start searching for new cluster from - * start of partition, to minimize the span of allocated swap. - */ - scan_base = offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster <= READ_ONCE(si->highest_bit); offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - - offset = scan_base; - spin_lock(&si->lock); - si->cluster_nr = SWAPFILE_CLUSTER - 1; - } - -checks: - if (!(si->flags & SWP_WRITEOK)) - goto no_page; - if (!si->highest_bit) - goto no_page; - if (offset > si->highest_bit) - scan_base = offset = si->lowest_bit; - - /* reuse swap entry of cache-only swap if not busy. */ - if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) { - int swap_was_freed; - spin_unlock(&si->lock); - swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT); - spin_lock(&si->lock); - /* entry was freed successfully, try to use this again */ - if (swap_was_freed > 0) - goto checks; - goto scan; /* check next one */ - } - - if (si->swap_map[offset]) { - if (!n_ret) - goto scan; - else - goto done; - } - memset(si->swap_map + offset, usage, nr_pages); - - swap_range_alloc(si, offset, nr_pages); - slots[n_ret++] = swp_entry(si->type, offset); - - /* got enough slots or reach max slots? */ - if ((n_ret == nr) || (offset >= si->highest_bit)) - goto done; - - /* search for next available slot */ - - /* time to take a break? */ - if (unlikely(--latency_ration < 0)) { - if (n_ret) - goto done; - spin_unlock(&si->lock); - cond_resched(); - spin_lock(&si->lock); - latency_ration = LATENCY_LIMIT; - } - - if (si->cluster_nr && !si->swap_map[++offset]) { - /* non-ssd case, still more slots in cluster? */ - --si->cluster_nr; - goto checks; - } - - /* - * Even if there's no free clusters available (fragmented), - * try to scan a little more quickly with lock held unless we - * have scanned too many slots already. - */ - if (!scanned_many) { - unsigned long scan_limit; - - if (offset < scan_base) - scan_limit = scan_base; - else - scan_limit = si->highest_bit; - for (; offset <= scan_limit && --latency_ration > 0; - offset++) { - if (!si->swap_map[offset]) - goto checks; - } - } - -done: - if (order == 0) - set_cluster_next(si, offset + 1); - si->flags -= SWP_SCANNING; - return n_ret; - -scan: - VM_WARN_ON(order > 0); - spin_unlock(&si->lock); - while (++offset <= READ_ONCE(si->highest_bit)) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - } - offset = si->lowest_bit; - while (offset < scan_base) { - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - scanned_many = true; - } - if (swap_offset_available_and_locked(si, offset)) - goto checks; - offset++; - } - spin_lock(&si->lock); - -no_page: - si->flags -= SWP_SCANNING; - return n_ret; + return cluster_alloc_swap(si, usage, nr, slots, order); } int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order) @@ -2871,8 +2670,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; - free_percpu(p->cluster_next_cpu); - p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(zeromap); kvfree(cluster_info); @@ -3184,8 +2981,6 @@ static unsigned long read_swap_header(struct swap_info_struct *si, } si->lowest_bit = 1; - si->cluster_next = 1; - si->cluster_nr = 0; maxpages = swapfile_maximum_size; last_page = swap_header->info.last_page; @@ -3271,7 +3066,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, unsigned long maxpages) { unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS; struct swap_cluster_info *cluster_info; unsigned long i, j, k, idx; int cpu, err = -ENOMEM; @@ -3283,15 +3077,6 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, for (i = 0; i < nr_clusters; i++) spin_lock_init(&cluster_info[i].lock); - si->cluster_next_cpu = alloc_percpu(unsigned int); - if (!si->cluster_next_cpu) - goto err_free; - - /* Random start position to help with wear leveling */ - for_each_possible_cpu(cpu) - per_cpu(*si->cluster_next_cpu, cpu) = - get_random_u32_inclusive(1, si->highest_bit); - si->percpu_cluster = alloc_percpu(struct percpu_cluster); if (!si->percpu_cluster) goto err_free; @@ -3333,7 +3118,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * sharing same address space. */ for (k = 0; k < SWAP_CLUSTER_COLS; k++) { - j = (k + col) % SWAP_CLUSTER_COLS; + j = k % SWAP_CLUSTER_COLS; for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) { struct swap_cluster_info *ci; idx = i * SWAP_CLUSTER_COLS + j; @@ -3483,18 +3268,18 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (si->bdev && bdev_nonrot(si->bdev)) { si->flags |= SWP_SOLIDSTATE; - - cluster_info = setup_clusters(si, swap_header, maxpages); - if (IS_ERR(cluster_info)) { - error = PTR_ERR(cluster_info); - cluster_info = NULL; - goto bad_swap_unlock_inode; - } } else { atomic_inc(&nr_rotate_swap); inced_nr_rotate_swap = true; } + cluster_info = setup_clusters(si, swap_header, maxpages); + if (IS_ERR(cluster_info)) { + error = PTR_ERR(cluster_info); + cluster_info = NULL; + goto bad_swap_unlock_inode; + } + if ((swap_flags & SWAP_FLAG_DISCARD) && si->bdev && bdev_max_discard_sectors(si->bdev)) { /* @@ -3575,8 +3360,6 @@ bad_swap_unlock_inode: bad_swap: free_percpu(si->percpu_cluster); si->percpu_cluster = NULL; - free_percpu(si->cluster_next_cpu); - si->cluster_next_cpu = NULL; inode = NULL; destroy_swap_extents(si); swap_cgroup_swapoff(si->type); -- 2.50.1 From 0b310d9cfd94e1725ccf09487b6dfc471da958cc Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 14 Jan 2025 01:57:23 +0800 Subject: [PATCH 16/16] mm, swap: use cluster lock for HDD Cluster lock (ci->lock) was introduced to reduce contention for certain operations. Using cluster lock for HDD is not helpful as HDD have a poor performance, so locking isn't the bottleneck. But having different set of locks for HDD / non-HDD prevents further rework of device lock (si->lock). This commit just changed all lock_cluster_or_swap_info to lock_cluster, which is a safe and straight conversion since cluster info is always allocated now, also removed all cluster_info related checks. Link: https://lkml.kernel.org/r/20250113175732.48099-5-ryncsn@gmail.com Signed-off-by: Kairui Song Suggested-by: Chris Li Reviewed-by: Baoquan He Cc: Barry Song Cc: "Huang, Ying" Cc: Hugh Dickens Cc: Johannes Weiner Cc: Kalesh Singh Cc: Nhat Pham Cc: Ryan Roberts Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/swapfile.c | 109 ++++++++++++++++---------------------------------- 1 file changed, 35 insertions(+), 74 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index fca58d43b836..83ebc24cc94b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -58,10 +58,9 @@ static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries); static bool folio_swapcache_freeable(struct folio *folio); -static struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset); -static void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci); +static struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, + unsigned long offset); +static void unlock_cluster(struct swap_cluster_info *ci); static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; @@ -222,9 +221,9 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, * swap_map is HAS_CACHE only, which means the slots have no page table * reference or pending writeback, and can't be allocated to others. */ - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); need_reclaim = swap_is_has_cache(si, offset, nr_pages); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); if (!need_reclaim) goto out_unlock; @@ -404,45 +403,15 @@ static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si { struct swap_cluster_info *ci; - ci = si->cluster_info; - if (ci) { - ci += offset / SWAPFILE_CLUSTER; - spin_lock(&ci->lock); - } - return ci; -} - -static inline void unlock_cluster(struct swap_cluster_info *ci) -{ - if (ci) - spin_unlock(&ci->lock); -} - -/* - * Determine the locking method in use for this device. Return - * swap_cluster_info if SSD-style cluster-based locking is in place. - */ -static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, unsigned long offset) -{ - struct swap_cluster_info *ci; - - /* Try to use fine-grained SSD-style locking if available: */ - ci = lock_cluster(si, offset); - /* Otherwise, fall back to traditional, coarse locking: */ - if (!ci) - spin_lock(&si->lock); + ci = &si->cluster_info[offset / SWAPFILE_CLUSTER]; + spin_lock(&ci->lock); return ci; } -static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si, - struct swap_cluster_info *ci) +static inline void unlock_cluster(struct swap_cluster_info *ci) { - if (ci) - unlock_cluster(ci); - else - spin_unlock(&si->lock); + spin_unlock(&ci->lock); } /* Add a cluster to discard list and schedule it to do discard */ @@ -558,9 +527,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si, unsigned long idx = page_nr / SWAPFILE_CLUSTER; struct swap_cluster_info *ci; - if (!cluster_info) - return; - ci = cluster_info + idx; ci->count++; @@ -576,9 +542,6 @@ static void inc_cluster_info_page(struct swap_info_struct *si, static void dec_cluster_info_page(struct swap_info_struct *si, struct swap_cluster_info *ci, int nr_pages) { - if (!si->cluster_info) - return; - VM_BUG_ON(ci->count < nr_pages); VM_BUG_ON(cluster_is_free(ci)); lockdep_assert_held(&si->lock); @@ -940,7 +903,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, si->highest_bit = 0; del_from_avail_list(si); - if (si->cluster_info && vm_swap_full()) + if (vm_swap_full()) schedule_work(&si->reclaim_work); } } @@ -1007,8 +970,6 @@ static int cluster_alloc_swap(struct swap_info_struct *si, { int n_ret = 0; - VM_BUG_ON(!si->cluster_info); - si->flags += SWP_SCANNING; while (n_ret < nr) { @@ -1052,10 +1013,10 @@ static int scan_swap_map_slots(struct swap_info_struct *si, } /* - * Swapfile is not block device or not using clusters so unable + * Swapfile is not block device so unable * to allocate large entries. */ - if (!(si->flags & SWP_BLKDEV) || !si->cluster_info) + if (!(si->flags & SWP_BLKDEV)) return 0; } @@ -1295,9 +1256,9 @@ static unsigned char __swap_entry_free(struct swap_info_struct *si, unsigned long offset = swp_offset(entry); unsigned char usage; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); usage = __swap_entry_free_locked(si, offset, 1); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); if (!usage) free_swap_slot(entry); @@ -1320,14 +1281,14 @@ static bool __swap_entries_free(struct swap_info_struct *si, if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER) goto fallback; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); if (!swap_is_last_map(si, offset, nr, &has_cache)) { - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); goto fallback; } for (i = 0; i < nr; i++) WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); if (!has_cache) { for (i = 0; i < nr; i++) @@ -1383,7 +1344,7 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 }; int i, nr; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); while (nr_pages) { nr = min(BITS_PER_LONG, nr_pages); for (i = 0; i < nr; i++) { @@ -1391,18 +1352,18 @@ static void cluster_swap_free_nr(struct swap_info_struct *si, bitmap_set(to_free, i, 1); } if (!bitmap_empty(to_free, BITS_PER_LONG)) { - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); for_each_set_bit(i, to_free, BITS_PER_LONG) free_swap_slot(swp_entry(si->type, offset + i)); if (nr == nr_pages) return; bitmap_clear(to_free, 0, BITS_PER_LONG); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); } offset += nr; nr_pages -= nr; } - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); } /* @@ -1441,9 +1402,9 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) if (!si) return; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); if (size > 1 && swap_is_has_cache(si, offset, size)) { - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); spin_lock(&si->lock); swap_entry_range_free(si, entry, size); spin_unlock(&si->lock); @@ -1451,14 +1412,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry) } for (int i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); free_swap_slot(entry); if (i == size - 1) return; - lock_cluster_or_swap_info(si, offset); + lock_cluster(si, offset); } } - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); } static int swp_entry_cmp(const void *ent1, const void *ent2) @@ -1522,9 +1483,9 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) struct swap_cluster_info *ci; int count; - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1547,7 +1508,7 @@ int swp_swapcount(swp_entry_t entry) offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); count = swap_count(si->swap_map[offset]); if (!(count & COUNT_CONTINUED)) @@ -1570,7 +1531,7 @@ int swp_swapcount(swp_entry_t entry) n *= (SWAP_CONT_MAX + 1); } while (tmp_count & COUNT_CONTINUED); out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return count; } @@ -1585,8 +1546,8 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - ci = lock_cluster_or_swap_info(si, offset); - if (!ci || nr_pages == 1) { + ci = lock_cluster(si, offset); + if (nr_pages == 1) { if (swap_count(map[roffset])) ret = true; goto unlock_out; @@ -1598,7 +1559,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, } } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return ret; } @@ -3428,7 +3389,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) offset = swp_offset(entry); VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER); VM_WARN_ON(usage == 1 && nr > 1); - ci = lock_cluster_or_swap_info(si, offset); + ci = lock_cluster(si, offset); err = 0; for (i = 0; i < nr; i++) { @@ -3483,7 +3444,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr) } unlock_out: - unlock_cluster_or_swap_info(si, ci); + unlock_cluster(ci); return err; } -- 2.50.1