From dfe61db4a132f229706dee64045c49144128be91 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 10 Jan 2025 18:50:28 -0500 Subject: [PATCH 01/16] selftests/mm: add tests for splitting pmd THPs to all lower orders Kernel already supports splitting a folio to any lower order. Test it. [ziy@nvidia.com: no need to test splitting to order-1] Link: https://lkml.kernel.org/r/DDA202EA-4664-4F50-A7FD-B00CBB7A624B@nvidia.com Link: https://lkml.kernel.org/r/20250110235028.96824-2-ziy@nvidia.com Signed-off-by: Zi Yan Cc: Alexander Zhu Cc: Rik van Riel Cc: Shuah Khan Cc: Usama Arif Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/split_huge_page_test.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/split_huge_page_test.c b/tools/testing/selftests/mm/split_huge_page_test.c index cd74ea9b1295..3f353f3d070f 100644 --- a/tools/testing/selftests/mm/split_huge_page_test.c +++ b/tools/testing/selftests/mm/split_huge_page_test.c @@ -144,7 +144,7 @@ void split_pmd_zero_pages(void) free(one_page); } -void split_pmd_thp(void) +void split_pmd_thp_to_order(int order) { char *one_page; size_t len = 4 * pmd_pagesize; @@ -164,7 +164,7 @@ void split_pmd_thp(void) /* split all THPs */ write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len, 0); + (uint64_t)one_page + len, order); for (i = 0; i < len; i++) if (one_page[i] != (char)i) @@ -174,7 +174,7 @@ void split_pmd_thp(void) if (!check_huge_anon(one_page, 0, pmd_pagesize)) ksft_exit_fail_msg("Still AnonHugePages not split\n"); - ksft_test_result_pass("Split huge pages successful\n"); + ksft_test_result_pass("Split huge pages to order %d successful\n", order); free(one_page); } @@ -481,7 +481,7 @@ int main(int argc, char **argv) if (argc > 1) optional_xfs_path = argv[1]; - ksft_set_plan(4+9); + ksft_set_plan(1+8+2+9); pagesize = getpagesize(); pageshift = ffs(pagesize) - 1; @@ -492,7 +492,11 @@ int main(int argc, char **argv) fd_size = 2 * pmd_pagesize; split_pmd_zero_pages(); - split_pmd_thp(); + + for (i = 0; i < 9; i++) + if (i != 1) + split_pmd_thp_to_order(i); + split_pte_mapped_thp(); split_file_backed_thp(); -- 2.51.0 From d6550ee43f4b96d10792912c6c01ebeeec38af79 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 10 Jan 2025 22:32:49 -0800 Subject: [PATCH 02/16] kasan: use correct kernel-doc format Use the correct kernel-doc character following function parameters or struct members (':' instead of '-') to eliminate kernel-doc warnings. kasan.h:509: warning: Function parameter or struct member 'addr' not described in 'kasan_poison' kasan.h:509: warning: Function parameter or struct member 'size' not described in 'kasan_poison' kasan.h:509: warning: Function parameter or struct member 'value' not described in 'kasan_poison' kasan.h:509: warning: Function parameter or struct member 'init' not described in 'kasan_poison' kasan.h:522: warning: Function parameter or struct member 'addr' not described in 'kasan_unpoison' kasan.h:522: warning: Function parameter or struct member 'size' not described in 'kasan_unpoison' kasan.h:522: warning: Function parameter or struct member 'init' not described in 'kasan_unpoison' kasan.h:539: warning: Function parameter or struct member 'address' not described in 'kasan_poison_last_granule' kasan.h:539: warning: Function parameter or struct member 'size' not described in 'kasan_poison_last_granule' Link: https://lkml.kernel.org/r/20250111063249.910975-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b7e4b81421b3..129178be5e64 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -501,18 +501,18 @@ static inline bool kasan_byte_accessible(const void *addr) /** * kasan_poison - mark the memory range as inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, must be aligned to KASAN_GRANULE_SIZE - * @value - value that's written to metadata for the range - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, must be aligned to KASAN_GRANULE_SIZE + * @value: value that's written to metadata for the range + * @init: whether to initialize the memory range (only for hardware tag-based) */ void kasan_poison(const void *addr, size_t size, u8 value, bool init); /** * kasan_unpoison - mark the memory range as accessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size, can be unaligned - * @init - whether to initialize the memory range (only for hardware tag-based) + * @addr: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size, can be unaligned + * @init: whether to initialize the memory range (only for hardware tag-based) * * For the tag-based modes, the @size gets aligned to KASAN_GRANULE_SIZE before * marking the range. @@ -530,8 +530,8 @@ bool kasan_byte_accessible(const void *addr); /** * kasan_poison_last_granule - mark the last granule of the memory range as * inaccessible - * @addr - range start address, must be aligned to KASAN_GRANULE_SIZE - * @size - range size + * @address: range start address, must be aligned to KASAN_GRANULE_SIZE + * @size: range size * * This function is only available for the generic mode, as it's the only mode * that has partially poisoned memory granules. -- 2.51.0 From bf069012df19cf80b460a03c92bfe6320dc268b0 Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Mon, 13 Jan 2025 11:28:58 +0800 Subject: [PATCH 03/16] selftests/mm/cow: modify the incorrect checking parameters In run_with_memfd_hugetlb(), some error handle have passed incorrect parameters. It should be "smem", but it was mistakenly written as "mem". Let's fix it. [gehao@kylinos.cn: fix other errant sites, per Anshuman] Link: https://lkml.kernel.org/r/20250113050908.93638-1-hao.ge@linux.dev Link: https://lkml.kernel.org/r/20250113032858.63670-1-hao.ge@linux.dev Fixes: f8664f3c4a08 ("selftests/vm: cow: basic COW tests for non-anonymous pages") Signed-off-by: Hao Ge Cc: SeongJae Park Cc: Shuah Khan (Samsung OSG) Cc: Anshuman Khandual Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/cow.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/cow.c b/tools/testing/selftests/mm/cow.c index 1238e1c5aae1..9446673645eb 100644 --- a/tools/testing/selftests/mm/cow.c +++ b/tools/testing/selftests/mm/cow.c @@ -1482,7 +1482,7 @@ static void run_with_zeropage(non_anon_test_fn fn, const char *desc) } smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); - if (mem == MAP_FAILED) { + if (smem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } @@ -1583,7 +1583,7 @@ static void run_with_memfd(non_anon_test_fn fn, const char *desc) goto close; } smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { + if (smem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } @@ -1634,7 +1634,7 @@ static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) goto close; } smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { + if (smem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } @@ -1684,7 +1684,7 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, goto close; } smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { + if (smem == MAP_FAILED) { ksft_test_result_fail("mmap() failed\n"); goto munmap; } @@ -1696,7 +1696,7 @@ static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, fn(mem, smem, hugetlbsize); munmap: munmap(mem, hugetlbsize); - if (mem != MAP_FAILED) + if (smem != MAP_FAILED) munmap(smem, hugetlbsize); close: close(fd); -- 2.51.0 From bdbe1d7bc325dbe77335f39b265620f9accc0ae9 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Mon, 13 Jan 2025 19:07:38 +0000 Subject: [PATCH 04/16] mm/damon/paddr: increment pa_stat damon address range by folio size This is to avoid going through all the pages in a folio. For folio_size > PAGE_SIZE, damon_get_folio will return NULL for tail pages, so the for loop in those instances will be a nop. Have a more efficient loop by just incrementing the address by folio_size. Link: https://lkml.kernel.org/r/20250113190738.1156381-1-usamaarif642@gmail.com Signed-off-by: Usama Arif Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 6b4397de4199..bd8cfe10121b 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -504,17 +504,21 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, if (!damon_pa_scheme_has_filter(s)) return 0; - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + addr = r->ar.start; + while (addr < r->ar.end) { struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - if (!folio) + if (!folio) { + addr += PAGE_SIZE; continue; + } if (damos_pa_filter_out(s, folio)) goto put_folio; else *sz_filter_passed += folio_size(folio); put_folio: + addr += folio_size(folio); folio_put(folio); } return 0; -- 2.51.0 From f57f63b0f0fd7ed71987857d244f8490b2d6043c Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Mon, 13 Jan 2025 13:01:56 -0800 Subject: [PATCH 05/16] mm/damon/paddr: improve readability of damon_pa_stat damon_pa_stat contains an unnecessary goto statement, and the if/else can be re-written to be more readable. This patch is written on top of SJ's patch series [1], which in turn is written on top of another one of his series [2]. [1] https://lore.kernel.org/all/20241219040327.61902-1-sj@kernel.org/ [2] https://lore.kernel.org/all/20241213215306.54778-1-sj@kernel.org/ Link: https://lkml.kernel.org/r/20250113210201.446051-1-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index bd8cfe10121b..0f9ae14f884d 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -513,11 +513,8 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, continue; } - if (damos_pa_filter_out(s, folio)) - goto put_folio; - else + if (!damos_pa_filter_out(s, folio)) *sz_filter_passed += folio_size(folio); -put_folio: addr += folio_size(folio); folio_put(folio); } -- 2.51.0 From 8d91fed83cc12306cbb63efa6c473ffee117977a Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:06 +0100 Subject: [PATCH 06/16] mm/huge_memory: convert has_hwpoisoned into a pure folio flag Patch series "mm: hugetlb+THP folio and migration cleanups", v2. Some cleanups around more folio conversion and migration handling that I collected working on random stuff. This patch (of 6): Let's stop setting it on pages, there is no need to anymore. Link: https://lkml.kernel.org/r/20250113131611.2554758-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Baolin Wang Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 ++---- mm/huge_memory.c | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 330929b6e062..616b57ddc3fe 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -906,11 +906,9 @@ TESTPAGEFLAG_FALSE(TransCompound, transcompound) * * This flag is set by hwpoison handler. Cleared by THP split or free page. */ -PAGEFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) - TESTSCFLAG(HasHWPoisoned, has_hwpoisoned, PF_SECOND) +FOLIO_FLAG(has_hwpoisoned, FOLIO_SECOND_PAGE) #else -PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) - TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) +FOLIO_FLAG_FALSE(has_hwpoisoned) #endif /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2654a9548749..3d3ebdc002d5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3290,7 +3290,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* lock lru list/PageCompound, ref frozen by page_ref_freeze */ lruvec = folio_lruvec_lock(folio); - ClearPageHasHWPoisoned(head); + folio_clear_has_hwpoisoned(folio); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { struct folio *tail; -- 2.51.0 From 4c640f128074e0d4459ecf072595a44df5c2ae18 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:07 +0100 Subject: [PATCH 07/16] mm/hugetlb: rename isolate_hugetlb() to folio_isolate_hugetlb() Let's make the function name match "folio_isolate_lru()", and add some kernel doc. Link: https://lkml.kernel.org/r/20250113131611.2554758-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Baolin Wang Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/gup.c | 2 +- mm/hugetlb.c | 23 ++++++++++++++++++++--- mm/mempolicy.c | 2 +- mm/migrate.c | 6 +++--- 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 49ec2362ce92..c95ad5cd7894 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -153,7 +153,7 @@ bool hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); -bool isolate_hugetlb(struct folio *folio, struct list_head *list); +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); @@ -414,7 +414,7 @@ static inline pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, return NULL; } -static inline bool isolate_hugetlb(struct folio *folio, struct list_head *list) +static inline bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { return false; } diff --git a/mm/gup.c b/mm/gup.c index 00a1269cbee0..2cc3a9d28e70 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2344,7 +2344,7 @@ static unsigned long collect_longterm_unpinnable_folios( continue; if (folio_test_hugetlb(folio)) { - isolate_hugetlb(folio, movable_folio_list); + folio_isolate_hugetlb(folio, movable_folio_list); continue; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 58c2c5498207..15a689964265 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2812,7 +2812,7 @@ retry: * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - isolated = isolate_hugetlb(old_folio, list); + isolated = folio_isolate_hugetlb(old_folio, list); ret = isolated ? 0 : -EBUSY; spin_lock_irq(&hugetlb_lock); goto free_new; @@ -2897,7 +2897,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (folio_ref_count(folio) && isolate_hugetlb(folio, list)) + if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); @@ -7421,7 +7421,24 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) #endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */ -bool isolate_hugetlb(struct folio *folio, struct list_head *list) +/** + * folio_isolate_hugetlb - try to isolate an allocated hugetlb folio + * @folio: the folio to isolate + * @list: the list to add the folio to on success + * + * Isolate an allocated (refcount > 0) hugetlb folio, marking it as + * isolated/non-migratable, and moving it from the active list to the + * given list. + * + * Isolation will fail if @folio is not an allocated hugetlb folio, or if + * it is already isolated/non-migratable. + * + * On success, an additional folio reference is taken that must be dropped + * using folio_putback_active_hugetlb() to undo the isolation. + * + * Return: True if isolation worked, otherwise False. + */ +bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list) { bool ret = true; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f83b73236ffe..bbaadbeeb291 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -647,7 +647,7 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask, */ if ((flags & MPOL_MF_MOVE_ALL) || (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte))) - if (!isolate_hugetlb(folio, qp->pagelist)) + if (!folio_isolate_hugetlb(folio, qp->pagelist)) qp->nr_failed++; unlock: spin_unlock(ptl); diff --git a/mm/migrate.c b/mm/migrate.c index 32cc8e0b1cce..c3052877e844 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -128,7 +128,7 @@ static void putback_movable_folio(struct folio *folio) * * This function shall be used whenever the isolated pageset has been * built from lru, balloon, hugetlbfs page. See isolate_migratepages_range() - * and isolate_hugetlb(). + * and folio_isolate_hugetlb(). */ void putback_movable_pages(struct list_head *l) { @@ -169,7 +169,7 @@ bool isolate_folio_to_list(struct folio *folio, struct list_head *list) bool isolated, lru; if (folio_test_hugetlb(folio)) - return isolate_hugetlb(folio, list); + return folio_isolate_hugetlb(folio, list); lru = !__folio_test_movable(folio); if (lru) @@ -2200,7 +2200,7 @@ static int __add_folio_for_migration(struct folio *folio, int node, return -EACCES; if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(folio, pagelist)) + if (folio_isolate_hugetlb(folio, pagelist)) return 1; } else if (folio_isolate_lru(folio)) { list_add_tail(&folio->lru, pagelist); -- 2.51.0 From ba23f58de896842028b4b33b95530f08288396fe Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:08 +0100 Subject: [PATCH 08/16] mm/migrate: don't call folio_putback_active_hugetlb() on dst hugetlb folio We replaced a simple put_page() by a putback_active_hugepage() call in commit 3aaa76e125c1 ("mm: migrate: hugetlb: putback destination hugepage to active list"), to set the "active" flag on the dst hugetlb folio. Nowadays, we decoupled the "active" list from the flag, by calling the flag "migratable". Calling "putback" on something that wasn't allocated is weird and not future proof, especially if we might reach that path when migration failed and we just want to free the freshly allocated hugetlb folio. Let's simply handle the migratable flag and the active list flag in move_hugetlb_state(), where we know that allocation succeeded and already handle the temporary flag; use a simple folio_put() to return our reference. Link: https://lkml.kernel.org/r/20250113131611.2554758-4-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb.c | 10 ++++++++++ mm/migrate.c | 8 ++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 15a689964265..4f95031eeeea 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7533,6 +7533,16 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re } spin_unlock_irq(&hugetlb_lock); } + + /* + * Our old folio is isolated and has "migratable" cleared until it + * is putback. As migration succeeded, set the new folio "migratable" + * and add it to the active list. + */ + spin_lock_irq(&hugetlb_lock); + folio_set_hugetlb_migratable(new_folio); + list_move_tail(&new_folio->lru, &(folio_hstate(new_folio))->hugepage_activelist); + spin_unlock_irq(&hugetlb_lock); } static void hugetlb_unshare_pmds(struct vm_area_struct *vma, diff --git a/mm/migrate.c b/mm/migrate.c index c3052877e844..2a6da67a0eda 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1539,14 +1539,14 @@ out: list_move_tail(&src->lru, ret); /* - * If migration was not successful and there's a freeing callback, use - * it. Otherwise, put_page() will drop the reference grabbed during - * isolation. + * If migration was not successful and there's a freeing callback, + * return the folio to that special allocator. Otherwise, simply drop + * our additional reference. */ if (put_new_folio) put_new_folio(dst, private); else - folio_putback_active_hugetlb(dst); + folio_put(dst); return rc; } -- 2.51.0 From b235448e8cab7eea17d164efc7bf55505985ba65 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:09 +0100 Subject: [PATCH 09/16] mm/hugetlb: rename folio_putback_active_hugetlb() to folio_putback_hugetlb() Now that folio_putback_hugetlb() is only called on folios that were previously isolated through folio_isolate_hugetlb(), let's rename it to match folio_putback_lru(). Add some kernel doc to clarify how this function is supposed to be used. Link: https://lkml.kernel.org/r/20250113131611.2554758-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 4 ++-- mm/hugetlb.c | 15 +++++++++++++-- mm/migrate.c | 6 +++--- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c95ad5cd7894..ec8c0ccc8f95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -157,7 +157,7 @@ bool folio_isolate_hugetlb(struct folio *folio, struct list_head *list); int get_hwpoison_hugetlb_folio(struct folio *folio, bool *hugetlb, bool unpoison); int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); -void folio_putback_active_hugetlb(struct folio *folio); +void folio_putback_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -430,7 +430,7 @@ static inline int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return 0; } -static inline void folio_putback_active_hugetlb(struct folio *folio) +static inline void folio_putback_hugetlb(struct folio *folio) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4f95031eeeea..308b0e3876b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7434,7 +7434,7 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * it is already isolated/non-migratable. * * On success, an additional folio reference is taken that must be dropped - * using folio_putback_active_hugetlb() to undo the isolation. + * using folio_putback_hugetlb() to undo the isolation. * * Return: True if isolation worked, otherwise False. */ @@ -7486,7 +7486,18 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, return ret; } -void folio_putback_active_hugetlb(struct folio *folio) +/** + * folio_putback_hugetlb - unisolate a hugetlb folio + * @folio: the isolated hugetlb folio + * + * Putback/un-isolate the hugetlb folio that was previous isolated using + * folio_isolate_hugetlb(): marking it non-isolated/migratable and putting it + * back onto the active list. + * + * Will drop the additional folio reference obtained through + * folio_isolate_hugetlb(). + */ +void folio_putback_hugetlb(struct folio *folio) { spin_lock_irq(&hugetlb_lock); folio_set_hugetlb_migratable(folio); diff --git a/mm/migrate.c b/mm/migrate.c index 2a6da67a0eda..61587121ccb2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -137,7 +137,7 @@ void putback_movable_pages(struct list_head *l) list_for_each_entry_safe(folio, folio2, l, lru) { if (unlikely(folio_test_hugetlb(folio))) { - folio_putback_active_hugetlb(folio); + folio_putback_hugetlb(folio); continue; } list_del(&folio->lru); @@ -1451,7 +1451,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); return MIGRATEPAGE_SUCCESS; } @@ -1534,7 +1534,7 @@ out_unlock: folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) - folio_putback_active_hugetlb(src); + folio_putback_hugetlb(src); else if (rc != -EAGAIN) list_move_tail(&src->lru, ret); -- 2.51.0 From 3f982b9b18c23900da10956b91036a3f6fe3f5bc Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:10 +0100 Subject: [PATCH 10/16] mm/hugetlb-cgroup: convert hugetlb_cgroup_css_offline() to work on folios Let's convert hugetlb_cgroup_css_offline() and hugetlb_cgroup_move_parent() to work on folios. hugepage_activelist contains folios, not pages. While at it, rename page_hcg simply to hcg, removing most of the "page" terminology. This removes an unnecessary call to compound_head(). Link: https://lkml.kernel.org/r/20250113131611.2554758-6-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Cc: Baolin Wang Cc: Muchun Song Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/hugetlb_cgroup.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 89a8ad45a533..bb9578bd99f9 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -195,24 +195,23 @@ static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, - struct page *page) + struct folio *folio) { unsigned int nr_pages; struct page_counter *counter; - struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); - struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_folio(folio); + hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ - if (!page_hcg || page_hcg != h_cg) + if (!hcg || hcg != h_cg) goto out; - nr_pages = compound_nr(page); + nr_pages = folio_nr_pages(folio); if (!parent) { parent = root_h_cgroup; /* root has no limit */ @@ -235,13 +234,13 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; - struct page *page; + struct folio *folio; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); - list_for_each_entry(page, &h->hugepage_activelist, lru) - hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); + list_for_each_entry(folio, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(hstate_index(h), h_cg, folio); spin_unlock_irq(&hugetlb_lock); } -- 2.51.0 From 89a41a0263293856678189981e5407375261c4ff Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 13 Jan 2025 14:16:11 +0100 Subject: [PATCH 11/16] mm/hugetlb: use folio->lru int demote_free_hugetlb_folios() We are demoting hugetlb folios to smaller hugetlb folios; let's avoid messing with pages where avoidable and handle it more similar to __split_huge_page_tail(). Link: https://lkml.kernel.org/r/20250113131611.2554758-7-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Sidhartha Kumar Cc: Baolin Wang Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 308b0e3876b5..87761b042ed0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3826,13 +3826,15 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) { struct page *page = folio_page(folio, i); + /* Careful: see __split_huge_page_tail() */ + struct folio *new_folio = (struct folio *)page; - page->mapping = NULL; clear_compound_head(page); prep_compound_page(page, dst->order); - init_new_hugetlb_folio(dst, page_folio(page)); - list_add(&page->lru, &dst_list); + new_folio->mapping = NULL; + init_new_hugetlb_folio(dst, new_folio); + list_add(&new_folio->lru, &dst_list); } } -- 2.51.0 From 9ad6344568cc31ede9741795b3e3c41c21e3156f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:39 -0700 Subject: [PATCH 12/16] mm/filemap: change filemap_create_folio() to take a struct kiocb Patch series "Uncached buffered IO", v8. 5 years ago I posted patches adding support for RWF_UNCACHED, as a way to do buffered IO that isn't page cache persistent. The approach back then was to have private pages for IO, and then get rid of them once IO was done. But that then runs into all the issues that O_DIRECT has, in terms of synchronizing with the page cache. So here's a new approach to the same concent, but using the page cache as synchronization. Due to excessive bike shedding on the naming, this is now named RWF_DONTCACHE, and is less special in that it's just page cache IO, except it prunes the ranges once IO is completed. Why do this, you may ask? The tldr is that device speeds are only getting faster, while reclaim is not. Doing normal buffered IO can be very unpredictable, and suck up a lot of resources on the reclaim side. This leads people to use O_DIRECT as a work-around, which has its own set of restrictions in terms of size, offset, and length of IO. It's also inherently synchronous, and now you need async IO as well. While the latter isn't necessarily a big problem as we have good options available there, it also should not be a requirement when all you want to do is read or write some data without caching. Even on desktop type systems, a normal NVMe device can fill the entire page cache in seconds. On the big system I used for testing, there's a lot more RAM, but also a lot more devices. As can be seen in some of the results in the following patches, you can still fill RAM in seconds even when there's 1TB of it. Hence this problem isn't solely a "big hyperscaler system" issue, it's common across the board. Common for both reads and writes with RWF_DONTCACHE is that they use the page cache for IO. Reads work just like a normal buffered read would, with the only exception being that the touched ranges will get pruned after data has been copied. For writes, the ranges will get writeback kicked off before the syscall returns, and then writeback completion will prune the range. Hence writes aren't synchronous, and it's easy to pipeline writes using RWF_DONTCACHE. Folios that aren't instantiated by RWF_DONTCACHE IO are left untouched. This means you that uncached IO will take advantage of the page cache for uptodate data, but not leave anything it instantiated/created in cache. File systems need to support this. This patchset adds support for the generic read path, which covers file systems like ext4. Patches exist to add support for iomap/XFS and btrfs as well, which sit on top of this series. If RWF_DONTCACHE IO is attempted on a file system that doesn't support it, -EOPNOTSUPP is returned. Hence the user can rely on it either working as designed, or flagging and error if that's not the case. The intent here is to give the application a sensible fallback path - eg, it may fall back to O_DIRECT if appropriate, or just live with the fact that uncached IO isn't available and do normal buffered IO. Adding "support" to other file systems should be trivial, most of the time just a one-liner adding FOP_DONTCACHE to the fop_flags in the file_operations struct, if the file system is using either iomap or the generic filemap helpers for reading and writing. Performance results are in patch 8 for reads, and you can find the write side results in the XFS patch adding support for DONTCACHE writes for XFS: https://git.kernel.dk/cgit/linux/commit/?h=buffered-uncached-fs.10&id=257e92de795fdff7d7e256501e024fac6da6a7f4 with the tldr being that I see about a 65% improvement in performance for both, with fully predictable IO times. CPU reduction is substantial as well, with no kswapd activity at all for reclaim when using uncached IO. Using it from applications is trivial - just set RWF_DONTCACHE for the read or write, using pwritev2(2) or preadv2(2). For io_uring, same thing, just set RWF_DONTCACHE in sqe->rw_flags for a buffered read/write operation. And that's it. Patches 1..7 are just prep patches, and should have no functional changes at all. Patch 8 adds support for the filemap path for RWF_DONTCACHE reads, and patches 9..12 are just prep patches for supporting the write side of uncached writes. In the below mentioned branch, there are then patches to adopt uncached reads and writes for xfs, btrfs, and ext4. The latter currently relies on bit of a hack for passing whether this is an uncached write or not through ->write_begin(), which can hopefully go away once ext4 adopts iomap for buffered writes. I say this is a hack as it's not the prettiest way to do it, however it is fully solid and will work just fine. Passes full xfstests and fsx overnight runs, no issues observed. That includes the vm running the testing also using RWF_DONTCACHE on the host. I'll post fsstress and fsx patches for RWF_DONTCACHE separately. As far as I'm concerned, no further work needs doing here. And git tree for the patches is here: https://git.kernel.dk/cgit/linux/log/?h=buffered-uncached.10 with the file system patches on top adding support for xfs/btrfs/ext4 here: https://git.kernel.dk/cgit/linux/log/?h=buffered-uncached-fs.10 This patch (of 12): Rather than pass in both the file and position directly from the kiocb, just take a struct kiocb instead. With the kiocb being passed in, skip passing in the address_space separately as well. While doing so, move the ki_flags checking into filemap_create_folio() as well. In preparation for actually needing the kiocb in the function. No functional changes in this patch. Link: https://lkml.kernel.org/r/20241220154831.1086649-1-axboe@kernel.dk Link: https://lkml.kernel.org/r/20241220154831.1086649-2-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Reviewed-by: Matthew Wilcox (Oracle) Cc: Brian Foster Cc: Chris Mason Cc: Johannes Weiner Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/filemap.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index b6494d2d3bc2..904d8fa2bfc0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2431,15 +2431,17 @@ unlock_mapping: return error; } -static int filemap_create_folio(struct file *file, - struct address_space *mapping, loff_t pos, - struct folio_batch *fbatch) +static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) { + struct address_space *mapping = iocb->ki_filp->f_mapping; struct folio *folio; int error; unsigned int min_order = mapping_min_folio_order(mapping); pgoff_t index; + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) + return -EAGAIN; + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); if (!folio) return -ENOMEM; @@ -2458,7 +2460,7 @@ static int filemap_create_folio(struct file *file, * well to keep locking rules simple. */ filemap_invalidate_lock_shared(mapping); - index = (pos >> (PAGE_SHIFT + min_order)) << min_order; + index = (iocb->ki_pos >> (PAGE_SHIFT + min_order)) << min_order; error = filemap_add_folio(mapping, folio, index, mapping_gfp_constraint(mapping, GFP_KERNEL)); if (error == -EEXIST) @@ -2466,7 +2468,8 @@ static int filemap_create_folio(struct file *file, if (error) goto error; - error = filemap_read_folio(file, mapping->a_ops->read_folio, folio); + error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio, + folio); if (error) goto error; @@ -2522,9 +2525,7 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); } if (!folio_batch_count(fbatch)) { - if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) - return -EAGAIN; - err = filemap_create_folio(filp, mapping, iocb->ki_pos, fbatch); + err = filemap_create_folio(iocb, fbatch); if (err == AOP_TRUNCATED_PAGE) goto retry; return err; -- 2.51.0 From f598cdaafc370a797ae883d370a7c18c1ffc43ef Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:40 -0700 Subject: [PATCH 13/16] mm/filemap: use page_cache_sync_ra() to kick off read-ahead Rather than use the page_cache_sync_readahead() helper, define our own ractl and use page_cache_sync_ra() directly. In preparation for needing to modify ractl inside filemap_get_pages(). No functional changes in this patch. Link: https://lkml.kernel.org/r/20241220154831.1086649-3-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Cc: Brian Foster Cc: Chris Mason Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/filemap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 904d8fa2bfc0..a1fda00aa6bc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2499,7 +2499,6 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count, { struct file *filp = iocb->ki_filp; struct address_space *mapping = filp->f_mapping; - struct file_ra_state *ra = &filp->f_ra; pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; pgoff_t last_index; struct folio *folio; @@ -2514,12 +2513,13 @@ retry: filemap_get_read_batch(mapping, index, last_index - 1, fbatch); if (!folio_batch_count(fbatch)) { + DEFINE_READAHEAD(ractl, filp, &filp->f_ra, mapping, index); + if (iocb->ki_flags & IOCB_NOIO) return -EAGAIN; if (iocb->ki_flags & IOCB_NOWAIT) flags = memalloc_noio_save(); - page_cache_sync_readahead(mapping, ra, filp, index, - last_index - index); + page_cache_sync_ra(&ractl, last_index - index); if (iocb->ki_flags & IOCB_NOWAIT) memalloc_noio_restore(flags); filemap_get_read_batch(mapping, index, last_index - 1, fbatch); -- 2.51.0 From 1963de79d3a3bc12b7a17a922d508b733ca8fa9e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:41 -0700 Subject: [PATCH 14/16] mm/readahead: add folio allocation helper Just a wrapper around filemap_alloc_folio() for now, but add it in preparation for modifying the folio based on the 'ractl' being passed in. No functional changes in this patch. Link: https://lkml.kernel.org/r/20241220154831.1086649-4-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Reviewed-by: Matthew Wilcox (Oracle) Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/readahead.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 2bc3abf07828..722b541c7137 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -178,6 +178,12 @@ static void read_pages(struct readahead_control *rac) BUG_ON(readahead_count(rac)); } +static struct folio *ractl_alloc_folio(struct readahead_control *ractl, + gfp_t gfp_mask, unsigned int order) +{ + return filemap_alloc_folio(gfp_mask, order); +} + /** * page_cache_ra_unbounded - Start unchecked readahead. * @ractl: Readahead control. @@ -255,8 +261,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, continue; } - folio = filemap_alloc_folio(gfp_mask, - mapping_min_folio_order(mapping)); + folio = ractl_alloc_folio(ractl, gfp_mask, + mapping_min_folio_order(mapping)); if (!folio) break; @@ -426,7 +432,7 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, pgoff_t mark, unsigned int order, gfp_t gfp) { int err; - struct folio *folio = filemap_alloc_folio(gfp, order); + struct folio *folio = ractl_alloc_folio(ractl, gfp, order); if (!folio) return -ENOMEM; @@ -751,7 +757,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; @@ -780,7 +786,7 @@ void readahead_expand(struct readahead_control *ractl, if (folio && !xa_is_value(folio)) return; /* Folio apparently present */ - folio = filemap_alloc_folio(gfp_mask, min_order); + folio = ractl_alloc_folio(ractl, gfp_mask, min_order); if (!folio) return; -- 2.51.0 From cceba6f7e46c48deca433030d80fc34599fb9fd8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:42 -0700 Subject: [PATCH 15/16] mm: add PG_dropbehind folio flag Add a folio flag that file IO can use to indicate that the cached IO being done should be dropped from the page cache upon completion. Link: https://lkml.kernel.org/r/20241220154831.1086649-5-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 +++++ include/trace/events/mmflags.h | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 616b57ddc3fe..36d283552f80 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -110,6 +110,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ + PG_dropbehind, /* drop pages on IO completion */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif @@ -562,6 +563,10 @@ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) FOLIO_FLAG(readahead, FOLIO_HEAD_PAGE) FOLIO_TEST_CLEAR_FLAG(readahead, FOLIO_HEAD_PAGE) +FOLIO_FLAG(dropbehind, FOLIO_HEAD_PAGE) + FOLIO_TEST_CLEAR_FLAG(dropbehind, FOLIO_HEAD_PAGE) + __FOLIO_SET_FLAG(dropbehind, FOLIO_HEAD_PAGE) + #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index bb8a59c6caa2..3bc8656c8359 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -116,7 +116,8 @@ DEF_PAGEFLAG_NAME(head), \ DEF_PAGEFLAG_NAME(reclaim), \ DEF_PAGEFLAG_NAME(swapbacked), \ - DEF_PAGEFLAG_NAME(unevictable) \ + DEF_PAGEFLAG_NAME(unevictable), \ + DEF_PAGEFLAG_NAME(dropbehind) \ IF_HAVE_PG_MLOCK(mlocked) \ IF_HAVE_PG_HWPOISON(hwpoison) \ IF_HAVE_PG_IDLE(idle) \ -- 2.51.0 From 77d075221ae777296e2b18a0a4f5fea6f75daf2c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 20 Dec 2024 08:47:43 -0700 Subject: [PATCH 16/16] mm/readahead: add readahead_control->dropbehind member If ractl->dropbehind is set to true, then folios created are marked as dropbehind as well. Link: https://lkml.kernel.org/r/20241220154831.1086649-6-axboe@kernel.dk Signed-off-by: Jens Axboe Reviewed-by: Kirill A. Shutemov Cc: Brian Foster Cc: Chris Mason Cc: Christoph Hellwig Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 + mm/readahead.c | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index fc2e1319c7bb..d53c49abead6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1358,6 +1358,7 @@ struct readahead_control { pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; + bool dropbehind; bool _workingset; unsigned long _pflags; }; diff --git a/mm/readahead.c b/mm/readahead.c index 722b541c7137..6a4e96b69702 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -181,7 +181,13 @@ static void read_pages(struct readahead_control *rac) static struct folio *ractl_alloc_folio(struct readahead_control *ractl, gfp_t gfp_mask, unsigned int order) { - return filemap_alloc_folio(gfp_mask, order); + struct folio *folio; + + folio = filemap_alloc_folio(gfp_mask, order); + if (folio && ractl->dropbehind) + __folio_set_dropbehind(folio); + + return folio; } /** -- 2.51.0