From 6c2625e9c2efef5272e1addf6007a1fcab1f059b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 16 Oct 2024 21:23:51 +0300 Subject: [PATCH 01/16] x86/percpu: fix clang warning when dealing with unsigned types Patch series "percpu: Add a test case and fix for clang", v2. Add a test case to percpu to check a corner case with the specific 64-bit unsigned value. This test case shows why the first patch is done in the way it's done. The before and after has been tested with binary comparison of the percpu_test module and runnig it on the real Intel system. This patch (of 2): When percpu_add_op() is used with an unsigned argument, it prevents kernel builds with clang, `make W=1` and CONFIG_WERROR=y: net/ipv4/tcp_output.c:187:3: error: result of comparison of constant -1 with expression of type 'u8' (aka 'unsigned char') is always false [-Werror,-Wtautological-constant-out-of-range-compare] 187 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 188 | tp->compressed_ack); | ~~~~~~~~~~~~~~~~~~~ ... arch/x86/include/asm/percpu.h:238:31: note: expanded from macro 'percpu_add_op' 238 | ((val) == 1 || (val) == -1)) ? \ | ~~~~~ ^ ~~ Fix this by casting -1 to the type of the parameter and then compare. Link: https://lkml.kernel.org/r/20241016182635.1156168-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20241016182635.1156168-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Borislav Petkov (AMD) Cc: Christoph Lameter Cc: Dave Hansen Cc: Dennis Zhou Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ingo Molnar Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uros Bizjak Signed-off-by: Andrew Morton --- arch/x86/include/asm/percpu.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index c55a79d5feae..e525cd85f999 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -234,9 +234,10 @@ do { \ */ #define percpu_add_op(size, qual, var, val) \ do { \ - const int pao_ID__ = (__builtin_constant_p(val) && \ - ((val) == 1 || (val) == -1)) ? \ - (int)(val) : 0; \ + const int pao_ID__ = \ + (__builtin_constant_p(val) && \ + ((val) == 1 || \ + (val) == (typeof(val))-1)) ? (int)(val) : 0; \ \ if (0) { \ typeof(var) pao_tmp__; \ -- 2.51.0 From 4a7bba1df00163ecbdf4994bc42b879ade4aeed2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 16 Oct 2024 21:23:52 +0300 Subject: [PATCH 02/16] percpu: add a test case for the specific 64-bit value addition It might be a corner case when we add UINT_MAX as 64-bit unsigned value to the percpu variable as it's not the same as -1 (ULONG_LONG_MAX). Add a test case for that. Link: https://lkml.kernel.org/r/20241016182635.1156168-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Borislav Petkov (AMD) Cc: Christoph Lameter Cc: Dave Hansen Cc: Dennis Zhou Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ingo Molnar Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uros Bizjak Signed-off-by: Andrew Morton --- lib/percpu_test.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/percpu_test.c b/lib/percpu_test.c index 4a3d70bbc1a0..ce7124b16dab 100644 --- a/lib/percpu_test.c +++ b/lib/percpu_test.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include /* validate @native and @pcp counter values match @expected */ @@ -24,8 +25,9 @@ static int __init percpu_test_init(void) * +ul_one/-ul_one below would replace with inc/dec instructions. */ volatile unsigned int ui_one = 1; - long l = 0; + unsigned long long ull = 0; unsigned long ul = 0; + long l = 0; pr_info("percpu test start\n"); @@ -112,6 +114,13 @@ static int __init percpu_test_init(void) CHECK(ul, ulong_counter, -1); CHECK(ul, ulong_counter, ULONG_MAX); + ul = ull = 0; + __this_cpu_write(ulong_counter, 0); + + ul = ull += UINT_MAX; + __this_cpu_add(ulong_counter, ull); + CHECK(ul, ulong_counter, UINT_MAX); + ul = 3; __this_cpu_write(ulong_counter, 3); -- 2.51.0 From d3ea85c6c5f70acff970f3339afb2da8f9a805a6 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 16 Oct 2024 16:10:41 +0200 Subject: [PATCH 03/16] mm: swap: use str_true_false() helper function Remove hard-coded strings by using the helper function str_true_false(). Link: https://lkml.kernel.org/r/20241016141040.79168-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Signed-off-by: Andrew Morton --- mm/swap_state.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 4669f29cf555..e0c0321b8ff7 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -889,8 +889,7 @@ struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%s\n", - enable_vma_readahead ? "true" : "false"); + return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, -- 2.51.0 From f1001f3d3b6868998cab73d10fda1a5c99ddf963 Mon Sep 17 00:00:00 2001 From: Wei Xu Date: Thu, 17 Oct 2024 18:15:28 +0000 Subject: [PATCH 04/16] mm/mglru: reset page lru tier bits when activating When a folio is activated, lru_gen_add_folio() moves the folio to the youngest generation. But unlike folio_update_gen()/folio_inc_gen(), lru_gen_add_folio() doesn't reset the folio lru tier bits (LRU_REFS_MASK | LRU_REFS_FLAGS). This inconsistency can affect how pages are aged via folio_mark_accessed() (e.g. fd accesses), though no user visible impact related to this has been detected yet. Note that lru_gen_add_folio() cannot clear PG_workingset if the activation is due to workingset refault, otherwise PSI accounting will be skipped. So fix lru_gen_add_folio() to clear the lru tier bits other than PG_workingset when activating a folio, and also clear all the lru tier bits when a folio is activated via folio_activate() in lru_gen_look_around(). Link: https://lkml.kernel.org/r/20241017181528.3358821-1-weixugc@google.com Fixes: 018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap") Signed-off-by: Wei Xu Cc: Axel Rasmussen Cc: Brian Geffon Cc: Jan Alexander Steffens Cc: Suleiman Souhlal Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 15 ++++++++++++++- include/linux/mmzone.h | 2 ++ mm/vmscan.c | 8 ++++---- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 6f801c7b36e2..355cf46a01a6 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -155,6 +155,11 @@ static inline int folio_lru_refs(struct folio *folio) return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; } +static inline void folio_clear_lru_refs(struct folio *folio) +{ + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); +} + static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); @@ -222,6 +227,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, { unsigned long seq; unsigned long flags; + unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -257,7 +263,14 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + mask = LRU_GEN_MASK; + /* + * Don't clear PG_workingset here because it can affect PSI accounting + * if the activation is due to workingset refault. + */ + if (folio_test_active(folio)) + mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); + set_mask_bits(&folio->flags, mask, flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5b1c984daf45..2e8c4307c728 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -403,6 +403,8 @@ enum { NR_LRU_GEN_CAPS }; +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5bec29914f12..8d1301c0f22a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2603,8 +2603,6 @@ static bool should_clear_pmd_young(void) * shorthand helpers ******************************************************************************/ -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define DEFINE_MAX_SEQ(lruvec) \ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) @@ -4142,8 +4140,10 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) old_gen = folio_lru_gen(folio); if (old_gen < 0) folio_set_referenced(folio); - else if (old_gen != new_gen) + else if (old_gen != new_gen) { + folio_clear_lru_refs(folio); folio_activate(folio); + } } arch_leave_lazy_mmu_mode(); @@ -4376,7 +4376,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca /* see the comment on MAX_NR_TIERS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + folio_clear_lru_refs(folio); /* for shrink_folio_list() */ folio_clear_reclaim(folio); -- 2.51.0 From 7146de5ff504003ed6f61c39c379b5777e7bed29 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 17 Oct 2024 17:56:38 +0100 Subject: [PATCH 05/16] tools: testing: fix phys_addr_t size on 64-bit systems The phys_addr_t size is predicated on whether CONFIG_PHYS_ADDR_T_64BIT is set or not. In the VMA tests, virt_to_phys() from tools/include/linux casts a volatile void * pointer to phys_addr_t, if CONFIG_PHYS_ADDR_T_64BIT is not set, this will be 32-bit and trigger a warning. Obviously this might also lead to truncation, which we would rather avoid. Fix this by adjusting the generation of generated/bit-length.h to generate a CONFIG_PHYS_ADDR_T{bits}BIT define. This does result in the generation of the useless CONFIG_PHYS_ADDR_T_32BIT define for 32-bit systems, but this should have no effect, and makes implementation of this easier. This resolves the issue and the warning. [lorenzo.stoakes@oracle.com: VMA tests not properly importing bit-length.h] Link: https://lkml.kernel.org/r/a6183df9-3108-4d59-8128-4fc6c14e22a5@lucifer.local Link: https://lkml.kernel.org/r/20241017165638.95602-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/shared/shared.mk | 1 + tools/testing/vma/vma.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk index a6bc51d0b0bf..923ee2492256 100644 --- a/tools/testing/shared/shared.mk +++ b/tools/testing/shared/shared.mk @@ -69,6 +69,7 @@ generated/bit-length.h: FORCE @if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then \ echo "Generating $@"; \ echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@; \ + echo "#define CONFIG_PHYS_ADDR_T_$(LONG_BIT)BIT 1" >> $@; \ fi FORCE: ; diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index b33b47342d41..8fab5e13c7c3 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -4,6 +4,8 @@ #include #include +#include "generated/bit-length.h" + #include "maple-shared.h" #include "vma_internal.h" -- 2.51.0 From 5a90c155defa684f3a21f68c3f8e40c056e6114c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 17 Oct 2024 22:17:42 +0800 Subject: [PATCH 06/16] tmpfs: don't enable large folios if not supported tmpfs can support large folios, but there are some configurable options (mount options and runtime deny/force) to enable/disable large folio allocation, so there is a performance issue when performing writes without large folios. The issue is similar to commit 4e527d5841e2 ("iomap: fault in smaller chunks for non-large folio mappings"). Since 'deny' is for emergencies and 'force' is for testing, performance issues should not be a problem in real production environments, so don't call mapping_set_large_folios() in __shmem_get_inode() when large folio is disabled with mount huge=never option (default policy). Link: https://lkml.kernel.org/r/20241017141742.1169404-1-wangkefeng.wang@huawei.com Fixes: 9aac777aaf94 ("filemap: Convert generic_perform_write() to support large folios") Signed-off-by: Kefeng Wang Cc: Alexander Viro Cc: Baolin Wang Cc: Christian Brauner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/shmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 6ad50ba60d8e..98fb539434f4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2842,7 +2842,10 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); - mapping_set_large_folios(inode->i_mapping); + + /* Don't consider 'deny' for emergencies and 'force' for testing */ + if (sbinfo->huge) + mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: -- 2.51.0 From 9884efd795cc2f71ef3b7f42df32420b0b7ce34f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 17 Oct 2024 22:14:56 +0800 Subject: [PATCH 07/16] mm: huge_memory: move file_thp_enabled() into huge_memory.c file_thp_enabled() is only used in __thp_vma_allowable_orders(), so move it into huge_memory.c, also check READ_ONLY_THP_FOR_FS ahead to avoid unnecessary code if config disabled. Link: https://lkml.kernel.org/r/20241017141457.1169092-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 13 ------------- mm/huge_memory.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8afe09a2cf03..006f730545c2 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -252,19 +252,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, return orders; } -static inline bool file_thp_enabled(struct vm_area_struct *vma) -{ - struct inode *inode; - - if (!vma->vm_file) - return false; - - inode = vma->vm_file->f_inode; - - return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && - !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); -} - unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 73194aa0544c..492c16eaf147 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -83,6 +83,21 @@ unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; static bool anon_orders_configured __initdata; +static inline bool file_thp_enabled(struct vm_area_struct *vma) +{ + struct inode *inode; + + if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) + return false; + + if (!vma->vm_file) + return false; + + inode = file_inode(vma->vm_file); + + return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); +} + unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, -- 2.51.0 From 4a9a27fdf7bfd29013491aea45e3512988cc5876 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 17 Oct 2024 22:14:57 +0800 Subject: [PATCH 08/16] mm: shmem: remove __shmem_huge_global_enabled() Remove __shmem_huge_global_enabled() since it as only one caller, and remove repeated check of VM_NOHUGEPAGE/MMF_DISABLE_THP as they are checked in shmem_allowable_huge_orders(), also remove unnecessary vma parameter. Link: https://lkml.kernel.org/r/20241017141457.1169092-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/shmem.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 98fb539434f4..ebf39aa0b9ab 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -548,17 +548,15 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - struct vm_area_struct *vma, - unsigned long vm_flags) +static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + unsigned long vm_flags) { - struct mm_struct *mm = vma ? vma->vm_mm : NULL; loff_t i_size; - if (!S_ISREG(inode->i_mode)) + if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) return false; - if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) + if (!S_ISREG(inode->i_mode)) return false; if (shmem_huge == SHMEM_HUGE_DENY) return false; @@ -576,7 +574,7 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, return true; fallthrough; case SHMEM_HUGE_ADVISE: - if (mm && (vm_flags & VM_HUGEPAGE)) + if (vm_flags & VM_HUGEPAGE) return true; fallthrough; default: @@ -584,17 +582,6 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, } } -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - struct vm_area_struct *vma, unsigned long vm_flags) -{ - if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) - return false; - - return __shmem_huge_global_enabled(inode, index, write_end, - shmem_huge_force, vma, vm_flags); -} - #if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { @@ -772,8 +759,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - struct vm_area_struct *vma, unsigned long vm_flags) + loff_t write_end, bool shmem_huge_force, + unsigned long vm_flags) { return false; } @@ -1170,7 +1157,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); inode_unlock_shared(inode); - if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1687,7 +1674,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, return 0; global_huge = shmem_huge_global_enabled(inode, index, write_end, - shmem_huge_force, vma, vm_flags); + shmem_huge_force, vm_flags); if (!vma || !vma_is_anon_shmem(vma)) { /* * For tmpfs, we now only support PMD sized THP if huge page -- 2.51.0 From 0938b1614648d5fbd832449a5a8a1b51d985323d Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 17 Oct 2024 08:23:42 +0200 Subject: [PATCH 09/16] mm: don't set readahead flag on a folio when lookahead_size > nr_to_read The readahead flag is set on a folio based on the lookahead_size and nr_to_read. For example, when the readahead happens from index to index + nr_to_read, then the readahead `mark` offset from index is set at nr_to_read - lookahead_size. There are some scenarios where the lookahead_size > nr_to_read. For example, readahead window was created, but the file was truncated before the readahead starts. do_page_cache_ra() will clamp the nr_to_read if the readahead window extends beyond EOF after truncation. If this happens, readahead flag should not be set on any folio on the current readahead window. The current calculation for `mark` with mapping_min_order > 0 gives incorrect results when lookahead_size > nr_to_read due to rounding up operation: index = 128 nr_to_read = 16 lookahead_size = 28 mapping_min_order = 4 (16 pages) ra_folio_index = round_up(128 + 16 - 28, 16) = 128; mark = 128 - 128 = 0; # offset from index to set RA flag In the above example, the lookahead_size is actually lying outside the current readahead window. Without this patch, RA flag will be set incorrectly on the folio at index 128. This can lead to marking the readahead flag on the wrong folio, therefore, triggering a readahead when it is not necessary. Explicitly initialize `mark` to be ULONG_MAX and only calculate it when lookahead_size is within the readahead window. Link: https://lkml.kernel.org/r/20241017062342.478973-1-kernel@pankajraghav.com Fixes: 26cfdb395eef ("readahead: allocate folios with mapping_min_order in readahead") Signed-off-by: Pankaj Raghav Cc: Luis Chamberlain Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/readahead.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 3dc6c7a128dd..475d2940a1ed 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -206,9 +206,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; - unsigned long ra_folio_index, index = readahead_index(ractl); + unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); - unsigned long mark, i = 0; + unsigned long mark = ULONG_MAX, i = 0; unsigned int min_nrpages = mapping_min_folio_nrpages(mapping); /* @@ -232,9 +232,14 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * index that only has lookahead or "async_region" to set the * readahead flag. */ - ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size, - min_nrpages); - mark = ra_folio_index - index; + if (lookahead_size <= nr_to_read) { + unsigned long ra_folio_index; + + ra_folio_index = round_up(readahead_index(ractl) + + nr_to_read - lookahead_size, + min_nrpages); + mark = ra_folio_index - index; + } nr_to_read += readahead_index(ractl) - index; ractl->_index = index; -- 2.51.0 From 61e9df7085cca6b62e9d230ed807eb524126a105 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 17 Oct 2024 01:58:08 +0000 Subject: [PATCH 10/16] maple_tree: calculate new_end when needed Patch series "Following cleanup after introduce mas_wr_store_type()", v2. Patch 1 postpone new_end calculation when needed. Patch 2 removes a unnecessary sanity check in mas_wr_slot_store(). This patch (of 2): For wr_exact_fit/wr_new_root, we don't need to calculate new_end. Let's postpone it until necessary. Link: https://lkml.kernel.org/r/20241017015809.23392-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20241017015809.23392-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 667120a44570..bc30e99d6cf0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4211,13 +4211,13 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) if (!wr_mas->entry) mas_wr_extend_null(wr_mas); - new_end = mas_wr_new_end(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; + new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) -- 2.51.0 From 38dc8f495246667b543de4cc646fce2925e4cf3b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 17 Oct 2024 01:58:09 +0000 Subject: [PATCH 11/16] maple_tree: remove sanity check from mas_wr_slot_store() After commit 5d659bbb52a2 ("maple_tree: introduce mas_wr_store_type()"), the check here is redundant. Let's remove it. Link: https://lkml.kernel.org/r/20241017015809.23392-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index bc30e99d6cf0..38aa8abf8eb8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3897,7 +3897,8 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } - } else if (!mt_in_rcu(mas->tree)) { + } else { + WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges @@ -3907,8 +3908,6 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ - } else { - return; } trace_ma_write(__func__, mas, 0, wr_mas->entry); -- 2.51.0 From 58f1069311db63ed9f330fdba4418a13ab49d843 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 18 Oct 2024 13:41:13 -0400 Subject: [PATCH 12/16] mm/mremap: cleanup vma_to_resize() Patch series "mm/mremap: Remove extra vma tree walk", v2. An extra vma tree walk was discovered in some mremap call paths during the discussion on mseal() changes. This patch set removes the extra vma tree walk and further cleans up mremap_to(). This patch (of 2): vma_to_resize() is used in two locations to find and validate the vma for the mremap location. One of the two locations already has the vma, which is then re-found to validate the same vma. This code can be simplified by moving the vma_lookup() from vma_to_resize() to mremap_to() and changing the return type to an int error. Since the function now just validates the vma, the function is renamed to resize_is_valid() to better reflect what it is doing. This commit also adds documentation about the function. Link: https://lkml.kernel.org/r/20241018174114.2871880-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20241018174114.2871880-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Jann Horn Cc: Jeff Xu Cc: Kefeng Wang Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/mremap.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 5917feafe8cc..e781ec4573ca 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -826,17 +826,24 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } -static struct vm_area_struct *vma_to_resize(unsigned long addr, +/* + * resize_is_valid() - Ensure the vma can be resized to the new length at the give + * address. + * + * @vma: The vma to resize + * @addr: The old address + * @old_len: The current size + * @new_len: The desired size + * @flags: The vma flags + * + * Return 0 on success, error otherwise. + */ +static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; unsigned long pgoff; - vma = vma_lookup(mm, addr); - if (!vma) - return ERR_PTR(-EFAULT); - /* * !old_len is a special case where an attempt is made to 'duplicate' * a mapping. This makes no sense for private mappings as it will @@ -847,37 +854,37 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, */ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); - return ERR_PTR(-EINVAL); + return -EINVAL; } if ((flags & MREMAP_DONTUNMAP) && (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) - return ERR_PTR(-EINVAL); + return -EINVAL; /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) - return ERR_PTR(-EFAULT); + return -EFAULT; if (new_len == old_len) - return vma; + return 0; /* Need to be careful about a growing mapping */ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - return ERR_PTR(-EINVAL); + return -EINVAL; if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - return ERR_PTR(-EFAULT); + return -EFAULT; if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len)) - return ERR_PTR(-EAGAIN); + return -EAGAIN; if (!may_expand_vm(mm, vma->vm_flags, (new_len - old_len) >> PAGE_SHIFT)) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - return vma; + return 0; } static unsigned long mremap_to(unsigned long addr, unsigned long old_len, @@ -936,12 +943,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, old_len = new_len; } - vma = vma_to_resize(addr, old_len, new_len, flags); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); + vma = vma_lookup(mm, addr); + if (!vma) { + ret = -EFAULT; goto out; } + ret = resize_is_valid(vma, addr, old_len, new_len, flags); + if (ret) + goto out; + /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (flags & MREMAP_DONTUNMAP && !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) { @@ -1114,11 +1125,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Ok, we need to grow.. */ - vma = vma_to_resize(addr, old_len, new_len, flags); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); + ret = resize_is_valid(vma, addr, old_len, new_len, flags); + if (ret) goto out; - } /* old_len exactly to the end of the area.. */ -- 2.51.0 From 4b6b0a5188c219cf40d6e863e55e2a5ca39e51cd Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 18 Oct 2024 13:41:14 -0400 Subject: [PATCH 13/16] mm/mremap: remove goto from mremap_to() mremap_to() has a goto label at the end that doesn't unwind anything. Removing the label makes the code cleaner. This commit also adds documentation to the function. Link: https://lkml.kernel.org/r/20241018174114.2871880-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Pedro Falcato Cc: David Hildenbrand Cc: Jann Horn Cc: Jeff Xu Cc: Kefeng Wang Cc: Lorenzo Stoakes Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/mremap.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e781ec4573ca..4c79ab92eb8f 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -887,6 +887,20 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr, return 0; } +/* + * mremap_to() - remap a vma to a new location + * @addr: The old address + * @old_len: The old size + * @new_addr: The target address + * @new_len: The new size + * @locked: If the returned vma is locked (VM_LOCKED) + * @flags: the mremap flags + * @uf: The mremap userfaultfd context + * @uf_unmap_early: The userfaultfd unmap early context + * @uf_unmap: The userfaultfd unmap context + * + * Returns: The new address of the vma or an error. + */ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len, bool *locked, unsigned long flags, struct vm_userfaultfd_ctx *uf, @@ -895,18 +909,18 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long ret = -EINVAL; + unsigned long ret; unsigned long map_flags = 0; if (offset_in_page(new_addr)) - goto out; + return -EINVAL; if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) - goto out; + return -EINVAL; /* Ensure the old/new locations do not overlap */ if (addr + old_len > new_addr && new_addr + new_len > addr) - goto out; + return -EINVAL; /* * move_vma() need us to stay 4 maps below the threshold, otherwise @@ -933,31 +947,28 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, */ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) - goto out; + return ret; } if (old_len > new_len) { ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); if (ret) - goto out; + return ret; old_len = new_len; } vma = vma_lookup(mm, addr); - if (!vma) { - ret = -EFAULT; - goto out; - } + if (!vma) + return -EFAULT; ret = resize_is_valid(vma, addr, old_len, new_len, flags); if (ret) - goto out; + return ret; /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (flags & MREMAP_DONTUNMAP && !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) { - ret = -ENOMEM; - goto out; + return -ENOMEM; } if (flags & MREMAP_FIXED) @@ -970,17 +981,14 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); if (IS_ERR_VALUE(ret)) - goto out; + return ret; /* We got a new mapping */ if (!(flags & MREMAP_FIXED)) new_addr = ret; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, - uf_unmap); - -out: - return ret; + return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, + uf, uf_unmap); } static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) -- 2.51.0 From 5bb6345cd2edfceef1749950ce786f205e56a90b Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 18 Oct 2024 15:11:51 +0530 Subject: [PATCH 14/16] mm: remove redundant condition for THP folio folio_test_pmd_mappable() implies folio_test_large(), therefore, simplify the expression for is_thp. Link: https://lkml.kernel.org/r/20241018094151.3458-1-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Anshuman Khandual Cc: "Huang, Ying" Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 72c6657f4f72..dfb5eba3c522 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1733,7 +1733,7 @@ static int migrate_pages_batch(struct list_head *from, list_for_each_entry_safe(folio, folio2, from, lru) { is_large = folio_test_large(folio); - is_thp = is_large && folio_test_pmd_mappable(folio); + is_thp = folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); -- 2.51.0 From b7f058f827392022d8c689329f88c7b324d71dad Mon Sep 17 00:00:00 2001 From: Luoxi Li Date: Fri, 18 Oct 2024 17:22:35 +0800 Subject: [PATCH 15/16] mm: remove unused has_isolate_pageblock has_isolate_pageblock() has been unused since commit 55612e80e722 ("mm: page_alloc: close migratetype race between freeing and stealing") Remove it. Link: https://lkml.kernel.org/r/20241018092235.2764859-1-kaixa@kiloview.com Signed-off-by: Luoxi Li Acked-by: David Hildenbrand Reviewed-by: Muhammad Usama Anjum Acked-by: Johannes Weiner Reviewed-by: Anshuman Khandual Cc: Baolin Wang Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index c16db0067090..73dc2c1841ec 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -3,10 +3,6 @@ #define __LINUX_PAGEISOLATION_H #ifdef CONFIG_MEMORY_ISOLATION -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return zone->nr_isolate_pageblock; -} static inline bool is_migrate_isolate_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; @@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype) return migratetype == MIGRATE_ISOLATE; } #else -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return false; -} static inline bool is_migrate_isolate_page(struct page *page) { return false; -- 2.51.0 From f3650ef89b879d63c63f04e98481f7ed4df1119a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 18 Oct 2024 11:00:27 +0800 Subject: [PATCH 16/16] mm: shmem: update iocb->ki_pos directly to simplify tmpfs read logic Patch series "Improve the tmpfs large folio read performance", v2. tmpfs already supports PMD-sized large folios, but the tmpfs read operation still performs copying at PAGE_SIZE granularity, which is not perfect. This patchset changes tmpfs to copy data at the folio granularity, which can improve the read performance. Use 'fio bs=64k' to read a 1G tmpfs file populated with 2M THPs, and I can see about 20% performance improvement, and no regression with bs=4k. I also did some functional testing with the xfstests suite, and I did not find any regressions with the following xfstests config: FSTYP=tmpfs export TEST_DIR=/mnt/tempfs_mnt export TEST_DEV=/mnt/tempfs_mnt export SCRATCH_MNT=/mnt/scratchdir export SCRATCH_DEV=/mnt/scratchdir This patch (of 2): Using iocb->ki_pos to check if the read bytes exceeds the file size and to calculate the bytes to be read can help simplify the code logic. Meanwhile, this is also a preparation for improving tmpfs large folios read performance in the following patch. Link: https://lkml.kernel.org/r/cover.1729218573.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/e8863e289577e0dc1e365b5419bf2d1c9a24ae3d.1729218573.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/shmem.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ebf39aa0b9ab..26aaddc52fd1 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3093,27 +3093,19 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) unsigned long offset; int error = 0; ssize_t retval = 0; - loff_t *ppos = &iocb->ki_pos; - index = *ppos >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; + offset = iocb->ki_pos & ~PAGE_MASK; for (;;) { struct folio *folio = NULL; struct page *page = NULL; - pgoff_t end_index; unsigned long nr, ret; - loff_t i_size = i_size_read(inode); + loff_t end_offset, i_size = i_size_read(inode); - end_index = i_size >> PAGE_SHIFT; - if (index > end_index) + if (unlikely(iocb->ki_pos >= i_size)) break; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) - break; - } + index = iocb->ki_pos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) @@ -3135,18 +3127,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ - nr = PAGE_SIZE; i_size = i_size_read(inode); - end_index = i_size >> PAGE_SHIFT; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) { - if (folio) - folio_put(folio); - break; - } + if (unlikely(iocb->ki_pos >= i_size)) { + if (folio) + folio_put(folio); + break; } - nr -= offset; + end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); + nr = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset); if (folio) { /* @@ -3186,8 +3174,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) retval += ret; offset += ret; - index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; + iocb->ki_pos += ret; if (!iov_iter_count(to)) break; @@ -3198,7 +3186,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) cond_resched(); } - *ppos = ((loff_t) index << PAGE_SHIFT) + offset; file_accessed(file); return retval ? retval : error; } -- 2.51.0