From 1d212293ffd145eebd795d5969a81a3b59f71bcb Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 5 Feb 2025 17:27:21 +0800 Subject: [PATCH 01/16] mm/swapfile.c: open code cluster_alloc_swap() It's only called in scan_swap_map_slots(). And also remove the stale code comment in scan_swap_map_slots() because it's not fit for the current cluster allocation mechanism. Link: https://lkml.kernel.org/r/20250205092721.9395-13-bhe@redhat.com Signed-off-by: Baoquan He Cc: Chris Li Cc: Kairui Song Signed-off-by: Andrew Morton --- mm/swapfile.c | 38 ++++++++++---------------------------- 1 file changed, 10 insertions(+), 28 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index c73e258bb588..cab68e57f4cc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1163,39 +1163,13 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_usage_sub(si, nr_entries); } -static int cluster_alloc_swap(struct swap_info_struct *si, - unsigned char usage, int nr, - swp_entry_t slots[], int order) -{ - int n_ret = 0; - - while (n_ret < nr) { - unsigned long offset = cluster_alloc_swap_entry(si, order, usage); - - if (!offset) - break; - slots[n_ret++] = swp_entry(si->type, offset); - } - - return n_ret; -} - static int scan_swap_map_slots(struct swap_info_struct *si, unsigned char usage, int nr, swp_entry_t slots[], int order) { unsigned int nr_pages = 1 << order; + int n_ret = 0; - /* - * We try to cluster swap pages by allocating them sequentially - * in swap. Once we've allocated SWAPFILE_CLUSTER pages this - * way, however, we resort to first-free allocation, starting - * a new cluster. This prevents us from scattering swap pages - * all over the entire swap partition, so that we reduce - * overall disk seek times between swap pages. -- sct - * But we do now try to find an empty cluster. -Andrea - * And we let swap pages go all over an SSD partition. Hugh - */ if (order > 0) { /* * Should not even be attempting large allocations when huge @@ -1215,7 +1189,15 @@ static int scan_swap_map_slots(struct swap_info_struct *si, return 0; } - return cluster_alloc_swap(si, usage, nr, slots, order); + while (n_ret < nr) { + unsigned long offset = cluster_alloc_swap_entry(si, order, usage); + + if (!offset) + break; + slots[n_ret++] = swp_entry(si->type, offset); + } + + return n_ret; } static bool get_swap_device_info(struct swap_info_struct *si) -- 2.51.0 From 9a5b183941b52f84c0f9e5f27ce44e99318c9e0f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 6 Feb 2025 13:26:33 +0100 Subject: [PATCH 02/16] mm, percpu: do not consider sleepable allocations atomic 28307d938fb2 ("percpu: make pcpu_alloc() aware of current gfp context") has fixed a reclaim recursion for scoped GFP_NOFS context. It has done that by avoiding taking pcpu_alloc_mutex. This is a correct solution as the worker context with full GFP_KERNEL allocation/reclaim power and which is using the same lock cannot block the NOFS pcpu_alloc caller. On the other hand this is a very conservative approach that could lead to failures because pcpu_alloc lockless implementation is quite limited. We have a bug report about premature failures when scsi array of 193 devices is scanned. Sometimes (not consistently) the scanning aborts because the iscsid daemon fails to create the queue for a random scsi device during the scan. iscsid itslef is running with PR_SET_IO_FLUSHER set so all allocations from this process context are GFP_NOIO. This in turn makes any pcpu_alloc lockless (without pcpu_alloc_mutex) which leads to pre-mature failures. It has turned out that iscsid has worked around this by dropping PR_SET_IO_FLUSHER (https://github.com/open-iscsi/open-iscsi/pull/382) when scanning host. But we can do better in this case on the kernel side and use pcpu_alloc_mutex for NOIO resp. NOFS constrained allocation scopes too. We just need the WQ worker to never trigger IO/FS reclaim. Achieve that by enforcing scoped GFP_NOIO for the whole execution of pcpu_balance_workfn (this will imply NOFS constrain as well). This will remove the dependency chain and preserve the full allocation power of the pcpu_alloc call. While at it make is_atomic really test for blockable allocations. Link: https://lkml.kernel.org/r/20250206122633.167896-1-mhocko@kernel.org Fixes: 28307d938fb2 ("percpu: make pcpu_alloc() aware of current gfp context") Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Dennis Zhou Cc: Filipe David Manana Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/percpu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index ac61e3fc5f15..027fb6497495 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1745,7 +1745,7 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); - is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; + is_atomic = !gfpflags_allow_blocking(gfp); do_warn = !(gfp & __GFP_NOWARN); /* @@ -2191,7 +2191,12 @@ static void pcpu_balance_workfn(struct work_struct *work) * to grow other chunks. This then gives pcpu_reclaim_populated() time * to move fully free chunks to the active list to be freed if * appropriate. + * + * Enforce GFP_NOIO allocations because we have pcpu_alloc users + * constrained to GFP_NOIO/NOFS contexts and they could form lock + * dependency through pcpu_alloc_mutex */ + unsigned int flags = memalloc_noio_save(); mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); @@ -2202,6 +2207,7 @@ static void pcpu_balance_workfn(struct work_struct *work) spin_unlock_irq(&pcpu_lock); mutex_unlock(&pcpu_alloc_mutex); + memalloc_noio_restore(flags); } /** -- 2.51.0 From 7ddeb91f5b03f25995861e9b2e1eb766aa528892 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 6 Feb 2025 11:45:36 +0000 Subject: [PATCH 03/16] mm: kmemleak: add support for dumping physical and __percpu object info Patch series "mm: kmemleak: Usability improvements". Following a recent false positive tracking that led to commit 488b5b9eca68 ("mm: kmemleak: fix upper boundary check for physical address objects"), I needed kmemleak to give me more debug information about the objects it is tracking. This lead to the first patch of this series. The second patch changes the kmemleak-test module to show the raw pointers for debugging purposes. This patch (of 2): Currently, echo dump=... > /sys/kernel/debug/kmemleak only looks up the main virtual address object tree. However, for debugging, it's useful to dump information about physical address and __percpu objects. Search all three object trees for the dump= command and also print the type of the object if not virtual: "(phys)" or "(percpu)". In addition, allow search by alias (pointer within the object). Link: https://lkml.kernel.org/r/20250206114537.2597764-1-catalin.marinas@arm.com Link: https://lkml.kernel.org/r/20250206114537.2597764-2-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton --- mm/kmemleak.c | 52 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c6ed68604136..c12cef3eeb32 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -352,6 +352,15 @@ static bool unreferenced_object(struct kmemleak_object *object) jiffies_last_scan); } +static const char *__object_type_str(struct kmemleak_object *object) +{ + if (object->flags & OBJECT_PHYS) + return " (phys)"; + if (object->flags & OBJECT_PERCPU) + return " (percpu)"; + return ""; +} + /* * Printing of the unreferenced objects information to the seq file. The * print_unreferenced function must be called with the object->lock held. @@ -364,8 +373,9 @@ static void print_unreferenced(struct seq_file *seq, unsigned int nr_entries; nr_entries = stack_depot_fetch(object->trace_handle, &entries); - warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", - object->pointer, object->size); + warn_or_seq_printf(seq, "unreferenced object%s 0x%08lx (size %zu):\n", + __object_type_str(object), + object->pointer, object->size); warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); hex_dump_object(seq, object); @@ -384,10 +394,10 @@ static void print_unreferenced(struct seq_file *seq, */ static void dump_object_info(struct kmemleak_object *object) { - pr_notice("Object 0x%08lx (size %zu):\n", - object->pointer, object->size); + pr_notice("Object%s 0x%08lx (size %zu):\n", + __object_type_str(object), object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); + object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%x\n", object->flags); @@ -1998,25 +2008,41 @@ static int kmemleak_open(struct inode *inode, struct file *file) return seq_open(file, &kmemleak_seq_ops); } -static int dump_str_object_info(const char *str) +static bool __dump_str_object_info(unsigned long addr, unsigned int objflags) { unsigned long flags; struct kmemleak_object *object; + + object = __find_and_get_object(addr, 1, objflags); + if (!object) + return false; + + raw_spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + raw_spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + + return true; +} + +static int dump_str_object_info(const char *str) +{ unsigned long addr; + bool found = false; if (kstrtoul(str, 0, &addr)) return -EINVAL; - object = find_and_get_object(addr, 0); - if (!object) { + + found |= __dump_str_object_info(addr, 0); + found |= __dump_str_object_info(addr, OBJECT_PHYS); + found |= __dump_str_object_info(addr, OBJECT_PERCPU); + + if (!found) { pr_info("Unknown object at 0x%08lx\n", addr); return -EINVAL; } - raw_spin_lock_irqsave(&object->lock, flags); - dump_object_info(object); - raw_spin_unlock_irqrestore(&object->lock, flags); - - put_object(object); return 0; } -- 2.51.0 From fe1136b4ccbfac9b8e72d4551d1ce788a67d59cb Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 6 Feb 2025 11:45:37 +0000 Subject: [PATCH 04/16] samples: kmemleak: print the raw pointers for debugging purposes The kmemleak-test.c module is meant to leak some pointers for debugging the kmemleak detection, pointer information dumping. It's no use if it prints the hashed values of such pointers. Change the printk() format from %p to %px. While at it, also display the raw __percpu pointer rather than this_cpu_ptr() since kmemleak now tracks such pointers independently of the standard allocations. Link: https://lkml.kernel.org/r/20250206114537.2597764-3-catalin.marinas@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton --- samples/kmemleak/kmemleak-test.c | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/samples/kmemleak/kmemleak-test.c b/samples/kmemleak/kmemleak-test.c index 544c36d51d56..8609812a37eb 100644 --- a/samples/kmemleak/kmemleak-test.c +++ b/samples/kmemleak/kmemleak-test.c @@ -40,25 +40,25 @@ static int kmemleak_test_init(void) pr_info("Kmemleak testing\n"); /* make some orphan objects */ - pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL)); - pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL)); - pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL)); - pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); - pr_info("kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(32) = 0x%px\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(32) = 0x%px\n", kmalloc(32, GFP_KERNEL)); + pr_info("kmalloc(1024) = 0x%px\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(1024) = 0x%px\n", kmalloc(1024, GFP_KERNEL)); + pr_info("kmalloc(2048) = 0x%px\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(2048) = 0x%px\n", kmalloc(2048, GFP_KERNEL)); + pr_info("kmalloc(4096) = 0x%px\n", kmalloc(4096, GFP_KERNEL)); + pr_info("kmalloc(4096) = 0x%px\n", kmalloc(4096, GFP_KERNEL)); #ifndef CONFIG_MODULES - pr_info("kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = 0x%px\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); - pr_info("kmem_cache_alloc(files_cachep) = %p\n", + pr_info("kmem_cache_alloc(files_cachep) = 0x%px\n", kmem_cache_alloc(files_cachep, GFP_KERNEL)); #endif - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - pr_info("vmalloc(64) = %p\n", vmalloc(64)); - pr_info("vmalloc(64) = %p\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); + pr_info("vmalloc(64) = 0x%px\n", vmalloc(64)); /* * Add elements to a list. They should only appear as orphan @@ -66,7 +66,7 @@ static int kmemleak_test_init(void) */ for (i = 0; i < 10; i++) { elem = kzalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kzalloc(sizeof(*elem)) = %p\n", elem); + pr_info("kzalloc(sizeof(*elem)) = 0x%px\n", elem); if (!elem) return -ENOMEM; INIT_LIST_HEAD(&elem->list); @@ -75,11 +75,11 @@ static int kmemleak_test_init(void) for_each_possible_cpu(i) { per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); - pr_info("kmalloc(129) = %p\n", + pr_info("kmalloc(129) = 0x%px\n", per_cpu(kmemleak_test_pointer, i)); } - pr_info("__alloc_percpu(64, 4) = %p\n", __alloc_percpu(64, 4)); + pr_info("__alloc_percpu(64, 4) = 0x%px\n", __alloc_percpu(64, 4)); return 0; } -- 2.51.0 From 3a06696305e757f652dd0dcf4dfa2272eda39434 Mon Sep 17 00:00:00 2001 From: Usama Arif Date: Fri, 7 Feb 2025 13:20:32 -0800 Subject: [PATCH 05/16] mm/damon/ops: have damon_get_folio return folio even for tail pages Patch series "mm/damon/paddr: fix large folios access and schemes handling". DAMON operations set for physical address space, namely 'paddr', treats tail pages as unaccessed always. It can also apply DAMOS action to a large folio multiple times within single DAMOS' regions walking. As a result, the monitoring output has poor quality and DAMOS works in unexpected ways when large folios are being used. Fix those. The patches were parts of Usama's hugepage_size DAMOS filter patch series[1]. The first fix has collected from there with a slight commit message change for the subject prefix. The second fix is re-written by SJ and posted as an RFC before this series. The second one also got a slight commit message change for the subject prefix. [1] https://lore.kernel.org/20250203225604.44742-1-usamaarif642@gmail.com [2] https://lore.kernel.org/20250206231103.38298-1-sj@kernel.org This patch (of 2): This effectively adds support for large folios in damon for paddr, as damon_pa_mkold/young won't get a null folio from this function and won't ignore it, hence access will be checked and reported. This also means that larger folios will be considered for different DAMOS actions like pageout, prioritization and migration. As these DAMOS actions will consider larger folios, iterate through the region at folio_size and not PAGE_SIZE intervals. This should not have an affect on vaddr, as damon_young_pmd_entry considers pmd entries. Link: https://lkml.kernel.org/r/20250207212033.45269-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250207212033.45269-2-sj@kernel.org Fixes: a28397beb55b ("mm/damon: implement primitives for physical address space monitoring") Signed-off-by: Usama Arif Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Cc: Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 2 +- mm/damon/paddr.c | 24 ++++++++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 86a50e8fbc80..0db1fc70c84d 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -26,7 +26,7 @@ struct folio *damon_get_folio(unsigned long pfn) struct page *page = pfn_to_online_page(pfn); struct folio *folio; - if (!page || PageTail(page)) + if (!page) return NULL; folio = page_folio(page); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 1a89920efce9..2ac19ebc7076 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -277,11 +277,14 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, damos_add_filter(s, filter); } - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + addr = r->ar.start; + while (addr < r->ar.end) { struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - if (!folio) + if (!folio) { + addr += PAGE_SIZE; continue; + } if (damos_pa_filter_out(s, folio)) goto put_folio; @@ -297,6 +300,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, else list_add(&folio->lru, &folio_list); put_folio: + addr += folio_size(folio); folio_put(folio); } if (install_young_filter) @@ -312,11 +316,14 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( { unsigned long addr, applied = 0; - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + addr = r->ar.start; + while (addr < r->ar.end) { struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - if (!folio) + if (!folio) { + addr += PAGE_SIZE; continue; + } if (damos_pa_filter_out(s, folio)) goto put_folio; @@ -329,6 +336,7 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( folio_deactivate(folio); applied += folio_nr_pages(folio); put_folio: + addr += folio_size(folio); folio_put(folio); } return applied * PAGE_SIZE; @@ -475,11 +483,14 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, unsigned long addr, applied; LIST_HEAD(folio_list); - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { + addr = r->ar.start; + while (addr < r->ar.end) { struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - if (!folio) + if (!folio) { + addr += PAGE_SIZE; continue; + } if (damos_pa_filter_out(s, folio)) goto put_folio; @@ -490,6 +501,7 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, goto put_folio; list_add(&folio->lru, &folio_list); put_folio: + addr += folio_size(folio); folio_put(folio); } applied = damon_pa_migrate_pages(&folio_list, s->target_nid); -- 2.51.0 From 94ba17adaba0f651fdcf745c8891a88e2e028cfa Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 7 Feb 2025 13:20:33 -0800 Subject: [PATCH 06/16] mm/damon: avoid applying DAMOS action to same entity multiple times 'paddr' DAMON operations set can apply a DAMOS scheme's action to a large folio multiple times in single DAMOS-regions-walk if the folio is laid on multiple DAMON regions. Add a field for DAMOS scheme object that can be used by the underlying ops to know what was the last entity that the scheme's action has applied. The core layer unsets the field when each DAMOS-regions-walk is done for the given scheme. And update 'paddr' ops to use the infrastructure to avoid the problem. Link: https://lkml.kernel.org/r/20250207212033.45269-3-sj@kernel.org Fixes: 57223ac29584 ("mm/damon/paddr: support the pageout scheme") Signed-off-by: SeongJae Park Reported-by: Usama Arif Closes: https://lore.kernel.org/20250203225604.44742-3-usamaarif642@gmail.com Cc: Signed-off-by: Andrew Morton --- include/linux/damon.h | 11 +++++++++++ mm/damon/core.c | 1 + mm/damon/paddr.c | 39 +++++++++++++++++++++++++++------------ 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index c9074d569596..b4d37d9b9221 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -432,6 +432,7 @@ struct damos_access_pattern { * @wmarks: Watermarks for automated (in)activation of this scheme. * @target_nid: Destination node if @action is "migrate_{hot,cold}". * @filters: Additional set of &struct damos_filter for &action. + * @last_applied: Last @action applied ops-managing entity. * @stat: Statistics of this scheme. * @list: List head for siblings. * @@ -454,6 +455,15 @@ struct damos_access_pattern { * implementation could check pages of the region and skip &action to respect * &filters * + * The minimum entity that @action can be applied depends on the underlying + * &struct damon_operations. Since it may not be aligned with the core layer + * abstract, namely &struct damon_region, &struct damon_operations could apply + * @action to same entity multiple times. Large folios that underlying on + * multiple &struct damon region objects could be such examples. The &struct + * damon_operations can use @last_applied to avoid that. DAMOS core logic + * unsets @last_applied when each regions walking for applying the scheme is + * finished. + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -482,6 +492,7 @@ struct damos { int target_nid; }; struct list_head filters; + void *last_applied; struct damos_stat stat; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 384935ef4e65..dc8f94fe7c3b 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1856,6 +1856,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) s->next_apply_sis = c->passed_sample_intervals + (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; + s->last_applied = NULL; } } diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 2ac19ebc7076..eb10a723b0a7 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -254,6 +254,17 @@ static bool damos_pa_filter_out(struct damos *scheme, struct folio *folio) return false; } +static bool damon_pa_invalid_damos_folio(struct folio *folio, struct damos *s) +{ + if (!folio) + return true; + if (folio == s->last_applied) { + folio_put(folio); + return true; + } + return false; +} + static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, unsigned long *sz_filter_passed) { @@ -261,6 +272,7 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, LIST_HEAD(folio_list); bool install_young_filter = true; struct damos_filter *filter; + struct folio *folio; /* check access in page level again by default */ damos_for_each_filter(filter, s) { @@ -279,9 +291,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s, addr = r->ar.start; while (addr < r->ar.end) { - struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - - if (!folio) { + folio = damon_get_folio(PHYS_PFN(addr)); + if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; continue; } @@ -307,6 +318,7 @@ put_folio: damos_destroy_filter(filter); applied = reclaim_pages(&folio_list); cond_resched(); + s->last_applied = folio; return applied * PAGE_SIZE; } @@ -315,12 +327,12 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate( unsigned long *sz_filter_passed) { unsigned long addr, applied = 0; + struct folio *folio; addr = r->ar.start; while (addr < r->ar.end) { - struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - - if (!folio) { + folio = damon_get_folio(PHYS_PFN(addr)); + if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; continue; } @@ -339,6 +351,7 @@ put_folio: addr += folio_size(folio); folio_put(folio); } + s->last_applied = folio; return applied * PAGE_SIZE; } @@ -482,12 +495,12 @@ static unsigned long damon_pa_migrate(struct damon_region *r, struct damos *s, { unsigned long addr, applied; LIST_HEAD(folio_list); + struct folio *folio; addr = r->ar.start; while (addr < r->ar.end) { - struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - - if (!folio) { + folio = damon_get_folio(PHYS_PFN(addr)); + if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; continue; } @@ -506,6 +519,7 @@ put_folio: } applied = damon_pa_migrate_pages(&folio_list, s->target_nid); cond_resched(); + s->last_applied = folio; return applied * PAGE_SIZE; } @@ -523,15 +537,15 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, { unsigned long addr; LIST_HEAD(folio_list); + struct folio *folio; if (!damon_pa_scheme_has_filter(s)) return 0; addr = r->ar.start; while (addr < r->ar.end) { - struct folio *folio = damon_get_folio(PHYS_PFN(addr)); - - if (!folio) { + folio = damon_get_folio(PHYS_PFN(addr)); + if (damon_pa_invalid_damos_folio(folio, s)) { addr += PAGE_SIZE; continue; } @@ -541,6 +555,7 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, addr += folio_size(folio); folio_put(folio); } + s->last_applied = folio; return 0; } -- 2.51.0 From e92b6e7bb6189d688ab8f9b27e3992cd8568ee4b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 7 Feb 2025 17:24:42 +0000 Subject: [PATCH 07/16] mm: use READ/WRITE_ONCE() for vma->vm_flags on migrate, mprotect According to the syzbot report referenced here, it is possible to encounter a race between mprotect() writing to the vma->vm_flags field and migration checking whether the VMA is locked. There is no real problem with timing here per se, only that torn reads/writes may occur. Therefore, as a proximate fix, ensure both operations READ_ONCE() and WRITE_ONCE() to avoid this. This race is possible due to the ability to look up VMAs via the rmap, which migration does in this case, which takes no mmap or VMA lock and therefore does not preclude an operation to modify a VMA. When the final update of VMA flags is performed by mprotect, this will cause the rmap lock to be taken while the VMA is inserted on split/merge. However the means by which we perform splits/merges in the kernel is that we perform the split/merge operation on the VMA, acquiring/releasing locks as needed, and only then, after having done so, modifying fields. We should carefully examine and determine whether we can combine the two operations so as to avoid such races, and whether it might be possible to otherwise annotate these rmap field accesses. Link: https://lkml.kernel.org/r/20250207172442.78836-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reported-by: syzbot+c2e5712cbb14c95d4847@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67a34e60.050a0220.50516.0040.GAE@google.com/ Cc: Jann Horn Cc: Liam Howlett Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- mm/mprotect.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 97f0edf0c032..a991d3691bda 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -328,7 +328,7 @@ static bool remove_migration_pte(struct folio *folio, folio_add_file_rmap_pte(folio, new, vma); set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } - if (vma->vm_flags & VM_LOCKED) + if (READ_ONCE(vma->vm_flags) & VM_LOCKED) mlock_drain_local(); trace_remove_migration_pte(pvmw.address, pte_val(pte), diff --git a/mm/mprotect.c b/mm/mprotect.c index 9cb6ab7c4048..1444878f7aeb 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -599,7 +599,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, unsigned long start, unsigned long end, unsigned long newflags) { struct mm_struct *mm = vma->vm_mm; - unsigned long oldflags = vma->vm_flags; + unsigned long oldflags = READ_ONCE(vma->vm_flags); long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; @@ -619,7 +619,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * uncommon case, so doesn't need to be very optimized. */ if (arch_has_pfn_modify_check() && - (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && + (oldflags & (VM_PFNMAP|VM_MIXEDMAP)) && (newflags & VM_ACCESS_FLAGS) == 0) { pgprot_t new_pgprot = vm_get_page_prot(newflags); @@ -668,7 +668,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * held in write mode. */ vma_start_write(vma); - vm_flags_reset(vma, newflags); + vm_flags_reset_once(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); -- 2.51.0 From 8d9a2f5d8abd3ba4355f25e60827c1ee082cd215 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 7 Feb 2025 10:04:53 +0000 Subject: [PATCH 08/16] mm/mm_init.c: use round_up() to align movable range Since MAX_ORDER_NR_PAGES is power of 2, let's use a faster version. Link: https://lkml.kernel.org/r/20250207100453.9989-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Anshuman Khandual Reviewed-by: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- mm/mm_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 2630cc30147e..de18d3ad12e1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -438,7 +438,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) * was requested by the user */ required_movablecore = - roundup(required_movablecore, MAX_ORDER_NR_PAGES); + round_up(required_movablecore, MAX_ORDER_NR_PAGES); required_movablecore = min(totalpages, required_movablecore); corepages = totalpages - required_movablecore; @@ -549,7 +549,7 @@ out2: unsigned long start_pfn, end_pfn; zone_movable_pfn[nid] = - roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + round_up(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); if (zone_movable_pfn[nid] >= end_pfn) -- 2.51.0 From c32696ca5e8e9fff83c951f3aa45cac2e97b0667 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 10 Feb 2025 10:27:34 -0800 Subject: [PATCH 09/16] mm/damon/core: unset damos->walk_completed after confimed set Patch series "mm/damon/core: fix wrong and/or useless damos_walk() behaviors". damos_walk() can finish working earlier or later than expected, and start earlier than practical. First two behaviors are clearly wrong behavior (doesn't follow the documentation) and all three behaviors are only making the feature useless. Fix those. This patch (of 3): damos->walk_completed is only set, not unset. This can cause next damos_walk() finish earlier than expected. Unset it after all walk_completed is confirmed. Link: https://lkml.kernel.org/r/20250210182737.134994-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250210182737.134994-2-sj@kernel.org Fixes: bf0eaba0ff9c ("mm/damon/core: implement damos_walk()") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index dc8f94fe7c3b..8e4ae9901b19 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1494,6 +1494,9 @@ static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s) if (!siter->walk_completed) return; } + damon_for_each_scheme(siter, ctx) + siter->walk_completed = false; + complete(&control->completion); mutex_lock(&ctx->walk_control_lock); ctx->walk_control = NULL; -- 2.51.0 From 40eb655b410d5c842313e556f743888033687865 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 10 Feb 2025 10:27:35 -0800 Subject: [PATCH 10/16] mm/damon/core: do not call damos_walk_control->walk() if walk is completed damos_walk() invokes callback functions of schemes until all schemes finishes at least one round of walks. If there are multiple DAMOS schemes having different apply_interval, the callback functions for longer apply interval scheme will be called for more than a round of the walk. The behavior is different from the document (see damos_walk() kernel-doc comment), and not useful. Make the behavior be same to the documented one, by stopping invoking the callback if the walk for the given scheme is completed. Link: https://lkml.kernel.org/r/20250210182737.134994-3-sj@kernel.org Fixes: bf0eaba0ff9c ("mm/damon/core: implement damos_walk()") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 8e4ae9901b19..e129fb785970 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1458,6 +1458,9 @@ static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t, { struct damos_walk_control *control; + if (s->walk_completed) + return; + mutex_lock(&ctx->walk_control_lock); control = ctx->walk_control; mutex_unlock(&ctx->walk_control_lock); -- 2.51.0 From 6fa70372c86162608c522eeaa58201d6c11ab773 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 10 Feb 2025 10:27:36 -0800 Subject: [PATCH 11/16] mm/damon/core: do damos walking in entire regions granularity damos_walk_control can be installed while DAMOS is walking the regions. This means the walk callback function invocations can be started from a region at the middle of the regions list. This makes it hard to be used reliably. Particularly, DAMOS tried regions update for collecting monitoring results gets problematic results. Increase the walk_control_lock critical section to do walking in entire regions granularity. Link: https://lkml.kernel.org/r/20250210182737.134994-4-sj@kernel.org Fixes: bf0eaba0ff9c ("mm/damon/core: implement damos_walk()") Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index e129fb785970..f663c8e99dfa 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1461,11 +1461,10 @@ static void damos_walk_call_walk(struct damon_ctx *ctx, struct damon_target *t, if (s->walk_completed) return; - mutex_lock(&ctx->walk_control_lock); control = ctx->walk_control; - mutex_unlock(&ctx->walk_control_lock); if (!control) return; + control->walk_fn(control->data, ctx, t, r, s, sz_filter_passed); } @@ -1485,9 +1484,7 @@ static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s) struct damos *siter; struct damos_walk_control *control; - mutex_lock(&ctx->walk_control_lock); control = ctx->walk_control; - mutex_unlock(&ctx->walk_control_lock); if (!control) return; @@ -1501,9 +1498,7 @@ static void damos_walk_complete(struct damon_ctx *ctx, struct damos *s) siter->walk_completed = false; complete(&control->completion); - mutex_lock(&ctx->walk_control_lock); ctx->walk_control = NULL; - mutex_unlock(&ctx->walk_control_lock); } /* @@ -1850,6 +1845,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (!has_schemes_to_apply) return; + mutex_lock(&c->walk_control_lock); damon_for_each_target(t, c) { damon_for_each_region_safe(r, next_r, t) damon_do_apply_schemes(c, t, r); @@ -1864,6 +1860,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) c->attrs.aggr_interval) / sample_interval; s->last_applied = NULL; } + mutex_unlock(&c->walk_control_lock); } /* -- 2.51.0 From 6e80c0aaad469e0a923ea0d7018fb1464e992018 Mon Sep 17 00:00:00 2001 From: Bertrand Wlodarczyk Date: Mon, 10 Feb 2025 17:07:49 +0100 Subject: [PATCH 12/16] vmscan, cleanup: add for_each_managed_zone_pgdat macro The macro is introduced to eliminate redundancy in the repeated iteration over managed zones in pgdat data structure, reducing the potential for errors. This change doesn't introduce any functional modifications. Due to concentration of the pattern in vmscan.c the macro is placed locally in that file. Link: https://lkml.kernel.org/r/20250210160818.686-1-bertrand.wlodarczyk@intel.com Signed-off-by: Bertrand Wlodarczyk Reviewed-by: Tim Chen Cc: Andy Whitcroft Cc: Dave Hansen Cc: Dwaipayan Ray Cc: Joe Perches Cc: Lukas Bulwahn Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/vmscan.c | 83 +++++++++++++++++++++-------------------------------- 1 file changed, 32 insertions(+), 51 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index bc1826020159..fcca38bc640f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -271,6 +271,25 @@ static int sc_swappiness(struct scan_control *sc, struct mem_cgroup *memcg) } #endif +/* for_each_managed_zone_pgdat - helper macro to iterate over all managed zones in a pgdat up to + * and including the specified highidx + * @zone: The current zone in the iterator + * @pgdat: The pgdat which node_zones are being iterated + * @idx: The index variable + * @highidx: The index of the highest zone to return + * + * This macro iterates through all managed zones up to and including the specified highidx. + * The zone iterator enters an invalid state after macro call and must be reinitialized + * before it can be used again. + */ +#define for_each_managed_zone_pgdat(zone, pgdat, idx, highidx) \ + for ((idx) = 0, (zone) = (pgdat)->node_zones; \ + (idx) <= (highidx); \ + (idx)++, (zone)++) \ + if (!managed_zone(zone)) \ + continue; \ + else + static void set_task_reclaim_state(struct task_struct *task, struct reclaim_state *rs) { @@ -396,13 +415,9 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, { unsigned long size = 0; int zid; + struct zone *zone; - for (zid = 0; zid <= zone_idx; zid++) { - struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, lruvec_pgdat(lruvec), zid, zone_idx) { if (!mem_cgroup_disabled()) size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid); else @@ -495,7 +510,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) { int reclaimable = 0, write_pending = 0; int i; - + struct zone *zone; /* * If kswapd is disabled, reschedule if necessary but do not * throttle as the system is likely near OOM. @@ -508,12 +523,7 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ - for (i = 0; i < MAX_NR_ZONES; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, MAX_NR_ZONES - 1) { reclaimable += zone_reclaimable_pages(zone); write_pending += zone_page_state_snapshot(zone, NR_ZONE_WRITE_PENDING); @@ -2372,17 +2382,13 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc) unsigned long total_high_wmark = 0; unsigned long free, anon; int z; + struct zone *zone; free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); file = node_page_state(pgdat, NR_ACTIVE_FILE) + node_page_state(pgdat, NR_INACTIVE_FILE); - for (z = 0; z < MAX_NR_ZONES; z++) { - struct zone *zone = &pgdat->node_zones[z]; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, z, MAX_NR_ZONES - 1) { total_high_wmark += high_wmark_pages(zone); } @@ -5851,6 +5857,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, unsigned long pages_for_compaction; unsigned long inactive_lru_pages; int z; + struct zone *zone; /* If not in reclaim/compaction mode, stop */ if (!in_reclaim_compaction(sc)) @@ -5870,11 +5877,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return false; /* If compaction would go ahead or the allocation would succeed, stop */ - for (z = 0; z <= sc->reclaim_idx; z++) { - struct zone *zone = &pgdat->node_zones[z]; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { /* Allocation can already succeed, nothing to do */ if (zone_watermark_ok(zone, sc->order, min_wmark_pages(zone), sc->reclaim_idx, 0)) @@ -6401,11 +6404,7 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) return true; - for (i = 0; i <= ZONE_NORMAL; i++) { - zone = &pgdat->node_zones[i]; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, ZONE_NORMAL) { if (!zone_reclaimable_pages(zone)) continue; @@ -6710,12 +6709,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) * Check watermarks bottom-up as lower zones are more likely to * meet watermarks. */ - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) mark = promo_wmark_pages(zone); else @@ -6800,11 +6794,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, /* Reclaim a number of pages proportional to the number of zones */ sc->nr_to_reclaim = 0; - for (z = 0; z <= sc->reclaim_idx; z++) { - zone = pgdat->node_zones + z; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, z, sc->reclaim_idx) { sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX); } @@ -6835,12 +6825,7 @@ update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active) int i; struct zone *zone; - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { if (active) set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags); else @@ -6901,11 +6886,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) * stall or direct reclaim until kswapd is finished. */ nr_boost_reclaim = 0; - for (i = 0; i <= highest_zoneidx; i++) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - + for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { nr_boost_reclaim += zone->watermark_boost; zone_boosts[i] = zone->watermark_boost; } -- 2.51.0 From 67254c7d70b63797a54173fbe05cd6552aca11d4 Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Mon, 10 Feb 2025 02:10:23 +0800 Subject: [PATCH 13/16] maple_tree: correct comment for mas_start() There's no mas->status of "mas_start", what the function is checking is whether mas->status equals to "ma_start". Correct the comment for the function. Link: https://lkml.kernel.org/r/20250209181023.228856-1-richard120310@gmail.com Signed-off-by: I Hsin Cheng Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f7153ade1be5..42c65974a56c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1353,7 +1353,7 @@ static void mas_node_count(struct ma_state *mas, int count) * mas_start() - Sets up maple state for operations. * @mas: The maple state. * - * If mas->status == mas_start, then set the min, max and depth to + * If mas->status == ma_start, then set the min, max and depth to * defaults. * * Return: -- 2.51.0 From c2661f5fe888c14eb1f09ed23e74213366fa70f3 Mon Sep 17 00:00:00 2001 From: David Laight Date: Sun, 9 Feb 2025 17:47:11 +0000 Subject: [PATCH 14/16] mm: remove the access_ok() call from gup_fast_fallback() Historiaclly the code relied on access_ok() to validate the address range. Commit 26f4c328079d7 added an explicit wrap check before access_ok(). Commit c28b1fc70390d then changed the wrap test to use check_add_overflow(). Commit 6014bc27561f2 relaxed the checks in x86-64's access_ok() and added an explicit check for TASK_SIZE here to make up for it. That left a pointless access_ok() call with its associated 'lfence' that can never actually fail. So just delete the test. Link: https://lkml.kernel.org/r/20250209174711.60889-1-david.laight.linux@gmail.com Signed-off-by: David Laight Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Cc: Thomas Gleixner Cc: Andy Lutomirks^H^Hski Cc: Borislav Betkov Cc: Dave Hansen Cc: Ingo Molnar Cc: Jan Kara Cc: John Hubbard Cc: Linus Torvalds Cc: Peter Xu Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton --- mm/gup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 61e751baf862..e42e4fdaf765 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2760,7 +2760,7 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * * *) ptes can be read atomically by the architecture. * - * *) access_ok is sufficient to validate userspace address ranges. + * *) valid user addesses are below TASK_MAX_SIZE * * The last two assumptions can be relaxed by the addition of helper functions. * @@ -3414,8 +3414,6 @@ static int gup_fast_fallback(unsigned long start, unsigned long nr_pages, return -EOVERFLOW; if (end > TASK_SIZE_MAX) return -EFAULT; - if (unlikely(!access_ok((void __user *)start, len))) - return -EFAULT; nr_pinned = gup_fast(start, end, gup_flags, pages); if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY) -- 2.51.0 From 3fae696393c7f07aac5db249fc2c537e8744c831 Mon Sep 17 00:00:00 2001 From: Eric Salem Date: Sat, 8 Feb 2025 20:36:36 -0600 Subject: [PATCH 15/16] selftests: mm: fix typo Fix misspelling. Link: https://lkml.kernel.org/r/77e0e915-36c3-4c95-84b8-0b73aaa17951@gmail.com Signed-off-by: Eric Salem Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 7ad6ba660c7d..31e0c8a3110d 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -323,7 +323,7 @@ int uffd_test_ctx_init(uint64_t features, const char **errmsg) ret = userfaultfd_open(&features); if (ret) { if (errmsg) - *errmsg = "possible lack of priviledge"; + *errmsg = "possible lack of privilege"; return ret; } -- 2.51.0 From cedae19487a368d899301c16ab576b0aedb9396d Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Sat, 8 Feb 2025 15:52:54 +0000 Subject: [PATCH 16/16] mm: refactor rmap_walk_file() to separate out traversal logic MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Patch series "expose mapping wrprotect, fix fb_defio use", v3. Right now the only means by which we can write-protect a range using the reverse mapping is via folio_mkclean(). However this is not always the appropriate means of doing so, specifically in the case of the framebuffer deferred I/O logic (fb_defio enabled by CONFIG_FB_DEFERRED_IO). There, kernel pages are mapped read-only and write-protect faults used to batch up I/O operations. Each time the deferred work is done, folio_mkclean() is used to mark the framebuffer page as having had I/O performed on it. However doing so requires the kernel page (perhaps allocated via vmalloc()) to have its page->mapping, index fields set so the rmap can find everything that maps it in order to write-protect. This is problematic as firstly, these fields should not be set for kernel-allocated memory, and secondly these are not folios (it's not user memory) and page->index, mapping fields are now deprecated and soon to be removed. The removal of these fields is imminent, rendering this series more urgent than it might first appear. The implementers cannot be blamed for having used this however, as there is simply no other way of performing this operation correctly. This series fixes this - we provide the mapping_wrprotect_range() function to allow the reverse mapping to be used to look up mappings from the page cache object (i.e. its address_space pointer) at a specific offset. The fb_defio logic already stores this offset, and can simply be expanded to keep track of the page cache object, so the change then becomes straight-forward. This series should have no functional change. This patch (of 3): In order to permit the traversal of the reverse mapping at a specified mapping and offset rather than those specified by an input folio, we need to separate out the portion of the rmap file logic which deals with this traversal from those parts of the logic which interact with the folio. This patch achieves this by adding a new static __rmap_walk_file() function which rmap_walk_file() invokes. This function permits the ability to pass NULL folio, on the assumption that the caller has provided for this correctly in the callbacks specified in the rmap_walk_control object. Though it provides for this, and adds debug asserts to ensure that, should a folio be specified, these are equal to the mapping and offset specified in the folio, there should be no functional change as a result of this patch. The reason for adding this is to enable for future changes to permit users to be able to traverse mappings of userland-mapped kernel memory, write-protecting those mappings to enable page_mkwrite() or pfn_mkwrite() fault handlers to be retriggered on subsequent dirty. Link: https://lkml.kernel.org/r/cover.1739029358.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/0d1acec0cba1e5a12f9b53efcabc397541c90517.1739029358.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Helge Deller Cc: Jaya Kumar Cc: Kajtar Zsolt Cc: Maíra Canal Cc: Matthew Wilcox [English fixes] Cc: Simona Vetter Cc: Thomas Zimemrmann Signed-off-by: Andrew Morton --- mm/rmap.c | 79 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 26 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 0f760b93fc0a..3da1ca49881c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2614,35 +2614,37 @@ static void rmap_walk_anon(struct folio *folio, anon_vma_unlock_read(anon_vma); } -/* - * rmap_walk_file - do something to file page using the object-based rmap method - * @folio: the folio to be handled - * @rwc: control variable according to each walk type - * @locked: caller holds relevant rmap lock +/** + * __rmap_walk_file() - Traverse the reverse mapping for a file-backed mapping + * of a page mapped within a specified page cache object at a specified offset. * - * Find all the mappings of a folio using the mapping pointer and the vma chains - * contained in the address_space struct it points to. + * @folio: Either the folio whose mappings to traverse, or if NULL, + * the callbacks specified in @rwc will be configured such + * as to be able to look up mappings correctly. + * @mapping: The page cache object whose mapping VMAs we intend to + * traverse. If @folio is non-NULL, this should be equal to + * folio_mapping(folio). + * @pgoff_start: The offset within @mapping of the page which we are + * looking up. If @folio is non-NULL, this should be equal + * to folio_pgoff(folio). + * @nr_pages: The number of pages mapped by the mapping. If @folio is + * non-NULL, this should be equal to folio_nr_pages(folio). + * @rwc: The reverse mapping walk control object describing how + * the traversal should proceed. + * @locked: Is the @mapping already locked? If not, we acquire the + * lock. */ -static void rmap_walk_file(struct folio *folio, - struct rmap_walk_control *rwc, bool locked) +static void __rmap_walk_file(struct folio *folio, struct address_space *mapping, + pgoff_t pgoff_start, unsigned long nr_pages, + struct rmap_walk_control *rwc, bool locked) { - struct address_space *mapping = folio_mapping(folio); - pgoff_t pgoff_start, pgoff_end; + pgoff_t pgoff_end = pgoff_start + nr_pages - 1; struct vm_area_struct *vma; - /* - * The page lock not only makes sure that page->mapping cannot - * suddenly be NULLified by truncation, it makes sure that the - * structure at mapping cannot be freed and reused yet, - * so we can safely take mapping->i_mmap_rwsem. - */ - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - - if (!mapping) - return; + VM_WARN_ON_FOLIO(folio && mapping != folio_mapping(folio), folio); + VM_WARN_ON_FOLIO(folio && pgoff_start != folio_pgoff(folio), folio); + VM_WARN_ON_FOLIO(folio && nr_pages != folio_nr_pages(folio), folio); - pgoff_start = folio_pgoff(folio); - pgoff_end = pgoff_start + folio_nr_pages(folio) - 1; if (!locked) { if (i_mmap_trylock_read(mapping)) goto lookup; @@ -2657,8 +2659,7 @@ static void rmap_walk_file(struct folio *folio, lookup: vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff_start, pgoff_end) { - unsigned long address = vma_address(vma, pgoff_start, - folio_nr_pages(folio)); + unsigned long address = vma_address(vma, pgoff_start, nr_pages); VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); @@ -2671,12 +2672,38 @@ lookup: if (rwc->done && rwc->done(folio)) goto done; } - done: if (!locked) i_mmap_unlock_read(mapping); } +/* + * rmap_walk_file - do something to file page using the object-based rmap method + * @folio: the folio to be handled + * @rwc: control variable according to each walk type + * @locked: caller holds relevant rmap lock + * + * Find all the mappings of a folio using the mapping pointer and the vma chains + * contained in the address_space struct it points to. + */ +static void rmap_walk_file(struct folio *folio, + struct rmap_walk_control *rwc, bool locked) +{ + /* + * The folio lock not only makes sure that folio->mapping cannot + * suddenly be NULLified by truncation, it makes sure that the structure + * at mapping cannot be freed and reused yet, so we can safely take + * mapping->i_mmap_rwsem. + */ + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + + if (!folio->mapping) + return; + + __rmap_walk_file(folio, folio->mapping, folio->index, + folio_nr_pages(folio), rwc, locked); +} + void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc) { if (unlikely(folio_test_ksm(folio))) -- 2.51.0