From a5c96dfd47d88658ac9cdece96e98c2ef17ab465 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 26 Feb 2025 17:54:02 +0530 Subject: [PATCH 01/16] docs: arm64: drop PTDUMP config options from ptdump.rst Both GENERIC_PTDUMP and PTDUMP_CORE are not user selectable config options. Just drop these from documentation. Link: https://lkml.kernel.org/r/20250226122404.1927473-4-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: Steven Price Acked-by: Catalin Marinas Cc: Will Deacon Cc: Jonathan Corbet Cc: Christophe Leroy Cc: Heiko Carstens Cc: Ingo Molnar Cc: Madhavan Srinivasan Cc: Marc Zyngier Cc: Mark Rutland Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Thomas Gleixner Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- Documentation/arch/arm64/ptdump.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/arch/arm64/ptdump.rst b/Documentation/arch/arm64/ptdump.rst index 5dcfc5d7cddf..51eb902ba41a 100644 --- a/Documentation/arch/arm64/ptdump.rst +++ b/Documentation/arch/arm64/ptdump.rst @@ -22,8 +22,6 @@ offlining of memory being accessed by the ptdump code. In order to dump the kernel page tables, enable the following configurations and mount debugfs:: - CONFIG_GENERIC_PTDUMP=y - CONFIG_PTDUMP_CORE=y CONFIG_PTDUMP_DEBUGFS=y mount -t debugfs nodev /sys/kernel/debug -- 2.51.0 From 3f54872454a927a2b5f9fb3e2d3cdbd51b3666b7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 26 Feb 2025 17:54:03 +0530 Subject: [PATCH 02/16] mm: make DEBUG_WX depdendent on GENERIC_PTDUMP DEBUG_WX selects PTDUMP_CORE without even ensuring that the given platform implements GENERIC_PTDUMP. This problem has been latent until now, as all the platforms subscribing ARCH_HAS_DEBUG_WX also subscribe GENERIC_PTDUMP. Link: https://lkml.kernel.org/r/20250226122404.1927473-5-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Steven Price Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Heiko Carstens Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Madhavan Srinivasan Cc: Marc Zyngier Cc: Mark Rutland Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 41a58536531d..a51a1149909a 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -186,6 +186,7 @@ config ARCH_HAS_DEBUG_WX config DEBUG_WX bool "Warn on W+X mappings at boot" depends on ARCH_HAS_DEBUG_WX + depends on GENERIC_PTDUMP depends on MMU select PTDUMP_CORE help -- 2.51.0 From f9aad622006bd64c28fdf73c03a1c5139fcbf049 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 26 Feb 2025 17:54:04 +0530 Subject: [PATCH 03/16] mm: rename GENERIC_PTDUMP and PTDUMP_CORE Platforms subscribe into generic ptdump implementation via GENERIC_PTDUMP. But generic ptdump gets enabled via PTDUMP_CORE. These configs combination is confusing as they sound very similar and does not differentiate between platform's feature subscription and feature enablement for ptdump. Rename the configs as ARCH_HAS_PTDUMP and PTDUMP making it more clear and improve readability. Link: https://lkml.kernel.org/r/20250226122404.1927473-6-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy (powerpc) Acked-by: Catalin Marinas [arm64] Cc: Will Deacon Cc: Jonathan Corbet Cc: Marc Zyngier Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Christophe Leroy Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Steven Price Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 2 +- arch/arm64/include/asm/ptdump.h | 4 ++-- arch/arm64/kvm/Kconfig | 4 ++-- arch/arm64/mm/Makefile | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/mm/Makefile | 2 +- arch/riscv/Kconfig | 2 +- arch/riscv/mm/Makefile | 2 +- arch/s390/Kconfig | 2 +- arch/s390/mm/Makefile | 2 +- arch/x86/Kconfig | 2 +- arch/x86/Kconfig.debug | 2 +- arch/x86/mm/Makefile | 2 +- mm/Kconfig.debug | 12 ++++++------ mm/Makefile | 2 +- 15 files changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 940343beb3d4..5cf688ee01b7 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -41,6 +41,7 @@ config ARM64 select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_NONLEAF_PMD_YOUNG if ARM64_HAFT + select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_HW_PTE_YOUNG @@ -157,7 +158,6 @@ config ARM64 select GENERIC_IRQ_SHOW_LEVEL select GENERIC_LIB_DEVMEM_IS_ALLOWED select GENERIC_PCI_IOMAP - select GENERIC_PTDUMP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 6cf4aae05219..b2931d1ae0fb 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -7,7 +7,7 @@ #include -#ifdef CONFIG_PTDUMP_CORE +#ifdef CONFIG_PTDUMP #include #include @@ -70,6 +70,6 @@ static inline void ptdump_debugfs_register(struct ptdump_info *info, #else static inline void note_page(struct ptdump_state *pt_st, unsigned long addr, int level, u64 val) { } -#endif /* CONFIG_PTDUMP_CORE */ +#endif /* CONFIG_PTDUMP */ #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index ead632ad01b4..096e45acadb2 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -71,8 +71,8 @@ config PTDUMP_STAGE2_DEBUGFS depends on KVM depends on DEBUG_KERNEL depends on DEBUG_FS - depends on GENERIC_PTDUMP - select PTDUMP_CORE + depends on ARCH_HAS_PTDUMP + select PTDUMP default n help Say Y here if you want to show the stage-2 kernel pagetables diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index fc92170a8f37..c26489cf96cd 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -5,7 +5,7 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ context.o proc.o pageattr.o fixmap.o obj-$(CONFIG_ARM64_CONTPTE) += contpte.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PTDUMP_DEBUGFS) += ptdump_debugfs.o obj-$(CONFIG_TRANS_TABLE) += trans_pgd.o obj-$(CONFIG_TRANS_TABLE) += trans_pgd-asm.o diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 424f188e62d9..6f1ae41dcf85 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -148,6 +148,7 @@ config PPC select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API select ARCH_HAS_PREEMPT_LAZY + select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_DEVMAP if PPC_BOOK3S_64 select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE && PPC_BOOK3S_64 @@ -206,7 +207,6 @@ config PPC select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI - select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 0fe2f085c05a..8c1582b2987d 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -15,5 +15,5 @@ obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o -obj-$(CONFIG_PTDUMP_CORE) += ptdump/ +obj-$(CONFIG_PTDUMP) += ptdump/ obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 7612c52e9b1e..353cf41d01f4 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -43,6 +43,7 @@ config RISCV select ARCH_HAS_PMEM_API select ARCH_HAS_PREEMPT_LAZY select ARCH_HAS_PREPARE_SYNC_CORE_CMD + select ARCH_HAS_PTDUMP if MMU select ARCH_HAS_PTE_DEVMAP if 64BIT && MMU select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_DIRECT_MAP if MMU @@ -112,7 +113,6 @@ config RISCV select GENERIC_IRQ_SHOW_LEVEL select GENERIC_LIB_DEVMEM_IS_ALLOWED select GENERIC_PCI_IOMAP - select GENERIC_PTDUMP if MMU select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if MMU && 64BIT diff --git a/arch/riscv/mm/Makefile b/arch/riscv/mm/Makefile index cbe4d775ef56..b916a68d324a 100644 --- a/arch/riscv/mm/Makefile +++ b/arch/riscv/mm/Makefile @@ -19,7 +19,7 @@ obj-y += context.o obj-y += pmem.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_KASAN) += kasan_init.o ifdef CONFIG_KASAN diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9c9ec08d78c7..dd9dd2f8e673 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -96,6 +96,7 @@ config S390 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS select ARCH_HAS_PREEMPT_LAZY + select ARCH_HAS_PTDUMP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SCALED_CPUTIME select ARCH_HAS_SET_DIRECT_MAP @@ -163,7 +164,6 @@ config S390 select GENERIC_CPU_VULNERABILITIES select GENERIC_ENTRY select GENERIC_GETTIMEOFDAY - select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index f6c2db7a8669..9726b91fe7e4 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -9,6 +9,6 @@ obj-y += page-states.o pageattr.o pgtable.o pgalloc.o extable.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_PTDUMP) += dump_pagetables.o obj-$(CONFIG_PGSTE) += gmap.o obj-$(CONFIG_PFAULT) += pfault.o diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cf49c130d1d0..bfd23a09b911 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,6 +26,7 @@ config X86_64 depends on 64BIT # Options that are inherently 64-bit kernel only: select ARCH_HAS_GIGANTIC_PAGE + select ARCH_HAS_PTDUMP select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 select ARCH_SUPPORTS_PER_VMA_LOCK select ARCH_SUPPORTS_HUGE_PFNMAP if TRANSPARENT_HUGEPAGE @@ -175,7 +176,6 @@ config X86 select GENERIC_IRQ_RESERVATION_MODE select GENERIC_IRQ_SHOW select GENERIC_PENDING_IRQ if SMP - select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_GETTIMEOFDAY diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 1eb4d23cdaae..c95c3aaadf97 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -59,7 +59,7 @@ config EARLY_PRINTK_USB_XDBC config EFI_PGT_DUMP bool "Dump the EFI pagetable" depends on EFI - select PTDUMP_CORE + select PTDUMP help Enable this if you want to dump the EFI page table before enabling virtual mode. This can be used to debug miscellaneous diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 690fbf48e853..e0c99a8760ca 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -39,7 +39,7 @@ CFLAGS_fault.o := -I $(src)/../include/asm/trace obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_PTDUMP) += dump_pagetables.o obj-$(CONFIG_PTDUMP_DEBUGFS) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index a51a1149909a..32b65073d0cc 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -186,9 +186,9 @@ config ARCH_HAS_DEBUG_WX config DEBUG_WX bool "Warn on W+X mappings at boot" depends on ARCH_HAS_DEBUG_WX - depends on GENERIC_PTDUMP + depends on ARCH_HAS_PTDUMP depends on MMU - select PTDUMP_CORE + select PTDUMP help Generate a warning if any W+X mappings are found at boot. @@ -213,18 +213,18 @@ config DEBUG_WX If in doubt, say "Y". -config GENERIC_PTDUMP +config ARCH_HAS_PTDUMP bool -config PTDUMP_CORE +config PTDUMP bool config PTDUMP_DEBUGFS bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL depends on DEBUG_FS - depends on GENERIC_PTDUMP - select PTDUMP_CORE + depends on ARCH_HAS_PTDUMP + select PTDUMP help Say Y here if you want to show the kernel pagetable layout in a debugfs file. This information is only useful for kernel developers diff --git a/mm/Makefile b/mm/Makefile index 84b1127e43a5..e7f6bbf8ae5f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -139,7 +139,7 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o -obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PTDUMP) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o -- 2.51.0 From b9585a3f3e0b30b3b60c85dc39f27ed3b06fb623 Mon Sep 17 00:00:00 2001 From: Zeng Jingxiang Date: Thu, 27 Feb 2025 16:22:23 +0800 Subject: [PATCH 04/16] mm/list_lru: make the case where mlru is NULL as unlikely In the following memcg_list_lru_alloc() function, mlru here is almost always NULL, so in most cases this should save a function call, mark mlru as unlikely to optimize the code, and reusing the mlru for the next attempt when the tree insertion fails. do { xas_lock_irqsave(&xas, flags); if (!xas_load(&xas) && !css_is_dying(&pos->css)) { xas_store(&xas, mlru); if (!xas_error(&xas)) mlru = NULL; } xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, GFP_KERNEL)); > if (mlru) kfree(mlru); Link: https://lkml.kernel.org/r/20250227082223.1173847-1-jingxiangzeng.cas@gmail.com Signed-off-by: Zeng Jingxiang Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202412290924.UTP7GH2Z-lkp@intel.com/ Suggested-by: Johannes Weiner Reviewed-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Chengming Zhou Cc: Jingxiang Zeng Cc: Kairui Song Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/list_lru.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 7d69434c70e0..490473af3122 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -510,7 +510,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { unsigned long flags; - struct list_lru_memcg *mlru; + struct list_lru_memcg *mlru = NULL; struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); @@ -535,9 +535,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, parent = parent_mem_cgroup(pos); } - mlru = memcg_init_list_lru_one(lru, gfp); - if (!mlru) - return -ENOMEM; + if (!mlru) { + mlru = memcg_init_list_lru_one(lru, gfp); + if (!mlru) + return -ENOMEM; + } xas_set(&xas, pos->kmemcg_id); do { xas_lock_irqsave(&xas, flags); @@ -548,10 +550,11 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, } xas_unlock_irqrestore(&xas, flags); } while (xas_nomem(&xas, gfp)); - if (mlru) - kfree(mlru); } while (pos != memcg && !css_is_dying(&pos->css)); + if (unlikely(mlru)) + kfree(mlru); + return xas_error(&xas); } #else -- 2.51.0 From 1eb3471bf5749ff3769ec52723bd9b8d773c7a62 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:19 -0800 Subject: [PATCH 05/16] mm/damon: add data structure for monitoring intervals auto-tuning Patch series "mm/damon: auto-tune aggregation interval". DAMON requires time-consuming and repetitive aggregation interval tuning. Introduce a feature for automating it using a feedback loop that aims an amount of observed access events, like auto-exposing cameras. Background: Access Frequency Monitoring and Aggregation Interval ================================================================ DAMON checks if each memory element (damon_region) is accessed or not for every user-specified time interval called 'sampling interval'. It aggregates the check intervals on per-element counter called 'nr_accesses'. DAMON users can read the counters to get the access temperature of a given element. The counters are reset for every another user-specified time interval called 'aggregation interval'. This can be illustrated as DAMON continuously capturing a snapshot of access events that happen and captured within the last aggregation interval. This implies the aggregation interval plays a key role for the quality of the snapshots, like the camera exposure time. If it is too short, the amount of access events that happened and captured for each snapshot is small, so each snapshot will show no many interesting things but just a cold and dark world with hopefuly one pale blue dot or two. If it is too long, too many events are aggregated in a single shot, so each snapshot will look like world of flames, or Muspellheim. It will be difficult to find practical insights in both cases. Problem: Time Consuming and Repetitive Tuning ============================================= The appropriate length of the aggregation interval depends on how frequently the system and workloads are making access events that DAMON can observe. Hence, users have to tune the interval with excessive amount of tests with the target system and workloads. If the system and workloads are changed, the tuning should be done again. If the characteristic of the workloads is dynamic, it becomes more challenging. It is therefore time-consuming and repetitive. The tuning challenge mainly stems from the wrong question. It is not asking users what quality of monitoring results they want, but how DAMON should operate for their hidden goal. To make the right answer, users need to fully understand DAMON's mechanisms and the characteristics of their workloads. Users shouldn't be asked to understand the underlying mechanism. Understanding the characteristics of the workloads shouldn't be the role of users but DAMON. Aim-oriented Feedback-driven Auto-Tuning ========================================= Fortunately, the appropriate length of the aggregation interval can be inferred using a feedback loop. If the current snapshots are showing no much intresting information, in other words, if it shows only rare access events, increasing the aggregation interval helps, and vice versa. We tested this theory on a few real-world workloads, and documented one of the experience with an official DAMON monitoring intervals tuning guideline. Since it is a simple theory that requires repeatable tries, it can be a good job for machines. Based on the guideline's theory, we design an automation of aggregation interval tuning, in a way similar to that of camera auto-exposure feature. It defines the amount of interesting information as the ratio of DAMON-observed access events that DAMON actually observed to theoretical maximum amount of it within each snapshot. Events are accounted in byte and sampling attempts granularity. For example, let's say there is a region of 'X' bytes size. DAMON tried access check smapling for the region 'Y' times in total for a given aggregation. Among the 'Y' attempts, 'Z' times it shown positive results. Then, the theoritical maximum number of access events for the region is 'X * Y'. And the number of access events that DAMON has observed for the region is 'X * Z'. The abount of the interesting information is '(X * Z / X * Y)'. Note that each snapshot would have multiple regions. Users can set an arbitrary value of the ratio as their target. Once the target is set, the automation periodically measures the current value of the ratio and increase or decrease the aggregation interval if the ratio value is lower or higher than the target. The amount of the change is proportion to the distance between the current adn the target values. To avoid auto-tuning goes too long way, let users set the minimum and the maximum aggregation interval times. Changing only aggregation interval while sampling interval is kept makes the maximum level of access frequency in each snapshot, or discernment of regions inconsistent. Also, unnecessarily short sampling interval causes meaningless monitoring overhed. The automation therefore adjusts the sampling interval together with aggregation interval, while keeping the ratio between the two intervals. Users can set the ratio, or the discernment. Discussion ========== The modified question (aimed amount of access events, or lights, in each snapshot) is easy to answer by both the users and the kernel. If users are interested in finding more cold regions, the value should be lower, and vice versa. If users have no idea, kernel can suggest a fair default value based on some theories and experiments. For example, based on the Pareto principle (80/20 rule), we could expect 20% target ratio will capture 80% of real access events. Since 80% might be too high, applying the rule once again, 4% (20% * 20%) may capture about 56% (80% * 80%) of real access events. Sampling to aggregation intervals ratio and min/max aggregation intervals are also arguably easy to answer. What users want is discernment of regions for efficient system operation, for examples, X amount of colder regions or Y amount of warmer regions, not exactly how many times each cache line is accessed in nanoseconds degree. The appropriate min/max aggregation interval can relatively naively set, and may better to set for aimed monitoring overhead. Since sampling interval is directly deciding the overhead, setting it based on the sampling interval can be easy. With my experiences, I'd argue the intervals ratio 0.05, and 5 milliseconds to 20 seconds sampling interval range (100 milliseconds to 400 seconds aggregation interval) can be a good default suggestion. Evaluation ========== On a machine running a real world server workload, I ran DAMON to monitor its physical address space for about 23 hours, with this feature turned on. We set it to tune sampling interval in a range from 5 milliseconds to 10 seconds, aiming 4 % DAMON-observed access ratio per three aggregation intervals. The exact command I used is as below. damo start --monitoring_intervals_goal 4% 3 5ms 10s --damos_action stat During the test run, DAMON continuously updated sampling and aggregation intervals as designed, within the given range. For all the time, DAMON was able to find the intervals that meets the target access events ratio in the given intervals range (sampling interval between 5 milliseconds and 10 seconds). For most of the time, tuned sampling interval was converged in 300-400 milliseconds. It made only small amount of changes within the range. The average of the tuned sampling interval during the test was about 380 milliseconds. The workload periodically gets less load and decreases its CPU usage. Presumably this also caused it making less memory access events. Reactively to such event,s DAMON also increased the intervals as expected. It was still able to find the optimum interval that satisfying the target access ratio within the given intervals range. Usually it was converged to about 5 seconds. Once the workload gets normal amount of load again, DAMON reactively reduced the intervals to the normal range. I collected and visualized DAMON's monitoring results on the server a few times. Every time the visualized access pattern looked not biased to only cold or hot pages but diverse and balanced. Let me show some of the snapshots that I collected at the nearly end of the test (after about 23 hours have passed since starting DAMON on the server). The recency histogram looks as below. Please note that this visualization shows only a very coarse grained information. For more details about the visualization format, please refer to DAMON user-space tool documentation[1]. # ./damo report access --style recency-sz-hist --tried_regions_of 0 0 0 --access_rate 0 0 [-19 h 7 m 45.514 s, -17 h 12 m 58.963 s) 6.198 GiB |**** | [-17 h 12 m 58.963 s, -15 h 18 m 12.412 s) 0 B | | [-15 h 18 m 12.412 s, -13 h 23 m 25.860 s) 0 B | | [-13 h 23 m 25.860 s, -11 h 28 m 39.309 s) 0 B | | [-11 h 28 m 39.309 s, -9 h 33 m 52.757 s) 0 B | | [-9 h 33 m 52.757 s, -7 h 39 m 6.206 s) 0 B | | [-7 h 39 m 6.206 s, -5 h 44 m 19.654 s) 0 B | | [-5 h 44 m 19.654 s, -3 h 49 m 33.103 s) 0 B | | [-3 h 49 m 33.103 s, -1 h 54 m 46.551 s) 0 B | | [-1 h 54 m 46.551 s, -0 ns) 16.967 GiB |********* | [-0 ns, --6886551440000 ns) 38.835 GiB |********************| memory bw estimate: 9.425 GiB per second total size: 62.000 GiB It shows about 38 GiB of memory was accessed at least once within last aggregation interval (given ~300 milliseconds tuned sampling interval, this is about six seconds). This is about 61 % of the total memory. In other words, DAMON found warmest 61 % memory of the system. The number is particularly interesting given our Pareto principle based theory for the tuning goal value. We set it as 20 % of 20 % (4 %), thinking it would capture 80 % of 80 % (64 %) real access events. And it foudn 61 % hot memory, or working set. Nevertheless, to make the theory clearer, much more discussion and tests would be needed. At the moment, nonetheless, we can say making the target value higher helps finding more hot memory regions. The histogram also shows an amount of cold memory. About 17 GiB memory of the system has not accessed at least for last aggregation interval (about six seconds), and at most for about last two hours. The real longest unaccessed time of the 17 GiB memory was about 19 minutes, though. This is a limitation of this visualization format. It further found very cold 6 GiB memory. It has not accessed at least for last 17 hours and at most 19 hours. What about hot memory distribution? To see this, I capture and visualize the snapshot in access temperature histogram. Again, please refer to the DAMON user-space tool documentation[1] for the format and what access temperature mean. Both the visualization and metric shows only very coarse grained and limited information. The resulting histogram look like below. # ./damo report access --style temperature-sz-hist --tried_regions_of 0 0 0 [-6,840,763,776,000, -5,501,580,939,800) 6.198 GiB |*** | [-5,501,580,939,800, -4,162,398,103,600) 0 B | | [-4,162,398,103,600, -2,823,215,267,400) 0 B | | [-2,823,215,267,400, -1,484,032,431,200) 0 B | | [-1,484,032,431,200, -144,849,595,000) 0 B | | [-144,849,595,000, 1,194,333,241,200) 55.802 GiB |********************| [1,194,333,241,200, 2,533,516,077,400) 4.000 KiB |* | [2,533,516,077,400, 3,872,698,913,600) 4.000 KiB |* | [3,872,698,913,600, 5,211,881,749,800) 8.000 KiB |* | [5,211,881,749,800, 6,551,064,586,000) 12.000 KiB |* | [6,551,064,586,000, 7,890,247,422,200) 4.000 KiB |* | memory bw estimate: 5.178 GiB per second total size: 62.000 GiB We can see most of the memory is in similar access temperature range, and definitely some pages are extremely hot. To see the picture in more detail, let's capture and visualize the snapshot per DAMON-region, sorted by their access temperature. The total number of the regions was about 300. Due to the limited space, I'm showing only a few parts of the output here. # ./damo report access --style hot --tried_regions_of 0 0 0 heatmap: 00000000888888889999999888888888888888888888888888888888888888888888888888888888 # min/max temperatures: -6,827,258,184,000, 17,589,052,500, column size: 793.600 MiB |999999999999999999999999999999999999999| 4.000 KiB access 100 % 18 h 9 m 43.918 s |999999999999999999999999999999999999999| 8.000 KiB access 100 % 17 h 56 m 5.351 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 15 h 24 m 19.634 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 14 h 10 m 55.606 s |999999999999999999999999999999999999999| 4.000 KiB access 100 % 11 h 34 m 18.993 s [...] |99999999999999999999999999999| 8.000 KiB access 100 % 1 m 27.945 s |11111111111111111111111111111| 80.000 KiB access 15 % 1 m 21.180 s |00000000000000000000000000000| 24.000 KiB access 5 % 1 m 21.180 s |00000000000000000000000000000| 5.919 GiB access 10 % 1 m 14.415 s |99999999999999999999999999999| 12.000 KiB access 100 % 1 m 7.650 s [...] |0| 4.000 KiB access 5 % 0 ns |0| 12.000 KiB access 5 % 0 ns |0| 188.000 KiB access 0 % 0 ns |0| 24.000 KiB access 0 % 0 ns |0| 48.000 KiB access 0 % 0 ns [...] |0000000000000000000000000000000| 8.000 KiB access 0 % 6 m 45.901 s |00000000000000000000000000000000| 36.000 KiB access 0 % 7 m 26.491 s |00000000000000000000000000000000| 4.000 KiB access 0 % 12 m 37.682 s |000000000000000000000000000000000| 8.000 KiB access 0 % 18 m 9.168 s |000000000000000000000000000000000| 16.000 KiB access 0 % 19 m 3.288 s |0000000000000000000000000000000000000000| 6.198 GiB access 0 % 18 h 57 m 52.582 s memory bw estimate: 8.798 GiB per second total size: 62.000 GiB We can see DAMON found small and extremely hot regions that accessed for all access check sampling (once per about 300 milliseconds) for more than 10 hours. The access temperature rapidly decreases. DAMON was also able to find small and big regions that not accessed for up to about 19 minutes. It even found an outlier cold region of 6 GiB that not accessed for about 19 hours. It is unclear what the outlier region is, as of this writing. For the testing, DAMON was consuming about 0.1% of single CPU time. This is again expected results, since DAMON was using about 370 milliseconds sampling interval in most case. # ps -p $kdamond_pid -o %cpu %CPU 0.1 I also ran similar tests against kernel build workload and an in-memory cache workload benchmark[2]. Detialed results including tuned intervals and captured access pattern were of course different sicne those depend on the workloads. But the auto-tuning feature was always working as expected like the above results for the real world workload. To wrap up, with intervals auto-tuning feature, DAMON was able to capture access pattern snapshots of a quality on a real world server workload. The auto-tuning feature was able to adaptively react to the dynamic access patterns of the workload and reliably provide consistent monitoring results without manual human interventions. Also, the auto-tuning made DAMON consumes only necessary amount of resource for the required quality. References ========== [1] https://github.com/damonitor/damo/blob/next/USAGE.md#access-report-styles [2] https://github.com/facebookresearch/DCPerf/blob/main/packages/tao_bench/README.md This patch (of 8): Add data structures for DAMON sampling and aggregation intervals automatic tuning that aims specific amount of DAMON-observed access events per snapshot. In more detail, define the data structure for the tuning goal, link it to the monitoring attributes data structure so that DAMON kernel API callers can make the request, and update parameters setup DAMON function to respect the new parameter. Link: https://lkml.kernel.org/r/20250303221726.484227-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250303221726.484227-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 27 +++++++++++++++++++++++++++ mm/damon/core.c | 22 ++++++++++++++++++++++ 2 files changed, 49 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 242910b190c9..5f2609f24761 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -659,12 +659,38 @@ struct damon_call_control { bool canceled; }; +/** + * struct damon_intervals_goal - Monitoring intervals auto-tuning goal. + * + * @access_bp: Access events observation ratio to achieve in bp. + * @aggrs: Number of aggregations to acheive @access_bp within. + * @min_sample_us: Minimum resulting sampling interval in microseconds. + * @max_sample_us: Maximum resulting sampling interval in microseconds. + * + * DAMON automatically tunes &damon_attrs->sample_interval and + * &damon_attrs->aggr_interval aiming the ratio in bp (1/10,000) of + * DAMON-observed access events to theoretical maximum amount within @aggrs + * aggregations be same to @access_bp. The logic increases + * &damon_attrs->aggr_interval and &damon_attrs->sampling_interval in same + * ratio if the current access events observation ratio is lower than the + * target for each @aggrs aggregations, and vice versa. + * + * If @aggrs is zero, the tuning is disabled and hence this struct is ignored. + */ +struct damon_intervals_goal { + unsigned long access_bp; + unsigned long aggrs; + unsigned long min_sample_us; + unsigned long max_sample_us; +}; + /** * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. * @ops_update_interval: The time between monitoring operations updates. + * @intervals_goal: Intervals auto-tuning goal. * @min_nr_regions: The minimum number of adaptive monitoring * regions. * @max_nr_regions: The maximum number of adaptive monitoring @@ -684,6 +710,7 @@ struct damon_attrs { unsigned long sample_interval; unsigned long aggr_interval; unsigned long ops_update_interval; + struct damon_intervals_goal intervals_goal; unsigned long min_nr_regions; unsigned long max_nr_regions; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index b1ce072b56f2..ad3b5c065cb8 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -615,6 +615,25 @@ static void damon_update_monitoring_results(struct damon_ctx *ctx, r, old_attrs, new_attrs); } +/* + * damon_valid_intervals_goal() - return if the intervals goal of @attrs is + * valid. + */ +static bool damon_valid_intervals_goal(struct damon_attrs *attrs) +{ + struct damon_intervals_goal *goal = &attrs->intervals_goal; + + /* tuning is disabled */ + if (!goal->aggrs) + return true; + if (goal->min_sample_us > goal->max_sample_us) + return false; + if (attrs->sample_interval < goal->min_sample_us || + goal->max_sample_us < attrs->sample_interval) + return false; + return true; +} + /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context @@ -635,6 +654,9 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) attrs->sample_interval : 1; struct damos *s; + if (!damon_valid_intervals_goal(attrs)) + return -EINVAL; + if (attrs->min_nr_regions < 3) return -EINVAL; if (attrs->min_nr_regions > attrs->max_nr_regions) -- 2.51.0 From f04b0fedbe714f822bd066b319a60faa39a985a1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:20 -0800 Subject: [PATCH 06/16] mm/damon/core: implement intervals auto-tuning Implement the DAMON sampling and aggregation intervals auto-tuning mechanism as briefly described on 'struct damon_intervals_goal'. The core part for deciding the direction and amount of the changes is implemented reusing the feedback loop function which is being used for DAMOS quotas auto-tuning. Unlike the DAMOS quotas auto-tuning use case, limit the maximum decreasing amount after the adjustment to 50% of the current value, though. This is because the intervals have no good merits at rapid reductions since it could unnecessarily increase the monitoring overhead. Link: https://lkml.kernel.org/r/20250303221726.484227-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- include/linux/damon.h | 16 +++++++++ mm/damon/core.c | 76 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/include/linux/damon.h b/include/linux/damon.h index 5f2609f24761..b3e2c793c1f4 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -713,6 +713,17 @@ struct damon_attrs { struct damon_intervals_goal intervals_goal; unsigned long min_nr_regions; unsigned long max_nr_regions; +/* private: internal use only */ + /* + * @aggr_interval to @sample_interval ratio. + * Core-external components call damon_set_attrs() with &damon_attrs + * that this field is unset. In the case, damon_set_attrs() sets this + * field of resulting &damon_attrs. Core-internal components such as + * kdamond_tune_intervals() calls damon_set_attrs() with &damon_attrs + * that this field is set. In the case, damon_set_attrs() just keep + * it. + */ + unsigned long aggr_samples; }; /** @@ -761,6 +772,11 @@ struct damon_ctx { * update */ unsigned long next_ops_update_sis; + /* + * number of sample intervals that should be passed before next + * intervals tuning + */ + unsigned long next_intervals_tune_sis; /* for waiting until the execution of the kdamond_fn is started */ struct completion kdamond_started; /* for scheme quotas prioritization */ diff --git a/mm/damon/core.c b/mm/damon/core.c index ad3b5c065cb8..9d37d3664030 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -664,6 +664,10 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) if (attrs->sample_interval > attrs->aggr_interval) return -EINVAL; + /* calls from core-external doesn't set this. */ + if (!attrs->aggr_samples) + attrs->aggr_samples = attrs->aggr_interval / sample_interval; + ctx->next_aggregation_sis = ctx->passed_sample_intervals + attrs->aggr_interval / sample_interval; ctx->next_ops_update_sis = ctx->passed_sample_intervals + @@ -1301,6 +1305,65 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) } } +static unsigned long damon_get_intervals_score(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r; + unsigned long sz_region, max_access_events = 0, access_events = 0; + unsigned long target_access_events; + unsigned long goal_bp = c->attrs.intervals_goal.access_bp; + + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + sz_region = damon_sz_region(r); + max_access_events += sz_region * c->attrs.aggr_samples; + access_events += sz_region * r->nr_accesses; + } + } + target_access_events = max_access_events * goal_bp / 10000; + return access_events * 10000 / target_access_events; +} + +static unsigned long damon_feed_loop_next_input(unsigned long last_input, + unsigned long score); + +static unsigned long damon_get_intervals_adaptation_bp(struct damon_ctx *c) +{ + unsigned long score_bp, adaptation_bp; + + score_bp = damon_get_intervals_score(c); + adaptation_bp = damon_feed_loop_next_input(100000000, score_bp) / + 10000; + /* + * adaptaion_bp ranges from 1 to 20,000. Avoid too rapid reduction of + * the intervals by rescaling [1,10,000] to [5000, 10,000]. + */ + if (adaptation_bp <= 10000) + adaptation_bp = 5000 + adaptation_bp / 2; + return adaptation_bp; +} + +static void kdamond_tune_intervals(struct damon_ctx *c) +{ + unsigned long adaptation_bp; + struct damon_attrs new_attrs; + struct damon_intervals_goal *goal; + + adaptation_bp = damon_get_intervals_adaptation_bp(c); + if (adaptation_bp == 10000) + return; + + new_attrs = c->attrs; + goal = &c->attrs.intervals_goal; + new_attrs.sample_interval = min(goal->max_sample_us, + c->attrs.sample_interval * adaptation_bp / 10000); + new_attrs.sample_interval = max(goal->min_sample_us, + new_attrs.sample_interval); + new_attrs.aggr_interval = new_attrs.sample_interval * + c->attrs.aggr_samples; + damon_set_attrs(c, &new_attrs); +} + static void damon_split_region_at(struct damon_target *t, struct damon_region *r, unsigned long sz_r); @@ -2209,6 +2272,8 @@ static void kdamond_init_intervals_sis(struct damon_ctx *ctx) ctx->next_aggregation_sis = ctx->attrs.aggr_interval / sample_interval; ctx->next_ops_update_sis = ctx->attrs.ops_update_interval / sample_interval; + ctx->next_intervals_tune_sis = ctx->next_aggregation_sis * + ctx->attrs.intervals_goal.aggrs; damon_for_each_scheme(scheme, ctx) { apply_interval = scheme->apply_interval_us ? @@ -2293,6 +2358,17 @@ static int kdamond_fn(void *data) sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; if (ctx->passed_sample_intervals >= next_aggregation_sis) { + if (ctx->attrs.intervals_goal.aggrs && + ctx->passed_sample_intervals >= + ctx->next_intervals_tune_sis) { + ctx->next_intervals_tune_sis += + ctx->attrs.aggr_samples * + ctx->attrs.intervals_goal.aggrs; + kdamond_tune_intervals(ctx); + sample_interval = ctx->attrs.sample_interval ? + ctx->attrs.sample_interval : 1; + + } ctx->next_aggregation_sis = next_aggregation_sis + ctx->attrs.aggr_interval / sample_interval; -- 2.51.0 From 8fbbcbeaafeb82498fd83f58c1e5ad1aff135212 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:21 -0800 Subject: [PATCH 07/16] mm/damon/sysfs: implement intervals tuning goal directory Implement DAMON sysfs interface directory and its files for setting DAMON sampling and aggregation intervals auto-tuning goal. Link: https://lkml.kernel.org/r/20250303221726.484227-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 189 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 189 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index deeab04d3b46..a772060300b4 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -408,6 +408,164 @@ static const struct kobj_type damon_sysfs_targets_ktype = { .default_groups = damon_sysfs_targets_groups, }; +/* + * intervals goal directory + */ + +struct damon_sysfs_intervals_goal { + struct kobject kobj; + unsigned long access_bp; + unsigned long aggrs; + unsigned long min_sample_us; + unsigned long max_sample_us; +}; + +static struct damon_sysfs_intervals_goal *damon_sysfs_intervals_goal_alloc( + unsigned long access_bp, unsigned long aggrs, + unsigned long min_sample_us, unsigned long max_sample_us) +{ + struct damon_sysfs_intervals_goal *goal = kmalloc(sizeof(*goal), + GFP_KERNEL); + + if (!goal) + return NULL; + + goal->kobj = (struct kobject){}; + goal->access_bp = access_bp; + goal->aggrs = aggrs; + goal->min_sample_us = min_sample_us; + goal->max_sample_us = max_sample_us; + return goal; +} + +static ssize_t access_bp_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->access_bp); +} + +static ssize_t access_bp_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + unsigned long nr; + int err = kstrtoul(buf, 0, &nr); + + if (err) + return err; + + goal->access_bp = nr; + return count; +} + +static ssize_t aggrs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->aggrs); +} + +static ssize_t aggrs_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + unsigned long nr; + int err = kstrtoul(buf, 0, &nr); + + if (err) + return err; + + goal->aggrs = nr; + return count; +} + +static ssize_t min_sample_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->min_sample_us); +} + +static ssize_t min_sample_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + unsigned long nr; + int err = kstrtoul(buf, 0, &nr); + + if (err) + return err; + + goal->min_sample_us = nr; + return count; +} + +static ssize_t max_sample_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + + return sysfs_emit(buf, "%lu\n", goal->max_sample_us); +} + +static ssize_t max_sample_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_intervals_goal *goal = container_of(kobj, + struct damon_sysfs_intervals_goal, kobj); + unsigned long nr; + int err = kstrtoul(buf, 0, &nr); + + if (err) + return err; + + goal->max_sample_us = nr; + return count; +} + +static void damon_sysfs_intervals_goal_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_intervals_goal, kobj)); +} + +static struct kobj_attribute damon_sysfs_intervals_goal_access_bp_attr = + __ATTR_RW_MODE(access_bp, 0600); + +static struct kobj_attribute damon_sysfs_intervals_goal_aggrs_attr = + __ATTR_RW_MODE(aggrs, 0600); + +static struct kobj_attribute damon_sysfs_intervals_goal_min_sample_us_attr = + __ATTR_RW_MODE(min_sample_us, 0600); + +static struct kobj_attribute damon_sysfs_intervals_goal_max_sample_us_attr = + __ATTR_RW_MODE(max_sample_us, 0600); + +static struct attribute *damon_sysfs_intervals_goal_attrs[] = { + &damon_sysfs_intervals_goal_access_bp_attr.attr, + &damon_sysfs_intervals_goal_aggrs_attr.attr, + &damon_sysfs_intervals_goal_min_sample_us_attr.attr, + &damon_sysfs_intervals_goal_max_sample_us_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_intervals_goal); + +static const struct kobj_type damon_sysfs_intervals_goal_ktype = { + .release = damon_sysfs_intervals_goal_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_intervals_goal_groups, +}; + /* * intervals directory */ @@ -417,6 +575,7 @@ struct damon_sysfs_intervals { unsigned long sample_us; unsigned long aggr_us; unsigned long update_us; + struct damon_sysfs_intervals_goal *intervals_goal; }; static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc( @@ -436,6 +595,32 @@ static struct damon_sysfs_intervals *damon_sysfs_intervals_alloc( return intervals; } +static int damon_sysfs_intervals_add_dirs(struct damon_sysfs_intervals *intervals) +{ + struct damon_sysfs_intervals_goal *goal; + int err; + + goal = damon_sysfs_intervals_goal_alloc(0, 0, 0, 0); + if (!goal) + return -ENOMEM; + + err = kobject_init_and_add(&goal->kobj, + &damon_sysfs_intervals_goal_ktype, &intervals->kobj, + "intervals_goal"); + if (err) { + kobject_put(&goal->kobj); + intervals->intervals_goal = NULL; + return err; + } + intervals->intervals_goal = goal; + return 0; +} + +static void damon_sysfs_intervals_rm_dirs(struct damon_sysfs_intervals *intervals) +{ + kobject_put(&intervals->intervals_goal->kobj); +} + static ssize_t sample_us_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -569,6 +754,9 @@ static int damon_sysfs_attrs_add_dirs(struct damon_sysfs_attrs *attrs) err = kobject_init_and_add(&intervals->kobj, &damon_sysfs_intervals_ktype, &attrs->kobj, "intervals"); + if (err) + goto put_intervals_out; + err = damon_sysfs_intervals_add_dirs(intervals); if (err) goto put_intervals_out; attrs->intervals = intervals; @@ -599,6 +787,7 @@ put_intervals_out: static void damon_sysfs_attrs_rm_dirs(struct damon_sysfs_attrs *attrs) { kobject_put(&attrs->nr_regions_range->kobj); + damon_sysfs_intervals_rm_dirs(attrs->intervals); kobject_put(&attrs->intervals->kobj); } -- 2.51.0 From 0622c68d0a51f1268f3f9a171f4969c1bfc07c05 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:22 -0800 Subject: [PATCH 08/16] mm/damon/sysfs: commit intervals tuning goal Connect DAMON sysfs interface for sampling and aggregation intervals auto-tuning with DAMON core API, so that users can really use the feature using the sysfs files. Link: https://lkml.kernel.org/r/20250303221726.484227-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index a772060300b4..fa5f004f0670 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1273,11 +1273,18 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, struct damon_sysfs_attrs *sys_attrs) { struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; + struct damon_sysfs_intervals_goal *sys_goal = + sys_intervals->intervals_goal; struct damon_sysfs_ul_range *sys_nr_regions = sys_attrs->nr_regions_range; struct damon_attrs attrs = { .sample_interval = sys_intervals->sample_us, .aggr_interval = sys_intervals->aggr_us, + .intervals_goal = { + .access_bp = sys_goal->access_bp, + .aggrs = sys_goal->aggrs, + .min_sample_us = sys_goal->min_sample_us, + .max_sample_us = sys_goal->max_sample_us}, .ops_update_interval = sys_intervals->update_us, .min_nr_regions = sys_nr_regions->min, .max_nr_regions = sys_nr_regions->max, -- 2.51.0 From 1077605396b4da993327ebe40eabc28478e2be94 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:23 -0800 Subject: [PATCH 09/16] mm/damon/sysfs: implement a command to update auto-tuned monitoring intervals DAMON kernel API callers can show auto-tuned sampling and aggregation intervals from the monmitoring attributes data structure. That can be useful for debugging or tuning of the feature. DAMON user-space ABI users has no way to see that, though. Implement a new DAMON sysfs interface command, namely 'update_tuned_intervals', for the purpose. If the command is written to the kdamond state file, the tuned sampling and aggregation intervals will be updated to the corresponding sysfs interface files. Link: https://lkml.kernel.org/r/20250303221726.484227-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index fa5f004f0670..ccd435d234b9 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1213,6 +1213,11 @@ enum damon_sysfs_cmd { * effective size quota of the scheme in bytes. */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_EFFECTIVE_QUOTAS, + /* + * @DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: Update the tuned monitoring + * intevals. + */ + DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS, /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ @@ -1230,6 +1235,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "update_schemes_tried_regions", "clear_schemes_tried_regions", "update_schemes_effective_quotas", + "update_tuned_intervals", }; /* @@ -1502,6 +1508,17 @@ static int damon_sysfs_upd_schemes_effective_quotas(void *data) return 0; } +static int damon_sysfs_upd_tuned_intervals(void *data) +{ + struct damon_sysfs_kdamond *kdamond = data; + struct damon_ctx *ctx = kdamond->damon_ctx; + + kdamond->contexts->contexts_arr[0]->attrs->intervals->sample_us = + ctx->attrs.sample_interval; + kdamond->contexts->contexts_arr[0]->attrs->intervals->aggr_us = + ctx->attrs.aggr_interval; + return 0; +} /* * damon_sysfs_cmd_request_callback() - DAMON callback for handling requests. @@ -1723,6 +1740,9 @@ static int damon_sysfs_handle_cmd(enum damon_sysfs_cmd cmd, return damon_sysfs_damon_call( damon_sysfs_upd_schemes_effective_quotas, kdamond); + case DAMON_SYSFS_CMD_UPDATE_TUNED_INTERVALS: + return damon_sysfs_damon_call( + damon_sysfs_upd_tuned_intervals, kdamond); default: break; } -- 2.51.0 From af03edb521f1ea5f66a2fa7cd3e4af7d9a1984e2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:24 -0800 Subject: [PATCH 10/16] Docs/mm/damon/design: document for intervals auto-tuning Document the design of DAMON sampling and aggregation intervals auto-tuning. [sj@kernel.org: fix a typo on 'intervals auto-tuning' section] Link: https://lkml.kernel.org/r/20250305182744.56125-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250303221726.484227-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 46 +++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 5af991551a86..5a8c1752dc8a 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -313,6 +313,10 @@ sufficient for the given purpose, it shouldn't be unnecessarily further lowered. It is recommended to be set proportional to ``aggregation interval``. By default, the ratio is set as ``1/20``, and it is still recommended. +Based on the manual tuning guide, DAMON provides more intuitive knob-based +intervals auto tuning mechanism. Please refer to :ref:`the design document of +the feature ` for detail. + Refer to below documents for an example tuning based on the above guide. .. toctree:: @@ -321,6 +325,48 @@ Refer to below documents for an example tuning based on the above guide. monitoring_intervals_tuning_example +.. _damon_design_monitoring_intervals_autotuning: + +Monitoring Intervals Auto-tuning +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +DAMON provides automatic tuning of the ``sampling interval`` and ``aggregation +interval`` based on the :ref:`the tuning guide idea +`. The tuning mechanism allows +users to set the aimed amount of access events to observe via DAMON within +given time interval. The target can be specified by the user as a ratio of +DAMON-observed access events to the theoretical maximum amount of the events +(``access_bp``) that measured within a given number of aggregations +(``aggrs``). + +The DAMON-observed access events are calculated in byte granularity based on +DAMON :ref:`region assumption `. For +example, if a region of size ``X`` bytes of ``Y`` ``nr_accesses`` is found, it +means ``X * Y`` access events are observed by DAMON. Theoretical maximum +access events for the region is calculated in same way, but replacing ``Y`` +with theoretical maximum ``nr_accesses``, which can be calculated as +``aggregation interval / sampling interval``. + +The mechanism calculates the ratio of access events for ``aggrs`` aggregations, +and increases or decrease the ``sampleing interval`` and ``aggregation +interval`` in same ratio, if the observed access ratio is lower or higher than +the target, respectively. The ratio of the intervals change is decided in +proportion to the distance between current samples ratio and the target ratio. + +The user can further set the minimum and maximum ``sampling interval`` that can +be set by the tuning mechanism using two parameters (``min_sample_us`` and +``max_sample_us``). Because the tuning mechanism changes ``sampling interval`` +and ``aggregation interval`` in same ratio always, the minimum and maximum +``aggregation interval`` after each of the tuning changes can automatically set +together. + +The tuning is turned off by default, and need to be set explicitly by the user. +As a rule of thumbs and the Parreto principle, 4% access samples ratio target +is recommended. Note that Parreto principle (80/20 rule) has applied twice. +That is, assumes 4% (20% of 20%) DAMON-observed access events ratio (source) +to capture 64% (80% multipled by 80%) real access events (outcomes). + + .. _damon_design_damos: Operation Schemes -- 2.51.0 From e2b23dc62369b76b68d8354f12baeaff14b6e24f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:25 -0800 Subject: [PATCH 11/16] Docs/ABI/damon: document intervals auto-tuning ABI Document the DAMON user-space ABI for DAMON sampling and aggregation intervals auto-tuning. Link: https://lkml.kernel.org/r/20250303221726.484227-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-damon | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index ccd13ca668c8..76da77d7f7b6 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -91,6 +91,36 @@ Description: Writing a value to this file sets the update interval of the DAMON context in microseconds as the value. Reading this file returns the value. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/intrvals_goal/access_bp +Date: Feb 2025 +Contact: SeongJae Park +Description: Writing a value to this file sets the monitoring intervals + auto-tuning target DAMON-observed access events ratio within + the given time interval (aggrs in same directory), in bp + (1/10,000). Reading this file returns the value. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/intrvals_goal/aggrs +Date: Feb 2025 +Contact: SeongJae Park +Description: Writing a value to this file sets the time interval to achieve + the monitoring intervals auto-tuning target DAMON-observed + access events ratio (access_bp in same directory) within. + Reading this file returns the value. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/intrvals_goal/min_sample_us +Date: Feb 2025 +Contact: SeongJae Park +Description: Writing a value to this file sets the minimum value of + auto-tuned sampling interval in microseconds. Reading this + file returns the value. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/intervals/intrvals_goal/max_sample_us +Date: Feb 2025 +Contact: SeongJae Park +Description: Writing a value to this file sets the maximum value of + auto-tuned sampling interval in microseconds. Reading this + file returns the value. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//monitoring_attrs/nr_regions/min WDate: Mar 2022 -- 2.51.0 From b243d666d1079587daa3f41fffdabbabad8dd075 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 3 Mar 2025 14:17:26 -0800 Subject: [PATCH 12/16] Docs/admin-guide/mm/damon/usage: add intervals_goal directory on the hierarchy Document DAMON sysfs interface usage for DAMON sampling and aggregation intervals auto-tuning. Link: https://lkml.kernel.org/r/20250303221726.484227-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 25 ++++++++++++++++++++ Documentation/mm/damon/design.rst | 4 ++++ 2 files changed, 29 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index dc37bba96273..de549dd18107 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -64,6 +64,7 @@ comma (","). │ │ │ │ :ref:`0 `/avail_operations,operations │ │ │ │ │ :ref:`monitoring_attrs `/ │ │ │ │ │ │ intervals/sample_us,aggr_us,update_us + │ │ │ │ │ │ │ intervals_goal/access_bp,aggrs,min_sample_us,max_sample_us │ │ │ │ │ │ nr_regions/min,max │ │ │ │ │ :ref:`targets `/nr_targets │ │ │ │ │ │ :ref:`0 `/pid_target @@ -132,6 +133,11 @@ Users can write below commands for the kdamond to the ``state`` file. - ``off``: Stop running. - ``commit``: Read the user inputs in the sysfs files except ``state`` file again. +- ``update_tuned_intervals``: Update the contents of ``sample_us`` and + ``aggr_us`` files of the kdamond with the auto-tuning applied ``sampling + interval`` and ``aggregation interval`` for the files. Please refer to + :ref:`intervals_goal section ` + for more details. - ``commit_schemes_quota_goals``: Read the DAMON-based operation schemes' :ref:`quota goals `. - ``update_schemes_stats``: Update the contents of stats files for each @@ -213,6 +219,25 @@ writing to and rading from the files. For more details about the intervals and monitoring regions range, please refer to the Design document (:doc:`/mm/damon/design`). +.. _damon_usage_sysfs_monitoring_intervals_goal: + +contexts//monitoring_attrs/intervals/intervals_goal/ +------------------------------------------------------- + +Under the ``intervals`` directory, one directory for automated tuning of +``sample_us`` and ``aggr_us``, namely ``intervals_goal`` directory also exists. +Under the directory, four files for the auto-tuning control, namely +``access_bp``, ``aggrs``, ``min_sample_us`` and ``max_sample_us`` exist. +Please refer to the :ref:`design document of the feature +` for the internal of the tuning +mechanism. Reading and writing the four files under ``intervals_goal`` +directory shows and updates the tuning parameters that described in the +:ref:design doc ` with the same +names. The tuning starts with the user-set ``sample_us`` and ``aggr_us``. The +tuning-applied current values of the two intervals can be read from the +``sample_us`` and ``aggr_us`` files after writing ``update_tuned_intervals`` to +the ``state`` file. + .. _sysfs_targets: contexts//targets/ diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 5a8c1752dc8a..e6fd3b604e70 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -366,6 +366,10 @@ is recommended. Note that Parreto principle (80/20 rule) has applied twice. That is, assumes 4% (20% of 20%) DAMON-observed access events ratio (source) to capture 64% (80% multipled by 80%) real access events (outcomes). +To know how user-space can use this feature via :ref:`DAMON sysfs interface +`, refer to :ref:`intervals_goal ` part of +the documentation. + .. _damon_design_damos: -- 2.51.0 From 691ee97e1a9de0cdb3efb893c1f180e3f4a35e32 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 3 Mar 2025 14:15:35 +0000 Subject: [PATCH 13/16] mm: fix lazy mmu docs and usage Patch series "Fix lazy mmu mode", v2. I'm planning to implement lazy mmu mode for arm64 to optimize vmalloc. As part of that, I will extend lazy mmu mode to cover kernel mappings in vmalloc table walkers. While lazy mmu mode is already used for kernel mappings in a few places, this will extend it's use significantly. Having reviewed the existing lazy mmu implementations in powerpc, sparc and x86, it looks like there are a bunch of bugs, some of which may be more likely to trigger once I extend the use of lazy mmu. So this series attempts to clarify the requirements and fix all the bugs in advance of that series. See patch #1 commit log for all the details. This patch (of 5): The docs, implementations and use of arch_[enter|leave]_lazy_mmu_mode() is a bit of a mess (to put it politely). There are a number of issues related to nesting of lazy mmu regions and confusion over whether the task, when in a lazy mmu region, is preemptible or not. Fix all the issues relating to the core-mm. Follow up commits will fix the arch-specific implementations. 3 arches implement lazy mmu; powerpc, sparc and x86. When arch_[enter|leave]_lazy_mmu_mode() was first introduced by commit 6606c3e0da53 ("[PATCH] paravirt: lazy mmu mode hooks.patch"), it was expected that lazy mmu regions would never nest and that the appropriate page table lock(s) would be held while in the region, thus ensuring the region is non-preemptible. Additionally lazy mmu regions were only used during manipulation of user mappings. Commit 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates") started invoking the lazy mmu mode in apply_to_pte_range(), which is used for both user and kernel mappings. For kernel mappings the region is no longer protected by any lock so there is no longer any guarantee about non-preemptibility. Additionally, for RT configs, the holding the PTL only implies no CPU migration, it doesn't prevent preemption. Commit bcc6cc832573 ("mm: add default definition of set_ptes()") added arch_[enter|leave]_lazy_mmu_mode() to the default implementation of set_ptes(), used by x86. So after this commit, lazy mmu regions can be nested. Additionally commit 1a10a44dfc1d ("sparc64: implement the new page table range API") and commit 9fee28baa601 ("powerpc: implement the new page table range API") did the same for the sparc and powerpc set_ptes() overrides. powerpc couldn't deal with preemption so avoids it in commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu mode"), which explicitly disables preemption for the whole region in its implementation. x86 can support preemption (or at least it could until it tried to add support nesting; more on this below). Sparc looks to be totally broken in the face of preemption, as far as I can tell. powerpc can't deal with nesting, so avoids it in commit 47b8def9358c ("powerpc/mm: Avoid calling arch_enter/leave_lazy_mmu() in set_ptes"), which removes the lazy mmu calls from its implementation of set_ptes(). x86 attempted to support nesting in commit 49147beb0ccb ("x86/xen: allow nesting of same lazy mode") but as far as I can tell, this breaks its support for preemption. In short, it's all a mess; the semantics for arch_[enter|leave]_lazy_mmu_mode() are not clearly defined and as a result the implementations all have different expectations, sticking plasters and bugs. arm64 is aiming to start using these hooks, so let's clean everything up before adding an arm64 implementation. Update the documentation to state that lazy mmu regions can never be nested, must not be called in interrupt context and preemption may or may not be enabled for the duration of the region. And fix the generic implementation of set_ptes() to avoid nesting. arch-specific fixes to conform to the new spec will proceed this one. These issues were spotted by code review and I have no evidence of issues being reported in the wild. Link: https://lkml.kernel.org/r/20250303141542.3371656-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20250303141542.3371656-2-ryan.roberts@arm.com Fixes: bcc6cc832573 ("mm: add default definition of set_ptes()") Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Juergen Gross Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juegren Gross Cc: Matthew Wilcow (Oracle) Cc: Thomas Gleinxer Cc: Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 94d267d02372..787c632ee2c9 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -222,10 +222,14 @@ static inline int pmd_dirty(pmd_t pmd) * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be - * up to date. This mode can only be entered and left under the protection of - * the page table locks for all page tables which may be modified. In the UP - * case, this is required so that preemption is disabled, and in the SMP case, - * it must synchronize the delayed page table writes properly on other CPUs. + * up to date. + * + * In the general case, no lock is guaranteed to be held between entry and exit + * of the lazy mode. So the implementation must assume preemption may be enabled + * and cpu migration is possible; it must take steps to be robust against this. + * (In practice, for user PTE updates, the appropriate page table lock(s) are + * held, but for kernel PTE updates, no lock is held). Nesting is not permitted + * and the mode cannot be used in interrupt context. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) @@ -287,7 +291,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, { page_table_check_ptes_set(mm, ptep, pte, nr); - arch_enter_lazy_mmu_mode(); for (;;) { set_pte(ptep, pte); if (--nr == 0) @@ -295,7 +298,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, ptep++; pte = pte_next_pfn(pte); } - arch_leave_lazy_mmu_mode(); } #endif #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) -- 2.51.0 From ad449d856bd7e7461ac740abb9b5d10a824e0166 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 3 Mar 2025 14:15:36 +0000 Subject: [PATCH 14/16] fs/proc/task_mmu: reduce scope of lazy mmu region Update the way arch_[enter|leave]_lazy_mmu_mode() is called in pagemap_scan_pmd_entry() to follow the normal pattern of holding the ptl for user space mappings. As a result the scope is reduced to only the pte table, but that's where most of the performance win is. While I believe there wasn't technically a bug here, the original scope made it easier to accidentally nest or, worse, accidentally call something like kmap() which would expect an immediate mode pte modification but it would end up deferred. Link: https://lkml.kernel.org/r/20250303141542.3371656-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Juergen Gross Cc: Andreas Larsson Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juegren Gross Cc: Matthew Wilcow (Oracle) Cc: Thomas Gleinxer Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c17615e21a5d..b0f189815512 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2459,22 +2459,19 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, spinlock_t *ptl; int ret; - arch_enter_lazy_mmu_mode(); - ret = pagemap_scan_thp_entry(pmd, start, end, walk); - if (ret != -ENOENT) { - arch_leave_lazy_mmu_mode(); + if (ret != -ENOENT) return ret; - } ret = 0; start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); if (!pte) { - arch_leave_lazy_mmu_mode(); walk->action = ACTION_AGAIN; return 0; } + arch_enter_lazy_mmu_mode(); + if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { @@ -2543,8 +2540,8 @@ flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); - pte_unmap_unlock(start_pte, ptl); arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); cond_resched(); return ret; -- 2.51.0 From a1d416bf9faf4f4871cb5a943614a07f80a7d70f Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 3 Mar 2025 14:15:37 +0000 Subject: [PATCH 15/16] sparc/mm: disable preemption in lazy mmu mode Since commit 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates") it's been possible for arch_[enter|leave]_lazy_mmu_mode() to be called without holding a page table lock (for the kernel mappings case), and therefore it is possible that preemption may occur while in the lazy mmu mode. The Sparc lazy mmu implementation is not robust to preemption since it stores the lazy mode state in a per-cpu structure and does not attempt to manage that state on task switch. Powerpc had the same issue and fixed it by explicitly disabling preemption in arch_enter_lazy_mmu_mode() and re-enabling in arch_leave_lazy_mmu_mode(). See commit b9ef323ea168 ("powerpc/64s: Disable preemption in hash lazy mmu mode"). Given Sparc's lazy mmu mode is based on powerpc's, let's fix it in the same way here. Link: https://lkml.kernel.org/r/20250303141542.3371656-4-ryan.roberts@arm.com Fixes: 38e0edb15bd0 ("mm/apply_to_range: call pte function with lazy updates") Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Andreas Larsson Acked-by: Juergen Gross Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juegren Gross Cc: Matthew Wilcow (Oracle) Cc: Thomas Gleinxer Cc: Signed-off-by: Andrew Morton --- arch/sparc/mm/tlb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 8648a50afe88..a35ddcca5e76 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -52,8 +52,10 @@ out: void arch_enter_lazy_mmu_mode(void) { - struct tlb_batch *tb = this_cpu_ptr(&tlb_batch); + struct tlb_batch *tb; + preempt_disable(); + tb = this_cpu_ptr(&tlb_batch); tb->active = 1; } @@ -64,6 +66,7 @@ void arch_leave_lazy_mmu_mode(void) if (tb->tlb_nr) flush_tlb_pending(); tb->active = 0; + preempt_enable(); } static void tlb_batch_add_one(struct mm_struct *mm, unsigned long vaddr, -- 2.51.0 From eb61ad14c459b54f71f76331ca35d12fa3eb8f98 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 3 Mar 2025 14:15:38 +0000 Subject: [PATCH 16/16] sparc/mm: avoid calling arch_enter/leave_lazy_mmu() in set_ptes With commit 1a10a44dfc1d ("sparc64: implement the new page table range API") set_ptes was added to the sparc architecture. The implementation included calling arch_enter/leave_lazy_mmu() calls. The patch removes the usage of arch_enter/leave_lazy_mmu() since this implies nesting of lazy mmu regions which is not supported. Without this fix, lazy mmu mode is effectively disabled because we exit the mode after the first set_ptes: remap_pte_range() -> arch_enter_lazy_mmu() -> set_ptes() -> arch_enter_lazy_mmu() -> arch_leave_lazy_mmu() -> arch_leave_lazy_mmu() Powerpc suffered the same problem and fixed it in a corresponding way with commit 47b8def9358c ("powerpc/mm: Avoid calling arch_enter/leave_lazy_mmu() in set_ptes"). Link: https://lkml.kernel.org/r/20250303141542.3371656-5-ryan.roberts@arm.com Fixes: 1a10a44dfc1d ("sparc64: implement the new page table range API") Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Andreas Larsson Acked-by: Juergen Gross Cc: Borislav Betkov Cc: Boris Ostrovsky Cc: Catalin Marinas Cc: Dave Hansen Cc: David S. Miller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Juegren Gross Cc: Matthew Wilcow (Oracle) Cc: Thomas Gleinxer Cc: Signed-off-by: Andrew Morton --- arch/sparc/include/asm/pgtable_64.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 2b7f358762c1..dc28f2c4eee3 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -936,7 +936,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned int nr) { - arch_enter_lazy_mmu_mode(); for (;;) { __set_pte_at(mm, addr, ptep, pte, 0); if (--nr == 0) @@ -945,7 +944,6 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, pte_val(pte) += PAGE_SIZE; addr += PAGE_SIZE; } - arch_leave_lazy_mmu_mode(); } #define set_ptes set_ptes -- 2.51.0