From 591c4c78be06360fe31b31401564bdb2d2b79995 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 12 May 2025 17:27:10 -0700 Subject: [PATCH 01/16] mm/damon/core: warn and fix nr_accesses[_bp] corruption Patch series "mm/damon: minor fixups and improvements for code, tests, and documents". Yet another batch of miscellaneous DAMON changes. Fix and improve minor problems in code, tests and documents. This patch (of 6): For a bug such as double aggregation reset[1], ->nr_accesses and/or ->nr_accesses_bp of damon_region could be corrupted. Such corruption can make monitoring results pretty inaccurate, so the root causing bug should be investigated. Meanwhile, the corruption itself can easily be fixed but silently fixing it will hide the bug. Fix the corruption as soon as found, but WARN_ONCE() so that we can be aware of the existence of the bug while keeping the system running in a more sane way. Link: https://lkml.kernel.org/r/20250513002715.40126-1-sj@kernel.org Link: https://lkml.kernel.org/r/20250513002715.40126-2-sj@kernel.org Link: https://lore.kernel.org/20250302214145.356806-1-sj@kernel.org [1] Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index 587fb9a4fef8..0bb71e2ab713 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1391,6 +1391,19 @@ int damos_walk(struct damon_ctx *ctx, struct damos_walk_control *control) return 0; } +/* + * Warn and fix corrupted ->nr_accesses[_bp] for investigations and preventing + * the problem being propagated. + */ +static void damon_warn_fix_nr_accesses_corruption(struct damon_region *r) +{ + if (r->nr_accesses_bp == r->nr_accesses * 10000) + return; + WARN_ONCE(true, "invalid nr_accesses_bp at reset: %u %u\n", + r->nr_accesses_bp, r->nr_accesses); + r->nr_accesses_bp = r->nr_accesses * 10000; +} + /* * Reset the aggregated monitoring results ('nr_accesses' of each region). */ @@ -1404,6 +1417,7 @@ static void kdamond_reset_aggregated(struct damon_ctx *c) damon_for_each_region(r, t) { trace_damon_aggregated(ti, r, damon_nr_regions(t)); + damon_warn_fix_nr_accesses_corruption(r); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } -- 2.51.0 From 0bac6b1a1111977c769e1933e0b0fc24e3647d97 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 12 May 2025 17:27:11 -0700 Subject: [PATCH 02/16] mm/damon/sysfs-schemes: fix wrong comment on damons_sysfs_quota_goal_metric_strs A comment on damos_sysfs_quota_goal_metric_strs is simply wrong, due to a copy-and-paste error. Fix it. Link: https://lkml.kernel.org/r/20250513002715.40126-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index c2b8a9cb44ec..0f6c9e1fec0b 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -940,7 +940,7 @@ struct damos_sysfs_quota_goal { int nid; }; -/* This should match with enum damos_action */ +/* This should match with enum damos_quota_goal_metric */ static const char * const damos_sysfs_quota_goal_metric_strs[] = { "user_input", "some_mem_psi_us", -- 2.51.0 From a82cf30010665a3602cc6051d760e19e9174ae3d Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 12 May 2025 17:27:12 -0700 Subject: [PATCH 03/16] mm/damon/paddr: remove unused variable, folio_list, in damon_pa_stat() Commit c0cb9d91bf297 ("mm/damon/paddr: report filter-passed bytes back for DAMOS_STAT action") added unused variable in damon_pa_stat(), due to a copy-and-paste error. Remove it. Link: https://lkml.kernel.org/r/20250513002715.40126-4-sj@kernel.org Fixes: c0cb9d91bf29 ("mm/damon/paddr: report filter-passed bytes back for DAMOS_STAT action") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 1b70d3f36046..e8464f7e0014 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -548,7 +548,6 @@ static unsigned long damon_pa_stat(struct damon_region *r, struct damos *s, unsigned long *sz_filter_passed) { unsigned long addr; - LIST_HEAD(folio_list); struct folio *folio; if (!damon_pa_scheme_has_filter(s)) -- 2.51.0 From 094fb14913c7c92b2fa8b398fe6a4b8f143aec25 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 12 May 2025 17:27:13 -0700 Subject: [PATCH 04/16] mm/damon/tests/core-kunit: add a test for damos_set_filters_default_reject() DAMOS filters' default reject behavior is not very simple. Actually there was a mistake[1] during the development. Add a kunit test for validating the behavior. Link: https://lkml.kernel.org/r/20250513002715.40126-5-sj@kernel.org Link: https://lore.kernel.org/20250227002913.19359-1-sj@kernel.org [1] Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/tests/core-kunit.h | 70 +++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/mm/damon/tests/core-kunit.h b/mm/damon/tests/core-kunit.h index be0fea9ee5fc..298c67557fae 100644 --- a/mm/damon/tests/core-kunit.h +++ b/mm/damon/tests/core-kunit.h @@ -510,6 +510,75 @@ static void damon_test_feed_loop_next_input(struct kunit *test) damon_feed_loop_next_input(last_input, 2000)); } +static void damon_test_set_filters_default_reject(struct kunit *test) +{ + struct damos scheme; + struct damos_filter *target_filter, *anon_filter; + + INIT_LIST_HEAD(&scheme.filters); + INIT_LIST_HEAD(&scheme.ops_filters); + + damos_set_filters_default_reject(&scheme); + /* + * No filter is installed. Allow by default on both core and ops layer + * filtering stages, since there are no filters at all. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + target_filter = damos_new_filter(DAMOS_FILTER_TYPE_TARGET, true, true); + damos_add_filter(&scheme, target_filter); + damos_set_filters_default_reject(&scheme); + /* + * A core-handled allow-filter is installed. + * Rejct by default on core layer filtering stage due to the last + * core-layer-filter's behavior. + * Allow by default on ops layer filtering stage due to the absence of + * ops layer filters. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, true); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + target_filter->allow = false; + damos_set_filters_default_reject(&scheme); + /* + * A core-handled reject-filter is installed. + * Allow by default on core layer filtering stage due to the last + * core-layer-filter's behavior. + * Allow by default on ops layer filtering stage due to the absence of + * ops layer filters. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, false); + + anon_filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true, true); + damos_add_filter(&scheme, anon_filter); + + damos_set_filters_default_reject(&scheme); + /* + * A core-handled reject-filter and ops-handled allow-filter are installed. + * Allow by default on core layer filtering stage due to the existence + * of the ops-handled filter. + * Reject by default on ops layer filtering stage due to the last + * ops-layer-filter's behavior. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); + + target_filter->allow = true; + damos_set_filters_default_reject(&scheme); + /* + * A core-handled allow-filter and ops-handled allow-filter are + * installed. + * Allow by default on core layer filtering stage due to the existence + * of the ops-handled filter. + * Reject by default on ops layer filtering stage due to the last + * ops-layer-filter's behavior. + */ + KUNIT_EXPECT_EQ(test, scheme.core_filters_default_reject, false); + KUNIT_EXPECT_EQ(test, scheme.ops_filters_default_reject, true); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -527,6 +596,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damos_test_new_filter), KUNIT_CASE(damos_test_filter_out), KUNIT_CASE(damon_test_feed_loop_next_input), + KUNIT_CASE(damon_test_set_filters_default_reject), {}, }; -- 2.51.0 From 03f83209e8e72df7eb6ed0afc60adca492844082 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 12 May 2025 17:27:14 -0700 Subject: [PATCH 05/16] selftests/damon/_damon_sysfs: read tried regions directories in order Kdamond.update_schemes_tried_regions() reads and stores tried regions information out of address order. It makes debugging a test failure difficult. Change the behavior to do the reading and writing in the address order. Link: https://lkml.kernel.org/r/20250513002715.40126-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_damon_sysfs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index 6e136dc3df19..1e587e0b1a39 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -420,11 +420,16 @@ class Kdamond: tried_regions = [] tried_regions_dir = os.path.join( scheme.sysfs_dir(), 'tried_regions') + region_indices = [] for filename in os.listdir( os.path.join(scheme.sysfs_dir(), 'tried_regions')): tried_region_dir = os.path.join(tried_regions_dir, filename) if not os.path.isdir(tried_region_dir): continue + region_indices.append(int(filename)) + for region_idx in sorted(region_indices): + tried_region_dir = os.path.join(tried_regions_dir, + '%d' % region_idx) region_values = [] for f in ['start', 'end', 'nr_accesses', 'age']: content, err = read_file( -- 2.51.0 From 6a4b3551ba1079e8831fb2821765a49b7fd33dbd Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 12 May 2025 17:27:15 -0700 Subject: [PATCH 06/16] Docs/damon: update titles and brief introductions to explain DAMOS DAMON was initially developed only for data access monitoring, and then extended for not only access monitoring but also access-aware system operations (DAMOS). But the documents have old titles and brief introductions for only the monitoring part. Update the titles and the brief introductions to explain DAMOS part together. Link: https://lkml.kernel.org/r/20250513002715.40126-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: David Gow Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/index.rst | 11 +++++------ Documentation/mm/damon/index.rst | 6 +++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 33d37bb2fb4e..bc7e976120e0 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -1,12 +1,11 @@ .. SPDX-License-Identifier: GPL-2.0 -========================== -DAMON: Data Access MONitor -========================== +================================================================ +DAMON: Data Access MONitoring and Access-aware System Operations +================================================================ -:doc:`DAMON ` allows light-weight data access monitoring. -Using DAMON, users can analyze the memory access patterns of their systems and -optimize those. +:doc:`DAMON ` is a Linux kernel subsystem for efficient data +access monitoring and access-aware system operations. .. toctree:: :maxdepth: 2 diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index 5a3359704cce..31c1fa955b3d 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -========================== -DAMON: Data Access MONitor -========================== +================================================================ +DAMON: Data Access MONitoring and Access-aware System Operations +================================================================ DAMON is a Linux kernel subsystem that provides a framework for data access monitoring and the monitoring results based system operations. The core -- 2.51.0 From 780138b123816d717dbc0771d4c87e9a8a01963d Mon Sep 17 00:00:00 2001 From: Casey Chen Date: Tue, 13 May 2025 12:26:02 -0600 Subject: [PATCH 07/16] alloc_tag: check mem_profiling_support in alloc_tag_init If mem_profiling_support is false, for example by sysctl.vm.mem_profiling=never, alloc_tag_init should skip module tags allocation, codetag type registration and procfs init. Link: https://lkml.kernel.org/r/20250513182602.121843-1-cachen@purestorage.com Signed-off-by: Casey Chen Reviewed-by: Yuanyuan Zhong Acked-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 25ecc1334b67..3baf19879c8e 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -244,17 +244,6 @@ static void shutdown_mem_profiling(bool remove_file) mem_profiling_support = false; } -static void __init procfs_init(void) -{ - if (!mem_profiling_support) - return; - - if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { - pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); - shutdown_mem_profiling(false); - } -} - void __init alloc_tag_sec_init(void) { struct alloc_tag *last_codetag; @@ -762,19 +751,34 @@ static int __init alloc_tag_init(void) }; int res; + sysctl_init(); + + if (!mem_profiling_support) { + pr_info("Memory allocation profiling is not supported!\n"); + return 0; + } + + if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { + pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); + shutdown_mem_profiling(false); + return -ENOMEM; + } + res = alloc_mod_tags_mem(); - if (res) + if (res) { + pr_err("Failed to reserve address space for module tags, errno = %d\n", res); + shutdown_mem_profiling(true); return res; + } alloc_tag_cttype = codetag_register_type(&desc); if (IS_ERR(alloc_tag_cttype)) { + pr_err("Allocation tags registration failed, errno = %ld\n", PTR_ERR(alloc_tag_cttype)); free_mod_tags_mem(); + shutdown_mem_profiling(true); return PTR_ERR(alloc_tag_cttype); } - sysctl_init(); - procfs_init(); - return 0; } module_init(alloc_tag_init); -- 2.51.0 From 19e0713bbe4a682b651dfd3773d2a06a9d61c47b Mon Sep 17 00:00:00 2001 From: Ryan Chung Date: Tue, 13 May 2025 16:44:11 +0900 Subject: [PATCH 08/16] selftests/eventfd: correct test name and improve messages MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit - Rename test from eventfd_chek_flag_cloexec_and_nonblock to eventfd_check_flag_cloexec_and_nonblock. - Make the RDWR‐flag comment declarative: “The kernel automatically adds the O_RDWR flag.” - Update semaphore‐flag failure message to: “eventfd semaphore flag check failed: …” Link: https://lkml.kernel.org/r/20250513074411.6965-1-seokwoo.chung130@gmail.com Signed-off-by: Ryan Chung Reviewed-by: Wen Yang Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/filesystems/eventfd/eventfd_test.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c index 85acb4e3ef00..72d51ad0ee0e 100644 --- a/tools/testing/selftests/filesystems/eventfd/eventfd_test.c +++ b/tools/testing/selftests/filesystems/eventfd/eventfd_test.c @@ -50,7 +50,7 @@ TEST(eventfd_check_flag_rdwr) ASSERT_GE(fd, 0); flags = fcntl(fd, F_GETFL); - // since the kernel automatically added O_RDWR. + // The kernel automatically adds the O_RDWR flag. EXPECT_EQ(flags, O_RDWR); close(fd); @@ -85,7 +85,7 @@ TEST(eventfd_check_flag_nonblock) close(fd); } -TEST(eventfd_chek_flag_cloexec_and_nonblock) +TEST(eventfd_check_flag_cloexec_and_nonblock) { int fd, flags; @@ -178,8 +178,7 @@ TEST(eventfd_check_flag_semaphore) // The semaphore could only be obtained from fdinfo. ret = verify_fdinfo(fd, &err, "eventfd-semaphore: ", 19, "1\n"); if (ret != 0) - ksft_print_msg("eventfd-semaphore check failed, msg: %s\n", - err.msg); + ksft_print_msg("eventfd semaphore flag check failed: %s\n", err.msg); EXPECT_EQ(ret, 0); close(fd); -- 2.51.0 From cc79061b8fc119807111b615aa791562374b15b2 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 13 May 2025 14:56:35 +0800 Subject: [PATCH 09/16] mm: khugepaged: decouple SHMEM and file folios' collapse Originally, the file pages collapse was intended for tmpfs/shmem to merge into THP in the background. However, now not only tmpfs/shmem can support large folios, but some other file systems (such as XFS, erofs ...) also support large folios. Therefore, it is time to decouple the support of file folios collapse from SHMEM. Link: https://lkml.kernel.org/r/ce5c2314e0368cf34bda26f9bacf01c982d4da17.1747119309.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Acked-by: Zi Yan Cc: Dev Jain Cc: Liam Howlett Cc: Lorenzo Stoakes Cc: Mariano Pache Cc: Michal Hocko Cc: Mike Rapoport Cc: Ryan Roberts Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 8 -------- mm/Kconfig | 2 +- mm/khugepaged.c | 13 ++----------- 3 files changed, 3 insertions(+), 20 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 1f46046080f5..b8d69cfbb58b 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,16 +15,8 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); extern bool current_is_khugepaged(void); -#ifdef CONFIG_SHMEM extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd); -#else -static inline int collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr, bool install_pmd) -{ - return 0; -} -#endif static inline void khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) { diff --git a/mm/Kconfig b/mm/Kconfig index 60ea9eba4814..bd08e151fa1b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -886,7 +886,7 @@ config THP_SWAP config READ_ONLY_THP_FOR_FS bool "Read-only THP for filesystems (EXPERIMENTAL)" - depends on TRANSPARENT_HUGEPAGE && SHMEM + depends on TRANSPARENT_HUGEPAGE help Allow khugepaged to put read-only file-backed pages in THP. diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ebcd7c8a4b44..cdf5a581368b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1464,7 +1464,6 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) } } -#ifdef CONFIG_SHMEM /* folio must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct folio *folio, struct page *page) @@ -2353,14 +2352,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, trace_mm_khugepaged_scan_file(mm, folio, file, present, swap, result); return result; } -#else -static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, - struct file *file, pgoff_t start, - struct collapse_control *cc) -{ - BUILD_BUG(); -} -#endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, struct collapse_control *cc) @@ -2436,7 +2427,7 @@ skip: VM_BUG_ON(khugepaged_scan.address < hstart || khugepaged_scan.address + HPAGE_PMD_SIZE > hend); - if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); @@ -2782,7 +2773,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); nodes_clear(cc->alloc_nmask); - if (IS_ENABLED(CONFIG_SHMEM) && !vma_is_anonymous(vma)) { + if (!vma_is_anonymous(vma)) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); -- 2.51.0 From 8a4b42b955286c122c4402b458acbc1d92953fbd Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:52 -0700 Subject: [PATCH 10/16] memcg: memcg_rstat_updated re-entrant safe against irqs Patch series "memcg: make memcg stats irq safe", v2. This series converts memcg stats to be irq safe i.e. memcg stats can be updated in any context (task, softirq or hardirq) without disabling the irqs. This is still not nmi-safe on all architectures but after this series converting memcg charging and stats nmi-safe will be easier. This patch (of 7): memcg_rstat_updated() is used to track the memcg stats updates for optimizing the flushes. At the moment, it is not re-entrant safe and the callers disabled irqs before calling. However to achieve the goal of updating memcg stats without irqs, memcg_rstat_updated() needs to be re-entrant safe against irqs. This patch makes memcg_rstat_updated() re-entrant safe using this_cpu_* ops. On archs with CONFIG_ARCH_HAS_NMI_SAFE_THIS_CPU_OPS, this patch is also making memcg_rstat_updated() nmi safe. [lorenzo.stoakes@oracle.com: fix build] Link: https://lkml.kernel.org/r/22f69e6e-7908-4e92-96ca-5c70d535c439@lucifer.local Link: https://lkml.kernel.org/r/20250514184158.3471331-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20250514184158.3471331-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Tested-by: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b6768f0e916..01560e22dd06 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -503,8 +503,8 @@ struct memcg_vmstats_percpu { unsigned int stats_updates; /* Cached pointers for fast iteration in memcg_rstat_updated() */ - struct memcg_vmstats_percpu *parent; - struct memcg_vmstats *vmstats; + struct memcg_vmstats_percpu __percpu *parent_pcpu; + struct memcg_vmstats *vmstats; /* The above should fit a single cacheline for memcg_rstat_updated() */ @@ -586,16 +586,21 @@ static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) { + struct memcg_vmstats_percpu __percpu *statc_pcpu; struct memcg_vmstats_percpu *statc; - int cpu = smp_processor_id(); + int cpu; unsigned int stats_updates; if (!val) return; + /* Don't assume callers have preemption disabled. */ + cpu = get_cpu(); + cgroup_rstat_updated(memcg->css.cgroup, cpu); - statc = this_cpu_ptr(memcg->vmstats_percpu); - for (; statc; statc = statc->parent) { + statc_pcpu = memcg->vmstats_percpu; + for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { + statc = this_cpu_ptr(statc_pcpu); /* * If @memcg is already flushable then all its ancestors are * flushable as well and also there is no need to increase @@ -604,14 +609,15 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) if (memcg_vmstats_needs_flush(statc->vmstats)) break; - stats_updates = READ_ONCE(statc->stats_updates) + abs(val); - WRITE_ONCE(statc->stats_updates, stats_updates); + stats_updates = this_cpu_add_return(statc_pcpu->stats_updates, + abs(val)); if (stats_updates < MEMCG_CHARGE_BATCH) continue; + stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); atomic64_add(stats_updates, &statc->vmstats->stats_updates); - WRITE_ONCE(statc->stats_updates, 0); } + put_cpu(); } static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) @@ -3689,7 +3695,8 @@ static void mem_cgroup_free(struct mem_cgroup *memcg) static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) { - struct memcg_vmstats_percpu *statc, *pstatc; + struct memcg_vmstats_percpu *statc; + struct memcg_vmstats_percpu __percpu *pstatc_pcpu; struct mem_cgroup *memcg; int node, cpu; int __maybe_unused i; @@ -3720,9 +3727,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) for_each_possible_cpu(cpu) { if (parent) - pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu); + pstatc_pcpu = parent->vmstats_percpu; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - statc->parent = parent ? pstatc : NULL; + statc->parent_pcpu = parent ? pstatc_pcpu : NULL; statc->vmstats = memcg->vmstats; } -- 2.51.0 From c7163535cdaf7305aac14745483abb54684a43d5 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:53 -0700 Subject: [PATCH 11/16] memcg: move preempt disable to callers of memcg_rstat_updated Let's move the explicit preempt disable code to the callers of memcg_rstat_updated and also remove the memcg_stats_lock and related functions which ensures the callers of stats update functions have disabled preemption because now the stats update functions are explicitly disabling preemption. Link: https://lkml.kernel.org/r/20250514184158.3471331-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 74 +++++++++++++------------------------------------ 1 file changed, 19 insertions(+), 55 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 01560e22dd06..82b51e1b36f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -555,48 +555,22 @@ static u64 flush_last_time; #define FLUSH_TIME (2UL*HZ) -/* - * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can - * not rely on this as part of an acquired spinlock_t lock. These functions are - * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion - * is sufficient. - */ -static void memcg_stats_lock(void) -{ - preempt_disable_nested(); - VM_WARN_ON_IRQS_ENABLED(); -} - -static void __memcg_stats_lock(void) -{ - preempt_disable_nested(); -} - -static void memcg_stats_unlock(void) -{ - preempt_enable_nested(); -} - - static bool memcg_vmstats_needs_flush(struct memcg_vmstats *vmstats) { return atomic64_read(&vmstats->stats_updates) > MEMCG_CHARGE_BATCH * num_online_cpus(); } -static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) +static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val, + int cpu) { struct memcg_vmstats_percpu __percpu *statc_pcpu; struct memcg_vmstats_percpu *statc; - int cpu; unsigned int stats_updates; if (!val) return; - /* Don't assume callers have preemption disabled. */ - cpu = get_cpu(); - cgroup_rstat_updated(memcg->css.cgroup, cpu); statc_pcpu = memcg->vmstats_percpu; for (; statc_pcpu; statc_pcpu = statc->parent_pcpu) { @@ -617,7 +591,6 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) stats_updates = this_cpu_xchg(statc_pcpu->stats_updates, 0); atomic64_add(stats_updates, &statc->vmstats->stats_updates); } - put_cpu(); } static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) @@ -715,6 +688,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { int i = memcg_stats_index(idx); + int cpu; if (mem_cgroup_disabled()) return; @@ -722,12 +696,14 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - memcg_stats_lock(); + cpu = get_cpu(); + __this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_state(memcg, idx, val); - memcg_stats_unlock(); + + put_cpu(); } #ifdef CONFIG_MEMCG_V1 @@ -756,6 +732,7 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, struct mem_cgroup_per_node *pn; struct mem_cgroup *memcg; int i = memcg_stats_index(idx); + int cpu; if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; @@ -763,24 +740,7 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; - /* - * The caller from rmap relies on disabled preemption because they never - * update their counter from in-interrupt context. For these two - * counters we check that the update is never performed from an - * interrupt context while other caller need to have disabled interrupt. - */ - __memcg_stats_lock(); - if (IS_ENABLED(CONFIG_DEBUG_VM)) { - switch (idx) { - case NR_ANON_MAPPED: - case NR_FILE_MAPPED: - case NR_ANON_THPS: - WARN_ON_ONCE(!in_task()); - break; - default: - VM_WARN_ON_IRQS_ENABLED(); - } - } + cpu = get_cpu(); /* Update memcg */ __this_cpu_add(memcg->vmstats_percpu->state[i], val); @@ -789,9 +749,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); - memcg_rstat_updated(memcg, val); + memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_lruvec_state(memcg, idx, val); - memcg_stats_unlock(); + + put_cpu(); } /** @@ -871,6 +832,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { int i = memcg_events_index(idx); + int cpu; if (mem_cgroup_disabled()) return; @@ -878,11 +840,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx)) return; - memcg_stats_lock(); + cpu = get_cpu(); + __this_cpu_add(memcg->vmstats_percpu->events[i], count); - memcg_rstat_updated(memcg, count); + memcg_rstat_updated(memcg, count, cpu); trace_count_memcg_events(memcg, idx, count); - memcg_stats_unlock(); + + put_cpu(); } unsigned long memcg_events(struct mem_cgroup *memcg, int event) -- 2.51.0 From 8814e3b8692b31e0150a894dd70c14ca0b7b746a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:54 -0700 Subject: [PATCH 12/16] memcg: make mod_memcg_state re-entrant safe against irqs Let's make mod_memcg_state re-entrant safe against irqs. The only thing needed is to convert the usage of __this_cpu_add() to this_cpu_add(). In addition, with re-entrant safety, there is no need to disable irqs. mod_memcg_state() is not safe against nmi, so let's add warning if someone tries to call it in nmi context. Link: https://lkml.kernel.org/r/20250514184158.3471331-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 20 ++------------------ mm/memcontrol.c | 8 ++++---- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9ed75f82b858..92861ff3c43f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -903,19 +903,9 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); -void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, - int val); - /* idx can be of type enum memcg_stat_item or node_stat_item */ -static inline void mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, int val) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_memcg_state(memcg, idx, val); - local_irq_restore(flags); -} +void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val); static inline void mod_memcg_page_state(struct page *page, enum memcg_stat_item idx, int val) @@ -1375,12 +1365,6 @@ static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } -static inline void __mod_memcg_state(struct mem_cgroup *memcg, - enum memcg_stat_item idx, - int nr) -{ -} - static inline void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int nr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 82b51e1b36f0..25d5b198c8af 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -679,12 +679,12 @@ static int memcg_state_val_in_pages(int idx, int val) } /** - * __mod_memcg_state - update cgroup memory statistics + * mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item * @val: delta to add to the counter, can be negative */ -void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, +void mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, int val) { int i = memcg_stats_index(idx); @@ -698,7 +698,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, cpu = get_cpu(); - __this_cpu_add(memcg->vmstats_percpu->state[i], val); + this_cpu_add(memcg->vmstats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); memcg_rstat_updated(memcg, val, cpu); trace_mod_memcg_state(memcg, idx, val); @@ -2918,7 +2918,7 @@ static void drain_obj_stock(struct obj_stock_pcp *stock) memcg = get_mem_cgroup_from_objcg(old); - __mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); + mod_memcg_state(memcg, MEMCG_KMEM, -nr_pages); memcg1_account_kmem(memcg, -nr_pages); if (!mem_cgroup_is_root(memcg)) memcg_uncharge(memcg, nr_pages); -- 2.51.0 From e52401e7247bb36cabba389ae32fb75a12a6e94b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:55 -0700 Subject: [PATCH 13/16] memcg: make count_memcg_events re-entrant safe against irqs Let's make count_memcg_events re-entrant safe against irqs. The only thing needed is to convert the usage of __this_cpu_add() to this_cpu_add(). In addition, with re-entrant safety, there is no need to disable irqs. Also add warnings for in_nmi() as it is not safe against nmi context. Link: https://lkml.kernel.org/r/20250514184158.3471331-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 21 ++------------------- mm/memcontrol-v1.c | 6 +++--- mm/memcontrol.c | 6 +++--- mm/swap.c | 8 ++++---- mm/vmscan.c | 14 +++++++------- 5 files changed, 19 insertions(+), 36 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 92861ff3c43f..f7848f73f41c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -942,19 +942,8 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, local_irq_restore(flags); } -void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, - unsigned long count); - -static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ - unsigned long flags; - - local_irq_save(flags); - __count_memcg_events(memcg, idx, count); - local_irq_restore(flags); -} +void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + unsigned long count); static inline void count_memcg_folio_events(struct folio *folio, enum vm_event_item idx, unsigned long nr) @@ -1418,12 +1407,6 @@ static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, } static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) -{ -} - -static inline void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 54c49cbfc968..4b94731305b9 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -512,9 +512,9 @@ static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages) { /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __count_memcg_events(memcg, PGPGIN, 1); + count_memcg_events(memcg, PGPGIN, 1); else { - __count_memcg_events(memcg, PGPGOUT, 1); + count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } @@ -689,7 +689,7 @@ void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long flags; local_irq_save(flags); - __count_memcg_events(memcg, PGPGOUT, pgpgout); + count_memcg_events(memcg, PGPGOUT, pgpgout); __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory); memcg1_check_events(memcg, nid); local_irq_restore(flags); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 25d5b198c8af..0ef7db12605b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -823,12 +823,12 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) } /** - * __count_memcg_events - account VM events in a cgroup + * count_memcg_events - account VM events in a cgroup * @memcg: the memory cgroup * @idx: the event item * @count: the number of events that occurred */ -void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, +void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { int i = memcg_events_index(idx); @@ -842,7 +842,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, cpu = get_cpu(); - __this_cpu_add(memcg->vmstats_percpu->events[i], count); + this_cpu_add(memcg->vmstats_percpu->events[i], count); memcg_rstat_updated(memcg, count, cpu); trace_count_memcg_events(memcg, idx, count); diff --git a/mm/swap.c b/mm/swap.c index 77b2d5997873..4fc322f7111a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -309,7 +309,7 @@ static void lru_activate(struct lruvec *lruvec, struct folio *folio) trace_mm_lru_activate(folio); __count_vm_events(PGACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages); } #ifdef CONFIG_SMP @@ -581,7 +581,7 @@ static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio) if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } } @@ -599,7 +599,7 @@ static void lru_deactivate(struct lruvec *lruvec, struct folio *folio) lruvec_add_folio(lruvec, folio); __count_vm_events(PGDEACTIVATE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages); } static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) @@ -625,7 +625,7 @@ static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio) lruvec_add_folio(lruvec, folio); __count_vm_events(PGLAZYFREE, nr_pages); - __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); + count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 7d6d1ce3921e..07c51fa03434 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2028,7 +2028,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, item = PGSCAN_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); __count_vm_events(PGSCAN_ANON + file, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2048,7 +2048,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); - __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); + count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); @@ -2138,7 +2138,7 @@ static void shrink_active_list(unsigned long nr_to_scan, if (!cgroup_reclaim(sc)) __count_vm_events(PGREFILL, nr_scanned); - __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); spin_unlock_irq(&lruvec->lru_lock); @@ -2195,7 +2195,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); __count_vm_events(PGDEACTIVATE, nr_deactivate); - __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); @@ -4612,8 +4612,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); } - __count_memcg_events(memcg, item, isolated); - __count_memcg_events(memcg, PGREFILL, sorted); + count_memcg_events(memcg, item, isolated); + count_memcg_events(memcg, PGREFILL, sorted); __count_vm_events(PGSCAN_ANON + type, isolated); trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, scanned, skipped, isolated, @@ -4763,7 +4763,7 @@ retry: item = PGSTEAL_KSWAPD + reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); - __count_memcg_events(memcg, item, reclaimed); + count_memcg_events(memcg, item, reclaimed); __count_vm_events(PGSTEAL_ANON + type, reclaimed); spin_unlock_irq(&lruvec->lru_lock); -- 2.51.0 From eee8a1778cab9efd5ba0eee1c081b4a4b396375b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:56 -0700 Subject: [PATCH 14/16] memcg: make __mod_memcg_lruvec_state re-entrant safe against irqs Let's make __mod_memcg_lruvec_state re-entrant safe and name it mod_memcg_lruvec_state(). The only thing needed is to convert the usage of __this_cpu_add() to this_cpu_add(). There are two callers of mod_memcg_lruvec_state() and one of them i.e. __mod_objcg_mlstate() will be re-entrant safe as well, so, rename it mod_objcg_mlstate(). The last caller __mod_lruvec_state() still calls __mod_node_page_state() which is not re-entrant safe yet, so keep it as is. Link: https://lkml.kernel.org/r/20250514184158.3471331-6-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0ef7db12605b..f66cacb6f2a0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -725,7 +725,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) } #endif -static void __mod_memcg_lruvec_state(struct lruvec *lruvec, +static void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { @@ -743,10 +743,10 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, cpu = get_cpu(); /* Update memcg */ - __this_cpu_add(memcg->vmstats_percpu->state[i], val); + this_cpu_add(memcg->vmstats_percpu->state[i], val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); + this_cpu_add(pn->lruvec_stats_percpu->state[i], val); val = memcg_state_val_in_pages(idx, val); memcg_rstat_updated(memcg, val, cpu); @@ -773,7 +773,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg and lruvec */ if (!mem_cgroup_disabled()) - __mod_memcg_lruvec_state(lruvec, idx, val); + mod_memcg_lruvec_state(lruvec, idx, val); } void __lruvec_stat_mod_folio(struct folio *folio, enum node_stat_item idx, @@ -2525,7 +2525,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) folio->memcg_data = (unsigned long)memcg; } -static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, +static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, struct pglist_data *pgdat, enum node_stat_item idx, int nr) { @@ -2535,7 +2535,7 @@ static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg, rcu_read_lock(); memcg = obj_cgroup_memcg(objcg); lruvec = mem_cgroup_lruvec(memcg, pgdat); - __mod_memcg_lruvec_state(lruvec, idx, nr); + mod_memcg_lruvec_state(lruvec, idx, nr); rcu_read_unlock(); } @@ -2845,12 +2845,12 @@ static void __account_obj_stock(struct obj_cgroup *objcg, struct pglist_data *oldpg = stock->cached_pgdat; if (stock->nr_slab_reclaimable_b) { - __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - __mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, + mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; } @@ -2876,7 +2876,7 @@ static void __account_obj_stock(struct obj_cgroup *objcg, } } if (nr) - __mod_objcg_mlstate(objcg, pgdat, idx, nr); + mod_objcg_mlstate(objcg, pgdat, idx, nr); } static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, @@ -2945,13 +2945,13 @@ static void drain_obj_stock(struct obj_stock_pcp *stock) */ if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) { if (stock->nr_slab_reclaimable_b) { - __mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_RECLAIMABLE_B, stock->nr_slab_reclaimable_b); stock->nr_slab_reclaimable_b = 0; } if (stock->nr_slab_unreclaimable_b) { - __mod_objcg_mlstate(old, stock->cached_pgdat, + mod_objcg_mlstate(old, stock->cached_pgdat, NR_SLAB_UNRECLAIMABLE_B, stock->nr_slab_unreclaimable_b); stock->nr_slab_unreclaimable_b = 0; -- 2.51.0 From 0ccf1806d44f7c567c73d6ef076a8f6c8c16c74b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:57 -0700 Subject: [PATCH 15/16] memcg: no stock lock for cpu hot-unplug Previously on the cpu hot-unplug, the kernel would call drain_obj_stock() with objcg local lock. However local lock was not needed as the stock which was accessed belongs to a dead cpu but we kept it there to disable irqs as drain_obj_stock() may call mod_objcg_mlstate() which required irqs disabled. However there is no need to disable irqs now for mod_objcg_mlstate(), so we can remove the local lock altogether from cpu hot-unplug path. Link: https://lkml.kernel.org/r/20250514184158.3471331-7-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f66cacb6f2a0..d8508b57d0fa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2023,17 +2023,8 @@ void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { - struct obj_stock_pcp *obj_st; - unsigned long flags; - - obj_st = &per_cpu(obj_stock, cpu); - - /* drain_obj_stock requires objstock.lock */ - local_lock_irqsave(&obj_stock.lock, flags); - drain_obj_stock(obj_st); - local_unlock_irqrestore(&obj_stock.lock, flags); - /* no need for the local lock */ + drain_obj_stock(&per_cpu(obj_stock, cpu)); drain_stock_fully(&per_cpu(memcg_stock, cpu)); return 0; -- 2.51.0 From 200577f69f29a58c90c67c83a0df6d12850e1d09 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 14 May 2025 11:41:58 -0700 Subject: [PATCH 16/16] memcg: objcg stock trylock without irq disabling There is no need to disable irqs to use objcg per-cpu stock, so let's just not do that but consume_obj_stock() and refill_obj_stock() will need to use trylock instead to avoid deadlock against irq. One consequence of this change is that the charge request from irq context may take slowpath more often but it should be rare. Link: https://lkml.kernel.org/r/20250514184158.3471331-8-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Alexei Starovoitov Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Sebastian Andrzej Siewior Signed-off-by: Andrew Morton --- mm/memcontrol.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d8508b57d0fa..35db91fddd1f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1880,18 +1880,17 @@ static void drain_local_memcg_stock(struct work_struct *dummy) static void drain_local_obj_stock(struct work_struct *dummy) { struct obj_stock_pcp *stock; - unsigned long flags; if (WARN_ONCE(!in_task(), "drain in non-task context")) return; - local_lock_irqsave(&obj_stock.lock, flags); + local_lock(&obj_stock.lock); stock = this_cpu_ptr(&obj_stock); drain_obj_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_unlock_irqrestore(&obj_stock.lock, flags); + local_unlock(&obj_stock.lock); } static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) @@ -2874,10 +2873,10 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, struct pglist_data *pgdat, enum node_stat_item idx) { struct obj_stock_pcp *stock; - unsigned long flags; bool ret = false; - local_lock_irqsave(&obj_stock.lock, flags); + if (!local_trylock(&obj_stock.lock)) + return ret; stock = this_cpu_ptr(&obj_stock); if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) { @@ -2888,7 +2887,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, __account_obj_stock(objcg, stock, nr_bytes, pgdat, idx); } - local_unlock_irqrestore(&obj_stock.lock, flags); + local_unlock(&obj_stock.lock); return ret; } @@ -2977,10 +2976,16 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, enum node_stat_item idx) { struct obj_stock_pcp *stock; - unsigned long flags; unsigned int nr_pages = 0; - local_lock_irqsave(&obj_stock.lock, flags); + if (!local_trylock(&obj_stock.lock)) { + if (pgdat) + mod_objcg_mlstate(objcg, pgdat, idx, nr_bytes); + nr_pages = nr_bytes >> PAGE_SHIFT; + nr_bytes = nr_bytes & (PAGE_SIZE - 1); + atomic_add(nr_bytes, &objcg->nr_charged_bytes); + goto out; + } stock = this_cpu_ptr(&obj_stock); if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */ @@ -3002,8 +3007,8 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, stock->nr_bytes &= (PAGE_SIZE - 1); } - local_unlock_irqrestore(&obj_stock.lock, flags); - + local_unlock(&obj_stock.lock); +out: if (nr_pages) obj_cgroup_uncharge_pages(objcg, nr_pages); } -- 2.51.0