From c4af66a95aa3bc1d4f607ebd4eea524fb58946e3 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Sun, 9 Feb 2025 14:13:11 +0800 Subject: [PATCH 01/16] cgroup/rstat: Fix forceidle time in cpu.stat The commit b824766504e4 ("cgroup/rstat: add force idle show helper") retrieves forceidle_time outside cgroup_rstat_lock for non-root cgroups which can be potentially inconsistent with other stats. Rather than reverting that commit, fix it in a way that retains the effort of cleaning up the ifdef-messes. Fixes: b824766504e4 ("cgroup/rstat: add force idle show helper") Signed-off-by: Abel Wu Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 5877974ece92..c2784c317cdd 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -613,36 +613,33 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime, ntime; + struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; + bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, - &utime, &stime); - ntime = cgrp->bstat.ntime; + &bstat.cputime.utime, &bstat.cputime.stime); cgroup_rstat_flush_release(cgrp); } else { - /* cgrp->bstat of root is not actually used, reuse it */ - root_cgroup_cputime(&cgrp->bstat); - usage = cgrp->bstat.cputime.sum_exec_runtime; - utime = cgrp->bstat.cputime.utime; - stime = cgrp->bstat.cputime.stime; - ntime = cgrp->bstat.ntime; + root_cgroup_cputime(&bstat); } - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - do_div(ntime, NSEC_PER_USEC); + do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); + do_div(bstat.cputime.utime, NSEC_PER_USEC); + do_div(bstat.cputime.stime, NSEC_PER_USEC); + do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", - usage, utime, stime, ntime); + bstat.cputime.sum_exec_runtime, + bstat.cputime.utime, + bstat.cputime.stime, + bstat.ntime); - cgroup_force_idle_show(seq, &cgrp->bstat); + cgroup_force_idle_show(seq, &bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ -- 2.51.0 From c7461cca916756a017f584126b8be73e58d55e53 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 5 Mar 2025 13:12:43 +0900 Subject: [PATCH 02/16] cgroup, docs: Be explicit about independence of RT_GROUP_SCHED and non-cpu controllers MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The cgroup v2 cpu controller has a limitation that if CONFIG_RT_GROUP_SCHED is enabled, the cpu controller can be enabled only if all the realtime processes are in the root cgroup. The other controllers have no such restriction. They can be used for the resource control of realtime processes irrespective of whether CONFIG_RT_GROUP_SCHED is enabled or not. Signed-off-by: Shashank Balaji Acked-by: Waiman Long Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 315ede811c9d..7e61288430b7 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1075,15 +1075,20 @@ cpufreq governor about the minimum desired frequency which should always be provided by a CPU, as well as the maximum desired frequency, which should not be exceeded by a CPU. -WARNING: cgroup2 doesn't yet support control of realtime processes. For -a kernel built with the CONFIG_RT_GROUP_SCHED option enabled for group -scheduling of realtime processes, the cpu controller can only be enabled -when all RT processes are in the root cgroup. This limitation does -not apply if CONFIG_RT_GROUP_SCHED is disabled. Be aware that system -management software may already have placed RT processes into nonroot -cgroups during the system boot process, and these processes may need -to be moved to the root cgroup before the cpu controller can be enabled -with a CONFIG_RT_GROUP_SCHED enabled kernel. +WARNING: cgroup2 cpu controller doesn't yet fully support the control of +realtime processes. For a kernel built with the CONFIG_RT_GROUP_SCHED option +enabled for group scheduling of realtime processes, the cpu controller can only +be enabled when all RT processes are in the root cgroup. Be aware that system +management software may already have placed RT processes into non-root cgroups +during the system boot process, and these processes may need to be moved to the +root cgroup before the cpu controller can be enabled with a +CONFIG_RT_GROUP_SCHED enabled kernel. + +With CONFIG_RT_GROUP_SCHED disabled, this limitation does not apply and some of +the interface files either affect realtime processes or account for them. See +the following section for details. Only the cpu controller is affected by +CONFIG_RT_GROUP_SCHED. Other controllers can be used for the resource control of +realtime processes irrespective of CONFIG_RT_GROUP_SCHED. CPU Interface Files -- 2.51.0 From a0dd846257af0b272488032df215939f48cefc75 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:18 +0100 Subject: [PATCH 03/16] cgroup/cpuset-v1: Add deprecation messages to sched_load_balance and memory_pressure_enabled MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit These two v1 feature have analogues in cgroup v2. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 81b5e2a50d58..7d310d8bb1ad 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -471,12 +471,14 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: + pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: + pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: -- 2.51.0 From 012c419f8d248fc92915a6d0998802ccd15cded6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:19 +0100 Subject: [PATCH 04/16] cgroup/cpuset-v1: Add deprecation messages to memory_spread_page and memory_spread_slab MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit There is MPOL_INTERLEAVE for user explicit allocations. Deprecate spreading of allocations that users carry out unwittingly. Use straight warning level for slab spreading since such a knob is unnecessarily intertwined with slab allocator. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 7d310d8bb1ad..fcf3fdac21a2 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -482,9 +482,11 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); break; case FILE_SPREAD_SLAB: + pr_warn_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); break; default: -- 2.51.0 From 77bbb259db53ab5f0c971cf0180c28c897a06b9f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:20 +0100 Subject: [PATCH 05/16] cgroup/blkio: Add deprecation messages to reset_stats MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It is difficult to sync with stat updaters, stats are (should be) monotonic so users can calculate differences from a reference. Signed-off-by: Michal Koutný Acked-by: Jens Axboe Signed-off-by: Tejun Heo --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 45a395862fbc..b3e5184b10d5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -659,6 +659,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkcg_gq *blkg; int i; + pr_info_once("blkio.%s is deprecated\n", cftype->name); mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); -- 2.51.0 From a0ab1453226d862cf30fdccc5a8e753f79c5bc99 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:21 +0100 Subject: [PATCH 06/16] cgroup: Print message when /proc/cgroups is read on v2-only system MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit As a followup to commits 6c2920926b10e ("cgroup: replace unified-hierarchy.txt with a proper cgroup v2 documentation") and ab03125268679 ("cgroup: Show # of subsystem CSSes in cgroup.stat"), add a runtime message to users who read status of controllers in /proc/cgroups on v2-only system. The detection is based on a) no controllers are attached to v1, b) default hierarchy is mounted (the latter is for setups that never mount v2 but read /proc/cgroups upon boot when controllers default to v2, so that this code may be backported to older kernels). Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 7 +++++++ kernel/cgroup/cgroup.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c964dd7ff967..95ab39e1ec8f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -168,6 +168,7 @@ struct cgroup_mgctx { extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; +extern bool cgrp_dfl_visible; /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e28d5f0d20ed..11ea8d24ac72 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = { int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; + bool cgrp_v1_visible = false; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) { if (cgroup1_subsys_absent(ss)) continue; + cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } + if (cgrp_dfl_visible && !cgrp_v1_visible) + pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n"); + + return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 805764cf14e2..a810952d75c8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_visible; +bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; -- 2.51.0 From 313819279289672ddf765331aca143d945ac19d5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:22 +0100 Subject: [PATCH 07/16] cgroup/cpuset-v1: Add deprecation messages to mem_exclusive and mem_hardwall MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The concept of exclusive memory affinity may require complex approaches like with cpuset v2 cpu partitions. There is so far no implementation in cpuset v2. Specific kernel memory affinity may cause unintended (global) bottlenecks like kmem limits. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index fcf3fdac21a2..620e07ffd61f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -465,9 +465,11 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_MEM_HARDWALL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: -- 2.51.0 From db4dc20c1140ab56dc31a73c4b48a50fad7a5634 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:23 +0100 Subject: [PATCH 08/16] cgroup/cpuset-v1: Add deprecation messages to memory_migrate MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Memory migration (between cgroups) was given up in v2 due to performance reasons of its implementation. Migration between NUMA nodes within one memcg may still make sense to modify affinity at runtime though. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 620e07ffd61f..a1bbbd345041 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -477,6 +477,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: -- 2.51.0 From 103149a0632eca460242ef01050f08a5050a32ce Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:24 +0100 Subject: [PATCH 09/16] RFC cgroup/cpuset-v1: Add deprecation messages to sched_relax_domain_level MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This is not a properly hierarchical resource, it might be better implemented based on a sched_attr. Cc: Hidetoshi Seto Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index a1bbbd345041..b69a7db67090 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -176,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = update_relax_domain_level(cs, val); break; default: -- 2.51.0 From fd4fd0a869e969a97753986b107729d4bb56525b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:25 +0100 Subject: [PATCH 10/16] mm: Add transformation message for per-memcg swappiness MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The concept of per-memcg swappiness has never landed well in memcg for cgroup v2. Add a message to users who use it on v1 hierarchy. Decreased swappiness transforms to memory.swap.max=0 whereas increased swappiness transforms into active memory.reclaim operation. Link: https://lore.kernel.org/r/1577252208-32419-1-git-send-email-teawater@gmail.com/ Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memory.rst | 1 + mm/memcontrol-v1.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 286d16fc22eb..02b8206a3594 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -90,6 +90,7 @@ Brief summary of control files. used. memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) + Per memcg knob does not exist in cgroup v2. memory.move_charge_at_immigrate This knob is deprecated. memory.oom_control set/show oom controls. This knob is deprecated and shouldn't be diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index a071fa43d479..93291c0e6eac 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1855,9 +1855,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > MAX_SWAPPINESS) return -EINVAL; - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { + pr_info_once("Per memcg swappiness does not exist in cgroup v2. " + "See memory.reclaim or memory.swap.max there\n ") WRITE_ONCE(memcg->swappiness, val); - else + } else WRITE_ONCE(vm_swappiness, val); return 0; -- 2.51.0 From 78f6519ed0766e36f08954810ce05f19212242a6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:26 +0100 Subject: [PATCH 11/16] cgroup: Add deprecation message to legacy freezer controller MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit As explained in the commit 76f969e8948d8 ("cgroup: cgroup v2 freezer"), the original freezer is imperfect, some users may unwittingly rely on it when there exists the alternative of v2. Print a message when it happens and explain that in the docs. Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst | 4 ++++ kernel/cgroup/legacy_freezer.c | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst index 582d3427de3f..a964aff373b1 100644 --- a/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst +++ b/Documentation/admin-guide/cgroup-v1/freezer-subsystem.rst @@ -125,3 +125,7 @@ to unfreeze all tasks in the container:: This is the basic mechanism which should do the right thing for user space task in a simple scenario. + +This freezer implementation is affected by shortcomings (see commit +76f969e8948d8 ("cgroup: cgroup v2 freezer")) and cgroup v2 freezer is +recommended. diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 074653f964c1..039d1eb2f215 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -430,9 +430,11 @@ static ssize_t freezer_write(struct kernfs_open_file *of, if (strcmp(buf, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) { + pr_info_once("Freezing with imperfect legacy cgroup freezer. " + "See cgroup.freeze of cgroup v2\n"); freeze = true; - else + } else return -EINVAL; freezer_change_state(css_freezer(of_css(of)), freeze); -- 2.51.0 From 76f9409f813c9dc24a22fa7a587a31983706c4d5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:27 +0100 Subject: [PATCH 12/16] cgroup: Update file naming comment MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This changed long time ago in commit 8d7e6fb0a1db9 ("cgroup: update cgroup name handling"). Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1b20d2d8ef7c..d8d6fe24f375 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -619,9 +619,8 @@ struct cgroup_root { */ struct cftype { /* - * By convention, the name should begin with the name of the - * subsystem, followed by a period. Zero length string indicates - * end of cftype array. + * Name of the subsystem is prepended in cgroup_file_name(). + * Zero length string indicates end of cftype array. */ char name[MAX_CFTYPE_NAME]; unsigned long private; -- 2.51.0 From 4a893bdc18df087ab7dde28a4e8066ae39faa700 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:28 +0100 Subject: [PATCH 13/16] blk-cgroup: Simplify policy files registration MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Use one set of files when there is no difference between default and legacy files, similar to regular subsys files registration. No functional change. Signed-off-by: Michal Koutný Acked-by: Jens Axboe Signed-off-by: Tejun Heo --- block/blk-cgroup.c | 7 +++++-- block/blk-ioprio.c | 23 +++++++---------------- include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 2 +- 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index b3e5184b10d5..fa26b33b540e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1768,12 +1768,15 @@ int blkcg_policy_register(struct blkcg_policy *pol) mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->dfl_cftypes) + if (pol->dfl_cftypes == pol->legacy_cftypes) { + WARN_ON(cgroup_add_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + } else { WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); - if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); + } mutex_unlock(&blkcg_pol_register_mutex); return 0; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 8fff7ccc0ac7..13659dc15c3f 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -113,27 +113,18 @@ static void ioprio_free_cpd(struct blkcg_policy_data *cpd) kfree(blkcg); } -#define IOPRIO_ATTRS \ - { \ - .name = "prio.class", \ - .seq_show = ioprio_show_prio_policy, \ - .write = ioprio_set_prio_policy, \ - }, \ - { } /* sentinel */ - -/* cgroup v2 attributes */ static struct cftype ioprio_files[] = { - IOPRIO_ATTRS -}; - -/* cgroup v1 attributes */ -static struct cftype ioprio_legacy_files[] = { - IOPRIO_ATTRS + { + .name = "prio.class", + .seq_show = ioprio_show_prio_policy, + .write = ioprio_set_prio_policy, + }, + { } /* sentinel */ }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, - .legacy_cftypes = ioprio_legacy_files, + .legacy_cftypes = ioprio_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f8ef47f8a634..8e7415c64ed1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -113,6 +113,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a810952d75c8..66f79f798c25 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4451,7 +4451,7 @@ int cgroup_rm_cftypes(struct cftype *cfts) * function currently returns 0 as long as @cfts registration is successful * even if some file creation attempts on existing cgroups fail. */ -static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { int ret; -- 2.51.0 From b0543d50c4cbc34ca612ab17c3cc9f2d174be8e0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 11 Mar 2025 13:56:11 -1000 Subject: [PATCH 14/16] mm: Fix a build breakage in memcontrol-v1.c MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit While adding a deprecation message, fd4fd0a869e9 ("mm: Add transformation message for per-memcg swappiness") missed the semicolon after the new pr_info_once() statement causing build breakage when CONFIG_MEMCG_V1 is enabled. Fix it. Signed-off-by: Tejun Heo Cc: Michal Koutný Fixes: fd4fd0a869e9 ("mm: Add transformation message for per-memcg swappiness") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202503120710.guZkJx0h-lkp@intel.com/ --- mm/memcontrol-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 93291c0e6eac..67e786d3b17c 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1857,7 +1857,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (!mem_cgroup_is_root(memcg)) { pr_info_once("Per memcg swappiness does not exist in cgroup v2. " - "See memory.reclaim or memory.swap.max there\n ") + "See memory.reclaim or memory.swap.max there\n "); WRITE_ONCE(memcg->swappiness, val); } else WRITE_ONCE(vm_swappiness, val); -- 2.51.0 From 0efc297a3c4974dbd609ee36fc6345720b6ca735 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Mar 2025 00:13:30 -0700 Subject: [PATCH 15/16] cgroup/rstat: avoid disabling irqs for O(num_cpu) MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit cgroup_rstat_flush_locked() grabs the irq safe cgroup_rstat_lock while iterating all possible cpus. It only drops the lock if there is scheduler or spin lock contention. If neither, then interrupts can be disabled for a long time. On large machines this can disable interrupts for a long enough time to drop network packets. On 400+ CPU machines I've seen interrupt disabled for over 40 msec. Prevent rstat from disabling interrupts while processing all possible cpus. Instead drop and reacquire cgroup_rstat_lock for each cpu. This approach was previously discussed in https://lore.kernel.org/lkml/ZBz%2FV5a7%2F6PZeM7S@slm.duckdns.org/, though this was in the context of an non-irq rstat spin lock. Benchmark this change with: 1) a single stat_reader process with 400 threads, each reading a test memcg's memory.stat repeatedly for 10 seconds. 2) 400 memory hog processes running in the test memcg and repeatedly charging memory until oom killed. Then they repeat charging and oom killing. v6.14-rc6 with CONFIG_IRQSOFF_TRACER with stat_reader and hogs, finds interrupts are disabled by rstat for 45341 usec: # => started at: _raw_spin_lock_irq # => ended at: cgroup_rstat_flush # # # _------=> CPU# # / _-----=> irqs-off/BH-disabled # | / _----=> need-resched # || / _---=> hardirq/softirq # ||| / _--=> preempt-depth # |||| / _-=> migrate-disable # ||||| / delay # cmd pid |||||| time | caller # \ / |||||| \ | / stat_rea-96532 52d.... 0us*: _raw_spin_lock_irq stat_rea-96532 52d.... 45342us : cgroup_rstat_flush stat_rea-96532 52d.... 45342us : tracer_hardirqs_on <-cgroup_rstat_flush stat_rea-96532 52d.... 45343us : => memcg1_stat_format => memory_stat_format => memory_stat_show => seq_read_iter => vfs_read => ksys_read => do_syscall_64 => entry_SYSCALL_64_after_hwframe With this patch the CONFIG_IRQSOFF_TRACER doesn't find rstat to be the longest holder. The longest irqs-off holder has irqs disabled for 4142 usec, a huge reduction from previous 45341 usec rstat finding. Running stat_reader memory.stat reader for 10 seconds: - without memory hogs: 9.84M accesses => 12.7M accesses - with memory hogs: 9.46M accesses => 11.1M accesses The throughput of memory.stat access improves. The mode of memory.stat access latency after grouping by of 2 buckets: - without memory hogs: 64 usec => 16 usec - with memory hogs: 64 usec => 8 usec The memory.stat latency improves. Signed-off-by: Eric Dumazet Signed-off-by: Greg Thelen Tested-by: Greg Thelen Acked-by: Michal Koutný Reviewed-by: Yosry Ahmed Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index c2784c317cdd..2c1053e83945 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -323,13 +323,11 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) rcu_read_unlock(); } - /* play nice and yield if necessary */ - if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - __cgroup_rstat_unlock(cgrp, cpu); - if (!cond_resched()) - cpu_relax(); - __cgroup_rstat_lock(cgrp, cpu); - } + /* play nice and avoid disabling interrupts for a long time */ + __cgroup_rstat_unlock(cgrp, cpu); + if (!cond_resched()) + cpu_relax(); + __cgroup_rstat_lock(cgrp, cpu); } } -- 2.51.0 From 093c8812de2d3436744fd10edab9bf816b94f833 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 19 Mar 2025 21:00:13 +0000 Subject: [PATCH 16/16] cgroup: rstat: Cleanup flushing functions and locking Now that the rstat lock is being re-acquired on every CPU iteration in cgroup_rstat_flush_locked(), having the initially acquire the lock is unnecessary and unclear. Inline cgroup_rstat_flush_locked() into cgroup_rstat_flush() and move the lock/unlock calls to the beginning and ending of the loop body to make the critical section obvious. cgroup_rstat_flush_hold/release() do not make much sense with the lock being dropped and reacquired internally. Since it has no external callers, remove it and explicitly acquire the lock in cgroup_base_stat_cputime_show() instead. This leaves the code with a single flushing function, cgroup_rstat_flush(). Signed-off-by: Yosry Ahmed Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 2 -- kernel/cgroup/rstat.c | 79 +++++++++++------------------------------- 2 files changed, 20 insertions(+), 61 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8e7415c64ed1..28e999f2c642 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -690,8 +690,6 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); -void cgroup_rstat_flush_hold(struct cgroup *cgrp); -void cgroup_rstat_flush_release(struct cgroup *cgrp); /* * Basic resource stats. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 2c1053e83945..7831978a26bb 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -299,17 +299,29 @@ static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) spin_unlock_irq(&cgroup_rstat_lock); } -/* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) +/** + * cgroup_rstat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + * + * This function may block. + */ +__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { int cpu; - lockdep_assert_held(&cgroup_rstat_lock); - + might_sleep(); for_each_possible_cpu(cpu) { struct cgroup *pos = cgroup_rstat_updated_list(cgrp, cpu); + /* Reacquire for each CPU to avoid disabling IRQs too long */ + __cgroup_rstat_lock(cgrp, cpu); for (; pos; pos = pos->rstat_flush_next) { struct cgroup_subsys_state *css; @@ -322,64 +334,12 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - - /* play nice and avoid disabling interrupts for a long time */ __cgroup_rstat_unlock(cgrp, cpu); if (!cond_resched()) cpu_relax(); - __cgroup_rstat_lock(cgrp, cpu); } } -/** - * cgroup_rstat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup - * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. - * - * This also gets all cgroups in the subtree including @cgrp off the - * ->updated_children lists. - * - * This function may block. - */ -__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) -{ - might_sleep(); - - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); - __cgroup_rstat_unlock(cgrp, -1); -} - -/** - * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold - * @cgrp: target cgroup - * - * Flush stats in @cgrp's subtree and prevent further flushes. Must be - * paired with cgroup_rstat_flush_release(). - * - * This function may block. - */ -void cgroup_rstat_flush_hold(struct cgroup *cgrp) - __acquires(&cgroup_rstat_lock) -{ - might_sleep(); - __cgroup_rstat_lock(cgrp, -1); - cgroup_rstat_flush_locked(cgrp); -} - -/** - * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() - * @cgrp: cgroup used by tracepoint - */ -void cgroup_rstat_flush_release(struct cgroup *cgrp) - __releases(&cgroup_rstat_lock) -{ - __cgroup_rstat_unlock(cgrp, -1); -} - int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; @@ -614,11 +574,12 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush_hold(cgrp); + cgroup_rstat_flush(cgrp); + __cgroup_rstat_lock(cgrp, -1); bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &bstat.cputime.utime, &bstat.cputime.stime); - cgroup_rstat_flush_release(cgrp); + __cgroup_rstat_unlock(cgrp, -1); } else { root_cgroup_cputime(&bstat); } -- 2.51.0