From 68e6b7d98bc64bbf1a54d963ca85111432f3a0b4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 20 Jan 2025 12:56:10 +0100 Subject: [PATCH 01/16] samples/vfs: fix build warnings Fix build warnings reported from linux-next. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/r/20250120192504.4a1965a0@canb.auug.org.au Signed-off-by: Christian Brauner --- samples/vfs/test-list-all-mounts.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/samples/vfs/test-list-all-mounts.c b/samples/vfs/test-list-all-mounts.c index 23d028881263..1a02ea4593e3 100644 --- a/samples/vfs/test-list-all-mounts.c +++ b/samples/vfs/test-list-all-mounts.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include "../../tools/testing/selftests/pidfd/pidfd.h" @@ -86,8 +87,8 @@ int main(int argc, char *argv[]) if (ret < 0) die_errno("ioctl(NS_GET_MNTNS_ID) failed"); - printf("Listing %u mounts for mount namespace %llu\n", - info.nr_mounts, info.mnt_ns_id); + printf("Listing %u mounts for mount namespace %" PRIu64 "\n", + info.nr_mounts, (uint64_t)info.mnt_ns_id); for (;;) { ssize_t nr_mounts; next: @@ -97,8 +98,8 @@ next: if (nr_mounts <= 0) { int fd_mntns_next; - printf("Finished listing %u mounts for mount namespace %llu\n\n", - info.nr_mounts, info.mnt_ns_id); + printf("Finished listing %u mounts for mount namespace %" PRIu64 "\n\n", + info.nr_mounts, (uint64_t)info.mnt_ns_id); fd_mntns_next = ioctl(fd_mntns, NS_MNT_GET_NEXT, &info); if (fd_mntns_next < 0) { if (errno == ENOENT) { @@ -110,8 +111,8 @@ next: close(fd_mntns); fd_mntns = fd_mntns_next; last_mnt_id = 0; - printf("Listing %u mounts for mount namespace %llu\n", - info.nr_mounts, info.mnt_ns_id); + printf("Listing %u mounts for mount namespace %" PRIu64 "\n", + info.nr_mounts, (uint64_t)info.mnt_ns_id); goto next; } @@ -129,14 +130,14 @@ next: STATMOUNT_MNT_OPTS | STATMOUNT_FS_TYPE, 0); if (!stmnt) { - printf("Failed to statmount(%llu) in mount namespace(%llu)\n", - last_mnt_id, info.mnt_ns_id); + printf("Failed to statmount(%" PRIu64 ") in mount namespace(%" PRIu64 ")\n", + (uint64_t)last_mnt_id, (uint64_t)info.mnt_ns_id); continue; } - printf("mnt_id:\t\t%llu\nmnt_parent_id:\t%llu\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n", - stmnt->mnt_id, - stmnt->mnt_parent_id, + printf("mnt_id:\t\t%" PRIu64 "\nmnt_parent_id:\t%" PRIu64 "\nfs_type:\t%s\nmnt_root:\t%s\nmnt_point:\t%s\nmnt_opts:\t%s\n\n", + (uint64_t)stmnt->mnt_id, + (uint64_t)stmnt->mnt_parent_id, stmnt->str + stmnt->fs_type, stmnt->str + stmnt->mnt_root, stmnt->str + stmnt->mnt_point, -- 2.50.1 From 027ea4f5f2c814b703adabdd42b779cd98e24411 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Dec 2024 13:07:51 -0800 Subject: [PATCH 02/16] x86: use proper 'clac' and 'stac' opcode names Back when we added SMAP support, all versions of binutils didn't necessarily understand the 'clac' and 'stac' instructions. So we implemented those instructions manually as ".byte" sequences. But we've since upgraded the minimum version of binutils to version 2.25, and that included proper support for the SMAP instructions, and there's no reason for us to use some line noise to express them any more. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/smap.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index bab490379c65..2de1e5a75c57 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -13,30 +13,26 @@ #include #include -/* "Raw" instruction opcodes */ -#define __ASM_CLAC ".byte 0x0f,0x01,0xca" -#define __ASM_STAC ".byte 0x0f,0x01,0xcb" - #ifdef __ASSEMBLY__ #define ASM_CLAC \ - ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP + ALTERNATIVE "", "clac", X86_FEATURE_SMAP #define ASM_STAC \ - ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP + ALTERNATIVE "", "stac", X86_FEATURE_SMAP #else /* __ASSEMBLY__ */ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __ASM_CLAC, X86_FEATURE_SMAP); + alternative("", "clac", X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __ASM_STAC, X86_FEATURE_SMAP); + alternative("", "stac", X86_FEATURE_SMAP); } static __always_inline unsigned long smap_save(void) @@ -44,7 +40,7 @@ static __always_inline unsigned long smap_save(void) unsigned long flags; asm volatile ("# smap_save\n\t" - ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC "\n\t", + ALTERNATIVE("", "pushf; pop %0; " "clac" "\n\t", X86_FEATURE_SMAP) : "=rm" (flags) : : "memory", "cc"); @@ -61,9 +57,9 @@ static __always_inline void smap_restore(unsigned long flags) /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP) + ALTERNATIVE("", "clac", X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) + ALTERNATIVE("", "stac", X86_FEATURE_SMAP) #endif /* __ASSEMBLY__ */ -- 2.50.1 From 91309a70829d94c735c8bb1cc383e78c96127a16 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 10 Dec 2024 10:25:04 -0800 Subject: [PATCH 03/16] x86: use cmov for user address masking MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This was a suggestion by David Laight, and while I was slightly worried that some micro-architecture would predict cmov like a conditional branch, there is little reason to actually believe any core would be that broken. Intel documents that their existing cores treat CMOVcc as a data dependency that will constrain speculation in their "Speculative Execution Side Channel Mitigations" whitepaper: "Other instructions such as CMOVcc, AND, ADC, SBB and SETcc can also be used to prevent bounds check bypass by constraining speculative execution on current family 6 processors (Intel® Core™, Intel® Atom™, Intel® Xeon® and Intel® Xeon Phi™ processors)" and while that leaves the future uarch issues open, that's certainly true of our traditional SBB usage too. Any core that predicts CMOV will be unusable for various crypto algorithms that need data-independent timing stability, so let's just treat CMOV as the safe choice that simplifies the address masking by avoiding an extra instruction and doesn't need a temporary register. Suggested-by: David Laight Link: https://www.intel.com/content/dam/develop/external/us/en/documents/336996-speculative-execution-side-channel-mitigations.pdf Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess_64.h | 12 ++++++------ arch/x86/lib/getuser.S | 5 ++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index b0a887209400..c52f0133425b 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -63,13 +63,13 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, */ static inline void __user *mask_user_address(const void __user *ptr) { - unsigned long mask; + void __user *ret; asm("cmp %1,%0\n\t" - "sbb %0,%0" - :"=r" (mask) - :"r" (ptr), - "0" (runtime_const_ptr(USER_PTR_MAX))); - return (__force void __user *)(mask | (__force unsigned long)ptr); + "cmova %1,%0" + :"=r" (ret) + :"r" (runtime_const_ptr(USER_PTR_MAX)), + "0" (ptr)); + return ret; } #define masked_user_access_begin(x) ({ \ __auto_type __masked_ptr = (x); \ diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 4357ec2a0bfc..89ecd57c9d42 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -44,9 +44,8 @@ .pushsection runtime_ptr_USER_PTR_MAX,"a" .long 1b - 8 - . .popsection - cmp %rax, %rdx - sbb %rdx, %rdx - or %rdx, %rax + cmp %rdx, %rax + cmova %rdx, %rax .else cmp $TASK_SIZE_MAX-\size+1, %eax jae .Lbad_get_user -- 2.50.1 From 4a6780a30e86cde7756954981db9e6aec285793d Mon Sep 17 00:00:00 2001 From: Haorui He Date: Sun, 12 Jan 2025 22:49:20 +0800 Subject: [PATCH 04/16] cgroup: update comment about dropping cgroup kn refs the cgroup is actually freed in css_free_rwork_fn() now the ref count of the cgroup's kernfs_node is also dropped there so we need to update the corresponding comment in cgroup_mkdir() Signed-off-by: Haorui He Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d9061bd55436..805764cf14e2 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5835,7 +5835,7 @@ int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) } /* - * This extra ref will be put in cgroup_free_fn() and guarantees + * This extra ref will be put in css_free_rwork_fn() and guarantees * that @cgrp->kn is always accessible. */ kernfs_get(cgrp->kn); -- 2.50.1 From dae68fba8e115fd84d820354f79da1481135acbd Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Mon, 20 Jan 2025 15:57:49 +0100 Subject: [PATCH 05/16] cgroup/cpuset: Move procfs cpuset attribute under cgroup-v1.c MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The cpuset file is a legacy attribute that is bound primarily to cpuset v1 hierarchy (equivalent information is available in /proc/$pid/cgroup path on the unified hierarchy in conjunction with respective cgroup.controllers showing where cpuset controller is enabled). Followup to commit b0ced9d378d49 ("cgroup/cpuset: move v1 interfaces to cpuset-v1.c") and hide CONFIG_PROC_PID_CPUSET under CONFIG_CPUSETS_V1. Drop an obsolete comment too. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- init/Kconfig | 5 +++-- kernel/cgroup/cpuset-v1.c | 41 +++++++++++++++++++++++++++++++++++ kernel/cgroup/cpuset.c | 45 --------------------------------------- 3 files changed, 44 insertions(+), 47 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index a20e6efd3f0f..2f3121c49ed2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1182,7 +1182,8 @@ config CPUSETS_V1 help Legacy cgroup v1 cpusets controller which has been deprecated by cgroup v2 implementation. The v1 is there for legacy applications - which haven't migrated to the new cgroup v2 interface yet. If you + which haven't migrated to the new cgroup v2 interface yet. Legacy + interface includes cpuset filesystem and /proc//cpuset. If you do not have any such application then you are completely fine leaving this option disabled. @@ -1190,7 +1191,7 @@ config CPUSETS_V1 config PROC_PID_CPUSET bool "Include legacy /proc//cpuset file" - depends on CPUSETS + depends on CPUSETS_V1 default y config CGROUP_DEVICE diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 25c1d7b77e2f..81b5e2a50d58 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include "cgroup-internal.h" #include "cpuset-internal.h" /* @@ -373,6 +374,46 @@ out: return ret; } +#ifdef CONFIG_PROC_PID_CPUSET +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc//cpuset. + */ +int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + char *buf; + struct cgroup_subsys_state *css; + int retval; + + retval = -ENOMEM; + buf = kmalloc(PATH_MAX, GFP_KERNEL); + if (!buf) + goto out; + + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + + if (retval == -E2BIG) + retval = -ENAMETOOLONG; + if (retval < 0) + goto out_free; + seq_puts(m, buf); + seq_putc(m, '\n'); + retval = 0; +out_free: + kfree(buf); +out: + return retval; +} +#endif /* CONFIG_PROC_PID_CPUSET */ + static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct cpuset *cs = css_cs(css); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f910c828973..5a637292faa2 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,7 +21,6 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ -#include "cgroup-internal.h" #include "cpuset-internal.h" #include @@ -4244,50 +4243,6 @@ void cpuset_print_current_mems_allowed(void) rcu_read_unlock(); } -#ifdef CONFIG_PROC_PID_CPUSET -/* - * proc_cpuset_show() - * - Print tasks cpuset path into seq_file. - * - Used for /proc//cpuset. - * - No need to task_lock(tsk) on this tsk->cpuset reference, as it - * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it - * anyway. - */ -int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - char *buf; - struct cgroup_subsys_state *css; - int retval; - - retval = -ENOMEM; - buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!buf) - goto out; - - rcu_read_lock(); - spin_lock_irq(&css_set_lock); - css = task_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - spin_unlock_irq(&css_set_lock); - rcu_read_unlock(); - - if (retval == -E2BIG) - retval = -ENAMETOOLONG; - if (retval < 0) - goto out_free; - seq_puts(m, buf); - seq_putc(m, '\n'); - retval = 0; -out_free: - kfree(buf); -out: - return retval; -} -#endif /* CONFIG_PROC_PID_CPUSET */ - /* Display task mems_allowed in /proc//status file. */ void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { -- 2.50.1 From ad6c08d8c1045843ec564a73981ece6ec31d11a0 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 27 Jan 2025 23:52:14 +0000 Subject: [PATCH 06/16] cgroup/misc: Remove unused misc_cg_res_total_usage MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit misc_cg_res_total_usage() was added in 2021 by commit a72232eabdfc ("cgroup: Add misc cgroup controller") but has remained unused. Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- include/linux/misc_cgroup.h | 6 ------ kernel/cgroup/misc.c | 16 ---------------- 2 files changed, 22 deletions(-) diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e59..4bf261d41a6d 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -60,7 +60,6 @@ struct misc_cg { struct misc_res res[MISC_CG_RES_TYPES]; }; -u64 misc_cg_res_total_usage(enum misc_res_type type); int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); @@ -104,11 +103,6 @@ static inline void put_misc_cg(struct misc_cg *cg) #else /* !CONFIG_CGROUP_MISC */ -static inline u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - return 0; -} - static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) { return 0; diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6..2fa3a4fb2aaf 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -67,22 +67,6 @@ static inline bool valid_type(enum misc_res_type type) return type >= 0 && type < MISC_CG_RES_TYPES; } -/** - * misc_cg_res_total_usage() - Get the current total usage of the resource. - * @type: misc res type. - * - * Context: Any context. - * Return: Current total usage of the resource. - */ -u64 misc_cg_res_total_usage(enum misc_res_type type) -{ - if (valid_type(type)) - return atomic64_read(&root_cg.res[type].usage); - - return 0; -} -EXPORT_SYMBOL_GPL(misc_cg_res_total_usage); - /** * misc_cg_set_capacity() - Set the capacity of the misc cgroup res. * @type: Type of the misc res. -- 2.50.1 From c4af66a95aa3bc1d4f607ebd4eea524fb58946e3 Mon Sep 17 00:00:00 2001 From: Abel Wu Date: Sun, 9 Feb 2025 14:13:11 +0800 Subject: [PATCH 07/16] cgroup/rstat: Fix forceidle time in cpu.stat The commit b824766504e4 ("cgroup/rstat: add force idle show helper") retrieves forceidle_time outside cgroup_rstat_lock for non-root cgroups which can be potentially inconsistent with other stats. Rather than reverting that commit, fix it in a way that retains the effort of cleaning up the ifdef-messes. Fixes: b824766504e4 ("cgroup/rstat: add force idle show helper") Signed-off-by: Abel Wu Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 5877974ece92..c2784c317cdd 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -613,36 +613,33 @@ static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime, ntime; + struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); - usage = cgrp->bstat.cputime.sum_exec_runtime; + bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, - &utime, &stime); - ntime = cgrp->bstat.ntime; + &bstat.cputime.utime, &bstat.cputime.stime); cgroup_rstat_flush_release(cgrp); } else { - /* cgrp->bstat of root is not actually used, reuse it */ - root_cgroup_cputime(&cgrp->bstat); - usage = cgrp->bstat.cputime.sum_exec_runtime; - utime = cgrp->bstat.cputime.utime; - stime = cgrp->bstat.cputime.stime; - ntime = cgrp->bstat.ntime; + root_cgroup_cputime(&bstat); } - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - do_div(ntime, NSEC_PER_USEC); + do_div(bstat.cputime.sum_exec_runtime, NSEC_PER_USEC); + do_div(bstat.cputime.utime, NSEC_PER_USEC); + do_div(bstat.cputime.stime, NSEC_PER_USEC); + do_div(bstat.ntime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n" "nice_usec %llu\n", - usage, utime, stime, ntime); + bstat.cputime.sum_exec_runtime, + bstat.cputime.utime, + bstat.cputime.stime, + bstat.ntime); - cgroup_force_idle_show(seq, &cgrp->bstat); + cgroup_force_idle_show(seq, &bstat); } /* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ -- 2.50.1 From c7461cca916756a017f584126b8be73e58d55e53 Mon Sep 17 00:00:00 2001 From: Shashank Balaji Date: Wed, 5 Mar 2025 13:12:43 +0900 Subject: [PATCH 08/16] cgroup, docs: Be explicit about independence of RT_GROUP_SCHED and non-cpu controllers MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The cgroup v2 cpu controller has a limitation that if CONFIG_RT_GROUP_SCHED is enabled, the cpu controller can be enabled only if all the realtime processes are in the root cgroup. The other controllers have no such restriction. They can be used for the resource control of realtime processes irrespective of whether CONFIG_RT_GROUP_SCHED is enabled or not. Signed-off-by: Shashank Balaji Acked-by: Waiman Long Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v2.rst | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 315ede811c9d..7e61288430b7 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1075,15 +1075,20 @@ cpufreq governor about the minimum desired frequency which should always be provided by a CPU, as well as the maximum desired frequency, which should not be exceeded by a CPU. -WARNING: cgroup2 doesn't yet support control of realtime processes. For -a kernel built with the CONFIG_RT_GROUP_SCHED option enabled for group -scheduling of realtime processes, the cpu controller can only be enabled -when all RT processes are in the root cgroup. This limitation does -not apply if CONFIG_RT_GROUP_SCHED is disabled. Be aware that system -management software may already have placed RT processes into nonroot -cgroups during the system boot process, and these processes may need -to be moved to the root cgroup before the cpu controller can be enabled -with a CONFIG_RT_GROUP_SCHED enabled kernel. +WARNING: cgroup2 cpu controller doesn't yet fully support the control of +realtime processes. For a kernel built with the CONFIG_RT_GROUP_SCHED option +enabled for group scheduling of realtime processes, the cpu controller can only +be enabled when all RT processes are in the root cgroup. Be aware that system +management software may already have placed RT processes into non-root cgroups +during the system boot process, and these processes may need to be moved to the +root cgroup before the cpu controller can be enabled with a +CONFIG_RT_GROUP_SCHED enabled kernel. + +With CONFIG_RT_GROUP_SCHED disabled, this limitation does not apply and some of +the interface files either affect realtime processes or account for them. See +the following section for details. Only the cpu controller is affected by +CONFIG_RT_GROUP_SCHED. Other controllers can be used for the resource control of +realtime processes irrespective of CONFIG_RT_GROUP_SCHED. CPU Interface Files -- 2.50.1 From a0dd846257af0b272488032df215939f48cefc75 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:18 +0100 Subject: [PATCH 09/16] cgroup/cpuset-v1: Add deprecation messages to sched_load_balance and memory_pressure_enabled MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit These two v1 feature have analogues in cgroup v2. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 81b5e2a50d58..7d310d8bb1ad 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -471,12 +471,14 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: + pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name); retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: + pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name); cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: -- 2.50.1 From 012c419f8d248fc92915a6d0998802ccd15cded6 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:19 +0100 Subject: [PATCH 10/16] cgroup/cpuset-v1: Add deprecation messages to memory_spread_page and memory_spread_slab MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit There is MPOL_INTERLEAVE for user explicit allocations. Deprecate spreading of allocations that users carry out unwittingly. Use straight warning level for slab spreading since such a knob is unnecessarily intertwined with slab allocator. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 7d310d8bb1ad..fcf3fdac21a2 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -482,9 +482,11 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, cpuset_memory_pressure_enabled = !!val; break; case FILE_SPREAD_PAGE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val); break; case FILE_SPREAD_SLAB: + pr_warn_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val); break; default: -- 2.50.1 From 77bbb259db53ab5f0c971cf0180c28c897a06b9f Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:20 +0100 Subject: [PATCH 11/16] cgroup/blkio: Add deprecation messages to reset_stats MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit It is difficult to sync with stat updaters, stats are (should be) monotonic so users can calculate differences from a reference. Signed-off-by: Michal Koutný Acked-by: Jens Axboe Signed-off-by: Tejun Heo --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 45a395862fbc..b3e5184b10d5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -659,6 +659,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkcg_gq *blkg; int i; + pr_info_once("blkio.%s is deprecated\n", cftype->name); mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); -- 2.50.1 From a0ab1453226d862cf30fdccc5a8e753f79c5bc99 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:21 +0100 Subject: [PATCH 12/16] cgroup: Print message when /proc/cgroups is read on v2-only system MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit As a followup to commits 6c2920926b10e ("cgroup: replace unified-hierarchy.txt with a proper cgroup v2 documentation") and ab03125268679 ("cgroup: Show # of subsystem CSSes in cgroup.stat"), add a runtime message to users who read status of controllers in /proc/cgroups on v2-only system. The detection is based on a) no controllers are attached to v1, b) default hierarchy is mounted (the latter is for setups that never mount v2 but read /proc/cgroups upon boot when controllers default to v2, so that this code may be backported to older kernels). Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 1 + kernel/cgroup/cgroup-v1.c | 7 +++++++ kernel/cgroup/cgroup.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index c964dd7ff967..95ab39e1ec8f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -168,6 +168,7 @@ struct cgroup_mgctx { extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; +extern bool cgrp_dfl_visible; /* iterate across the hierarchies */ #define for_each_root(root) \ diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index e28d5f0d20ed..11ea8d24ac72 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -673,6 +673,7 @@ struct cftype cgroup1_base_files[] = { int proc_cgroupstats_show(struct seq_file *m, void *v) { struct cgroup_subsys *ss; + bool cgrp_v1_visible = false; int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); @@ -684,12 +685,18 @@ int proc_cgroupstats_show(struct seq_file *m, void *v) for_each_subsys(ss, i) { if (cgroup1_subsys_absent(ss)) continue; + cgrp_v1_visible |= ss->root != &cgrp_dfl_root; + seq_printf(m, "%s\t%d\t%d\t%d\n", ss->legacy_name, ss->root->hierarchy_id, atomic_read(&ss->root->nr_cgrps), cgroup_ssid_enabled(i)); } + if (cgrp_dfl_visible && !cgrp_v1_visible) + pr_info_once("/proc/cgroups lists only v1 controllers, use cgroup.controllers of root cgroup for v2 info\n"); + + return 0; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 805764cf14e2..a810952d75c8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_visible; +bool cgrp_dfl_visible; /* some controllers are not supported in the default hierarchy */ static u16 cgrp_dfl_inhibit_ss_mask; -- 2.50.1 From 313819279289672ddf765331aca143d945ac19d5 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:22 +0100 Subject: [PATCH 13/16] cgroup/cpuset-v1: Add deprecation messages to mem_exclusive and mem_hardwall MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The concept of exclusive memory affinity may require complex approaches like with cpuset v2 cpu partitions. There is so far no implementation in cpuset v2. Specific kernel memory affinity may cause unintended (global) bottlenecks like kmem limits. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index fcf3fdac21a2..620e07ffd61f 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -465,9 +465,11 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_MEM_HARDWALL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val); break; case FILE_SCHED_LOAD_BALANCE: -- 2.50.1 From db4dc20c1140ab56dc31a73c4b48a50fad7a5634 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:23 +0100 Subject: [PATCH 14/16] cgroup/cpuset-v1: Add deprecation messages to memory_migrate MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Memory migration (between cgroups) was given up in v2 due to performance reasons of its implementation. Migration between NUMA nodes within one memcg may still make sense to modify affinity at runtime though. Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index 620e07ffd61f..a1bbbd345041 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -477,6 +477,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: -- 2.50.1 From 103149a0632eca460242ef01050f08a5050a32ce Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:24 +0100 Subject: [PATCH 15/16] RFC cgroup/cpuset-v1: Add deprecation messages to sched_relax_domain_level MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This is not a properly hierarchical resource, it might be better implemented based on a sched_attr. Cc: Hidetoshi Seto Signed-off-by: Michal Koutný Acked-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset-v1.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cgroup/cpuset-v1.c b/kernel/cgroup/cpuset-v1.c index a1bbbd345041..b69a7db67090 100644 --- a/kernel/cgroup/cpuset-v1.c +++ b/kernel/cgroup/cpuset-v1.c @@ -176,6 +176,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: + pr_info_once("cpuset.%s is deprecated\n", cft->name); retval = update_relax_domain_level(cs, val); break; default: -- 2.50.1 From fd4fd0a869e969a97753986b107729d4bb56525b Mon Sep 17 00:00:00 2001 From: =?utf8?q?Michal=20Koutn=C3=BD?= Date: Tue, 11 Mar 2025 13:36:25 +0100 Subject: [PATCH 16/16] mm: Add transformation message for per-memcg swappiness MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The concept of per-memcg swappiness has never landed well in memcg for cgroup v2. Add a message to users who use it on v1 hierarchy. Decreased swappiness transforms to memory.swap.max=0 whereas increased swappiness transforms into active memory.reclaim operation. Link: https://lore.kernel.org/r/1577252208-32419-1-git-send-email-teawater@gmail.com/ Signed-off-by: Michal Koutný Signed-off-by: Tejun Heo --- Documentation/admin-guide/cgroup-v1/memory.rst | 1 + mm/memcontrol-v1.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 286d16fc22eb..02b8206a3594 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -90,6 +90,7 @@ Brief summary of control files. used. memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) + Per memcg knob does not exist in cgroup v2. memory.move_charge_at_immigrate This knob is deprecated. memory.oom_control set/show oom controls. This knob is deprecated and shouldn't be diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index a071fa43d479..93291c0e6eac 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -1855,9 +1855,11 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, if (val > MAX_SWAPPINESS) return -EINVAL; - if (!mem_cgroup_is_root(memcg)) + if (!mem_cgroup_is_root(memcg)) { + pr_info_once("Per memcg swappiness does not exist in cgroup v2. " + "See memory.reclaim or memory.swap.max there\n ") WRITE_ONCE(memcg->swappiness, val); - else + } else WRITE_ONCE(vm_swappiness, val); return 0; -- 2.50.1