From a39326767c55c00c7c313333404cbcb502cce8fe Mon Sep 17 00:00:00 2001 From: =?utf8?q?Motiejus=20Jak=C3=85=60tys?= Date: Tue, 12 Nov 2024 19:16:55 +0200 Subject: [PATCH 01/16] tools/mm: fix compile error MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Add a missing semicolon. Link: https://lkml.kernel.org/r/20241112171655.1662670-1-motiejus@jakstys.lt Fixes: ece5897e5a10 ("tools/mm: -Werror fixes in page-types/slabinfo") Signed-off-by: Motiejus JakÅ`tys Closes: https://github.com/NixOS/nixpkgs/issues/355369 Reviewed-by: SeongJae Park Reviewed-by: Vishal Moola (Oracle) Acked-by: Oleksandr Natalenko Cc: Wladislav Wiebe Signed-off-by: Andrew Morton --- tools/mm/page-types.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mm/page-types.c b/tools/mm/page-types.c index 6eb17cc1a06c..bcac7ebfb51f 100644 --- a/tools/mm/page-types.c +++ b/tools/mm/page-types.c @@ -420,7 +420,7 @@ static void show_page(unsigned long voffset, unsigned long offset, if (opt_file) printf("%lx\t", voffset); if (opt_list_cgroup) - printf("@%" PRIu64 "\t", cgroup) + printf("@%" PRIu64 "\t", cgroup); if (opt_list_mapcnt) printf("%" PRIu64 "\t", mapcnt); -- 2.50.1 From a4a282daf1a190f03790bf163458ea3c8d28d217 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 11 Nov 2024 20:34:30 +0100 Subject: [PATCH 02/16] mm/mremap: fix address wraparound in move_page_tables() On 32-bit platforms, it is possible for the expression `len + old_addr < old_end` to be false-positive if `len + old_addr` wraps around. `old_addr` is the cursor in the old range up to which page table entries have been moved; so if the operation succeeded, `old_addr` is the *end* of the old region, and adding `len` to it can wrap. The overflow causes mremap() to mistakenly believe that PTEs have been copied; the consequence is that mremap() bails out, but doesn't move the PTEs back before the new VMA is unmapped, causing anonymous pages in the region to be lost. So basically if userspace tries to mremap() a private-anon region and hits this bug, mremap() will return an error and the private-anon region's contents appear to have been zeroed. The idea of this check is that `old_end - len` is the original start address, and writing the check that way also makes it easier to read; so fix the check by rearranging the comparison accordingly. (An alternate fix would be to refactor this function by introducing an "orig_old_start" variable or such.) Tested in a VM with a 32-bit X86 kernel; without the patch: ``` user@horn:~/big_mremap$ cat test.c #define _GNU_SOURCE #include #include #include #include #define ADDR1 ((void*)0x60000000) #define ADDR2 ((void*)0x10000000) #define SIZE 0x50000000uL int main(void) { unsigned char *p1 = mmap(ADDR1, SIZE, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED_NOREPLACE, -1, 0); if (p1 == MAP_FAILED) err(1, "mmap 1"); unsigned char *p2 = mmap(ADDR2, SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED_NOREPLACE, -1, 0); if (p2 == MAP_FAILED) err(1, "mmap 2"); *p1 = 0x41; printf("first char is 0x%02hhx\n", *p1); unsigned char *p3 = mremap(p1, SIZE, SIZE, MREMAP_MAYMOVE|MREMAP_FIXED, p2); if (p3 == MAP_FAILED) { printf("mremap() failed; first char is 0x%02hhx\n", *p1); } else { printf("mremap() succeeded; first char is 0x%02hhx\n", *p3); } } user@horn:~/big_mremap$ gcc -static -o test test.c user@horn:~/big_mremap$ setarch -R ./test first char is 0x41 mremap() failed; first char is 0x00 ``` With the patch: ``` user@horn:~/big_mremap$ setarch -R ./test first char is 0x41 mremap() succeeded; first char is 0x41 ``` Link: https://lkml.kernel.org/r/20241111-fix-mremap-32bit-wrap-v1-1-61d6be73b722@google.com Fixes: af8ca1c14906 ("mm/mremap: optimize the start addresses in move_page_tables()") Signed-off-by: Jann Horn Acked-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Acked-by: Qi Zheng Reviewed-by: Liam R. Howlett Cc: Joel Fernandes (Google) Cc: Signed-off-by: Andrew Morton --- mm/mremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index dda09e957a5d..dee98ff2bbd6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -648,7 +648,7 @@ again: * Prevent negative return values when {old,new}_addr was realigned * but we broke out of the above loop for the first PMD itself. */ - if (len + old_addr < old_end) + if (old_addr < old_end - len) return 0; return len + old_addr - old_end; /* how much done */ -- 2.50.1 From 31daa34315d45d3fe77f2158d889d523d78852ea Mon Sep 17 00:00:00 2001 From: Dave Vasilevsky Date: Tue, 17 Sep 2024 12:37:20 -0400 Subject: [PATCH 03/16] crash, powerpc: default to CRASH_DUMP=n on PPC_BOOK3S_32 MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Fixes boot failures on 6.9 on PPC_BOOK3S_32 machines using Open Firmware. On these machines, the kernel refuses to boot from non-zero PHYSICAL_START, which occurs when CRASH_DUMP is on. Since most PPC_BOOK3S_32 machines boot via Open Firmware, it should default to off for them. Users booting via some other mechanism can still turn it on explicitly. Does not change the default on any other architectures for the time being. Link: https://lkml.kernel.org/r/20240917163720.1644584-1-dave@vasilevsky.ca Fixes: 75bc255a7444 ("crash: clean up kdump related config items") Signed-off-by: Dave Vasilevsky Reported-by: Reimar Döffinger Closes: https://lists.debian.org/debian-powerpc/2024/07/msg00001.html Acked-by: Michael Ellerman [powerpc] Acked-by: Baoquan He Cc: "Eric W. Biederman" Cc: John Paul Adrian Glaubitz Cc: Reimar Döffinger Cc: Signed-off-by: Andrew Morton --- arch/arm/Kconfig | 3 +++ arch/arm64/Kconfig | 3 +++ arch/loongarch/Kconfig | 3 +++ arch/mips/Kconfig | 3 +++ arch/powerpc/Kconfig | 4 ++++ arch/riscv/Kconfig | 3 +++ arch/s390/Kconfig | 3 +++ arch/sh/Kconfig | 3 +++ arch/x86/Kconfig | 3 +++ kernel/Kconfig.kexec | 2 +- 10 files changed, 29 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 749179a1d162..202397be76d8 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1598,6 +1598,9 @@ config ATAGS_PROC config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config AUTO_ZRELADDR bool "Auto calculation of the decompressed kernel image address" if !ARCH_MULTIPLATFORM default !(ARCH_FOOTBRIDGE || ARCH_RPC || ARCH_SA1100) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index fd9df6dcc593..22ea2705becc 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1576,6 +1576,9 @@ config ARCH_DEFAULT_KEXEC_IMAGE_VERIFY_SIG config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION def_bool CRASH_RESERVE diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index bb35c34f86d2..d9fce0fd475a 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -604,6 +604,9 @@ config ARCH_SUPPORTS_KEXEC config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_SELECTS_CRASH_DUMP def_bool y depends on CRASH_DUMP diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 397edf05dd72..467b10f4361a 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2876,6 +2876,9 @@ config ARCH_SUPPORTS_KEXEC config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config PHYSICAL_START hex "Physical address where the kernel is loaded" default "0xffffffff84000000" diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8094a01974cc..1a2ff0276365 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -684,6 +684,10 @@ config RELOCATABLE_TEST config ARCH_SUPPORTS_CRASH_DUMP def_bool PPC64 || PPC_BOOK3S_32 || PPC_85xx || (44x && !SMP) +config ARCH_DEFAULT_CRASH_DUMP + bool + default y if !PPC_BOOK3S_32 + config ARCH_SELECTS_CRASH_DUMP def_bool y depends on CRASH_DUMP diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index f4c570538d55..fa8f2da87a0a 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -898,6 +898,9 @@ config ARCH_SUPPORTS_KEXEC_PURGATORY config ARCH_SUPPORTS_CRASH_DUMP def_bool y +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION def_bool CRASH_RESERVE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index d339fe4fdedf..cc1f9cffe2a5 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -276,6 +276,9 @@ config ARCH_SUPPORTS_CRASH_DUMP This option also enables s390 zfcpdump. See also +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + menu "Processor type and features" config HAVE_MARCH_Z10_FEATURES diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index e9103998cca9..04ff5fb9242e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -550,6 +550,9 @@ config ARCH_SUPPORTS_KEXEC config ARCH_SUPPORTS_CRASH_DUMP def_bool BROKEN_ON_SMP +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_SUPPORTS_KEXEC_JUMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 16354dfa6d96..7b9a7e8f39ac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2084,6 +2084,9 @@ config ARCH_SUPPORTS_KEXEC_JUMP config ARCH_SUPPORTS_CRASH_DUMP def_bool X86_64 || (X86_32 && HIGHMEM) +config ARCH_DEFAULT_CRASH_DUMP + def_bool y + config ARCH_SUPPORTS_CRASH_HOTPLUG def_bool y diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 6c34e63c88ff..4d111f871951 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -97,7 +97,7 @@ config KEXEC_JUMP config CRASH_DUMP bool "kernel crash dumps" - default y + default ARCH_DEFAULT_CRASH_DUMP depends on ARCH_SUPPORTS_CRASH_DUMP depends on KEXEC_CORE select VMCORE_INFO -- 2.50.1 From fd7b4f9f46d46acbc7af3a439bb0d869efdc5c58 Mon Sep 17 00:00:00 2001 From: Qun-Wei Lin Date: Wed, 13 Nov 2024 12:25:43 +0800 Subject: [PATCH 04/16] sched/task_stack: fix object_is_on_stack() for KASAN tagged pointers When CONFIG_KASAN_SW_TAGS and CONFIG_KASAN_STACK are enabled, the object_is_on_stack() function may produce incorrect results due to the presence of tags in the obj pointer, while the stack pointer does not have tags. This discrepancy can lead to incorrect stack object detection and subsequently trigger warnings if CONFIG_DEBUG_OBJECTS is also enabled. Example of the warning: ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at lib/debugobjects.c:557 __debug_object_init+0x330/0x364 Modules linked in: CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc5 #4 Hardware name: linux,dummy-virt (DT) pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __debug_object_init+0x330/0x364 lr : __debug_object_init+0x330/0x364 sp : ffff800082ea7b40 x29: ffff800082ea7b40 x28: 98ff0000c0164518 x27: 98ff0000c0164534 x26: ffff800082d93ec8 x25: 0000000000000001 x24: 1cff0000c00172a0 x23: 0000000000000000 x22: ffff800082d93ed0 x21: ffff800081a24418 x20: 3eff800082ea7bb0 x19: efff800000000000 x18: 0000000000000000 x17: 00000000000000ff x16: 0000000000000047 x15: 206b63617473206e x14: 0000000000000018 x13: ffff800082ea7780 x12: 0ffff800082ea78e x11: 0ffff800082ea790 x10: 0ffff800082ea79d x9 : 34d77febe173e800 x8 : 34d77febe173e800 x7 : 0000000000000001 x6 : 0000000000000001 x5 : feff800082ea74b8 x4 : ffff800082870a90 x3 : ffff80008018d3c4 x2 : 0000000000000001 x1 : ffff800082858810 x0 : 0000000000000050 Call trace: __debug_object_init+0x330/0x364 debug_object_init_on_stack+0x30/0x3c schedule_hrtimeout_range_clock+0xac/0x26c schedule_hrtimeout+0x1c/0x30 wait_task_inactive+0x1d4/0x25c kthread_bind_mask+0x28/0x98 init_rescuer+0x1e8/0x280 workqueue_init+0x1a0/0x3cc kernel_init_freeable+0x118/0x200 kernel_init+0x28/0x1f0 ret_from_fork+0x10/0x20 ---[ end trace 0000000000000000 ]--- ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated. ------------[ cut here ]------------ Link: https://lkml.kernel.org/r/20241113042544.19095-1-qun-wei.lin@mediatek.com Signed-off-by: Qun-Wei Lin Cc: Andrew Yang Cc: AngeloGioacchino Del Regno Cc: Casper Li Cc: Catalin Marinas Cc: Chinwen Chang Cc: Kent Overstreet Cc: Matthias Brugger Cc: Pasha Tatashin Cc: Shakeel Butt Cc: Signed-off-by: Andrew Morton --- include/linux/sched/task_stack.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index bf10bdb487dd..6c2fef89a4fd 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -9,6 +9,7 @@ #include #include #include +#include #ifdef CONFIG_THREAD_INFO_IN_TASK @@ -89,6 +90,7 @@ static inline int object_is_on_stack(const void *obj) { void *stack = task_stack_page(current); + obj = kasan_reset_tag(obj); return (obj >= stack) && (obj < (stack + THREAD_SIZE)); } -- 2.50.1 From 669b0cb81e4e4e78cff77a5b367c7f70c0c6c05e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 14 Nov 2024 11:59:32 +0300 Subject: [PATCH 05/16] fs/proc/task_mmu: prevent integer overflow in pagemap_scan_get_args() MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit The "arg->vec_len" variable is a u64 that comes from the user at the start of the function. The "arg->vec_len * sizeof(struct page_region))" multiplication can lead to integer wrapping. Use size_mul() to avoid that. Also the size_add/mul() functions work on unsigned long so for 32bit systems we need to ensure that "arg->vec_len" fits in an unsigned long. Link: https://lkml.kernel.org/r/39d41335-dd4d-48ed-8a7f-402c57d8ea84@stanley.mountain Fixes: 52526ca7fdb9 ("fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs") Signed-off-by: Dan Carpenter Cc: Andrei Vagin Cc: Andrii Nakryiko Cc: Arnd Bergmann Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Michał Mirosław Cc: Muhammad Usama Anjum Cc: Oscar Salvador Cc: Peter Xu Cc: Ryan Roberts Cc: Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e52bd96137a6..7eb010de39fe 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2665,8 +2665,10 @@ static int pagemap_scan_get_args(struct pm_scan_arg *arg, return -EFAULT; if (!arg->vec && arg->vec_len) return -EINVAL; + if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX) + return -EINVAL; if (arg->vec && !access_ok((void __user *)(long)arg->vec, - arg->vec_len * sizeof(struct page_region))) + size_mul(arg->vec_len, sizeof(struct page_region)))) return -EFAULT; /* Fixup default values */ -- 2.50.1 From 0740e54304dcd11cf2a8edb6764423eb2fed1c61 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 13 Nov 2024 23:07:11 +0800 Subject: [PATCH 06/16] mm, doc: update read_ahead_kb for MADV_HUGEPAGE MADV_HUGEPAGE is a new addition to readahead with behavior distinct from normal pages. To prevent confusion, we should update the documentation accordingly. Link: https://lkml.kernel.org/r/20241113150711.1685-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Cc: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- Documentation/ABI/stable/sysfs-block | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index cea8856f798d..7a820a7d53aa 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -594,6 +594,9 @@ Description: [RW] Maximum number of kilobytes to read-ahead for filesystems on this block device. + For MADV_HUGEPAGE, the readahead size may exceed this setting + since its granularity is based on the hugepage size. + What: /sys/block//queue/rotational Date: January 2009 -- 2.50.1 From 8ce41b0f9d77cca074df25afd39b86e2ee3aa68e Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Wed, 13 Nov 2024 16:32:35 +0800 Subject: [PATCH 07/16] mm: fix NULL pointer dereference in alloc_pages_bulk_noprof We triggered a NULL pointer dereference for ac.preferred_zoneref->zone in alloc_pages_bulk_noprof() when the task is migrated between cpusets. When cpuset is enabled, in prepare_alloc_pages(), ac->nodemask may be ¤t->mems_allowed. when first_zones_zonelist() is called to find preferred_zoneref, the ac->nodemask may be modified concurrently if the task is migrated between different cpusets. Assuming we have 2 NUMA Node, when traversing Node1 in ac->zonelist, the nodemask is 2, and when traversing Node2 in ac->zonelist, the nodemask is 1. As a result, the ac->preferred_zoneref points to NULL zone. In alloc_pages_bulk_noprof(), for_each_zone_zonelist_nodemask() finds a allowable zone and calls zonelist_node_idx(ac.preferred_zoneref), leading to NULL pointer dereference. __alloc_pages_noprof() fixes this issue by checking NULL pointer in commit ea57485af8f4 ("mm, page_alloc: fix check for NULL preferred_zone") and commit df76cee6bbeb ("mm, page_alloc: remove redundant checks from alloc fastpath"). To fix it, check NULL pointer for preferred_zoneref->zone. Link: https://lkml.kernel.org/r/20241113083235.166798-1-tujinjiang@huawei.com Fixes: 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") Signed-off-by: Jinjiang Tu Reviewed-by: Vlastimil Babka Cc: Alexander Lobakin Cc: David Hildenbrand Cc: Kefeng Wang Cc: Mel Gorman Cc: Nanyong Sun Cc: Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 216fbbfbedcf..b6958333054d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4607,7 +4607,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, gfp = alloc_gfp; /* Find an allowed local zone that meets the low watermark. */ - for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { + z = ac.preferred_zoneref; + for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) { unsigned long mark; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && -- 2.50.1 From 737f34137844d6572ab7d473c998c7f977ff30eb Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 14 Nov 2024 07:38:44 +0300 Subject: [PATCH 08/16] ocfs2: uncache inode which has failed entering the group Syzbot has reported the following BUG: kernel BUG at fs/ocfs2/uptodate.c:509! ... Call Trace: ? __die_body+0x5f/0xb0 ? die+0x9e/0xc0 ? do_trap+0x15a/0x3a0 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? do_error_trap+0x1dc/0x2c0 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? __pfx_do_error_trap+0x10/0x10 ? handle_invalid_op+0x34/0x40 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? exc_invalid_op+0x38/0x50 ? asm_exc_invalid_op+0x1a/0x20 ? ocfs2_set_new_buffer_uptodate+0x2e/0x160 ? ocfs2_set_new_buffer_uptodate+0x144/0x160 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ocfs2_group_add+0x39f/0x15a0 ? __pfx_ocfs2_group_add+0x10/0x10 ? __pfx_lock_acquire+0x10/0x10 ? mnt_get_write_access+0x68/0x2b0 ? __pfx_lock_release+0x10/0x10 ? rcu_read_lock_any_held+0xb7/0x160 ? __pfx_rcu_read_lock_any_held+0x10/0x10 ? smack_log+0x123/0x540 ? mnt_get_write_access+0x68/0x2b0 ? mnt_get_write_access+0x68/0x2b0 ? mnt_get_write_access+0x226/0x2b0 ocfs2_ioctl+0x65e/0x7d0 ? __pfx_ocfs2_ioctl+0x10/0x10 ? smack_file_ioctl+0x29e/0x3a0 ? __pfx_smack_file_ioctl+0x10/0x10 ? lockdep_hardirqs_on_prepare+0x43d/0x780 ? __pfx_lockdep_hardirqs_on_prepare+0x10/0x10 ? __pfx_ocfs2_ioctl+0x10/0x10 __se_sys_ioctl+0xfb/0x170 do_syscall_64+0xf3/0x230 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... When 'ioctl(OCFS2_IOC_GROUP_ADD, ...)' has failed for the particular inode in 'ocfs2_verify_group_and_input()', corresponding buffer head remains cached and subsequent call to the same 'ioctl()' for the same inode issues the BUG() in 'ocfs2_set_new_buffer_uptodate()' (trying to cache the same buffer head of that inode). Fix this by uncaching the buffer head with 'ocfs2_remove_from_cache()' on error path in 'ocfs2_group_add()'. Link: https://lkml.kernel.org/r/20241114043844.111847-1-dmantipov@yandex.ru Fixes: 7909f2bf8353 ("[PATCH 2/2] ocfs2: Implement group add for online resize") Signed-off-by: Dmitry Antipov Reported-by: syzbot+453873f1588c2d75b447@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=453873f1588c2d75b447 Reviewed-by: Joseph Qi Cc: Dmitry Antipov Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/resize.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index c4a4016d3866..b0733c08ed13 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -574,6 +574,8 @@ out_commit: ocfs2_commit_trans(osb, handle); out_free_group_bh: + if (ret < 0) + ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); out_unlock: -- 2.50.1 From 44f392fbf628a7ff2d8bb8e83ca1851261f81a6f Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Sat, 16 Nov 2024 09:22:14 -0500 Subject: [PATCH 09/16] Revert "drm/amd/pm: correct the workload setting" This reverts commit 74e1006430a5377228e49310f6d915628609929e. This causes a regression in the workload selection. A more extensive fix is being worked on. For now, revert. Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3618 Fixes: 74e1006430a5 ("drm/amd/pm: correct the workload setting") Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 49 ++++++------------- drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 4 +- .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 5 +- .../gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 5 +- .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 5 +- .../gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c | 4 +- .../gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c | 4 +- .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 20 ++------ .../drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 5 +- .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c | 9 ++-- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 --- drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h | 2 - 12 files changed, 36 insertions(+), 84 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index ee1bcfaae3e3..80e60ea2d11e 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1259,33 +1259,26 @@ static int smu_sw_init(void *handle) smu->watermarks_bitmap = 0; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; - smu->user_dpm_profile.user_workload_mask = 0; atomic_set(&smu->smu_power.power_gate.vcn_gated, 1); atomic_set(&smu->smu_power.power_gate.jpeg_gated, 1); atomic_set(&smu->smu_power.power_gate.vpe_gated, 1); atomic_set(&smu->smu_power.power_gate.umsch_mm_gated, 1); - smu->workload_priority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT] = 0; - smu->workload_priority[PP_SMC_POWER_PROFILE_FULLSCREEN3D] = 1; - smu->workload_priority[PP_SMC_POWER_PROFILE_POWERSAVING] = 2; - smu->workload_priority[PP_SMC_POWER_PROFILE_VIDEO] = 3; - smu->workload_priority[PP_SMC_POWER_PROFILE_VR] = 4; - smu->workload_priority[PP_SMC_POWER_PROFILE_COMPUTE] = 5; - smu->workload_priority[PP_SMC_POWER_PROFILE_CUSTOM] = 6; + smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT] = 0; + smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D] = 1; + smu->workload_prority[PP_SMC_POWER_PROFILE_POWERSAVING] = 2; + smu->workload_prority[PP_SMC_POWER_PROFILE_VIDEO] = 3; + smu->workload_prority[PP_SMC_POWER_PROFILE_VR] = 4; + smu->workload_prority[PP_SMC_POWER_PROFILE_COMPUTE] = 5; + smu->workload_prority[PP_SMC_POWER_PROFILE_CUSTOM] = 6; if (smu->is_apu || - !smu_is_workload_profile_available(smu, PP_SMC_POWER_PROFILE_FULLSCREEN3D)) { - smu->driver_workload_mask = - 1 << smu->workload_priority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT]; - } else { - smu->driver_workload_mask = - 1 << smu->workload_priority[PP_SMC_POWER_PROFILE_FULLSCREEN3D]; - smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_FULLSCREEN3D; - } + !smu_is_workload_profile_available(smu, PP_SMC_POWER_PROFILE_FULLSCREEN3D)) + smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT]; + else + smu->workload_mask = 1 << smu->workload_prority[PP_SMC_POWER_PROFILE_FULLSCREEN3D]; - smu->workload_mask = smu->driver_workload_mask | - smu->user_dpm_profile.user_workload_mask; smu->workload_setting[0] = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; smu->workload_setting[1] = PP_SMC_POWER_PROFILE_FULLSCREEN3D; smu->workload_setting[2] = PP_SMC_POWER_PROFILE_POWERSAVING; @@ -2355,20 +2348,17 @@ static int smu_switch_power_profile(void *handle, return -EINVAL; if (!en) { - smu->driver_workload_mask &= ~(1 << smu->workload_priority[type]); + smu->workload_mask &= ~(1 << smu->workload_prority[type]); index = fls(smu->workload_mask); index = index > 0 && index <= WORKLOAD_POLICY_MAX ? index - 1 : 0; workload[0] = smu->workload_setting[index]; } else { - smu->driver_workload_mask |= (1 << smu->workload_priority[type]); + smu->workload_mask |= (1 << smu->workload_prority[type]); index = fls(smu->workload_mask); index = index <= WORKLOAD_POLICY_MAX ? index - 1 : 0; workload[0] = smu->workload_setting[index]; } - smu->workload_mask = smu->driver_workload_mask | - smu->user_dpm_profile.user_workload_mask; - if (smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_MANUAL && smu_dpm_ctx->dpm_level != AMD_DPM_FORCED_LEVEL_PERF_DETERMINISM) smu_bump_power_profile_mode(smu, workload, 0); @@ -3059,23 +3049,12 @@ static int smu_set_power_profile_mode(void *handle, uint32_t param_size) { struct smu_context *smu = handle; - int ret; if (!smu->pm_enabled || !smu->adev->pm.dpm_enabled || !smu->ppt_funcs->set_power_profile_mode) return -EOPNOTSUPP; - if (smu->user_dpm_profile.user_workload_mask & - (1 << smu->workload_priority[param[param_size]])) - return 0; - - smu->user_dpm_profile.user_workload_mask = - (1 << smu->workload_priority[param[param_size]]); - smu->workload_mask = smu->user_dpm_profile.user_workload_mask | - smu->driver_workload_mask; - ret = smu_bump_power_profile_mode(smu, param, param_size); - - return ret; + return smu_bump_power_profile_mode(smu, param, param_size); } static int smu_get_fan_control_mode(void *handle, u32 *fan_mode) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h index d60d9a12a47e..b44a185d07e8 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h @@ -240,7 +240,6 @@ struct smu_user_dpm_profile { /* user clock state information */ uint32_t clk_mask[SMU_CLK_COUNT]; uint32_t clk_dependency; - uint32_t user_workload_mask; }; #define SMU_TABLE_INIT(tables, table_id, s, a, d) \ @@ -558,8 +557,7 @@ struct smu_context { bool disable_uclk_switch; uint32_t workload_mask; - uint32_t driver_workload_mask; - uint32_t workload_priority[WORKLOAD_POLICY_MAX]; + uint32_t workload_prority[WORKLOAD_POLICY_MAX]; uint32_t workload_setting[WORKLOAD_POLICY_MAX]; uint32_t power_profile_mode; uint32_t default_power_profile_mode; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index 31fe512028f4..c0f6b59369b7 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -1455,6 +1455,7 @@ static int arcturus_set_power_profile_mode(struct smu_context *smu, return -EINVAL; } + if ((profile_mode == PP_SMC_POWER_PROFILE_CUSTOM) && (smu->smc_fw_version >= 0x360d00)) { if (size != 10) @@ -1522,14 +1523,14 @@ static int arcturus_set_power_profile_mode(struct smu_context *smu, ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, - smu->workload_mask, + 1 << workload_type, NULL); if (ret) { dev_err(smu->adev->dev, "Fail to set workload type %d\n", workload_type); return ret; } - smu_cmn_assign_power_profile(smu); + smu->power_profile_mode = profile_mode; return 0; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c index 12223f507977..16af1a329621 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c @@ -2081,13 +2081,10 @@ static int navi10_set_power_profile_mode(struct smu_context *smu, long *input, u smu->power_profile_mode); if (workload_type < 0) return -EINVAL; - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, - smu->workload_mask, NULL); + 1 << workload_type, NULL); if (ret) dev_err(smu->adev->dev, "[%s] Failed to set work load mask!", __func__); - else - smu_cmn_assign_power_profile(smu); return ret; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 3b7b2ec8319a..9c3c48297cba 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -1786,13 +1786,10 @@ static int sienna_cichlid_set_power_profile_mode(struct smu_context *smu, long * smu->power_profile_mode); if (workload_type < 0) return -EINVAL; - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, - smu->workload_mask, NULL); + 1 << workload_type, NULL); if (ret) dev_err(smu->adev->dev, "[%s] Failed to set work load mask!", __func__); - else - smu_cmn_assign_power_profile(smu); return ret; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c index 952ee22cbc90..1fe020f1f4db 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/vangogh_ppt.c @@ -1079,7 +1079,7 @@ static int vangogh_set_power_profile_mode(struct smu_context *smu, long *input, } ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ActiveProcessNotify, - smu->workload_mask, + 1 << workload_type, NULL); if (ret) { dev_err_once(smu->adev->dev, "Fail to set workload type %d\n", @@ -1087,7 +1087,7 @@ static int vangogh_set_power_profile_mode(struct smu_context *smu, long *input, return ret; } - smu_cmn_assign_power_profile(smu); + smu->power_profile_mode = profile_mode; return 0; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c index 62316a6707ef..cc0504b063fa 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/renoir_ppt.c @@ -890,14 +890,14 @@ static int renoir_set_power_profile_mode(struct smu_context *smu, long *input, u } ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_ActiveProcessNotify, - smu->workload_mask, + 1 << workload_type, NULL); if (ret) { dev_err_once(smu->adev->dev, "Fail to set workload type %d\n", workload_type); return ret; } - smu_cmn_assign_power_profile(smu); + smu->power_profile_mode = profile_mode; return 0; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 5dd7ceca64fe..d53e162dcd8d 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2485,7 +2485,7 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu, DpmActivityMonitorCoeffInt_t *activity_monitor = &(activity_monitor_external.DpmActivityMonitorCoeffInt); int workload_type, ret = 0; - u32 workload_mask; + u32 workload_mask, selected_workload_mask; smu->power_profile_mode = input[size]; @@ -2552,7 +2552,7 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu, if (workload_type < 0) return -EINVAL; - workload_mask = 1 << workload_type; + selected_workload_mask = workload_mask = 1 << workload_type; /* Add optimizations for SMU13.0.0/10. Reuse the power saving profile */ if ((amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 0) && @@ -2567,22 +2567,12 @@ static int smu_v13_0_0_set_power_profile_mode(struct smu_context *smu, workload_mask |= 1 << workload_type; } - smu->workload_mask |= workload_mask; ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, - smu->workload_mask, + workload_mask, NULL); - if (!ret) { - smu_cmn_assign_power_profile(smu); - if (smu->power_profile_mode == PP_SMC_POWER_PROFILE_POWERSAVING) { - workload_type = smu_cmn_to_asic_specific_index(smu, - CMN2ASIC_MAPPING_WORKLOAD, - PP_SMC_POWER_PROFILE_FULLSCREEN3D); - smu->power_profile_mode = smu->workload_mask & (1 << workload_type) - ? PP_SMC_POWER_PROFILE_FULLSCREEN3D - : PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; - } - } + if (!ret) + smu->workload_mask = selected_workload_mask; return ret; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index 9d0b19419de0..b891a5e0a396 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -2499,14 +2499,13 @@ static int smu_v13_0_7_set_power_profile_mode(struct smu_context *smu, long *inp smu->power_profile_mode); if (workload_type < 0) return -EINVAL; - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, - smu->workload_mask, NULL); + 1 << workload_type, NULL); if (ret) dev_err(smu->adev->dev, "[%s] Failed to set work load mask!", __func__); else - smu_cmn_assign_power_profile(smu); + smu->workload_mask = (1 << workload_type); return ret; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c index 1aa13d32ceb2..1e16a281f2dc 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c @@ -1807,11 +1807,12 @@ static int smu_v14_0_2_set_power_profile_mode(struct smu_context *smu, if (workload_type < 0) return -EINVAL; - ret = smu_cmn_send_smc_msg_with_param(smu, SMU_MSG_SetWorkloadMask, - smu->workload_mask, NULL); - + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_SetWorkloadMask, + 1 << workload_type, + NULL); if (!ret) - smu_cmn_assign_power_profile(smu); + smu->workload_mask = 1 << workload_type; return ret; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c index bdfc5e617333..91ad434bcdae 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c @@ -1138,14 +1138,6 @@ int smu_cmn_set_mp1_state(struct smu_context *smu, return ret; } -void smu_cmn_assign_power_profile(struct smu_context *smu) -{ - uint32_t index; - index = fls(smu->workload_mask); - index = index > 0 && index <= WORKLOAD_POLICY_MAX ? index - 1 : 0; - smu->power_profile_mode = smu->workload_setting[index]; -} - bool smu_cmn_is_audio_func_enabled(struct amdgpu_device *adev) { struct pci_dev *p = NULL; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h index 8a801e389659..1de685defe85 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h @@ -130,8 +130,6 @@ void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t crev); int smu_cmn_set_mp1_state(struct smu_context *smu, enum pp_mp1_state mp1_state); -void smu_cmn_assign_power_profile(struct smu_context *smu); - /* * Helper function to make sysfs_emit_at() happy. Align buf to * the current page boundary and record the offset. -- 2.50.1 From d1aa0c04294e29883d65eac6c2f72fe95cc7c049 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Nov 2024 16:57:24 -0800 Subject: [PATCH 10/16] mm: revert "mm: shmem: fix data-race in shmem_getattr()" Revert d949d1d14fa2 ("mm: shmem: fix data-race in shmem_getattr()") as suggested by Chuck [1]. It is causing deadlocks when accessing tmpfs over NFS. As Hugh commented, "added just to silence a syzbot sanitizer splat: added where there has never been any practical problem". Link: https://lkml.kernel.org/r/ZzdxKF39VEmXSSyN@tissot.1015granger.net [1] Fixes: d949d1d14fa2 ("mm: shmem: fix data-race in shmem_getattr()") Acked-by: Hugh Dickins Cc: Chuck Lever Cc: Jeongjun Park Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index e87f5d6799a7..568bb290bdce 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1166,9 +1166,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - inode_lock_shared(inode); generic_fillattr(idmap, request_mask, inode, stat); - inode_unlock_shared(inode); if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) stat->blksize = HPAGE_PMD_SIZE; -- 2.50.1 From adc218676eef25575469234709c2d87185ca223a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Nov 2024 14:15:08 -0800 Subject: [PATCH 11/16] Linux 6.12 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 79192a3024bf..68a8faff2543 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.50.1 From d4a058762f3d931aa1159b64ba94a09a04024f8c Mon Sep 17 00:00:00 2001 From: Todd Brandt Date: Wed, 31 Jul 2024 12:24:09 -0400 Subject: [PATCH 12/16] tools/power turbostat: fix GCC9 build regression Fix build regression seen when using old gcc-9 compiler. Signed-off-by: Todd Brandt Reviewed-by: Chen Yu Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 089220aaa5c9..00674f7abdf5 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2788,6 +2788,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } for (i = 0, ppmt = sys.pmt_tp; ppmt; i++, ppmt = ppmt->next) { + const unsigned long value_raw = t->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) @@ -2799,9 +2801,6 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data break; case PMT_TYPE_XTAL_TIME: - const unsigned long value_raw = t->pmt_counter[i]; - const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); break; } @@ -2869,6 +2868,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } for (i = 0, ppmt = sys.pmt_cp; ppmt; i++, ppmt = ppmt->next) { + const unsigned long value_raw = c->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) @@ -2880,9 +2881,6 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data break; case PMT_TYPE_XTAL_TIME: - const unsigned long value_raw = c->pmt_counter[i]; - const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); break; } @@ -3068,6 +3066,8 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data } for (i = 0, ppmt = sys.pmt_pp; ppmt; i++, ppmt = ppmt->next) { + const unsigned long value_raw = p->pmt_counter[i]; + const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; switch (ppmt->type) { case PMT_TYPE_RAW: if (pmt_counter_get_width(ppmt) <= 32) @@ -3079,9 +3079,6 @@ int format_counters(struct thread_data *t, struct core_data *c, struct pkg_data break; case PMT_TYPE_XTAL_TIME: - const unsigned long value_raw = p->pmt_counter[i]; - const double value_converted = 100.0 * value_raw / crystal_hz / interval_float; - outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), value_converted); break; } -- 2.50.1 From ea8614c08d7f725729910f464c01577d036f38f5 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Wed, 7 Aug 2024 13:43:39 +0200 Subject: [PATCH 13/16] tools/power turbostat: Fix column printing for PMT xtal_time counters If the very first printed column was for a PMT counter of type xtal_time we would misalign the column header, because we were always printing the delimiter. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 00674f7abdf5..80ac40638307 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -2291,7 +2291,7 @@ void print_header(char *delim) break; case PMT_TYPE_XTAL_TIME: - outp += sprintf(outp, "%s%s", delim, ppmt->name); + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); break; } @@ -2365,7 +2365,7 @@ void print_header(char *delim) break; case PMT_TYPE_XTAL_TIME: - outp += sprintf(outp, "%s%s", delim, ppmt->name); + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); break; } @@ -2496,7 +2496,7 @@ void print_header(char *delim) break; case PMT_TYPE_XTAL_TIME: - outp += sprintf(outp, "%s%s", delim, ppmt->name); + outp += sprintf(outp, "%s%s", (printed++ ? delim : ""), ppmt->name); break; } -- 2.50.1 From ae2cdf8d92ffc326104524a1f9da4cf75b6ea996 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 20 Aug 2024 18:47:59 +0200 Subject: [PATCH 14/16] tools/power turbostat: Allow using cpu device in perf counters on hybrid platforms Intel hybrid platforms expose different perf devices for P and E cores. Instead of one, "/sys/bus/event_source/devices/cpu" device, there are "/sys/bus/event_source/devices/{cpu_core,cpu_atom}". This, however makes it more complicated for the user, because most of the counters are available on both and had to be handled manually. This patch allows users to use "virtual" cpu device that is seemingly translated to cpu_core and cpu_atom perf devices, depending on the type of a CPU we are opening the counter for. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.8 | 25 ++++++ tools/power/x86/turbostat/turbostat.c | 105 ++++++++++++++++++++++++-- 2 files changed, 123 insertions(+), 7 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.8 b/tools/power/x86/turbostat/turbostat.8 index 067717bce1d4..56c7ff6efcda 100644 --- a/tools/power/x86/turbostat/turbostat.8 +++ b/tools/power/x86/turbostat/turbostat.8 @@ -33,6 +33,9 @@ name as necessary to disambiguate it from others is necessary. Note that option msr0xXXX is a hex offset, eg. msr0x10 /sys/path... is an absolute path to a sysfs attribute is a perf device from /sys/bus/event_source/devices/ eg. cstate_core + On Intel hybrid platforms, instead of one "cpu" perf device there are two, "cpu_core" and "cpu_atom" devices for P and E cores respectively. + Turbostat, in this case, allow user to use "cpu" device and will automatically detect the type of a CPU and translate it to "cpu_core" and "cpu_atom" accordingly. + For a complete example see "ADD PERF COUNTER EXAMPLE #2 (using virtual "cpu" device)". is a perf event for given device from /sys/bus/event_source/devices//events/ eg. c1-residency perf/cstate_core/c1-residency would then use /sys/bus/event_source/devices/cstate_core/events/c1-residency @@ -387,6 +390,28 @@ CPU pCPU%c1 CPU%c1 .fi +.SH ADD PERF COUNTER EXAMPLE #2 (using virtual cpu device) +Here we run on hybrid, Raptor Lake platform. +We limit turbostat to show output for just cpu0 (pcore) and cpu12 (ecore). +We add a counter showing number of L3 cache misses, using virtual "cpu" device, +labeling it with the column header, "VCMISS". +We add a counter showing number of L3 cache misses, using virtual "cpu_core" device, +labeling it with the column header, "PCMISS". This will fail on ecore cpu12. +We add a counter showing number of L3 cache misses, using virtual "cpu_atom" device, +labeling it with the column header, "ECMISS". This will fail on pcore cpu0. +We display it only once, after the conclusion of 0.1 second sleep. +.nf +sudo ./turbostat --quiet --cpu 0,12 --show CPU --add perf/cpu/cache-misses,cpu,delta,raw,VCMISS --add perf/cpu_core/cache-misses,cpu,delta,raw,PCMISS --add perf/cpu_atom/cache-misses,cpu,delta,raw,ECMISS sleep .1 +turbostat: added_perf_counters_init_: perf/cpu_atom/cache-misses: failed to open counter on cpu0 +turbostat: added_perf_counters_init_: perf/cpu_core/cache-misses: failed to open counter on cpu12 +0.104630 sec +CPU ECMISS PCMISS VCMISS +- 0x0000000000000000 0x0000000000000000 0x0000000000000000 +0 0x0000000000000000 0x0000000000007951 0x0000000000007796 +12 0x000000000001137a 0x0000000000000000 0x0000000000011392 + +.fi + .SH ADD PMT COUNTER EXAMPLE Here we limit turbostat to showing just the CPU number 0. We add two counters, showing crystal clock count and the DC6 residency. diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 80ac40638307..462d821eaf41 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -31,6 +31,9 @@ ) // end copied section +#define CPUID_LEAF_MODEL_ID 0x1A +#define CPUID_LEAF_MODEL_ID_CORE_TYPE_SHIFT 24 + #define X86_VENDOR_INTEL 0 #include INTEL_FAMILY_HEADER @@ -89,6 +92,9 @@ #define PERF_DEV_NAME_BYTES 32 #define PERF_EVT_NAME_BYTES 32 +#define INTEL_ECORE_TYPE 0x20 +#define INTEL_PCORE_TYPE 0x40 + enum counter_scope { SCOPE_CPU, SCOPE_CORE, SCOPE_PACKAGE }; enum counter_type { COUNTER_ITEMS, COUNTER_CYCLES, COUNTER_SECONDS, COUNTER_USEC, COUNTER_K2M }; enum counter_format { FORMAT_RAW, FORMAT_DELTA, FORMAT_PERCENT, FORMAT_AVERAGE }; @@ -1848,6 +1854,7 @@ struct cpu_topology { int logical_node_id; /* 0-based count within the package */ int physical_core_id; int thread_id; + int type; cpu_set_t *put_ids; /* Processing Unit/Thread IDs */ } *cpus; @@ -5653,6 +5660,32 @@ int init_thread_id(int cpu) return 0; } +int set_my_cpu_type(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int max_level; + + __cpuid(0, max_level, ebx, ecx, edx); + + if (max_level < CPUID_LEAF_MODEL_ID) + return 0; + + __cpuid(CPUID_LEAF_MODEL_ID, eax, ebx, ecx, edx); + + return (eax >> CPUID_LEAF_MODEL_ID_CORE_TYPE_SHIFT); +} + +int set_cpu_hybrid_type(int cpu) +{ + if (cpu_migrate(cpu)) + return -1; + + int type = set_my_cpu_type(); + + cpus[cpu].type = type; + return 0; +} + /* * snapshot_proc_interrupts() * @@ -8281,6 +8314,8 @@ void topology_probe(bool startup) for_all_proc_cpus(init_thread_id); + for_all_proc_cpus(set_cpu_hybrid_type); + /* * For online cpus * find max_core_id, max_package_id @@ -8545,6 +8580,35 @@ void check_perf_access(void) bic_enabled &= ~BIC_IPC; } +bool perf_has_hybrid_devices(void) +{ + /* + * 0: unknown + * 1: has separate perf device for p and e core + * -1: doesn't have separate perf device for p and e core + */ + static int cached; + + if (cached > 0) + return true; + + if (cached < 0) + return false; + + if (access("/sys/bus/event_source/devices/cpu_core", F_OK)) { + cached = -1; + return false; + } + + if (access("/sys/bus/event_source/devices/cpu_atom", F_OK)) { + cached = -1; + return false; + } + + cached = 1; + return true; +} + int added_perf_counters_init_(struct perf_counter_info *pinfo) { size_t num_domains = 0; @@ -8601,29 +8665,56 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) if (domain_visited[next_domain]) continue; - perf_type = read_perf_type(pinfo->device); + /* + * Intel hybrid platforms expose different perf devices for P and E cores. + * Instead of one, "/sys/bus/event_source/devices/cpu" device, there are + * "/sys/bus/event_source/devices/{cpu_core,cpu_atom}". + * + * This makes it more complicated to the user, because most of the counters + * are available on both and have to be handled manually, otherwise. + * + * Code below, allow user to use the old "cpu" name, which is translated accordingly. + */ + const char *perf_device = pinfo->device; + + if (strcmp(perf_device, "cpu") == 0 && perf_has_hybrid_devices()) { + switch (cpus[cpu].type) { + case INTEL_PCORE_TYPE: + perf_device = "cpu_core"; + break; + + case INTEL_ECORE_TYPE: + perf_device = "cpu_atom"; + break; + + default: /* Don't change, we will probably fail and report a problem soon. */ + break; + } + } + + perf_type = read_perf_type(perf_device); if (perf_type == (unsigned int)-1) { warnx("%s: perf/%s/%s: failed to read %s", - __func__, pinfo->device, pinfo->event, "type"); + __func__, perf_device, pinfo->event, "type"); continue; } - perf_config = read_perf_config(pinfo->device, pinfo->event); + perf_config = read_perf_config(perf_device, pinfo->event); if (perf_config == (unsigned int)-1) { warnx("%s: perf/%s/%s: failed to read %s", - __func__, pinfo->device, pinfo->event, "config"); + __func__, perf_device, pinfo->event, "config"); continue; } /* Scale is not required, some counters just don't have it. */ - perf_scale = read_perf_scale(pinfo->device, pinfo->event); + perf_scale = read_perf_scale(perf_device, pinfo->event); if (perf_scale == 0.0) perf_scale = 1.0; fd_perf = open_perf_counter(cpu, perf_type, perf_config, -1, 0); if (fd_perf == -1) { warnx("%s: perf/%s/%s: failed to open counter on cpu%d", - __func__, pinfo->device, pinfo->event, cpu); + __func__, perf_device, pinfo->event, cpu); continue; } @@ -8633,7 +8724,7 @@ int added_perf_counters_init_(struct perf_counter_info *pinfo) if (debug) fprintf(stderr, "Add perf/%s/%s cpu%d: %d\n", - pinfo->device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); + perf_device, pinfo->event, cpu, pinfo->fd_perf_per_domain[next_domain]); } pinfo = pinfo->next; -- 2.50.1 From fed8511cc8996989178823052dc0200643e1389a Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Tue, 27 Aug 2024 13:07:51 +0800 Subject: [PATCH 15/16] tools/power turbostat: Fix trailing '\n' parsing parse_cpu_string() parses the string input either from command line or from /sys/fs/cgroup/cpuset.cpus.effective to get a list of CPUs that turbostat can run with. The cpu string returned by /sys/fs/cgroup/cpuset.cpus.effective contains a trailing '\n', but strtoul() fails to treat this as an error. That says, for the code below val = ("\n", NULL, 10); val returns 0, and errno is also not set. As a result, CPU0 is erroneously considered as allowed CPU and this causes failures when turbostat tries to run on CPU0. get_counters: Could not migrate to CPU 0 ... turbostat: re-initialized with num_cpus 8, allowed_cpus 5 get_counters: Could not migrate to CPU 0 Add a check to return immediately if '\n' or '\0' is detected. Fixes: 8c3dd2c9e542 ("tools/power/turbostat: Abstrct function for parsing cpu string") Signed-off-by: Zhang Rui Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 462d821eaf41..b4386d54e65d 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -5389,6 +5389,9 @@ static int parse_cpu_str(char *cpu_str, cpu_set_t *cpu_set, int cpu_set_size) if (*next == '-') /* no negative cpu numbers */ return 1; + if (*next == '\0' || *next == '\n') + break; + start = strtoul(next, &next, 10); if (start >= CPU_SUBSET_MAXCPUS) -- 2.50.1 From c808624e2db2234991d948aa9bb9cd05bc3851a9 Mon Sep 17 00:00:00 2001 From: Patryk Wlazlyn Date: Tue, 17 Sep 2024 22:33:26 +0200 Subject: [PATCH 16/16] tools/power turbostat: Honor --show CPU, even when even when num_cpus=1 Honor --show CPU and --show Core when "topo.num_cpus == 1". Previously turbostat assumed that on a 1-CPU system, these columns should never appear. Honoring these flags makes it easier for several programs that parse turbostat output. Signed-off-by: Patryk Wlazlyn Signed-off-by: Len Brown --- tools/power/x86/turbostat/turbostat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index b4386d54e65d..924f14e1ec35 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -8229,7 +8229,7 @@ void topology_probe(bool startup) set_max_cpu_num(); topo.num_cpus = 0; for_all_proc_cpus(count_cpus); - if (!summary_only && topo.num_cpus > 1) + if (!summary_only) BIC_PRESENT(BIC_CPU); if (debug > 1) @@ -8367,7 +8367,7 @@ void topology_probe(bool startup) topo.cores_per_node = max_core_id + 1; if (debug > 1) fprintf(outf, "max_core_id %d, sizing for %d cores per package\n", max_core_id, topo.cores_per_node); - if (!summary_only && topo.cores_per_node > 1) + if (!summary_only) BIC_PRESENT(BIC_Core); topo.num_die = topo.max_die_id + 1; -- 2.50.1