From faa242b1d2a97143150bdc50d5b61fd70fcd17cd Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 27 Oct 2024 12:33:21 +0000 Subject: [PATCH 01/16] mm/mlock: set the correct prev on failure After commit 94d7d9233951 ("mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al."), if vma_modify_flags() return error, the vma is set to an error code. This will lead to an invalid prev be returned. Generally this shouldn't matter as the caller should treat an error as indicating state is now invalidated, however unfortunately apply_mlockall_flags() does not check for errors and assumes that mlock_fixup() correctly maintains prev even if an error were to occur. This patch fixes that assumption. [lorenzo.stoakes@oracle.com: provide a better fix and rephrase the log] Link: https://lkml.kernel.org/r/20241027123321.19511-1-richard.weiyang@gmail.com Fixes: 94d7d9233951 ("mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al.") Signed-off-by: Wei Yang Reviewed-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Vlastimil Babka Cc: Jann Horn Cc: Signed-off-by: Andrew Morton --- mm/mlock.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index e3e3dc2b2956..cde076fa7d5e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -725,14 +725,17 @@ static int apply_mlockall_flags(int flags) } for_each_vma(vmi, vma) { + int error; vm_flags_t newflags; newflags = vma->vm_flags & ~VM_LOCKED_MASK; newflags |= to_add; - /* Ignore errors */ - mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, - newflags); + error = mlock_fixup(&vmi, vma, &prev, vma->vm_start, vma->vm_end, + newflags); + /* Ignore errors, but prev needs fixing up. */ + if (error) + prev = vma; cond_resched(); } out: -- 2.51.0 From 3488af0970445ff5532c7e8dc5e6456b877aee5e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 31 Oct 2024 11:37:56 -0700 Subject: [PATCH 02/16] mm/damon/core: handle zero {aggregation,ops_update} intervals Patch series "mm/damon/core: fix handling of zero non-sampling intervals". DAMON's internal intervals accounting logic is not correctly handling non-sampling intervals of zero values for a wrong assumption. This could cause unexpected monitoring behavior, and even result in infinite hang of DAMON sysfs interface user threads in case of zero aggregation interval. Fix those by updating the intervals accounting logic. For details of the root case and solutions, please refer to commit messages of fixes. This patch (of 2): DAMON's logics to determine if this is the time to do aggregation and ops update assumes next_{aggregation,ops_update}_sis are always set larger than current passed_sample_intervals. And therefore it further assumes continuously incrementing passed_sample_intervals every sampling interval will make it reaches to the next_{aggregation,ops_update}_sis in future. The logic therefore make the action and update next_{aggregation,ops_updaste}_sis only if passed_sample_intervals is same to the counts, respectively. If Aggregation interval or Ops update interval are zero, however, next_aggregation_sis or next_ops_update_sis are set same to current passed_sample_intervals, respectively. And passed_sample_intervals is incremented before doing the next_{aggregation,ops_update}_sis check. Hence, passed_sample_intervals becomes larger than next_{aggregation,ops_update}_sis, and the logic says it is not the time to do the action and update next_{aggregation,ops_update}_sis forever, until an overflow happens. In other words, DAMON stops doing aggregations or ops updates effectively forever, and users cannot get monitoring results. Based on the documents and the common sense, a reasonable behavior for such inputs is doing an aggregation and an ops update for every sampling interval. Handle the case by removing the assumption. Note that this could incur particular real issue for DAMON sysfs interface users, in case of zero Aggregation interval. When user starts DAMON with zero Aggregation interval and asks online DAMON parameter tuning via DAMON sysfs interface, the request is handled by the aggregation callback. Until the callback finishes the work, the user who requested the online tuning just waits. Hence, the user will be stuck until the passed_sample_intervals overflows. Link: https://lkml.kernel.org/r/20241031183757.49610-1-sj@kernel.org Link: https://lkml.kernel.org/r/20241031183757.49610-2-sj@kernel.org Fixes: 4472edf63d66 ("mm/damon/core: use number of passed access sampling as a timer") Signed-off-by: SeongJae Park Cc: [6.7.x] Signed-off-by: Andrew Morton --- mm/damon/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index a83f3b736d51..3131a07569e4 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -2000,7 +2000,7 @@ static int kdamond_fn(void *data) if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); - if (ctx->passed_sample_intervals == next_aggregation_sis) { + if (ctx->passed_sample_intervals >= next_aggregation_sis) { kdamond_merge_regions(ctx, max_nr_accesses / 10, sz_limit); @@ -2018,7 +2018,7 @@ static int kdamond_fn(void *data) sample_interval = ctx->attrs.sample_interval ? ctx->attrs.sample_interval : 1; - if (ctx->passed_sample_intervals == next_aggregation_sis) { + if (ctx->passed_sample_intervals >= next_aggregation_sis) { ctx->next_aggregation_sis = next_aggregation_sis + ctx->attrs.aggr_interval / sample_interval; @@ -2028,7 +2028,7 @@ static int kdamond_fn(void *data) ctx->ops.reset_aggregated(ctx); } - if (ctx->passed_sample_intervals == next_ops_update_sis) { + if (ctx->passed_sample_intervals >= next_ops_update_sis) { ctx->next_ops_update_sis = next_ops_update_sis + ctx->attrs.ops_update_interval / sample_interval; -- 2.51.0 From 8e7bde615f634a82a44b1f3d293c049fd3ef9ca9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 31 Oct 2024 11:37:57 -0700 Subject: [PATCH 03/16] mm/damon/core: handle zero schemes apply interval DAMON's logics to determine if this is the time to apply damos schemes assumes next_apply_sis is always set larger than current passed_sample_intervals. And therefore assume continuously incrementing passed_sample_intervals will make it reaches to the next_apply_sis in future. The logic hence does apply the scheme and update next_apply_sis only if passed_sample_intervals is same to next_apply_sis. If Schemes apply interval is set as zero, however, next_apply_sis is set same to current passed_sample_intervals, respectively. And passed_sample_intervals is incremented before doing the next_apply_sis check. Hence, next_apply_sis becomes larger than next_apply_sis, and the logic says it is not the time to apply schemes and update next_apply_sis. In other words, DAMON stops applying schemes until passed_sample_intervals overflows. Based on the documents and the common sense, a reasonable behavior for such inputs would be applying the schemes for every sampling interval. Handle the case by removing the assumption. Link: https://lkml.kernel.org/r/20241031183757.49610-3-sj@kernel.org Fixes: 42f994b71404 ("mm/damon/core: implement scheme-specific apply interval") Signed-off-by: SeongJae Park Cc: [6.7.x] Signed-off-by: Andrew Morton --- mm/damon/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 3131a07569e4..ce700e694b63 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1412,7 +1412,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - if (c->passed_sample_intervals != s->next_apply_sis) + if (c->passed_sample_intervals < s->next_apply_sis) continue; if (!s->wmarks.activated) @@ -1622,7 +1622,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) bool has_schemes_to_apply = false; damon_for_each_scheme(s, c) { - if (c->passed_sample_intervals != s->next_apply_sis) + if (c->passed_sample_intervals < s->next_apply_sis) continue; if (!s->wmarks.activated) @@ -1642,9 +1642,9 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } damon_for_each_scheme(s, c) { - if (c->passed_sample_intervals != s->next_apply_sis) + if (c->passed_sample_intervals < s->next_apply_sis) continue; - s->next_apply_sis += + s->next_apply_sis = c->passed_sample_intervals + (s->apply_interval_us ? s->apply_interval_us : c->attrs.aggr_interval) / sample_interval; } -- 2.51.0 From 4401e9d10ab0281a520b9f8c220f30f60b5c248f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Thu, 31 Oct 2024 09:12:03 -0700 Subject: [PATCH 04/16] mm/damon/core: avoid overflow in damon_feed_loop_next_input() damon_feed_loop_next_input() is inefficient and fragile to overflows. Specifically, 'score_goal_diff_bp' calculation can overflow when 'score' is high. The calculation is actually unnecessary at all because 'goal' is a constant of value 10,000. Calculation of 'compensation' is again fragile to overflow. Final calculation of return value for under-achiving case is again fragile to overflow when the current score is under-achieving the target. Add two corner cases handling at the beginning of the function to make the body easier to read, and rewrite the body of the function to avoid overflows and the unnecessary bp value calcuation. Link: https://lkml.kernel.org/r/20241031161203.47751-1-sj@kernel.org Fixes: 9294a037c015 ("mm/damon/core: implement goal-oriented feedback-driven quota auto-tuning") Signed-off-by: SeongJae Park Reported-by: Guenter Roeck Closes: https://lore.kernel.org/944f3d5b-9177-48e7-8ec9-7f1331a3fea3@roeck-us.net Tested-by: Guenter Roeck Cc: [6.8.x] Signed-off-by: Andrew Morton --- mm/damon/core.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index ce700e694b63..511c3f61ab44 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1456,17 +1456,31 @@ static unsigned long damon_feed_loop_next_input(unsigned long last_input, unsigned long score) { const unsigned long goal = 10000; - unsigned long score_goal_diff = max(goal, score) - min(goal, score); - unsigned long score_goal_diff_bp = score_goal_diff * 10000 / goal; - unsigned long compensation = last_input * score_goal_diff_bp / 10000; /* Set minimum input as 10000 to avoid compensation be zero */ const unsigned long min_input = 10000; + unsigned long score_goal_diff, compensation; + bool over_achieving = score > goal; - if (goal > score) + if (score == goal) + return last_input; + if (score >= goal * 2) + return min_input; + + if (over_achieving) + score_goal_diff = score - goal; + else + score_goal_diff = goal - score; + + if (last_input < ULONG_MAX / score_goal_diff) + compensation = last_input * score_goal_diff / goal; + else + compensation = last_input / goal * score_goal_diff; + + if (over_achieving) + return max(last_input - compensation, min_input); + if (last_input < ULONG_MAX - compensation) return last_input + compensation; - if (last_input > compensation + min_input) - return last_input - compensation; - return min_input; + return ULONG_MAX; } #ifdef CONFIG_PSI -- 2.51.0 From 652e1a51465f2e8e75590bc3dd1e3a3b61020568 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ma=C3=ADra=20Canal?= Date: Fri, 1 Nov 2024 13:54:05 -0300 Subject: [PATCH 05/16] mm: fix docs for the kernel parameter ``thp_anon=`` MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit If we add ``thp_anon=32,64K:always`` to the kernel command line, we will see the following error: [ 0.000000] huge_memory: thp_anon=32,64K:always: error parsing string, ignoring setting This happens because the correct format isn't ``thp_anon=,[KMG]:```, as [KMG] must follow each number to especify its unit. So, the correct format is ``thp_anon=[KMG],[KMG]:```. Therefore, adjust the documentation to reflect the correct format of the parameter ``thp_anon=``. Link: https://lkml.kernel.org/r/20241101165719.1074234-3-mcanal@igalia.com Fixes: dd4d30d1cdbe ("mm: override mTHP "enabled" defaults at kernel cmdline") Signed-off-by: Maíra Canal Acked-by: Barry Song Acked-by: David Hildenbrand Cc: Baolin Wang Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Lance Yang Cc: Ryan Roberts Signed-off-by: Andrew Morton --- Documentation/admin-guide/kernel-parameters.txt | 2 +- Documentation/admin-guide/mm/transhuge.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1518343bbe22..1666576acc0e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6688,7 +6688,7 @@ 0: no polling (default) thp_anon= [KNL] - Format: ,[KMG]:;-[KMG]: + Format: [KMG],[KMG]:;[KMG]-[KMG]: state is one of "always", "madvise", "never" or "inherit". Control the default behavior of the system with respect to anonymous transparent hugepages. diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index cfdd16a52e39..a1bb495eab59 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -303,7 +303,7 @@ control by passing the parameter ``transparent_hugepage=always`` or kernel command line. Alternatively, each supported anonymous THP size can be controlled by -passing ``thp_anon=,[KMG]:;-[KMG]:``, +passing ``thp_anon=[KMG],[KMG]:;[KMG]-[KMG]:``, where ```` is the THP size (must be a power of 2 of PAGE_SIZE and supported anonymous THP) and ```` is one of ``always``, ``madvise``, ``never`` or ``inherit``. -- 2.51.0 From 0268d4579901821ff17259213c2d8c9679995d48 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Fri, 1 Nov 2024 19:15:57 +0500 Subject: [PATCH 06/16] selftests: hugetlb_dio: check for initial conditions to skip in the start The test should be skipped if initial conditions aren't fulfilled in the start instead of failing and outputting non-compliant TAP logs. This kind of failure pollutes the results. The initial conditions are: - The test should only execute if /tmp file can be allocated. - The test should only execute if huge pages are free. Before: TAP version 13 1..4 Bail out! Error opening file : Read-only file system (30) # Planned tests != run tests (4 != 0) # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:0 error:0 After: TAP version 13 1..0 # SKIP Unable to allocate file: Read-only file system Link: https://lkml.kernel.org/r/20241101141557.3159432-1-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Fixes: 3a103b5315b7 ("selftest: mm: Test if hugepage does not get leaked during __bio_release_pages()") Cc: Muhammad Usama Anjum Cc: Shuah Khan Cc: Donet Tom Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/hugetlb_dio.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb_dio.c b/tools/testing/selftests/mm/hugetlb_dio.c index f9ac20c657ec..60001c142ce9 100644 --- a/tools/testing/selftests/mm/hugetlb_dio.c +++ b/tools/testing/selftests/mm/hugetlb_dio.c @@ -44,13 +44,6 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) if (fd < 0) ksft_exit_fail_perror("Error opening file\n"); - /* Get the free huge pages before allocation */ - free_hpage_b = get_free_hugepages(); - if (free_hpage_b == 0) { - close(fd); - ksft_exit_skip("No free hugepage, exiting!\n"); - } - /* Allocate a hugetlb page */ orig_buffer = mmap(NULL, h_pagesize, mmap_prot, mmap_flags, -1, 0); if (orig_buffer == MAP_FAILED) { @@ -94,8 +87,20 @@ void run_dio_using_hugetlb(unsigned int start_off, unsigned int end_off) int main(void) { size_t pagesize = 0; + int fd; ksft_print_header(); + + /* Open the file to DIO */ + fd = open("/tmp", O_TMPFILE | O_RDWR | O_DIRECT, 0664); + if (fd < 0) + ksft_exit_skip("Unable to allocate file: %s\n", strerror(errno)); + close(fd); + + /* Check if huge pages are free */ + if (!get_free_hugepages()) + ksft_exit_skip("No free hugepage, exiting\n"); + ksft_set_plan(4); /* Get base page size */ -- 2.51.0 From 432dc0654c612457285a5dcf9bb13968ac6f0804 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Fri, 1 Nov 2024 19:19:40 +0000 Subject: [PATCH 07/16] ucounts: fix counter leak in inc_rlimit_get_ucounts() The inc_rlimit_get_ucounts() increments the specified rlimit counter and then checks its limit. If the value exceeds the limit, the function returns an error without decrementing the counter. Link: https://lkml.kernel.org/r/20241101191940.3211128-1-roman.gushchin@linux.dev Fixes: 15bc01effefe ("ucounts: Fix signal ucount refcounting") Signed-off-by: Andrei Vagin Co-developed-by: Roman Gushchin Signed-off-by: Roman Gushchin Tested-by: Roman Gushchin Acked-by: Alexey Gladkov Cc: Kees Cook Cc: Andrei Vagin Cc: "Eric W. Biederman" Cc: Alexey Gladkov Cc: Oleg Nesterov Cc: Signed-off-by: Andrew Morton --- kernel/ucount.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/ucount.c b/kernel/ucount.c index 8c07714ff27d..9469102c5ac0 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -317,7 +317,7 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) for (iter = ucounts; iter; iter = iter->ns->ucounts) { long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) - goto unwind; + goto dec_unwind; if (iter == ucounts) ret = new; max = get_userns_rlimit_max(iter->ns, type); @@ -334,7 +334,6 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) dec_unwind: dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); -unwind: do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } -- 2.51.0 From b8ee299855f08539e04d6c1a6acb3dc9e5423c00 Mon Sep 17 00:00:00 2001 From: Qi Xi Date: Fri, 1 Nov 2024 11:48:03 +0800 Subject: [PATCH 08/16] fs/proc: fix compile warning about variable 'vmcore_mmap_ops' When build with !CONFIG_MMU, the variable 'vmcore_mmap_ops' is defined but not used: >> fs/proc/vmcore.c:458:42: warning: unused variable 'vmcore_mmap_ops' 458 | static const struct vm_operations_struct vmcore_mmap_ops = { Fix this by only defining it when CONFIG_MMU is enabled. Link: https://lkml.kernel.org/r/20241101034803.9298-1-xiqi2@huawei.com Fixes: 9cb218131de1 ("vmcore: introduce remap_oldmem_pfn_range()") Signed-off-by: Qi Xi Reported-by: kernel test robot Closes: https://lore.kernel.org/lkml/202410301936.GcE8yUos-lkp@intel.com/ Cc: Baoquan He Cc: Dave Young Cc: Michael Holzheu Cc: Vivek Goyal Cc: Wang ShaoBo Signed-off-by: Andrew Morton --- fs/proc/vmcore.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index b52d85f8ad59..b4521b096058 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -457,10 +457,6 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf) #endif } -static const struct vm_operations_struct vmcore_mmap_ops = { - .fault = mmap_vmcore_fault, -}; - /** * vmcore_alloc_buf - allocate buffer in vmalloc memory * @size: size of buffer @@ -488,6 +484,11 @@ static inline char *vmcore_alloc_buf(size_t size) * virtually contiguous user-space in ELF layout. */ #ifdef CONFIG_MMU + +static const struct vm_operations_struct vmcore_mmap_ops = { + .fault = mmap_vmcore_fault, +}; + /* * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages * reported as not being ram with the zero page. -- 2.51.0 From 9e05e5c7ee8758141d2db7e8fea2cab34500c6ed Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 4 Nov 2024 19:54:19 +0000 Subject: [PATCH 09/16] signal: restore the override_rlimit logic Prior to commit d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts") UCOUNT_RLIMIT_SIGPENDING rlimit was not enforced for a class of signals. However now it's enforced unconditionally, even if override_rlimit is set. This behavior change caused production issues. For example, if the limit is reached and a process receives a SIGSEGV signal, sigqueue_alloc fails to allocate the necessary resources for the signal delivery, preventing the signal from being delivered with siginfo. This prevents the process from correctly identifying the fault address and handling the error. From the user-space perspective, applications are unaware that the limit has been reached and that the siginfo is effectively 'corrupted'. This can lead to unpredictable behavior and crashes, as we observed with java applications. Fix this by passing override_rlimit into inc_rlimit_get_ucounts() and skip the comparison to max there if override_rlimit is set. This effectively restores the old behavior. Link: https://lkml.kernel.org/r/20241104195419.3962584-1-roman.gushchin@linux.dev Fixes: d64696905554 ("Reimplement RLIMIT_SIGPENDING on top of ucounts") Signed-off-by: Roman Gushchin Co-developed-by: Andrei Vagin Signed-off-by: Andrei Vagin Acked-by: Oleg Nesterov Acked-by: Alexey Gladkov Cc: Kees Cook Cc: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton --- include/linux/user_namespace.h | 3 ++- kernel/signal.c | 3 ++- kernel/ucount.c | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 3625096d5f85..7183e5aca282 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -141,7 +141,8 @@ static inline long get_rlimit_value(struct ucounts *ucounts, enum rlimit_type ty long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v); -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type); +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit); void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type); bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long max); diff --git a/kernel/signal.c b/kernel/signal.c index 4344860ffcac..cbabb2d05e0a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -419,7 +419,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, */ rcu_read_lock(); ucounts = task_ucounts(t); - sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); + sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, + override_rlimit); rcu_read_unlock(); if (!sigpending) return NULL; diff --git a/kernel/ucount.c b/kernel/ucount.c index 9469102c5ac0..696406939be5 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -307,7 +307,8 @@ void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) do_dec_rlimit_put_ucounts(ucounts, NULL, type); } -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type, + bool override_rlimit) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; @@ -320,7 +321,8 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) goto dec_unwind; if (iter == ucounts) ret = new; - max = get_userns_rlimit_max(iter->ns, type); + if (!override_rlimit) + max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. -- 2.51.0 From 0b63c0e01fba40e3992bc627272ec7b618ccaef7 Mon Sep 17 00:00:00 2001 From: Andrew Kanner Date: Sun, 3 Nov 2024 20:38:45 +0100 Subject: [PATCH 10/16] ocfs2: remove entry once instead of null-ptr-dereference in ocfs2_xa_remove() Syzkaller is able to provoke null-ptr-dereference in ocfs2_xa_remove(): [ 57.319872] (a.out,1161,7):ocfs2_xa_remove:2028 ERROR: status = -12 [ 57.320420] (a.out,1161,7):ocfs2_xa_cleanup_value_truncate:1999 ERROR: Partial truncate while removing xattr overlay.upper. Leaking 1 clusters and removing the entry [ 57.321727] BUG: kernel NULL pointer dereference, address: 0000000000000004 [...] [ 57.325727] RIP: 0010:ocfs2_xa_block_wipe_namevalue+0x2a/0xc0 [...] [ 57.331328] Call Trace: [ 57.331477] [...] [ 57.333511] ? do_user_addr_fault+0x3e5/0x740 [ 57.333778] ? exc_page_fault+0x70/0x170 [ 57.334016] ? asm_exc_page_fault+0x2b/0x30 [ 57.334263] ? __pfx_ocfs2_xa_block_wipe_namevalue+0x10/0x10 [ 57.334596] ? ocfs2_xa_block_wipe_namevalue+0x2a/0xc0 [ 57.334913] ocfs2_xa_remove_entry+0x23/0xc0 [ 57.335164] ocfs2_xa_set+0x704/0xcf0 [ 57.335381] ? _raw_spin_unlock+0x1a/0x40 [ 57.335620] ? ocfs2_inode_cache_unlock+0x16/0x20 [ 57.335915] ? trace_preempt_on+0x1e/0x70 [ 57.336153] ? start_this_handle+0x16c/0x500 [ 57.336410] ? preempt_count_sub+0x50/0x80 [ 57.336656] ? _raw_read_unlock+0x20/0x40 [ 57.336906] ? start_this_handle+0x16c/0x500 [ 57.337162] ocfs2_xattr_block_set+0xa6/0x1e0 [ 57.337424] __ocfs2_xattr_set_handle+0x1fd/0x5d0 [ 57.337706] ? ocfs2_start_trans+0x13d/0x290 [ 57.337971] ocfs2_xattr_set+0xb13/0xfb0 [ 57.338207] ? dput+0x46/0x1c0 [ 57.338393] ocfs2_xattr_trusted_set+0x28/0x30 [ 57.338665] ? ocfs2_xattr_trusted_set+0x28/0x30 [ 57.338948] __vfs_removexattr+0x92/0xc0 [ 57.339182] __vfs_removexattr_locked+0xd5/0x190 [ 57.339456] ? preempt_count_sub+0x50/0x80 [ 57.339705] vfs_removexattr+0x5f/0x100 [...] Reproducer uses faultinject facility to fail ocfs2_xa_remove() -> ocfs2_xa_value_truncate() with -ENOMEM. In this case the comment mentions that we can return 0 if ocfs2_xa_cleanup_value_truncate() is going to wipe the entry anyway. But the following 'rc' check is wrong and execution flow do 'ocfs2_xa_remove_entry(loc);' twice: * 1st: in ocfs2_xa_cleanup_value_truncate(); * 2nd: returning back to ocfs2_xa_remove() instead of going to 'out'. Fix this by skipping the 2nd removal of the same entry and making syzkaller repro happy. Link: https://lkml.kernel.org/r/20241103193845.2940988-1-andrew.kanner@gmail.com Fixes: 399ff3a748cf ("ocfs2: Handle errors while setting external xattr values.") Signed-off-by: Andrew Kanner Reported-by: syzbot+386ce9e60fa1b18aac5b@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/671e13ab.050a0220.2b8c0f.01d0.GAE@google.com/T/ Tested-by: syzbot+386ce9e60fa1b18aac5b@syzkaller.appspotmail.com Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton --- fs/ocfs2/xattr.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index dd0a05365e79..73a6f6fd8a8e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2036,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc, rc = 0; ocfs2_xa_cleanup_value_truncate(loc, "removing", orig_clusters); - if (rc) - goto out; + goto out; } } -- 2.51.0 From c289f4de8e479251b64988839fd0e87f246e03a2 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 4 Nov 2024 00:44:09 +0100 Subject: [PATCH 11/16] mailmap: add entry for Thorsten Blum Map my previously used email address to my @linux.dev address. Link: https://lkml.kernel.org/r/20241103234411.2522-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Cc: Alex Elder Cc: David S. Miller Cc: Geliang Tang Cc: Kees Cook Cc: Mathieu Othacehe Cc: Matthieu Baerts (NGI0) Cc: Matt Ranostay Cc: Naoya Horiguchi Cc: Neeraj Upadhyay Cc: Quentin Monnet Signed-off-by: Andrew Morton --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 5378f04b2566..5e829da09e7f 100644 --- a/.mailmap +++ b/.mailmap @@ -665,6 +665,7 @@ Tomeu Vizoso Thomas Graf Thomas Körper Thomas Pedersen +Thorsten Blum Tiezhu Yang Tingwei Zhang Tirupathi Reddy -- 2.51.0 From 8de3e97f3d3d62cd9f3067f073e8ac93261597db Mon Sep 17 00:00:00 2001 From: Liu Peibao Date: Fri, 1 Nov 2024 16:12:43 +0800 Subject: [PATCH 12/16] i2c: designware: do not hold SCL low when I2C_DYNAMIC_TAR_UPDATE is not set When the Tx FIFO is empty and the last command has no STOP bit set, the master holds SCL low. If I2C_DYNAMIC_TAR_UPDATE is not set, BIT(13) MST_ON_HOLD of IC_RAW_INTR_STAT is not enabled, causing the __i2c_dw_disable() timeout. This is quite similar to commit 2409205acd3c ("i2c: designware: fix __i2c_dw_disable() in case master is holding SCL low"). Also check BIT(7) MST_HOLD_TX_FIFO_EMPTY in IC_STATUS, which is available when IC_STAT_FOR_CLK_STRETCH is set. Fixes: 2409205acd3c ("i2c: designware: fix __i2c_dw_disable() in case master is holding SCL low") Co-developed-by: Xiaowu Ding Signed-off-by: Xiaowu Ding Co-developed-by: Angus Chen Signed-off-by: Angus Chen Signed-off-by: Liu Peibao Acked-by: Jarkko Nikula Signed-off-by: Andi Shyti --- drivers/i2c/busses/i2c-designware-common.c | 6 ++++-- drivers/i2c/busses/i2c-designware-core.h | 1 + 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-common.c b/drivers/i2c/busses/i2c-designware-common.c index f31d352d98b5..9d88b4fa03e4 100644 --- a/drivers/i2c/busses/i2c-designware-common.c +++ b/drivers/i2c/busses/i2c-designware-common.c @@ -524,7 +524,7 @@ err_release_lock: void __i2c_dw_disable(struct dw_i2c_dev *dev) { struct i2c_timings *t = &dev->timings; - unsigned int raw_intr_stats; + unsigned int raw_intr_stats, ic_stats; unsigned int enable; int timeout = 100; bool abort_needed; @@ -532,9 +532,11 @@ void __i2c_dw_disable(struct dw_i2c_dev *dev) int ret; regmap_read(dev->map, DW_IC_RAW_INTR_STAT, &raw_intr_stats); + regmap_read(dev->map, DW_IC_STATUS, &ic_stats); regmap_read(dev->map, DW_IC_ENABLE, &enable); - abort_needed = raw_intr_stats & DW_IC_INTR_MST_ON_HOLD; + abort_needed = (raw_intr_stats & DW_IC_INTR_MST_ON_HOLD) || + (ic_stats & DW_IC_STATUS_MASTER_HOLD_TX_FIFO_EMPTY); if (abort_needed) { if (!(enable & DW_IC_ENABLE_ENABLE)) { regmap_write(dev->map, DW_IC_ENABLE, DW_IC_ENABLE_ENABLE); diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 8e8854ec9882..2d32896d0673 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -116,6 +116,7 @@ #define DW_IC_STATUS_RFNE BIT(3) #define DW_IC_STATUS_MASTER_ACTIVITY BIT(5) #define DW_IC_STATUS_SLAVE_ACTIVITY BIT(6) +#define DW_IC_STATUS_MASTER_HOLD_TX_FIFO_EMPTY BIT(7) #define DW_IC_SDA_HOLD_RX_SHIFT 16 #define DW_IC_SDA_HOLD_RX_MASK GENMASK(23, 16) -- 2.51.0 From ace149e0830c380ddfce7e466fe860ca502fe4ee Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 13 Sep 2024 13:57:04 -0400 Subject: [PATCH 13/16] filemap: Fix bounds checking in filemap_read() If the caller supplies an iocb->ki_pos value that is close to the filesystem upper limit, and an iterator with a count that causes us to overflow that limit, then filemap_read() enters an infinite loop. This behaviour was discovered when testing xfstests generic/525 with the "localio" optimisation for loopback NFS mounts. Reported-by: Mike Snitzer Fixes: c2a9737f45e2 ("vfs,mm: fix a dead loop in truncate_inode_pages_range()") Tested-by: Mike Snitzer Signed-off-by: Trond Myklebust Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 36d22968be9a..56fa431c52af 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2625,7 +2625,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, if (unlikely(!iov_iter_count(iter))) return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos); folio_batch_init(&fbatch); do { -- 2.51.0 From 2d5404caa8c7bb5c4e0435f94b28834ae5456623 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 10 Nov 2024 14:19:35 -0800 Subject: [PATCH 14/16] Linux 6.12-rc7 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index b8efbfe9da94..79192a3024bf 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 12 SUBLEVEL = 0 -EXTRAVERSION = -rc6 +EXTRAVERSION = -rc7 NAME = Baby Opossum Posse # *DOCUMENTATION* -- 2.51.0 From a6654a40a852a4ca18aacced4cf5ca87997818d7 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 12 Nov 2024 16:35:36 +0800 Subject: [PATCH 15/16] LoongArch: For all possible CPUs setup logical-physical CPU mapping In order to support ACPI-based physical CPU hotplug, we suppose for all "possible" CPUs cpu_logical_map() can work. Because some drivers want to use cpu_logical_map() for all "possible" CPUs, while currently we only setup logical-physical mapping for "present" CPUs. This lack of mapping also causes cpu_to_node() cannot work for hot-added CPUs. All "possible" CPUs are listed in MADT, and the "present" subset is marked as ACPI_MADT_ENABLED. To setup logical-physical CPU mapping for all possible CPUs and keep present CPUs continuous in cpu_present_mask, we parse MADT twice. The first pass handles CPUs with ACPI_MADT_ENABLED and the second pass handles CPUs without ACPI_MADT_ENABLED. The global flag (cpu_enumerated) is removed because acpi_map_cpu() calls cpu_number_map() rather than set_processor_mask() now. Reported-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kernel/acpi.c | 81 +++++++++++++++++++++++------------- arch/loongarch/kernel/smp.c | 3 +- 2 files changed, 55 insertions(+), 29 deletions(-) diff --git a/arch/loongarch/kernel/acpi.c b/arch/loongarch/kernel/acpi.c index f1a74b80f22c..382a09a7152c 100644 --- a/arch/loongarch/kernel/acpi.c +++ b/arch/loongarch/kernel/acpi.c @@ -58,48 +58,48 @@ void __iomem *acpi_os_ioremap(acpi_physical_address phys, acpi_size size) return ioremap_cache(phys, size); } -static int cpu_enumerated = 0; - #ifdef CONFIG_SMP -static int set_processor_mask(u32 id, u32 flags) +static int set_processor_mask(u32 id, u32 pass) { - int nr_cpus; - int cpu, cpuid = id; - - if (!cpu_enumerated) - nr_cpus = NR_CPUS; - else - nr_cpus = nr_cpu_ids; + int cpu = -1, cpuid = id; - if (num_processors >= nr_cpus) { + if (num_processors >= NR_CPUS) { pr_warn(PREFIX "nr_cpus limit of %i reached." - " processor 0x%x ignored.\n", nr_cpus, cpuid); + " processor 0x%x ignored.\n", NR_CPUS, cpuid); return -ENODEV; } + if (cpuid == loongson_sysconf.boot_cpu_id) cpu = 0; - else - cpu = find_first_zero_bit(cpumask_bits(cpu_present_mask), NR_CPUS); - - if (!cpu_enumerated) - set_cpu_possible(cpu, true); - if (flags & ACPI_MADT_ENABLED) { + switch (pass) { + case 1: /* Pass 1 handle enabled processors */ + if (cpu < 0) + cpu = find_first_zero_bit(cpumask_bits(cpu_present_mask), NR_CPUS); num_processors++; set_cpu_present(cpu, true); - __cpu_number_map[cpuid] = cpu; - __cpu_logical_map[cpu] = cpuid; - } else + break; + case 2: /* Pass 2 handle disabled processors */ + if (cpu < 0) + cpu = find_first_zero_bit(cpumask_bits(cpu_possible_mask), NR_CPUS); disabled_cpus++; + break; + default: + return cpu; + } + + set_cpu_possible(cpu, true); + __cpu_number_map[cpuid] = cpu; + __cpu_logical_map[cpu] = cpuid; return cpu; } #endif static int __init -acpi_parse_processor(union acpi_subtable_headers *header, const unsigned long end) +acpi_parse_p1_processor(union acpi_subtable_headers *header, const unsigned long end) { struct acpi_madt_core_pic *processor = NULL; @@ -110,12 +110,29 @@ acpi_parse_processor(union acpi_subtable_headers *header, const unsigned long en acpi_table_print_madt_entry(&header->common); #ifdef CONFIG_SMP acpi_core_pic[processor->core_id] = *processor; - set_processor_mask(processor->core_id, processor->flags); + if (processor->flags & ACPI_MADT_ENABLED) + set_processor_mask(processor->core_id, 1); #endif return 0; } +static int __init +acpi_parse_p2_processor(union acpi_subtable_headers *header, const unsigned long end) +{ + struct acpi_madt_core_pic *processor = NULL; + + processor = (struct acpi_madt_core_pic *)header; + if (BAD_MADT_ENTRY(processor, end)) + return -EINVAL; + +#ifdef CONFIG_SMP + if (!(processor->flags & ACPI_MADT_ENABLED)) + set_processor_mask(processor->core_id, 2); +#endif + + return 0; +} static int __init acpi_parse_eio_master(union acpi_subtable_headers *header, const unsigned long end) { @@ -143,12 +160,14 @@ static void __init acpi_process_madt(void) } #endif acpi_table_parse_madt(ACPI_MADT_TYPE_CORE_PIC, - acpi_parse_processor, MAX_CORE_PIC); + acpi_parse_p1_processor, MAX_CORE_PIC); + + acpi_table_parse_madt(ACPI_MADT_TYPE_CORE_PIC, + acpi_parse_p2_processor, MAX_CORE_PIC); acpi_table_parse_madt(ACPI_MADT_TYPE_EIO_PIC, acpi_parse_eio_master, MAX_IO_PICS); - cpu_enumerated = 1; loongson_sysconf.nr_cpus = num_processors; } @@ -310,6 +329,10 @@ static int __ref acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) int nid; nid = acpi_get_node(handle); + + if (nid != NUMA_NO_NODE) + nid = early_cpu_to_node(cpu); + if (nid != NUMA_NO_NODE) { set_cpuid_to_node(physid, nid); node_set(nid, numa_nodes_parsed); @@ -324,12 +347,14 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, u32 acpi_id, int *pcpu { int cpu; - cpu = set_processor_mask(physid, ACPI_MADT_ENABLED); - if (cpu < 0) { + cpu = cpu_number_map(physid); + if (cpu < 0 || cpu >= nr_cpu_ids) { pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); - return cpu; + return -ERANGE; } + num_processors++; + set_cpu_present(cpu, true); acpi_map_cpu2node(handle, cpu, physid); *pcpu = cpu; diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index 9afc2d8b3414..c0b498467ffa 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -331,11 +331,11 @@ void __init loongson_prepare_cpus(unsigned int max_cpus) int i = 0; parse_acpi_topology(); + cpu_data[0].global_id = cpu_logical_map(0); for (i = 0; i < loongson_sysconf.nr_cpus; i++) { set_cpu_present(i, true); csr_mail_send(0, __cpu_logical_map[i], 0); - cpu_data[i].global_id = __cpu_logical_map[i]; } per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; @@ -380,6 +380,7 @@ void loongson_init_secondary(void) cpu_logical_map(cpu) / loongson_sysconf.cores_per_package; cpu_data[cpu].core = pptt_enabled ? cpu_data[cpu].core : cpu_logical_map(cpu) % loongson_sysconf.cores_per_package; + cpu_data[cpu].global_id = cpu_logical_map(cpu); } void loongson_smp_finish(void) -- 2.51.0 From 30cec747d6bf2c3e915c075d76d9712e54cde0a6 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Tue, 12 Nov 2024 16:35:36 +0800 Subject: [PATCH 16/16] LoongArch: Fix early_numa_add_cpu() usage for FDT systems early_numa_add_cpu() applies on physical CPU id rather than logical CPU id, so use cpuid instead of cpu. Cc: stable@vger.kernel.org Fixes: 3de9c42d02a79a5 ("LoongArch: Add all CPUs enabled by fdt to NUMA node 0") Reported-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kernel/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/kernel/smp.c b/arch/loongarch/kernel/smp.c index c0b498467ffa..5d59e9ce2772 100644 --- a/arch/loongarch/kernel/smp.c +++ b/arch/loongarch/kernel/smp.c @@ -302,7 +302,7 @@ static void __init fdt_smp_setup(void) __cpu_number_map[cpuid] = cpu; __cpu_logical_map[cpu] = cpuid; - early_numa_add_cpu(cpu, 0); + early_numa_add_cpu(cpuid, 0); set_cpuid_to_node(cpuid, 0); } -- 2.51.0