]> www.infradead.org Git - users/willy/xarray.git/commitdiff
Add a new optional ",cma" suffix to the crashkernel= command line option
authorJiri Bohac <jbohac@suse.cz>
Thu, 12 Jun 2025 10:13:21 +0000 (12:13 +0200)
committerAndrew Morton <akpm@linux-foundation.org>
Sun, 20 Jul 2025 02:08:22 +0000 (19:08 -0700)
Patch series "kdump: crashkernel reservation from CMA", v5.

This series implements a way to reserve additional crash kernel memory
using CMA.

Currently, all the memory for the crash kernel is not usable by the 1st
(production) kernel.  It is also unmapped so that it can't be corrupted by
the fault that will eventually trigger the crash.  This makes sense for
the memory actually used by the kexec-loaded crash kernel image and initrd
and the data prepared during the load (vmcoreinfo, ...).  However, the
reserved space needs to be much larger than that to provide enough
run-time memory for the crash kernel and the kdump userspace.  Estimating
the amount of memory to reserve is difficult.  Being too careful makes
kdump likely to end in OOM, being too generous takes even more memory from
the production system.  Also, the reservation only allows reserving a
single contiguous block (or two with the "low" suffix).  I've seen systems
where this fails because the physical memory is fragmented.

By reserving additional crashkernel memory from CMA, the main crashkernel
reservation can be just large enough to fit the kernel and initrd image,
minimizing the memory taken away from the production system.  Most of the
run-time memory for the crash kernel will be memory previously available
to userspace in the production system.  As this memory is no longer
wasted, the reservation can be done with a generous margin, making kdump
more reliable.  Kernel memory that we need to preserve for dumping is
normally not allocated from CMA, unless it is explicitly allocated as
movable.  Currently this is only the case for memory ballooning and zswap.
Such movable memory will be missing from the vmcore.  User data is
typically not dumped by makedumpfile.  When dumping of user data is
intended this new CMA reservation cannot be used.

There are five patches in this series:

The first adds a new ",cma" suffix to the recenly introduced generic
crashkernel parsing code.  parse_crashkernel() takes one more argument to
store the cma reservation size.

The second patch implements reserve_crashkernel_cma() which performs the
reservation.  If the requested size is not available in a single range,
multiple smaller ranges will be reserved.

The third patch updates Documentation/, explicitly mentioning the
potential DMA corruption of the CMA-reserved memory.

The fourth patch adds a short delay before booting the kdump kernel,
allowing pending DMA transfers to finish.

The fifth patch enables the functionality for x86 as a proof of
concept. There are just three things every arch needs to do:
- call reserve_crashkernel_cma()
- include the CMA-reserved ranges in the physical memory map
- exclude the CMA-reserved ranges from the memory available
  through /proc/vmcore by excluding them from the vmcoreinfo
  PT_LOAD ranges.

Adding other architectures is easy and I can do that as soon as this
series is merged.

With this series applied, specifying
crashkernel=100M craskhernel=1G,cma
on the command line will make a standard crashkernel reservation
of 100M, where kexec will load the kernel and initrd.

An additional 1G will be reserved from CMA, still usable by the production
system.  The crash kernel will have 1.1G memory available.  The 100M can
be reliably predicted based on the size of the kernel and initrd.

The new cma suffix is completely optional. When no
crashkernel=size,cma is specified, everything works as before.

This patch (of 5):

Add a new cma_size parameter to parse_crashkernel().  When not NULL, call
__parse_crashkernel to parse the CMA reservation size from
"crashkernel=size,cma" and store it in cma_size.

Set cma_size to NULL in all calls to parse_crashkernel().

Link: https://lkml.kernel.org/r/aEqnxxfLZMllMC8I@dwarf.suse.cz
Link: https://lkml.kernel.org/r/aEqoQckgoTQNULnh@dwarf.suse.cz
Signed-off-by: Jiri Bohac <jbohac@suse.cz>
Cc: Baoquan He <bhe@redhat.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Donald Dutile <ddutile@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Philipp Rudo <prudo@redhat.com>
Cc: Pingfan Liu <piliu@redhat.com>
Cc: Tao Liu <ltao@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
13 files changed:
arch/arm/kernel/setup.c
arch/arm64/mm/init.c
arch/loongarch/kernel/setup.c
arch/mips/kernel/setup.c
arch/powerpc/kernel/fadump.c
arch/powerpc/kexec/core.c
arch/powerpc/mm/nohash/kaslr_booke.c
arch/riscv/mm/init.c
arch/s390/kernel/setup.c
arch/sh/kernel/machine_kexec.c
arch/x86/kernel/setup.c
include/linux/crash_reserve.h
kernel/crash_reserve.c

index a41c93988d2c6b146b2118798c8dec7f87a3e4a4..0bfd66c7ada052f682d12ed11c9f1a10520ee722 100644 (file)
@@ -1004,7 +1004,7 @@ static void __init reserve_crashkernel(void)
        total_mem = get_total_mem();
        ret = parse_crashkernel(boot_command_line, total_mem,
                                &crash_size, &crash_base,
-                               NULL, NULL);
+                               NULL, NULL, NULL);
        /* invalid value specified or crashkernel=0 */
        if (ret || !crash_size)
                return;
index 0c8c35dd645e4a84035fb2ec14a8415c30af3bb5..ea84a61ed50848c6dd0119851eb5edd49b8feea0 100644 (file)
@@ -106,7 +106,7 @@ static void __init arch_reserve_crashkernel(void)
 
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
                                &crash_size, &crash_base,
-                               &low_size, &high);
+                               &low_size, NULL, &high);
        if (ret)
                return;
 
index b99fbb388fe03ca09fb81b2f7ec97476eaa5d7d9..22b27cd447a1b2d51180c48a63047a7f49ccb93d 100644 (file)
@@ -265,7 +265,7 @@ static void __init arch_reserve_crashkernel(void)
                return;
 
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-                               &crash_size, &crash_base, &low_size, &high);
+                               &crash_size, &crash_base, &low_size, NULL, &high);
        if (ret)
                return;
 
index fbfe0771317ea4726b6d74f0a3da113b4458d9c7..11b9b6b63e19f3bd314795d9013eac2a1744dfe5 100644 (file)
@@ -458,7 +458,7 @@ static void __init mips_parse_crashkernel(void)
        total_mem = memblock_phys_mem_size();
        ret = parse_crashkernel(boot_command_line, total_mem,
                                &crash_size, &crash_base,
-                               NULL, NULL);
+                               NULL, NULL, NULL);
        if (ret != 0 || crash_size <= 0)
                return;
 
index 8ca49e40c473ede2cce82bba2acbd3b29886c408..28cab25d5b3369eb10c0bef0aa6c994b84f0eed3 100644 (file)
@@ -333,7 +333,7 @@ static __init u64 fadump_calculate_reserve_size(void)
         * memory at a predefined offset.
         */
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-                               &size, &base, NULL, NULL);
+                               &size, &base, NULL, NULL, NULL);
        if (ret == 0 && size > 0) {
                unsigned long max_size;
 
index 00e9c267b912f032cd99ac98cef1363dc989e484..d1a2d755381ca52f9086f603c464c8efb65295ad 100644 (file)
@@ -110,7 +110,7 @@ void __init arch_reserve_crashkernel(void)
 
        /* use common parsing */
        ret = parse_crashkernel(boot_command_line, total_mem_sz, &crash_size,
-                               &crash_base, NULL, NULL);
+                               &crash_base, NULL, NULL, NULL);
 
        if (ret)
                return;
index 5c8d1bb98b3e86df303913d14db91f4f3312bee7..5e4897daaaeaecd28d0258c08063eeaf7e9a25a0 100644 (file)
@@ -178,7 +178,7 @@ static void __init get_crash_kernel(void *fdt, unsigned long size)
        int ret;
 
        ret = parse_crashkernel(boot_command_line, size, &crash_size,
-                               &crash_base, NULL, NULL);
+                               &crash_base, NULL, NULL, NULL);
        if (ret != 0 || crash_size == 0)
                return;
        if (crash_base == 0)
index 8d0374d7ce8ed72320f58e4cea212d0e2bce8fd4..15683ae13fa5d121c850c20037984f237602a4d1 100644 (file)
@@ -1408,7 +1408,7 @@ static void __init arch_reserve_crashkernel(void)
 
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
                                &crash_size, &crash_base,
-                               &low_size, &high);
+                               &low_size, NULL, &high);
        if (ret)
                return;
 
index f244c5560e7f62d02a36b5b46382e6005c464424..b99aeb0db2ee304d0ff46ea32a0216b92641964e 100644 (file)
@@ -605,7 +605,7 @@ static void __init reserve_crashkernel(void)
        int rc;
 
        rc = parse_crashkernel(boot_command_line, ident_map_size,
-                              &crash_size, &crash_base, NULL, NULL);
+                              &crash_size, &crash_base, NULL, NULL, NULL);
 
        crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN);
        crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN);
index 8321b31d2e19dc0c5af57020d4225398bd9790fc..37073ca1e0ad3aa595c53e0d64240da7085d33b6 100644 (file)
@@ -146,7 +146,7 @@ void __init reserve_crashkernel(void)
                return;
 
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
-                       &crash_size, &crash_base, NULL, NULL);
+                       &crash_size, &crash_base, NULL, NULL, NULL);
        if (ret == 0 && crash_size > 0) {
                crashk_res.start = crash_base;
                crashk_res.end = crash_base + crash_size - 1;
index fb27be6971286d942320d3acd414a6245a28356b..c22dc630c2974ca75e123f34b5fa768f173cd682 100644 (file)
@@ -608,7 +608,7 @@ static void __init arch_reserve_crashkernel(void)
 
        ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
                                &crash_size, &crash_base,
-                               &low_size, &high);
+                               &low_size, NULL, &high);
        if (ret)
                return;
 
index 1fe7e7d1b214b808034f85c9792bbd94077f77f7..e784aaff2f5a163579ef872a14183b1993b590b0 100644 (file)
@@ -16,7 +16,8 @@ extern struct resource crashk_low_res;
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
                unsigned long long *crash_size, unsigned long long *crash_base,
-               unsigned long long *low_size, bool *high);
+               unsigned long long *low_size, unsigned long long *cma_size,
+               bool *high);
 
 #ifdef CONFIG_ARCH_HAS_GENERIC_CRASHKERNEL_RESERVATION
 #ifndef DEFAULT_CRASH_KERNEL_LOW_SIZE
index acb6bf42e30d3dad4bab8a7e2c69977696683db8..86ae1365d04e45cefba6416a0ebb1b00a2a3ca55 100644 (file)
@@ -172,17 +172,19 @@ static int __init parse_crashkernel_simple(char *cmdline,
 
 #define SUFFIX_HIGH 0
 #define SUFFIX_LOW  1
-#define SUFFIX_NULL 2
+#define SUFFIX_CMA  2
+#define SUFFIX_NULL 3
 static __initdata char *suffix_tbl[] = {
        [SUFFIX_HIGH] = ",high",
        [SUFFIX_LOW]  = ",low",
+       [SUFFIX_CMA]  = ",cma",
        [SUFFIX_NULL] = NULL,
 };
 
 /*
  * That function parses "suffix"  crashkernel command lines like
  *
- *     crashkernel=size,[high|low]
+ *     crashkernel=size,[high|low|cma]
  *
  * It returns 0 on success and -EINVAL on failure.
  */
@@ -298,9 +300,11 @@ int __init parse_crashkernel(char *cmdline,
                             unsigned long long *crash_size,
                             unsigned long long *crash_base,
                             unsigned long long *low_size,
+                            unsigned long long *cma_size,
                             bool *high)
 {
        int ret;
+       unsigned long long __always_unused cma_base;
 
        /* crashkernel=X[@offset] */
        ret = __parse_crashkernel(cmdline, system_ram, crash_size,
@@ -331,6 +335,14 @@ int __init parse_crashkernel(char *cmdline,
 
                *high = true;
        }
+
+       /*
+        * optional CMA reservation
+        * cma_base is ignored
+        */
+       if (cma_size)
+               __parse_crashkernel(cmdline, 0, cma_size,
+                       &cma_base, suffix_tbl[SUFFIX_CMA]);
 #endif
        if (!*crash_size)
                ret = -EINVAL;