]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm, slub: fix kmem_cache_cpu fields alignment for double cmpxchg
authorVlastimil Babka <vbabka@suse.cz>
Mon, 23 Aug 2021 23:59:03 +0000 (09:59 +1000)
committerStephen Rothwell <sfr@canb.auug.org.au>
Wed, 25 Aug 2021 23:33:34 +0000 (09:33 +1000)
Sven Eckelmann reports [1] that the addition of local_lock to kmem_cache_cpu
breaks a config with 64BIT+LOCK_STAT:

    general protection fault, maybe for address 0xffff888007fcf1c8: 0000 [#1] NOPTI
    CPU: 0 PID: 0 Comm: swapper Not tainted 5.14.0-rc5+ #7
    Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
    RIP: 0010:kmem_cache_alloc+0x81/0x180
    Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
    RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
    RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
    RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
    RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
    R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
    R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
    FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
    Call Trace:
     __get_vm_area_node.constprop.0.isra.0+0x74/0x150
     __vmalloc_node_range+0x5a/0x2b0
     ? kernel_clone+0x88/0x390
     ? copy_process+0x1ac/0x17e0
     copy_process+0x768/0x17e0
     ? kernel_clone+0x88/0x390
     kernel_clone+0x88/0x390
     ? _vm_unmap_aliases.part.0+0xe9/0x110
     ? change_page_attr_set_clr+0x10d/0x180
     kernel_thread+0x43/0x50
     ? rest_init+0x100/0x100
     rest_init+0x1e/0x100
     arch_call_rest_init+0x9/0xc
     start_kernel+0x481/0x493
     x86_64_start_reservations+0x24/0x26
     x86_64_start_kernel+0x80/0x84
     secondary_startup_64_no_verify+0xc2/0xcb
    random: get_random_bytes called from oops_exit+0x34/0x60 with crng_init=0
    ---[ end trace 2cac18ac38f640c1 ]---
    RIP: 0010:kmem_cache_alloc+0x81/0x180
    Code: 79 48 00 4c 8b 41 38 0f 84 89 00 00 00 4d 85 c0 0f 84 80 00 00 00 41 8b 44 24 28 49 8b 3c 24 48 8d 4a 01 49 8b 1c 00 4c 89 c0 <48> 0f c7 4f 38 0f 943
    RSP: 0000:ffffffff81803c10 EFLAGS: 00000286
    RAX: ffff88800244e7c0 RBX: ffff88800244e800 RCX: 0000000000000024
    RDX: 0000000000000023 RSI: 0000000000000100 RDI: ffff888007fcf190
    RBP: ffffffff81803c38 R08: ffff88800244e7c0 R09: 0000000000000dc0
    R10: 0000000000004000 R11: 0000000000000000 R12: ffff8880024413c0
    R13: ffffffff810d18f4 R14: 0000000000000dc0 R15: 0000000000000100
    FS:  0000000000000000(0000) GS:ffffffff81840000(0000) knlGS:0000000000000000
    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
    CR2: ffff888002001000 CR3: 0000000001824000 CR4: 00000000000006b0
    Kernel panic - not syncing: Attempted to kill the idle task!
    ---[ end Kernel panic - not syncing: Attempted to kill the idle task! ]---

Decoding the RIP points to this_cpu_cmpxchg_double() call in slab_alloc_node().

The problem is the particular size of local_lock_t with LOCK_STAT resulting
in the following layout:

struct kmem_cache_cpu {
        local_lock_t               lock;                 /*     0    56 */
        void * *                   freelist;             /*    56     8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        long unsigned int          tid;                  /*    64     8 */
        struct page *              page;                 /*    72     8 */
        struct page *              partial;              /*    80     8 */

        /* size: 88, cachelines: 2, members: 5 */
        /* last cacheline: 24 bytes */
};

As pointed out by Sebastian Andrzej Siewior, this_cpu_cmpxchg_double()
needs the freelist and tid fields to be aligned to sum of their sizes
(16 bytes) but they are not in this configuration. This didn't happen
with non-debug RT and !RT configs as well as lockdep.

To fix this, move the lock field below partial field, so that it doesn't
affect the layout.

[1] https://lore.kernel.org/linux-mm/2666777.vCjUEy5FO1@sven-desktop/

This is a fixup for mmotm patch
mm-slub-convert-kmem_cpu_slab-protection-to-local_lock.patch

Link: https://lkml.kernel.org/r/e907c2b6-6df1-8038-8c6c-aa9c1fd11259@suse.cz
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reported-by: Sven Eckelmann <sven@narfation.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
include/linux/slub_def.h

index b5bcac29b979c351aeb01a50c6e14c0a41b47130..85499f0586b06c55d2a494e5888bda1e2d651b9d 100644 (file)
@@ -41,14 +41,18 @@ enum stat_item {
        CPU_PARTIAL_DRAIN,      /* Drain cpu partial to node partial */
        NR_SLUB_STAT_ITEMS };
 
+/*
+ * When changing the layout, make sure freelist and tid are still compatible
+ * with this_cpu_cmpxchg_double() alignment requirements.
+ */
 struct kmem_cache_cpu {
-       local_lock_t lock;      /* Protects the fields below except stat */
        void **freelist;        /* Pointer to next available object */
        unsigned long tid;      /* Globally unique transaction id */
        struct page *page;      /* The slab from which we are allocating */
 #ifdef CONFIG_SLUB_CPU_PARTIAL
        struct page *partial;   /* Partially allocated frozen slabs */
 #endif
+       local_lock_t lock;      /* Protects the fields above */
 #ifdef CONFIG_SLUB_STATS
        unsigned stat[NR_SLUB_STAT_ITEMS];
 #endif