]> www.infradead.org Git - users/dwmw2/linux.git/commitdiff
x86/smpboot: Support parallel startup of secondary CPUs
authorDavid Woodhouse <dwmw@amazon.co.uk>
Thu, 23 Feb 2023 19:11:34 +0000 (19:11 +0000)
committerDavid Woodhouse <dwmw@amazon.co.uk>
Sat, 25 Feb 2023 07:58:48 +0000 (07:58 +0000)
Rework the real-mode startup code to allow for APs to be brought up in
parallel. This is in two parts:

1. Introduce a bit-spinlock to prevent them from all using the real
   mode stack at the same time.

2. Avoid the use of global variables for passing per-CPU information to
   the APs.

To achieve the latter, export the cpuid_to_apicid[] array so that each
AP can find its own per_cpu data (and thus initial_gs, initial_stack and
early_gdt_descr) by searching therein based on its APIC ID.

Introduce a global variable 'smpboot_control' indicating to the AP how
it should find its APIC ID. For a serialized bringup, the APIC ID is
explicitly passed in the low bits of smpboot_control, while for parallel
mode there are flags directing the AP to find its APIC ID in CPUID leaf
0x0b (for X2APIC mode) or CPUID leaf 0x01 where 8 bits are sufficient.

Parallel startup may be disabled by a command line option, and also if:
 • AMD SEV-ES is in use, since the AP may not use CPUID that early.
 • X2APIC is enabled, but CPUID leaf 0xb is not present and correect.
 • X2APIC is not enabled but not even CPUID leaf 0x01 exists.

Aside from the fact that APs will now look up their per-cpu data via the
newly-exported cpuid_to_apicid[] table, there is no behavioural change
intended yet, since new parallel CPUHP states have not — yet — been
added.

[ tglx: Initial proof of concept patch with bitlock and APIC ID lookup ]
[ dwmw2: Rework and testing, commit message, CPUID 0x1 and CPU0 support ]
[ seanc: Fix stray override of initial_gs in common_cpu_up() ]
[ Oleksandr Natalenko: reported suspend/resume issue fixed in
  x86_acpi_suspend_lowlevel ]
Co-developed-by: Thomas Gleixner <tglx@linutronix.de>
Co-developed-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
Tested-by: Paul E. McKenney <paulmck@kernel.org>
Tested-by: Kim Phillips <kim.phillips@amd.com>
Tested-by: Oleksandr Natalenko <oleksandr@natalenko.name>
Documentation/admin-guide/kernel-parameters.txt
arch/x86/include/asm/realmode.h
arch/x86/include/asm/smp.h
arch/x86/kernel/acpi/sleep.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/asm-offsets.c
arch/x86/kernel/head_64.S
arch/x86/kernel/smpboot.c
arch/x86/realmode/init.c
arch/x86/realmode/rm/trampoline_64.S

index 6cfa6e3996cf75ee6bb1e8b399daa4d5701ab92b..ee099b8aac6d419f341a011de3359b3441d43dcf 100644 (file)
 
        nomodule        Disable module load
 
+       no_parallel_bringup
+                       [X86,SMP] Disable parallel brinugp of secondary cores.
+
        nopat           [X86] Disable PAT (page attribute table extension of
                        pagetables) support.
 
index a336feef0af14318bf3776e417c4fbcd8277b976..f0357cfe2fb04d694a1d7052f94e6afc6842e2ee 100644 (file)
@@ -52,6 +52,7 @@ struct trampoline_header {
        u64 efer;
        u32 cr4;
        u32 flags;
+       u32 lock;
 #endif
 };
 
@@ -65,6 +66,8 @@ extern unsigned long initial_stack;
 extern unsigned long initial_vc_handler;
 #endif
 
+extern u32 *trampoline_lock;
+
 extern unsigned char real_mode_blob[];
 extern unsigned char real_mode_relocs[];
 
index b4dbb20dab1a11d509b4f702f4bde6a6269b4d18..33c0d5fd8af6f5344736e77754697cc3edec2517 100644 (file)
@@ -199,5 +199,13 @@ extern void nmi_selftest(void);
 #define nmi_selftest() do { } while (0)
 #endif
 
-#endif /* __ASSEMBLY__ */
+extern unsigned int smpboot_control;
+
+#endif /* !__ASSEMBLY__ */
+
+/* Control bits for startup_64 */
+#define STARTUP_SECONDARY      0x80000000
+#define STARTUP_APICID_CPUID_0B        0x40000000
+#define STARTUP_APICID_CPUID_01        0x20000000
+
 #endif /* _ASM_X86_SMP_H */
index 3b7f4cdbf2e0a12b9bfcb2e485327d01f20cbf87..47e75c056cb5340935f5c499df631775ad35c288 100644 (file)
@@ -16,6 +16,7 @@
 #include <asm/cacheflush.h>
 #include <asm/realmode.h>
 #include <asm/hypervisor.h>
+#include <asm/smp.h>
 
 #include <linux/ftrace.h>
 #include "../../realmode/rm/wakeup.h"
@@ -57,6 +58,7 @@ asmlinkage acpi_status __visible x86_acpi_enter_sleep_state(u8 state)
  */
 int x86_acpi_suspend_lowlevel(void)
 {
+       unsigned int __maybe_unused saved_smpboot_ctrl;
        struct wakeup_header *header =
                (struct wakeup_header *) __va(real_mode_header->wakeup_header);
 
@@ -115,6 +117,8 @@ int x86_acpi_suspend_lowlevel(void)
        early_gdt_descr.address =
                        (unsigned long)get_cpu_gdt_rw(smp_processor_id());
        initial_gs = per_cpu_offset(smp_processor_id());
+       /* Force the startup into boot mode */
+       saved_smpboot_ctrl = xchg(&smpboot_control, 0);
 #endif
        initial_code = (unsigned long)wakeup_long64;
        saved_magic = 0x123456789abcdef0L;
@@ -127,6 +131,9 @@ int x86_acpi_suspend_lowlevel(void)
        pause_graph_tracing();
        do_suspend_lowlevel();
        unpause_graph_tracing();
+
+       if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_SMP))
+               smpboot_control = saved_smpboot_ctrl;
        return 0;
 }
 
index 20d9a604da7c4b624f701182f0ead13244d1323f..ac1d7e5da1f233e1927f7a08d9b3db1b432cc4e2 100644 (file)
@@ -2377,7 +2377,7 @@ static int nr_logical_cpuids = 1;
 /*
  * Used to store mapping between logical CPU IDs and APIC IDs.
  */
-static int cpuid_to_apicid[] = {
+int cpuid_to_apicid[] = {
        [0 ... NR_CPUS - 1] = -1,
 };
 
index 82c783da16a863d9b0318174ef96b076866700c1..797ae1a15c91e3ae7e30080e0b4f292bf1cef8e3 100644 (file)
@@ -108,6 +108,7 @@ static void __used common(void)
        OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
        OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
        OFFSET(X86_top_of_stack, pcpu_hot, top_of_stack);
+       OFFSET(X86_current_task, pcpu_hot, current_task);
 #ifdef CONFIG_CALL_DEPTH_TRACKING
        OFFSET(X86_call_depth, pcpu_hot, call_depth);
 #endif
index 222efd4a09bc8861b3871f4a175013c0730f8759..c32e5b06a9ce267fbb4af6c81488d612b4b8f923 100644 (file)
@@ -25,6 +25,7 @@
 #include <asm/export.h>
 #include <asm/nospec-branch.h>
 #include <asm/fixmap.h>
+#include <asm/smp.h>
 
 /*
  * We are not able to switch in one step to the final KERNEL ADDRESS SPACE
@@ -241,6 +242,85 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
        UNWIND_HINT_EMPTY
        ANNOTATE_NOENDBR // above
 
+#ifdef CONFIG_SMP
+       /*
+        * Is this the boot CPU coming up? If so everything is available
+        * in initial_gs, initial_stack and early_gdt_descr.
+        */
+       movl    smpboot_control(%rip), %edx
+       testl   $STARTUP_SECONDARY, %edx
+       jz      .Lsetup_cpu
+
+       /*
+        * For parallel boot, the APIC ID is retrieved from CPUID, and then
+        * used to look up the CPU number.  For booting a single CPU, the
+        * CPU number is encoded in smpboot_control.
+        *
+        * Bit 31       STARTUP_SECONDARY flag (checked above)
+        * Bit 30       STARTUP_APICID_CPUID_0B flag (use CPUID 0x0b)
+        * Bit 29       STARTUP_APICID_CPUID_01 flag (use CPUID 0x01)
+        * Bit 0-24     CPU# if STARTUP_APICID_CPUID_xx flags are not set
+        */
+       testl   $STARTUP_APICID_CPUID_0B, %edx
+       jnz     .Luse_cpuid_0b
+       testl   $STARTUP_APICID_CPUID_01, %edx
+       jnz     .Luse_cpuid_01
+       andl    $0x0FFFFFFF, %edx
+       movl    %edx, %ecx
+       jmp     .Linit_cpu_data
+
+.Luse_cpuid_01:
+       mov     $0x01, %eax
+       cpuid
+       mov     %ebx, %edx
+       shr     $24, %edx
+       jmp     .Lsetup_AP
+
+.Luse_cpuid_0b:
+       mov     $0x0B, %eax
+       xorl    %ecx, %ecx
+       cpuid
+
+.Lsetup_AP:
+       /* EDX contains the APIC ID of the current CPU */
+       xorq    %rcx, %rcx
+       leaq    cpuid_to_apicid(%rip), %rbx
+
+.Lfind_cpunr:
+       cmpl    (%rbx,%rcx,4), %edx
+       jz      .Linit_cpu_data
+       inc     %ecx
+       cmpl    nr_cpu_ids(%rip), %ecx
+       jb      .Lfind_cpunr
+
+       /*  APIC ID not found in the table. Drop the trampoline lock and bail. */
+       movq    trampoline_lock(%rip), %rax
+       lock
+       btrl    $0, (%rax)
+
+1:     cli
+       hlt
+       jmp     1b
+
+.Linit_cpu_data:
+       /* Get the per cpu offset for the given CPU# which is in ECX */
+       leaq    __per_cpu_offset(%rip), %rbx
+       movq    (%rbx,%rcx,8), %rbx
+       /* Save it for GS BASE setup */
+       movq    %rbx, initial_gs(%rip)
+
+       /* Calculate the GDT address */
+       movq    $gdt_page, %rcx
+       addq    %rbx, %rcx
+       movq    %rcx, early_gdt_descr_base(%rip)
+
+       /* Find the idle task stack */
+       movq    pcpu_hot + X86_current_task(%rbx), %rcx
+       movq    TASK_threadsp(%rcx), %rcx
+       movq    %rcx, initial_stack(%rip)
+#endif /* CONFIG_SMP */
+
+.Lsetup_cpu:
        /*
         * We must switch to a new descriptor in kernel space for the GDT
         * because soon the kernel won't have access anymore to the userspace
@@ -281,6 +361,14 @@ SYM_INNER_LABEL(secondary_startup_64_no_verify, SYM_L_GLOBAL)
         */
        movq initial_stack(%rip), %rsp
 
+       /* Drop the realmode protection. For the boot CPU the pointer is NULL! */
+       movq    trampoline_lock(%rip), %rax
+       testq   %rax, %rax
+       jz      .Lsetup_idt
+       lock
+       btrl    $0, (%rax)
+
+.Lsetup_idt:
        /* Setup and Load IDT */
        pushq   %rsi
        call    early_setup_idt
@@ -372,7 +460,11 @@ SYM_CODE_END(secondary_startup_64)
 SYM_CODE_START(start_cpu0)
        ANNOTATE_NOENDBR
        UNWIND_HINT_EMPTY
-       movq    initial_stack(%rip), %rsp
+
+       /* Find the idle task stack */
+       movq    PER_CPU_VAR(pcpu_hot) + X86_current_task, %rcx
+       movq    TASK_threadsp(%rcx), %rsp
+
        jmp     .Ljump_to_C_code
 SYM_CODE_END(start_cpu0)
 #endif
@@ -426,6 +518,7 @@ SYM_DATA(initial_vc_handler,        .quad handle_vc_boot_ghcb)
  * reliably detect the end of the stack.
  */
 SYM_DATA(initial_stack, .quad init_thread_union + THREAD_SIZE - FRAME_SIZE)
+SYM_DATA(trampoline_lock, .quad 0);
        __FINITDATA
 
        __INIT
@@ -660,6 +753,9 @@ SYM_DATA_END(level1_fixmap_pgt)
 SYM_DATA(early_gdt_descr,              .word GDT_ENTRIES*8-1)
 SYM_DATA_LOCAL(early_gdt_descr_base,   .quad INIT_PER_CPU_VAR(gdt_page))
 
+       .align 16
+SYM_DATA(smpboot_control,              .long 0)
+
        .align 16
 /* This must match the first entry in level2_kernel_pgt */
 SYM_DATA(phys_base, .quad 0x0)
index b18c1385e181b85e302569523bda20a6962aa9d5..74c76c78f7d2c75b839cca1636646b70a81343bb 100644 (file)
@@ -797,6 +797,16 @@ static int __init cpu_init_udelay(char *str)
 }
 early_param("cpu_init_udelay", cpu_init_udelay);
 
+static bool do_parallel_bringup __ro_after_init = true;
+
+static int __init no_parallel_bringup(char *str)
+{
+       do_parallel_bringup = false;
+
+       return 0;
+}
+early_param("no_parallel_bringup", no_parallel_bringup);
+
 static void __init smp_quirk_init_udelay(void)
 {
        /* if cmdline changed it from default, leave it alone */
@@ -1084,8 +1094,6 @@ int common_cpu_up(unsigned int cpu, struct task_struct *idle)
 #ifdef CONFIG_X86_32
        /* Stack for startup_32 can be just as for start_secondary onwards */
        per_cpu(pcpu_hot.top_of_stack, cpu) = task_top_of_stack(idle);
-#else
-       initial_gs = per_cpu_offset(cpu);
 #endif
        return 0;
 }
@@ -1110,9 +1118,14 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,
                start_ip = real_mode_header->trampoline_start64;
 #endif
        idle->thread.sp = (unsigned long)task_pt_regs(idle);
-       early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
        initial_code = (unsigned long)start_secondary;
-       initial_stack  = idle->thread.sp;
+
+       if (IS_ENABLED(CONFIG_X86_32)) {
+               early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu);
+               initial_stack  = idle->thread.sp;
+       } else if (!do_parallel_bringup) {
+               smpboot_control = STARTUP_SECONDARY | cpu;
+       }
 
        /* Enable the espfix hack for this CPU */
        init_espfix_ap(cpu);
@@ -1512,6 +1525,47 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
        speculative_store_bypass_ht_init();
 
+       /*
+        * We can do 64-bit AP bringup in parallel if the CPU reports
+        * its APIC ID in CPUID (either leaf 0x0B if we need the full
+        * APIC ID in X2APIC mode, or leaf 0x01 if 8 bits are
+        * sufficient). Otherwise it's too hard. And not for SEV-ES
+        * guests because they can't use CPUID that early.
+        */
+       if (IS_ENABLED(CONFIG_X86_32) || boot_cpu_data.cpuid_level < 1 ||
+           (x2apic_mode && boot_cpu_data.cpuid_level < 0xb) ||
+           cc_platform_has(CC_ATTR_GUEST_STATE_ENCRYPT))
+               do_parallel_bringup = false;
+
+       if (do_parallel_bringup && x2apic_mode) {
+               unsigned int eax, ebx, ecx, edx;
+
+               /*
+                * To support parallel bringup in x2apic mode, the AP will need
+                * to obtain its APIC ID from CPUID 0x0B, since CPUID 0x01 has
+                * only 8 bits. Check that it is present and seems correct.
+                */
+               cpuid_count(0xb, 0, &eax, &ebx, &ecx, &edx);
+
+               /*
+                * AMD says that if executed with an umimplemented level in
+                * ECX, then it will return all zeroes in EAX. Intel says it
+                * will return zeroes in both EAX and EBX. Checking only EAX
+                * should be sufficient.
+                */
+               if (eax) {
+                       pr_debug("Using CPUID 0xb for parallel CPU startup\n");
+                       smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_0B;
+               } else {
+                       pr_info("Disabling parallel bringup because CPUID 0xb looks untrustworthy\n");
+                       do_parallel_bringup = false;
+               }
+       } else if (do_parallel_bringup) {
+               /* Without X2APIC, what's in CPUID 0x01 should suffice. */
+               pr_debug("Using CPUID 0x1 for parallel CPU startup\n");
+               smpboot_control = STARTUP_SECONDARY | STARTUP_APICID_CPUID_01;
+       }
+
        snp_set_wakeup_secondary_cpu();
 }
 
index af565816d2ba6aed5df706910d9e43e773b53fcb..788e5559549f39ab72573fb582d5964ce3490ed0 100644 (file)
@@ -154,6 +154,9 @@ static void __init setup_real_mode(void)
 
        trampoline_header->flags = 0;
 
+       trampoline_lock = &trampoline_header->lock;
+       *trampoline_lock = 0;
+
        trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 
        /* Map the real mode stub as virtual == physical */
index e38d61d6562e4611c9150b935b018bee46717fe5..49ebc1636ffde1ae88d3709113292e9ef4778b10 100644 (file)
@@ -49,6 +49,19 @@ SYM_CODE_START(trampoline_start)
        mov     %ax, %es
        mov     %ax, %ss
 
+       /*
+        * Make sure only one CPU fiddles with the realmode stack
+        */
+.Llock_rm:
+       btl     $0, tr_lock
+       jnc     2f
+       pause
+       jmp     .Llock_rm
+2:
+       lock
+       btsl    $0, tr_lock
+       jc      .Llock_rm
+
        # Setup stack
        movl    $rm_stack_end, %esp
 
@@ -241,6 +254,7 @@ SYM_DATA_START(trampoline_header)
        SYM_DATA(tr_efer,               .space 8)
        SYM_DATA(tr_cr4,                .space 4)
        SYM_DATA(tr_flags,              .space 4)
+       SYM_DATA(tr_lock,               .space 4)
 SYM_DATA_END(trampoline_header)
 
 #include "trampoline_common.S"