x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y)

author Andy Lutomirski <luto@kernel.org>

Thu, 11 Aug 2016 09:35:23 +0000 (02:35 -0700)

committer Ingo Molnar <mingo@kernel.org>

Wed, 24 Aug 2016 10:11:42 +0000 (12:11 +0200)
author Andy Lutomirski <luto@kernel.org>
Thu, 11 Aug 2016 09:35:23 +0000 (02:35 -0700)
committer Ingo Molnar <mingo@kernel.org>
Wed, 24 Aug 2016 10:11:42 +0000 (12:11 +0200)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig

index c580d8c33562ec5eba4dbfc273ae3ed7b6b67072..21a6d0ec59834f0eaa645891796e4c02019a649f 100644 (file)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,6 +94,7 @@ config X86
         select HAVE_ARCH_TRANSPARENT_HUGEPAGE
         select HAVE_ARCH_WITHIN_STACK_FRAMES
         select HAVE_EBPF_JIT                    if X86_64
+       select HAVE_ARCH_VMAP_STACK             if X86_64
         select HAVE_CC_STACKPROTECTOR
         select HAVE_CMPXCHG_DOUBLE
         select HAVE_CMPXCHG_LOCAL
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h

index 8f321a1b03a1aaa0e87c4c1182d2b2f282efa1e4..14e4b20f0aafb3e5ce375849a9cddd389171017c 100644 (file)
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -8,6 +8,28 @@ struct tss_struct;
  void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
                       struct tss_struct *tss);
  
+/* This runs runs on the previous thread's stack. */
+static inline void prepare_switch_to(struct task_struct *prev,
+                                    struct task_struct *next)
+{
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * If we switch to a stack that has a top-level paging entry
+        * that is not present in the current mm, the resulting #PF will
+        * will be promoted to a double-fault and we'll panic.  Probe
+        * the new stack now so that vmalloc_fault can fix up the page
+        * tables if needed.  This can only happen if we use a stack
+        * in vmap space.
+        *
+        * We assume that the stack is aligned so that it never spans
+        * more than one top-level paging entry.
+        *
+        * To minimize cache pollution, just follow the stack pointer.
+        */
+       READ_ONCE(*(unsigned char *)next->thread.sp);
+#endif
+}
+
  #ifdef CONFIG_X86_32
  
  #ifdef CONFIG_CC_STACKPROTECTOR
@@ -39,6 +61,8 @@ do {                                                                  \
          */                                                             \
         unsigned long ebx, ecx, edx, esi, edi;                          \
                                                                         \
+       prepare_switch_to(prev, next);                                  \
+                                                                       \
         asm volatile("pushl %%ebp\n\t"          /* save    EBP   */     \
                      "movl %%esp,%[prev_sp]\n\t"        /* save    ESP   */ \
                      "movl %[next_sp],%%esp\n\t"        /* restore ESP   */ \
@@ -103,7 +127,9 @@ do {                                                                        \
   * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
   * has no effect.
   */
-#define switch_to(prev, next, last) \
+#define switch_to(prev, next, last)                                      \
+       prepare_switch_to(prev, next);                                    \
+                                                                         \
         asm volatile(SAVE_CONTEXT                                         \
              "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */       \
              "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */    \
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c

index b70ca12dd389c57fd32222fa1279f0d46c39a6ac..907b4e4aeb5eabda5d6f9c028f17a081e41f2cd8 100644 (file)
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP,     SIGBUS,  "segment not present", segment_not_present)
  DO_ERROR(X86_TRAP_SS,     SIGBUS,  "stack segment",            stack_segment)
  DO_ERROR(X86_TRAP_AC,     SIGBUS,  "alignment check",          alignment_check)
  
+#ifdef CONFIG_VMAP_STACK
+static void __noreturn handle_stack_overflow(const char *message,
+                                            struct pt_regs *regs,
+                                            unsigned long fault_address)
+{
+       printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n",
+                (void *)fault_address, current->stack,
+                (char *)current->stack + THREAD_SIZE - 1);
+       die(message, regs, 0);
+
+       /* Be absolutely certain we don't return. */
+       panic(message);
+}
+#endif
+
  #ifdef CONFIG_X86_64
  /* Runs on IST stack */
  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
  {
         static const char str[] = "double fault";
         struct task_struct *tsk = current;
+#ifdef CONFIG_VMAP_STACK
+       unsigned long cr2;
+#endif
  
  #ifdef CONFIG_X86_ESPFIX64
         extern unsigned char native_irq_return_iret[];
@@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
         tsk->thread.error_code = error_code;
         tsk->thread.trap_nr = X86_TRAP_DF;
  
+#ifdef CONFIG_VMAP_STACK
+       /*
+        * If we overflow the stack into a guard page, the CPU will fail
+        * to deliver #PF and will send #DF instead.  Similarly, if we
+        * take any non-IST exception while too close to the bottom of
+        * the stack, the processor will get a page fault while
+        * delivering the exception and will generate a double fault.
+        *
+        * According to the SDM (footnote in 6.15 under "Interrupt 14 -
+        * Page-Fault Exception (#PF):
+        *
+        *   Processors update CR2 whenever a page fault is detected. If a
+        *   second page fault occurs while an earlier page fault is being
+        *   deliv- ered, the faulting linear address of the second fault will
+        *   overwrite the contents of CR2 (replacing the previous
+        *   address). These updates to CR2 occur even if the page fault
+        *   results in a double fault or occurs during the delivery of a
+        *   double fault.
+        *
+        * The logic below has a small possibility of incorrectly diagnosing
+        * some errors as stack overflows.  For example, if the IDT or GDT
+        * gets corrupted such that #GP delivery fails due to a bad descriptor
+        * causing #GP and we hit this condition while CR2 coincidentally
+        * points to the stack guard page, we'll think we overflowed the
+        * stack.  Given that we're going to panic one way or another
+        * if this happens, this isn't necessarily worth fixing.
+        *
+        * If necessary, we could improve the test by only diagnosing
+        * a stack overflow if the saved RSP points within 47 bytes of
+        * the bottom of the stack: if RSP == tsk_stack + 48 and we
+        * take an exception, the stack is already aligned and there
+        * will be enough room SS, RSP, RFLAGS, CS, RIP, and a
+        * possible error code, so a stack overflow would *not* double
+        * fault.  With any less space left, exception delivery could
+        * fail, and, as a practical matter, we've overflowed the
+        * stack even if the actual trigger for the double fault was
+        * something else.
+        */
+       cr2 = read_cr2();
+       if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
+               handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
+#endif
+
  #ifdef CONFIG_DOUBLEFAULT
         df_debug(regs, error_code);
  #endif
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c

index 4dbe65622810208182bfe363efdd477a84abb040..a7655f6caf7dbd641dfea7c3e60581e01df594cb 100644 (file)
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
         unsigned cpu = smp_processor_id();
  
         if (likely(prev != next)) {
+               if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+                       /*
+                        * If our current stack is in vmalloc space and isn't
+                        * mapped in the new pgd, we'll double-fault.  Forcibly
+                        * map it.
+                        */
+                       unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
+
+                       pgd_t *pgd = next->pgd + stack_pgd_index;
+
+                       if (unlikely(pgd_none(*pgd)))
+                               set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
+               }
+
  #ifdef CONFIG_SMP
                 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
                 this_cpu_write(cpu_tlbstate.active_mm, next);
  #endif
+
                 cpumask_set_cpu(cpu, mm_cpumask(next));
  
                 /*
author	Andy Lutomirski <luto@kernel.org>
	Thu, 11 Aug 2016 09:35:23 +0000 (02:35 -0700)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 24 Aug 2016 10:11:42 +0000 (12:11 +0200)
arch/x86/Kconfig		patch \| blob \| history
arch/x86/include/asm/switch_to.h		patch \| blob \| history
arch/x86/kernel/traps.c		patch \| blob \| history
arch/x86/mm/tlb.c		patch \| blob \| history