In order to combine the preemption and need_resched test we need to
fold the need_resched information into the preempt_count value.
Since the NEED_RESCHED flag is set across CPUs this needs to be an
atomic operation, however we very much want to avoid making
preempt_count atomic, therefore we keep the existing TIF_NEED_RESCHED
infrastructure in place but at 3 sites test it and fold its value into
preempt_count; namely:
 - resched_task() when setting TIF_NEED_RESCHED on the current task
 - scheduler_ipi() when resched_task() sets TIF_NEED_RESCHED on a
                   remote task it follows it up with a reschedule IPI
                   and we can modify the cpu local preempt_count from
                   there.
 - cpu_idle_loop() for when resched_task() found tsk_is_polling().
We use an inverted bitmask to indicate need_resched so that a 0 means
both need_resched and !atomic.
Also remove the barrier() in preempt_enable() between
preempt_enable_no_resched() and preempt_check_resched() to avoid
having to reload the preemption value and allow the compiler to use
the flags of the previuos decrement. I couldn't come up with any sane
reason for this barrier() to be there as preempt_enable_no_resched()
already has a barrier() before doing the decrement.
Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-7a7m5qqbn5pmwnd4wko9u6da@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
 
 #include <linux/linkage.h>
 #include <linux/list.h>
 
+/*
+ * We use the MSB mostly because its available; see <linux/preempt_mask.h> for
+ * the other bits -- can't include that header due to inclusion hell.
+ */
+#define PREEMPT_NEED_RESCHED   0x80000000
+
+/*
+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
+ * that think a non-zero value indicates we cannot preempt.
+ */
 static __always_inline int preempt_count(void)
 {
-       return current_thread_info()->preempt_count;
+       return current_thread_info()->preempt_count & ~PREEMPT_NEED_RESCHED;
 }
 
 static __always_inline int *preempt_count_ptr(void)
        return ¤t_thread_info()->preempt_count;
 }
 
+/*
+ * We now loose PREEMPT_NEED_RESCHED and cause an extra reschedule; however the
+ * alternative is loosing a reschedule. Better schedule too often -- also this
+ * should be a very rare operation.
+ */
 static __always_inline void preempt_count_set(int pc)
 {
        *preempt_count_ptr() = pc;
 }
 
+/*
+ * We fold the NEED_RESCHED bit into the preempt count such that
+ * preempt_enable() can decrement and test for needing to reschedule with a
+ * single instruction.
+ *
+ * We invert the actual bit, so that when the decrement hits 0 we know we both
+ * need to resched (the bit is cleared) and can resched (no preempt count).
+ */
+
+static __always_inline void set_preempt_need_resched(void)
+{
+       *preempt_count_ptr() &= ~PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline void clear_preempt_need_resched(void)
+{
+       *preempt_count_ptr() |= PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline bool test_preempt_need_resched(void)
+{
+       return !(*preempt_count_ptr() & PREEMPT_NEED_RESCHED);
+}
+
 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 
 #define preempt_check_resched() \
 do { \
-       if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+       if (unlikely(!*preempt_count_ptr())) \
                preempt_schedule(); \
 } while (0)
 
 
 #define preempt_check_resched_context() \
 do { \
-       if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+       if (unlikely(!*preempt_count_ptr())) \
                preempt_schedule_context(); \
 } while (0)
 #else
 #define preempt_enable() \
 do { \
        preempt_enable_no_resched(); \
-       barrier(); \
        preempt_check_resched(); \
 } while (0)
 
 #define preempt_enable_notrace() \
 do { \
        preempt_enable_no_resched_notrace(); \
-       barrier(); \
        preempt_check_resched_context(); \
 } while (0)
 
 
 #include <linux/errno.h>
 #include <linux/nodemask.h>
 #include <linux/mm_types.h>
+#include <linux/preempt.h>
 
 #include <asm/page.h>
 #include <asm/ptrace.h>
  * We include PREEMPT_ACTIVE to avoid cond_resched() from working
  * before the scheduler is active -- see should_resched().
  */
-#define INIT_PREEMPT_COUNT     (1 + PREEMPT_ACTIVE)
+#define INIT_PREEMPT_COUNT     (1 + PREEMPT_ACTIVE + PREEMPT_NEED_RESCHED)
+#define PREEMPT_ENABLED                (PREEMPT_NEED_RESCHED)
+#define PREEMPT_DISABLED       (1 + PREEMPT_NEED_RESCHED)
 
 /**
  * struct thread_group_cputimer - thread group interval timer counts
 
 static inline int need_resched(void)
 {
-       return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+       return unlikely(test_preempt_need_resched());
 }
 
 /*
 
                                __current_set_polling();
                        }
                        arch_cpu_idle_exit();
+                       /*
+                        * We need to test and propagate the TIF_NEED_RESCHED
+                        * bit here because we might not have send the
+                        * reschedule IPI to idle tasks.
+                        */
+                       if (tif_need_resched())
+                               set_preempt_need_resched();
                }
                tick_nohz_idle_exit();
                schedule_preempt_disabled();
 
        set_tsk_need_resched(p);
 
        cpu = task_cpu(p);
-       if (cpu == smp_processor_id())
+       if (cpu == smp_processor_id()) {
+               set_preempt_need_resched();
                return;
+       }
 
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
 
 void scheduler_ipi(void)
 {
+       /*
+        * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+        * TIF_NEED_RESCHED remotely (for the first time) will also send
+        * this IPI.
+        */
+       if (tif_need_resched())
+               set_preempt_need_resched();
+
        if (llist_empty(&this_rq()->wake_list)
                        && !tick_nohz_full_cpu(smp_processor_id())
                        && !got_nohz_idle_kick())
 #endif
 #ifdef CONFIG_PREEMPT_COUNT
        /* Want to start with kernel preemption disabled. */
-       task_thread_info(p)->preempt_count = 1;
+       task_thread_info(p)->preempt_count = PREEMPT_DISABLED;
 #endif
 #ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
+       clear_preempt_need_resched();
        rq->skip_clock_update = 0;
 
        if (likely(prev != next)) {
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
-       struct thread_info *ti = current_thread_info();
        enum ctx_state prev_state;
 
        /* Catch callers which need to be fixed */
-       BUG_ON(ti->preempt_count || !irqs_disabled());
+       BUG_ON(preempt_count() || !irqs_disabled());
 
        prev_state = exception_enter();
 
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 
        /* Set the preempt count _outside_ the spinlocks! */
-       task_thread_info(idle)->preempt_count = 0;
+       task_thread_info(idle)->preempt_count = PREEMPT_ENABLED;
 
        /*
         * The idle tasks have their own, simple scheduling class: