#include <linux/mutex.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
 
 #include "rcutree.h"
 
 int rcu_scheduler_active __read_mostly;
 EXPORT_SYMBOL_GPL(rcu_scheduler_active);
 
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads.  These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+static DEFINE_PER_CPU(wait_queue_head_t, rcu_cpu_wq);
+static DEFINE_PER_CPU(char, rcu_cpu_has_work);
+static char rcu_kthreads_spawnable;
+
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp);
+static void invoke_rcu_kthread(void);
+
+#define RCU_KTHREAD_PRIO 1     /* RT priority for per-CPU kthreads. */
+
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
 /*
  * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
  * and move all callbacks from the outgoing CPU to the current one.
+ * There can only be one CPU hotplug operation at a time, so no other
+ * CPU can be attempting to update rcu_cpu_kthread_task.
  */
 static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
        int need_report = 0;
        struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
        struct rcu_node *rnp;
+       struct task_struct *t;
+
+       /* Stop the CPU's kthread. */
+       t = per_cpu(rcu_cpu_kthread_task, cpu);
+       if (t != NULL) {
+               per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
+               kthread_stop(t);
+       }
 
        /* Exclude any attempts to start a new grace period. */
        raw_spin_lock_irqsave(&rsp->onofflock, flags);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
                rcu_report_exp_rnp(rsp, rnp);
+
+       /*
+        * If there are no more online CPUs for this rcu_node structure,
+        * kill the rcu_node structure's kthread.  Otherwise, adjust its
+        * affinity.
+        */
+       t = rnp->node_kthread_task;
+       if (t != NULL &&
+           rnp->qsmaskinit == 0) {
+               kthread_stop(t);
+               rnp->node_kthread_task = NULL;
+       } else
+               rcu_node_kthread_setaffinity(rnp);
 }
 
 /*
 
        /* Re-raise the RCU softirq if there are callbacks remaining. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
 }
 
 /*
        }
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
-               raise_softirq(RCU_SOFTIRQ);
+               invoke_rcu_kthread();
 }
 
 #ifdef CONFIG_SMP
 /*
  * Do softirq processing for the current CPU.
  */
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void rcu_process_callbacks(void)
 {
        __rcu_process_callbacks(&rcu_sched_state,
                                &__get_cpu_var(rcu_sched_data));
        rcu_needs_cpu_flush();
 }
 
+/*
+ * Wake up the current CPU's kthread.  This replaces raise_softirq()
+ * in earlier versions of RCU.  Note that because we are running on
+ * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
+ * cannot disappear out from under us.
+ */
+static void invoke_rcu_kthread(void)
+{
+       unsigned long flags;
+       wait_queue_head_t *q;
+       int cpu;
+
+       local_irq_save(flags);
+       cpu = smp_processor_id();
+       per_cpu(rcu_cpu_has_work, cpu) = 1;
+       if (per_cpu(rcu_cpu_kthread_task, cpu) == NULL) {
+               local_irq_restore(flags);
+               return;
+       }
+       q = &per_cpu(rcu_cpu_wq, cpu);
+       wake_up(q);
+       local_irq_restore(flags);
+}
+
+/*
+ * Timer handler to initiate the waking up of per-CPU kthreads that
+ * have yielded the CPU due to excess numbers of RCU callbacks.
+ */
+static void rcu_cpu_kthread_timer(unsigned long arg)
+{
+       unsigned long flags;
+       struct rcu_data *rdp = (struct rcu_data *)arg;
+       struct rcu_node *rnp = rdp->mynode;
+       struct task_struct *t;
+
+       raw_spin_lock_irqsave(&rnp->lock, flags);
+       rnp->wakemask |= rdp->grpmask;
+       t = rnp->node_kthread_task;
+       if (t == NULL) {
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               return;
+       }
+       wake_up_process(t);
+       raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Drop to non-real-time priority and yield, but only after posting a
+ * timer that will cause us to regain our real-time priority if we
+ * remain preempted.  Either way, we restore our real-time priority
+ * before returning.
+ */
+static void rcu_yield(int cpu)
+{
+       struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu);
+       struct sched_param sp;
+       struct timer_list yield_timer;
+
+       setup_timer_on_stack(&yield_timer, rcu_cpu_kthread_timer, (unsigned long)rdp);
+       mod_timer(&yield_timer, jiffies + 2);
+       sp.sched_priority = 0;
+       sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
+       schedule();
+       sp.sched_priority = RCU_KTHREAD_PRIO;
+       sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+       del_timer(&yield_timer);
+}
+
+/*
+ * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
+ * This can happen while the corresponding CPU is either coming online
+ * or going offline.  We cannot wait until the CPU is fully online
+ * before starting the kthread, because the various notifier functions
+ * can wait for RCU grace periods.  So we park rcu_cpu_kthread() until
+ * the corresponding CPU is online.
+ *
+ * Return 1 if the kthread needs to stop, 0 otherwise.
+ *
+ * Caller must disable bh.  This function can momentarily enable it.
+ */
+static int rcu_cpu_kthread_should_stop(int cpu)
+{
+       while (cpu_is_offline(cpu) ||
+              !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) ||
+              smp_processor_id() != cpu) {
+               if (kthread_should_stop())
+                       return 1;
+               local_bh_enable();
+               schedule_timeout_uninterruptible(1);
+               if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)))
+                       set_cpus_allowed_ptr(current, cpumask_of(cpu));
+               local_bh_disable();
+       }
+       return 0;
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
+ * earlier RCU softirq.
+ */
+static int rcu_cpu_kthread(void *arg)
+{
+       int cpu = (int)(long)arg;
+       unsigned long flags;
+       int spincnt = 0;
+       wait_queue_head_t *wqp = &per_cpu(rcu_cpu_wq, cpu);
+       char work;
+       char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+
+       for (;;) {
+               wait_event_interruptible(*wqp,
+                                        *workp != 0 || kthread_should_stop());
+               local_bh_disable();
+               if (rcu_cpu_kthread_should_stop(cpu)) {
+                       local_bh_enable();
+                       break;
+               }
+               local_irq_save(flags);
+               work = *workp;
+               *workp = 0;
+               local_irq_restore(flags);
+               if (work)
+                       rcu_process_callbacks();
+               local_bh_enable();
+               if (*workp != 0)
+                       spincnt++;
+               else
+                       spincnt = 0;
+               if (spincnt > 10) {
+                       rcu_yield(cpu);
+                       spincnt = 0;
+               }
+       }
+       return 0;
+}
+
+/*
+ * Spawn a per-CPU kthread, setting up affinity and priority.
+ * Because the CPU hotplug lock is held, no other CPU will be attempting
+ * to manipulate rcu_cpu_kthread_task.  There might be another CPU
+ * attempting to access it during boot, but the locking in kthread_bind()
+ * will enforce sufficient ordering.
+ */
+static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
+{
+       struct sched_param sp;
+       struct task_struct *t;
+
+       if (!rcu_kthreads_spawnable ||
+           per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
+               return 0;
+       t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+       if (IS_ERR(t))
+               return PTR_ERR(t);
+       kthread_bind(t, cpu);
+       WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
+       per_cpu(rcu_cpu_kthread_task, cpu) = t;
+       wake_up_process(t);
+       sp.sched_priority = RCU_KTHREAD_PRIO;
+       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+       return 0;
+}
+
+/*
+ * Per-rcu_node kthread, which is in charge of waking up the per-CPU
+ * kthreads when needed.  We ignore requests to wake up kthreads
+ * for offline CPUs, which is OK because force_quiescent_state()
+ * takes care of this case.
+ */
+static int rcu_node_kthread(void *arg)
+{
+       int cpu;
+       unsigned long flags;
+       unsigned long mask;
+       struct rcu_node *rnp = (struct rcu_node *)arg;
+       struct sched_param sp;
+       struct task_struct *t;
+
+       for (;;) {
+               wait_event_interruptible(rnp->node_wq, rnp->wakemask != 0 ||
+                                                      kthread_should_stop());
+               if (kthread_should_stop())
+                       break;
+               raw_spin_lock_irqsave(&rnp->lock, flags);
+               mask = rnp->wakemask;
+               rnp->wakemask = 0;
+               raw_spin_unlock_irqrestore(&rnp->lock, flags);
+               for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
+                       if ((mask & 0x1) == 0)
+                               continue;
+                       preempt_disable();
+                       t = per_cpu(rcu_cpu_kthread_task, cpu);
+                       if (!cpu_online(cpu) || t == NULL) {
+                               preempt_enable();
+                               continue;
+                       }
+                       per_cpu(rcu_cpu_has_work, cpu) = 1;
+                       sp.sched_priority = RCU_KTHREAD_PRIO;
+                       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+                       preempt_enable();
+               }
+       }
+       return 0;
+}
+
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question.
+ */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp)
+{
+       cpumask_var_t cm;
+       int cpu;
+       unsigned long mask = rnp->qsmaskinit;
+
+       if (rnp->node_kthread_task == NULL ||
+           rnp->qsmaskinit == 0)
+               return;
+       if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+               return;
+       cpumask_clear(cm);
+       for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+               if (mask & 0x1)
+                       cpumask_set_cpu(cpu, cm);
+       set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
+       free_cpumask_var(cm);
+}
+
+/*
+ * Spawn a per-rcu_node kthread, setting priority and affinity.
+ */
+static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
+                                               struct rcu_node *rnp)
+{
+       int rnp_index = rnp - &rsp->node[0];
+       struct sched_param sp;
+       struct task_struct *t;
+
+       if (!rcu_kthreads_spawnable ||
+           rnp->qsmaskinit == 0 ||
+           rnp->node_kthread_task != NULL)
+               return 0;
+       t = kthread_create(rcu_node_kthread, (void *)rnp, "rcun%d", rnp_index);
+       if (IS_ERR(t))
+               return PTR_ERR(t);
+       rnp->node_kthread_task = t;
+       wake_up_process(t);
+       sp.sched_priority = 99;
+       sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+       return 0;
+}
+
+/*
+ * Spawn all kthreads -- called as soon as the scheduler is running.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+       int cpu;
+       struct rcu_node *rnp;
+
+       rcu_kthreads_spawnable = 1;
+       for_each_possible_cpu(cpu) {
+               init_waitqueue_head(&per_cpu(rcu_cpu_wq, cpu));
+               per_cpu(rcu_cpu_has_work, cpu) = 0;
+               if (cpu_online(cpu))
+                       (void)rcu_spawn_one_cpu_kthread(cpu);
+       }
+       rcu_for_each_leaf_node(&rcu_sched_state, rnp) {
+               init_waitqueue_head(&rnp->node_wq);
+               (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp);
+       }
+       return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+
 static void
 __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
           struct rcu_state *rsp)
        rcu_preempt_init_percpu_data(cpu);
 }
 
+static void __cpuinit rcu_online_kthreads(int cpu)
+{
+       struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu);
+       struct rcu_node *rnp = rdp->mynode;
+
+       /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+       if (rcu_kthreads_spawnable) {
+               (void)rcu_spawn_one_cpu_kthread(cpu);
+               if (rnp->node_kthread_task == NULL)
+                       (void)rcu_spawn_one_node_kthread(&rcu_sched_state, rnp);
+       }
+}
+
 /*
  * Handle CPU online/offline notification events.
  */
                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
+       struct rcu_data *rdp = per_cpu_ptr(rcu_sched_state.rda, cpu);
+       struct rcu_node *rnp = rdp->mynode;
 
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
                rcu_online_cpu(cpu);
+               rcu_online_kthreads(cpu);
+               break;
+       case CPU_ONLINE:
+               rcu_node_kthread_setaffinity(rnp);
                break;
        case CPU_DYING:
        case CPU_DYING_FROZEN:
        rcu_init_one(&rcu_sched_state, &rcu_sched_data);
        rcu_init_one(&rcu_bh_state, &rcu_bh_data);
        __rcu_init_preempt();
-       open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 
        /*
         * We don't need protection against CPU-hotplug here because