]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
rcu: Make rcu_barrier() no longer block CPU-hotplug operations
authorPaul E. McKenney <paulmck@kernel.org>
Tue, 14 Dec 2021 21:35:17 +0000 (13:35 -0800)
committerPaul E. McKenney <paulmck@kernel.org>
Tue, 8 Feb 2022 18:12:28 +0000 (10:12 -0800)
This commit removes the cpus_read_lock() and cpus_read_unlock() calls
from rcu_barrier(), thus allowing CPUs to come and go during the course
of rcu_barrier() execution.  Posting of the ->barrier_head callbacks does
synchronize with portions of RCU's CPU-hotplug notifiers, but these locks
are held for short time periods on both sides.  Thus, full CPU-hotplug
operations could both start and finish during the execution of a given
rcu_barrier() invocation.

Additional synchronization is provided by a global ->barrier_lock.
Since the ->barrier_lock is only used during rcu_barrier() execution and
during onlining/offlining a CPU, the contention for this lock should
be low.  It might be tempting to make use of a per-CPU lock just on
general principles, but straightforward attempts to do this have the
problems shown below.

Initial state: 3 CPUs present, CPU 0 and CPU1 do not have
any callback and CPU2 has callbacks.

1. CPU0 calls rcu_barrier().

2. CPU1 starts offlining for CPU2. CPU1 calls
   rcutree_migrate_callbacks(). rcu_barrier_entrain() is called
   from rcutree_migrate_callbacks(), with CPU2's rdp->barrier_lock.
   It does not entrain ->barrier_head for CPU2, as rcu_barrier()
   on CPU0 hasn't started the barrier sequence (by calling
   rcu_seq_start(&rcu_state.barrier_sequence)) yet.

3. CPU0 starts new barrier sequence. It iterates over
   CPU0 and CPU1, after acquiring their per-cpu ->barrier_lock
   and finds 0 segcblist length. It updates ->barrier_seq_snap
   for CPU0 and CPU1 and continues loop iteration to CPU2.

    for_each_possible_cpu(cpu) {
        raw_spin_lock_irqsave(&rdp->barrier_lock, flags);
        if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
            WRITE_ONCE(rdp->barrier_seq_snap, gseq);
            raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
            rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
            continue;
        }

4. rcutree_migrate_callbacks() completes execution on CPU1.
   Segcblist len for CPU2 becomes 0.

5. The loop iteration on CPU0, checks rcu_segcblist_n_cbs(&rdp->cblist)
   for CPU2 and completes the loop iteration after setting
   ->barrier_seq_snap.

6. As there isn't any ->barrier_head callback entrained; at
   this point, rcu_barrier() in CPU0 returns.

7. The callbacks, which migrated from CPU2 to CPU1, execute.

Straightforward per-CPU locking is also subject to the following race
condition noted by Boqun Feng:

1. CPU0 calls rcu_barrier(), starting a new barrier sequence by invoking
   rcu_seq_start() and init_completion(), but does not yet initialize
   rcu_state.barrier_cpu_count.

2. CPU1 starts offlining for CPU2, calling rcutree_migrate_callbacks(),
   which in turn calls rcu_barrier_entrain() holding CPU2's.
   rdp->barrier_lock.  It then entrains ->barrier_head for CPU2
   and atomically increments rcu_state.barrier_cpu_count, which is
   unfortunately not yet initialized to the value 2.

3. The just-entrained RCU callback is invoked.  It atomically
   decrements rcu_state.barrier_cpu_count and sees that it is
   now zero.  This callback therefore invokes complete().

4. CPU0 continues executing rcu_barrier(), but is not blocked
   by its call to wait_for_completion().  This results in rcu_barrier()
   returning before all pre-existing callbacks have been invoked,
   which is a bug.

Therefore, synchronization is provided by rcu_state.barrier_lock,
which is also held across the initialization sequence, especially the
rcu_seq_start() and the atomic_set() that sets rcu_state.barrier_cpu_count
to the value 2.  In addition, this lock is held when entraining the
rcu_barrier() callback, when deciding whether or not a CPU has callbacks
that rcu_barrier() must wait on, when setting the ->qsmaskinitnext for
incoming CPUs, and when migrating callbacks from a CPU that is going
offline.

Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Co-developed-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Neeraj Upadhyay <quic_neeraju@quicinc.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
kernel/rcu/tree.c
kernel/rcu/tree.h

index 004ff1c0d192fd0413885de7e22223ff75b9e0d0..2d70b91e3fbcc600f6908dce3476e47fe19c3162 100644 (file)
@@ -87,6 +87,7 @@ static struct rcu_state rcu_state = {
        .gp_state = RCU_GP_IDLE,
        .gp_seq = (0UL - 300UL) << RCU_SEQ_CTR_SHIFT,
        .barrier_mutex = __MUTEX_INITIALIZER(rcu_state.barrier_mutex),
+       .barrier_lock = __RAW_SPIN_LOCK_UNLOCKED(rcu_state.barrier_lock),
        .name = RCU_NAME,
        .abbr = RCU_ABBR,
        .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex),
@@ -3994,7 +3995,7 @@ static void rcu_barrier_entrain(struct rcu_data *rdp)
        unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
        unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
 
-       lockdep_assert_held(&rdp->barrier_lock);
+       lockdep_assert_held(&rcu_state.barrier_lock);
        if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
                return;
        rcu_barrier_trace(TPS("IRQ"), -1, rcu_state.barrier_sequence);
@@ -4023,9 +4024,9 @@ static void rcu_barrier_handler(void *cpu_in)
        lockdep_assert_irqs_disabled();
        WARN_ON_ONCE(cpu != rdp->cpu);
        WARN_ON_ONCE(cpu != smp_processor_id());
-       raw_spin_lock(&rdp->barrier_lock);
+       raw_spin_lock(&rcu_state.barrier_lock);
        rcu_barrier_entrain(rdp);
-       raw_spin_unlock(&rdp->barrier_lock);
+       raw_spin_unlock(&rcu_state.barrier_lock);
 }
 
 /**
@@ -4058,6 +4059,7 @@ void rcu_barrier(void)
        }
 
        /* Mark the start of the barrier operation. */
+       raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
        rcu_seq_start(&rcu_state.barrier_sequence);
        gseq = rcu_state.barrier_sequence;
        rcu_barrier_trace(TPS("Inc1"), -1, rcu_state.barrier_sequence);
@@ -4071,7 +4073,7 @@ void rcu_barrier(void)
         */
        init_completion(&rcu_state.barrier_completion);
        atomic_set(&rcu_state.barrier_cpu_count, 2);
-       cpus_read_lock();
+       raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
 
        /*
         * Force each CPU with callbacks to register a new callback.
@@ -4083,21 +4085,21 @@ void rcu_barrier(void)
 retry:
                if (smp_load_acquire(&rdp->barrier_seq_snap) == gseq)
                        continue;
-               raw_spin_lock_irqsave(&rdp->barrier_lock, flags);
+               raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
                if (!rcu_segcblist_n_cbs(&rdp->cblist)) {
                        WRITE_ONCE(rdp->barrier_seq_snap, gseq);
-                       raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
+                       raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
                        rcu_barrier_trace(TPS("NQ"), cpu, rcu_state.barrier_sequence);
                        continue;
                }
                if (!rcu_rdp_cpu_online(rdp)) {
                        rcu_barrier_entrain(rdp);
                        WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
-                       raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
+                       raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
                        rcu_barrier_trace(TPS("OfflineNoCBQ"), cpu, rcu_state.barrier_sequence);
                        continue;
                }
-               raw_spin_unlock_irqrestore(&rdp->barrier_lock, flags);
+               raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
                if (smp_call_function_single(cpu, rcu_barrier_handler, (void *)cpu, 1)) {
                        schedule_timeout_uninterruptible(1);
                        goto retry;
@@ -4105,7 +4107,6 @@ retry:
                WARN_ON_ONCE(READ_ONCE(rdp->barrier_seq_snap) != gseq);
                rcu_barrier_trace(TPS("OnlineQ"), cpu, rcu_state.barrier_sequence);
        }
-       cpus_read_unlock();
 
        /*
         * Now that we have an rcu_barrier_callback() callback on each
@@ -4173,7 +4174,6 @@ rcu_boot_init_percpu_data(int cpu)
        INIT_WORK(&rdp->strict_work, strict_work_handler);
        WARN_ON_ONCE(rdp->dynticks_nesting != 1);
        WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
-       raw_spin_lock_init(&rdp->barrier_lock);
        rdp->barrier_seq_snap = rcu_state.barrier_sequence;
        rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
        rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
@@ -4325,10 +4325,10 @@ void rcu_cpu_starting(unsigned int cpu)
        local_irq_save(flags);
        arch_spin_lock(&rcu_state.ofl_lock);
        rcu_dynticks_eqs_online();
-       raw_spin_lock(&rdp->barrier_lock);
+       raw_spin_lock(&rcu_state.barrier_lock);
        raw_spin_lock_rcu_node(rnp);
        WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask);
-       raw_spin_unlock(&rdp->barrier_lock);
+       raw_spin_unlock(&rcu_state.barrier_lock);
        newcpu = !(rnp->expmaskinitnext & mask);
        rnp->expmaskinitnext |= mask;
        /* Allow lockless access for expedited grace periods. */
@@ -4415,7 +4415,7 @@ void rcutree_migrate_callbacks(int cpu)
            rcu_segcblist_empty(&rdp->cblist))
                return;  /* No callbacks to migrate. */
 
-       raw_spin_lock_irqsave(&rdp->barrier_lock, flags);
+       raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
        WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
        rcu_barrier_entrain(rdp);
        my_rdp = this_cpu_ptr(&rcu_data);
@@ -4427,7 +4427,7 @@ void rcutree_migrate_callbacks(int cpu)
        needwake = rcu_advance_cbs(my_rnp, rdp) ||
                   rcu_advance_cbs(my_rnp, my_rdp);
        rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
-       raw_spin_unlock(&rdp->barrier_lock); /* irqs remain disabled. */
+       raw_spin_unlock(&rcu_state.barrier_lock); /* irqs remain disabled. */
        needwake = needwake || rcu_advance_cbs(my_rnp, my_rdp);
        rcu_segcblist_disable(&rdp->cblist);
        WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) != !rcu_segcblist_n_cbs(&my_rdp->cblist));
index 84362951ed9e16eeb178809e4e8f08e260cde8c5..a2d7ffd634cc1f33be93d9bc038a0d027b81967a 100644 (file)
@@ -188,7 +188,6 @@ struct rcu_data {
        bool rcu_forced_tick_exp;       /*   ... provide QS to expedited GP. */
 
        /* 4) rcu_barrier(), OOM callbacks, and expediting. */
-       raw_spinlock_t barrier_lock;    /* Protects ->barrier_seq_snap. */
        unsigned long barrier_seq_snap; /* Snap of rcu_state.barrier_sequence. */
        struct rcu_head barrier_head;
        int exp_dynticks_snap;          /* Double-check need for IPI. */
@@ -323,6 +322,8 @@ struct rcu_state {
                                                /*  rcu_barrier(). */
        /* End of fields guarded by barrier_mutex. */
 
+       raw_spinlock_t barrier_lock;            /* Protects ->barrier_seq_snap. */
+
        struct mutex exp_mutex;                 /* Serialize expedited GP. */
        struct mutex exp_wake_mutex;            /* Serialize wakeup. */
        unsigned long expedited_sequence;       /* Take a ticket. */