{
        unsigned long gseq = READ_ONCE(rcu_state.barrier_sequence);
        unsigned long lseq = READ_ONCE(rdp->barrier_seq_snap);
+       bool wake_nocb = false;
+       bool was_alldone = false;
 
        lockdep_assert_held(&rcu_state.barrier_lock);
        if (rcu_seq_state(lseq) || !rcu_seq_state(gseq) || rcu_seq_ctr(lseq) != rcu_seq_ctr(gseq))
        rdp->barrier_head.func = rcu_barrier_callback;
        debug_rcu_head_queue(&rdp->barrier_head);
        rcu_nocb_lock(rdp);
+       /*
+        * Flush bypass and wakeup rcuog if we add callbacks to an empty regular
+        * queue. This way we don't wait for bypass timer that can reach seconds
+        * if it's fully lazy.
+        */
+       was_alldone = rcu_rdp_is_offloaded(rdp) && !rcu_segcblist_pend_cbs(&rdp->cblist);
        WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+       wake_nocb = was_alldone && rcu_segcblist_pend_cbs(&rdp->cblist);
        if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head)) {
                atomic_inc(&rcu_state.barrier_cpu_count);
        } else {
                rcu_barrier_trace(TPS("IRQNQ"), -1, rcu_state.barrier_sequence);
        }
        rcu_nocb_unlock(rdp);
+       if (wake_nocb)
+               wake_nocb_gp(rdp, false);
        smp_store_release(&rdp->barrier_seq_snap, gseq);
 }
 
 
 static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
 static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
 static void rcu_init_one_nocb(struct rcu_node *rnp);
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force);
 static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
                                  unsigned long j);
 static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,