POOL_BH                 = 1 << 0,       /* is a BH pool */
        POOL_MANAGER_ACTIVE     = 1 << 1,       /* being managed */
        POOL_DISASSOCIATED      = 1 << 2,       /* cpu can't serve workers */
+       POOL_BH_DRAINING        = 1 << 3,       /* draining after CPU offline */
 };
 
 enum worker_flags {
 static void kick_bh_pool(struct worker_pool *pool)
 {
 #ifdef CONFIG_SMP
-       if (unlikely(pool->cpu != smp_processor_id())) {
+       /* see drain_dead_softirq_workfn() for BH_DRAINING */
+       if (unlikely(pool->cpu != smp_processor_id() &&
+                    !(pool->flags & POOL_BH_DRAINING))) {
                irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu);
                return;
        }
        struct worker_pool *pool = worker->pool;
        unsigned long work_data;
        int lockdep_start_depth, rcu_start_depth;
+       bool bh_draining = pool->flags & POOL_BH_DRAINING;
 #ifdef CONFIG_LOCKDEP
        /*
         * It is permissible to free the struct work_struct from
 
        rcu_start_depth = rcu_preempt_depth();
        lockdep_start_depth = lockdep_depth(current);
-       lock_map_acquire(&pwq->wq->lockdep_map);
+       /* see drain_dead_softirq_workfn() */
+       if (!bh_draining)
+               lock_map_acquire(&pwq->wq->lockdep_map);
        lock_map_acquire(&lockdep_map);
        /*
         * Strictly speaking we should mark the invariant state without holding
        trace_workqueue_execute_end(work, worker->current_func);
        pwq->stats[PWQ_STAT_COMPLETED]++;
        lock_map_release(&lockdep_map);
-       lock_map_release(&pwq->wq->lockdep_map);
+       if (!bh_draining)
+               lock_map_release(&pwq->wq->lockdep_map);
 
        if (unlikely((worker->task && in_atomic()) ||
                     lockdep_depth(current) != lockdep_start_depth ||
                bh_worker(list_first_entry(&pool->workers, struct worker, node));
 }
 
+struct wq_drain_dead_softirq_work {
+       struct work_struct      work;
+       struct worker_pool      *pool;
+       struct completion       done;
+};
+
+static void drain_dead_softirq_workfn(struct work_struct *work)
+{
+       struct wq_drain_dead_softirq_work *dead_work =
+               container_of(work, struct wq_drain_dead_softirq_work, work);
+       struct worker_pool *pool = dead_work->pool;
+       bool repeat;
+
+       /*
+        * @pool's CPU is dead and we want to execute its still pending work
+        * items from this BH work item which is running on a different CPU. As
+        * its CPU is dead, @pool can't be kicked and, as work execution path
+        * will be nested, a lockdep annotation needs to be suppressed. Mark
+        * @pool with %POOL_BH_DRAINING for the special treatments.
+        */
+       raw_spin_lock_irq(&pool->lock);
+       pool->flags |= POOL_BH_DRAINING;
+       raw_spin_unlock_irq(&pool->lock);
+
+       bh_worker(list_first_entry(&pool->workers, struct worker, node));
+
+       raw_spin_lock_irq(&pool->lock);
+       pool->flags &= ~POOL_BH_DRAINING;
+       repeat = need_more_worker(pool);
+       raw_spin_unlock_irq(&pool->lock);
+
+       /*
+        * bh_worker() might hit consecutive execution limit and bail. If there
+        * still are pending work items, reschedule self and return so that we
+        * don't hog this CPU's BH.
+        */
+       if (repeat) {
+               if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+                       queue_work(system_bh_highpri_wq, work);
+               else
+                       queue_work(system_bh_wq, work);
+       } else {
+               complete(&dead_work->done);
+       }
+}
+
+/*
+ * @cpu is dead. Drain the remaining BH work items on the current CPU. It's
+ * possible to allocate dead_work per CPU and avoid flushing. However, then we
+ * have to worry about draining overlapping with CPU coming back online or
+ * nesting (one CPU's dead_work queued on another CPU which is also dead and so
+ * on). Let's keep it simple and drain them synchronously. These are BH work
+ * items which shouldn't be requeued on the same pool. Shouldn't take long.
+ */
+void workqueue_softirq_dead(unsigned int cpu)
+{
+       int i;
+
+       for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
+               struct worker_pool *pool = &per_cpu(bh_worker_pools, cpu)[i];
+               struct wq_drain_dead_softirq_work dead_work;
+
+               if (!need_more_worker(pool))
+                       continue;
+
+               INIT_WORK(&dead_work.work, drain_dead_softirq_workfn);
+               dead_work.pool = pool;
+               init_completion(&dead_work.done);
+
+               if (pool->attrs->nice == HIGHPRI_NICE_LEVEL)
+                       queue_work(system_bh_highpri_wq, &dead_work.work);
+               else
+                       queue_work(system_bh_wq, &dead_work.work);
+
+               wait_for_completion(&dead_work.done);
+       }
+}
+
 /**
  * check_flush_dependency - check for flush dependency sanity
  * @target_wq: workqueue being flushed