local_bh_enable();
 }
 
+static bool flush_required(int cpu)
+{
+#if IS_ENABLED(CONFIG_RPS)
+       struct softnet_data *sd = &per_cpu(softnet_data, cpu);
+       bool do_flush;
+
+       local_irq_disable();
+       rps_lock(sd);
+
+       /* as insertion into process_queue happens with the rps lock held,
+        * process_queue access may race only with dequeue
+        */
+       do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
+                  !skb_queue_empty_lockless(&sd->process_queue);
+       rps_unlock(sd);
+       local_irq_enable();
+
+       return do_flush;
+#endif
+       /* without RPS we can't safely check input_pkt_queue: during a
+        * concurrent remote skb_queue_splice() we can detect as empty both
+        * input_pkt_queue and process_queue even if the latter could end-up
+        * containing a lot of packets.
+        */
+       return true;
+}
+
 static void flush_all_backlogs(void)
 {
+       static cpumask_t flush_cpus;
        unsigned int cpu;
 
+       /* since we are under rtnl lock protection we can use static data
+        * for the cpumask and avoid allocating on stack the possibly
+        * large mask
+        */
+       ASSERT_RTNL();
+
        get_online_cpus();
 
-       for_each_online_cpu(cpu)
-               queue_work_on(cpu, system_highpri_wq,
-                             per_cpu_ptr(&flush_works, cpu));
+       cpumask_clear(&flush_cpus);
+       for_each_online_cpu(cpu) {
+               if (flush_required(cpu)) {
+                       queue_work_on(cpu, system_highpri_wq,
+                                     per_cpu_ptr(&flush_works, cpu));
+                       cpumask_set_cpu(cpu, &flush_cpus);
+               }
+       }
 
-       for_each_online_cpu(cpu)
+       /* we can have in flight packet[s] on the cpus we are not flushing,
+        * synchronize_net() in rollback_registered_many() will take care of
+        * them
+        */
+       for_each_cpu(cpu, &flush_cpus)
                flush_work(per_cpu_ptr(&flush_works, cpu));
 
        put_online_cpus();