* @swork: work struct for serialization.
  * @pd: Backpointer to the internal control structure.
  * @work: work struct for parallelization.
+ * @reorder_work: work struct for reordering.
  * @num_obj: Number of objects that are processed by this cpu.
  * @cpu_index: Index of the cpu.
  */
        struct padata_list    reorder;
        struct parallel_data *pd;
        struct work_struct    work;
+       struct work_struct    reorder_work;
        atomic_t              num_obj;
        int                   cpu_index;
 };
 
        return;
 }
 
+static void invoke_padata_reorder(struct work_struct *work)
+{
+       struct padata_parallel_queue *pqueue;
+       struct parallel_data *pd;
+
+       local_bh_disable();
+       pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
+       pd = pqueue->pd;
+       padata_reorder(pd);
+       local_bh_enable();
+}
+
 static void padata_reorder_timer(unsigned long arg)
 {
        struct parallel_data *pd = (struct parallel_data *)arg;
+       unsigned int weight;
+       int target_cpu, cpu;
 
-       padata_reorder(pd);
+       cpu = get_cpu();
+
+       /* We don't lock pd here to not interfere with parallel processing
+        * padata_reorder() calls on other CPUs. We just need any CPU out of
+        * the cpumask.pcpu set. It would be nice if it's the right one but
+        * it doesn't matter if we're off to the next one by using an outdated
+        * pd->processed value.
+        */
+       weight = cpumask_weight(pd->cpumask.pcpu);
+       target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
+
+       /* ensure to call the reorder callback on the correct CPU */
+       if (cpu != target_cpu) {
+               struct padata_parallel_queue *pqueue;
+               struct padata_instance *pinst;
+
+               /* The timer function is serialized wrt itself -- no locking
+                * needed.
+                */
+               pinst = pd->pinst;
+               pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
+               queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
+       } else {
+               padata_reorder(pd);
+       }
+
+       put_cpu();
 }
 
 static void padata_serial_worker(struct work_struct *serial_work)
                __padata_list_init(&pqueue->reorder);
                __padata_list_init(&pqueue->parallel);
                INIT_WORK(&pqueue->work, padata_parallel_worker);
+               INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
                atomic_set(&pqueue->num_obj, 0);
        }
 }