rq_affinity (RW)
 ----------------
-If this option is enabled, the block layer will migrate request completions
-to the CPU that originally submitted the request. For some workloads
-this provides a significant reduction in CPU cycles due to caching effects.
+If this option is '1', the block layer will migrate request completions to the
+cpu "group" that originally submitted the request. For some workloads this
+provides a significant reduction in CPU cycles due to caching effects.
+
+For storage configurations that need to maximize distribution of completion
+processing setting this option to '2' forces the completion to run on the
+requesting cpu (bypassing the "group" aggregation logic).
 
 scheduler (RW)
 --------------
 
        init_request_from_bio(req, bio);
 
        if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
-           bio_flagged(bio, BIO_CPU_AFFINE)) {
-               req->cpu = blk_cpu_to_group(get_cpu());
-               put_cpu();
-       }
+           bio_flagged(bio, BIO_CPU_AFFINE))
+               req->cpu = smp_processor_id();
 
        plug = current->plug;
        if (plug) {
 
 
 void __blk_complete_request(struct request *req)
 {
+       int ccpu, cpu, group_cpu = NR_CPUS;
        struct request_queue *q = req->q;
        unsigned long flags;
-       int ccpu, cpu, group_cpu;
 
        BUG_ON(!q->softirq_done_fn);
 
        local_irq_save(flags);
        cpu = smp_processor_id();
-       group_cpu = blk_cpu_to_group(cpu);
 
        /*
         * Select completion CPU
         */
-       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
+       if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) {
                ccpu = req->cpu;
-       else
+               if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) {
+                       ccpu = blk_cpu_to_group(ccpu);
+                       group_cpu = blk_cpu_to_group(cpu);
+               }
+       } else
                ccpu = cpu;
 
        if (ccpu == cpu || ccpu == group_cpu) {
 
 static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
 {
        bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags);
+       bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags);
 
-       return queue_var_show(set, page);
+       return queue_var_show(set << force, page);
 }
 
 static ssize_t
 
        ret = queue_var_store(&val, page, count);
        spin_lock_irq(q->queue_lock);
-       if (val)
+       if (val) {
                queue_flag_set(QUEUE_FLAG_SAME_COMP, q);
-       else
-               queue_flag_clear(QUEUE_FLAG_SAME_COMP,  q);
+               if (val == 2)
+                       queue_flag_set(QUEUE_FLAG_SAME_FORCE, q);
+       } else {
+               queue_flag_clear(QUEUE_FLAG_SAME_COMP, q);
+               queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q);
+       }
        spin_unlock_irq(q->queue_lock);
 #endif
        return ret;
 
 #define QUEUE_FLAG_ELVSWITCH   6       /* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI                7       /* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     8      /* disable merge attempts */
-#define QUEUE_FLAG_SAME_COMP   9       /* force complete on same CPU */
+#define QUEUE_FLAG_SAME_COMP   9       /* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO     10      /* fake timeout */
 #define QUEUE_FLAG_STACKABLE   11      /* supports request stacking */
 #define QUEUE_FLAG_NONROT      12      /* non-rotational device (SSD) */
 #define QUEUE_FLAG_NOXMERGES   15      /* No extended merges */
 #define QUEUE_FLAG_ADD_RANDOM  16      /* Contributes to random pool */
 #define QUEUE_FLAG_SECDISCARD  17      /* supports SECDISCARD */
+#define QUEUE_FLAG_SAME_FORCE  18      /* force complete on same CPU */
 
 #define QUEUE_FLAG_DEFAULT     ((1 << QUEUE_FLAG_IO_STAT) |            \
                                 (1 << QUEUE_FLAG_STACKABLE)    |       \