module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO);
 MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to subchannels");
+
+static int ring_avail_percent_lowater = 10;
+module_param(ring_avail_percent_lowater, int, S_IRUGO);
+MODULE_PARM_DESC(ring_avail_percent_lowater,
+               "Select a channel if available ring size > this in percent");
+
 /*
  * Timeout in seconds for all devices managed by this driver.
  */
         * Mask of CPUs bound to subchannels.
         */
        struct cpumask alloced_cpus;
+       /*
+        * Pre-allocated struct cpumask for each hardware queue.
+        * struct cpumask is used by selecting out-going channels. It is a
+        * big structure, default to 1024k bytes when CONFIG_MAXSMP=y.
+        * Pre-allocate it to avoid allocation on the kernel stack.
+        */
+       struct cpumask *cpumask_chns;
        /* Used for vsc/vsp channel reset process */
        struct storvsc_cmd_request init_request;
        struct storvsc_cmd_request reset_request;
        if (stor_device->stor_chns == NULL)
                return -ENOMEM;
 
+       stor_device->cpumask_chns = kcalloc(num_possible_cpus(),
+                       sizeof(struct cpumask), GFP_KERNEL);
+       if (stor_device->cpumask_chns == NULL) {
+               kfree(stor_device->stor_chns);
+               return -ENOMEM;
+       }
+
        stor_device->stor_chns[device->channel->target_cpu] = device->channel;
        cpumask_set_cpu(device->channel->target_cpu,
                        &stor_device->alloced_cpus);
        vmbus_close(device->channel);
 
        kfree(stor_device->stor_chns);
+       kfree(stor_device->cpumask_chns);
        kfree(stor_device);
        return 0;
 }
 {
        u16 slot = 0;
        u16 hash_qnum;
-       struct cpumask alloced_mask;
+       struct cpumask *alloced_mask = &stor_device->cpumask_chns[q_num];
        int num_channels, tgt_cpu;
 
        if (stor_device->num_sc == 0)
         * III. Mapping is persistent.
         */
 
-       cpumask_and(&alloced_mask, &stor_device->alloced_cpus,
+       cpumask_and(alloced_mask, &stor_device->alloced_cpus,
                    cpumask_of_node(cpu_to_node(q_num)));
 
-       num_channels = cpumask_weight(&alloced_mask);
+       num_channels = cpumask_weight(alloced_mask);
        if (num_channels == 0)
                return stor_device->device->channel;
 
        while (hash_qnum >= num_channels)
                hash_qnum -= num_channels;
 
-       for_each_cpu(tgt_cpu, &alloced_mask) {
+       for_each_cpu(tgt_cpu, alloced_mask) {
                if (slot == hash_qnum)
                        break;
                slot++;
 {
        struct storvsc_device *stor_device;
        struct vstor_packet *vstor_packet;
-       struct vmbus_channel *outgoing_channel;
+       struct vmbus_channel *outgoing_channel, *channel;
        int ret = 0;
-       struct cpumask alloced_mask;
+       struct cpumask *alloced_mask;
        int tgt_cpu;
 
        vstor_packet = &request->vstor_packet;
        /*
         * Select an an appropriate channel to send the request out.
         */
-
        if (stor_device->stor_chns[q_num] != NULL) {
                outgoing_channel = stor_device->stor_chns[q_num];
-               if (outgoing_channel->target_cpu == smp_processor_id()) {
+               if (outgoing_channel->target_cpu == q_num) {
                        /*
                         * Ideally, we want to pick a different channel if
                         * available on the same NUMA node.
                         */
-                       cpumask_and(&alloced_mask, &stor_device->alloced_cpus,
+                       alloced_mask = &stor_device->cpumask_chns[q_num];
+                       cpumask_and(alloced_mask, &stor_device->alloced_cpus,
                                    cpumask_of_node(cpu_to_node(q_num)));
-                       for_each_cpu_wrap(tgt_cpu, &alloced_mask,
-                                       outgoing_channel->target_cpu + 1) {
-                               if (tgt_cpu != outgoing_channel->target_cpu) {
-                                       outgoing_channel =
-                                       stor_device->stor_chns[tgt_cpu];
-                                       break;
+
+                       for_each_cpu_wrap(tgt_cpu, alloced_mask, q_num + 1) {
+                               if (tgt_cpu == q_num)
+                                       continue;
+                               channel = stor_device->stor_chns[tgt_cpu];
+                               if (hv_get_avail_to_write_percent(
+                                                       &channel->outbound)
+                                               > ring_avail_percent_lowater) {
+                                       outgoing_channel = channel;
+                                       goto found_channel;
+                               }
+                       }
+
+                       /*
+                        * All the other channels on the same NUMA node are
+                        * busy. Try to use the channel on the current CPU
+                        */
+                       if (hv_get_avail_to_write_percent(
+                                               &outgoing_channel->outbound)
+                                       > ring_avail_percent_lowater)
+                               goto found_channel;
+
+                       /*
+                        * If we reach here, all the channels on the current
+                        * NUMA node are busy. Try to find a channel in
+                        * other NUMA nodes
+                        */
+                       cpumask_andnot(alloced_mask, &stor_device->alloced_cpus,
+                                       cpumask_of_node(cpu_to_node(q_num)));
+
+                       for_each_cpu(tgt_cpu, alloced_mask) {
+                               channel = stor_device->stor_chns[tgt_cpu];
+                               if (hv_get_avail_to_write_percent(
+                                                       &channel->outbound)
+                                               > ring_avail_percent_lowater) {
+                                       outgoing_channel = channel;
+                                       goto found_channel;
                                }
                        }
                }
                outgoing_channel = get_og_chn(stor_device, q_num);
        }
 
-
+found_channel:
        vstor_packet->flags |= REQUEST_COMPLETION_FLAG;
 
        vstor_packet->vm_srb.length = (sizeof(struct vmscsi_request) -
                max_sub_channels = (num_cpus / storvsc_vcpus_per_sub_channel);
        }
 
-       scsi_driver.can_queue = (max_outstanding_req_per_channel *
-                                (max_sub_channels + 1));
+       scsi_driver.can_queue = max_outstanding_req_per_channel *
+                               (max_sub_channels + 1) *
+                               (100 - ring_avail_percent_lowater) / 100;
 
        host = scsi_host_alloc(&scsi_driver,
                               sizeof(struct hv_host_device));
 
 err_out1:
        kfree(stor_device->stor_chns);
+       kfree(stor_device->cpumask_chns);
        kfree(stor_device);
 
 err_out0: