} else if (!cluster_list_empty(&si->discard_clusters)) {
                        /*
                         * we don't have free cluster but have some clusters in
-                        * discarding, do discard now and reclaim them
+                        * discarding, do discard now and reclaim them, then
+                        * reread cluster_next_cpu since we dropped si->lock
                         */
                        swap_do_scheduled_discard(si);
-                       *scan_base = *offset = si->cluster_next;
+                       *scan_base = this_cpu_read(*si->cluster_next_cpu);
+                       *offset = *scan_base;
                        goto new_cluster;
                } else
                        return false;
        }
 }
 
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+{
+       unsigned long prev;
+
+       if (!(si->flags & SWP_SOLIDSTATE)) {
+               si->cluster_next = next;
+               return;
+       }
+
+       prev = this_cpu_read(*si->cluster_next_cpu);
+       /*
+        * Cross the swap address space size aligned trunk, choose
+        * another trunk randomly to avoid lock contention on swap
+        * address space if possible.
+        */
+       if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
+           (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
+               /* No free swap slots available */
+               if (si->highest_bit <= si->lowest_bit)
+                       return;
+               next = si->lowest_bit +
+                       prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+               next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
+               next = max_t(unsigned int, next, si->lowest_bit);
+       }
+       this_cpu_write(*si->cluster_next_cpu, next);
+}
+
 static int scan_swap_map_slots(struct swap_info_struct *si,
                               unsigned char usage, int nr,
                               swp_entry_t slots[])
         */
 
        si->flags += SWP_SCANNING;
-       scan_base = offset = si->cluster_next;
+       /*
+        * Use percpu scan base for SSD to reduce lock contention on
+        * cluster and swap cache.  For HDD, sequential access is more
+        * important.
+        */
+       if (si->flags & SWP_SOLIDSTATE)
+               scan_base = this_cpu_read(*si->cluster_next_cpu);
+       else
+               scan_base = si->cluster_next;
+       offset = scan_base;
 
        /* SSD algorithm */
        if (si->cluster_info) {
        unlock_cluster(ci);
 
        swap_range_alloc(si, offset, 1);
-       si->cluster_next = offset + 1;
        slots[n_ret++] = swp_entry(si->type, offset);
 
        /* got enough slots or reach max slots? */
        }
 
 done:
+       set_cluster_next(si, offset + 1);
        si->flags -= SWP_SCANNING;
        return n_ret;
 
        mutex_unlock(&swapon_mutex);
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
+       free_percpu(p->cluster_next_cpu);
+       p->cluster_next_cpu = NULL;
        vfree(swap_map);
        kvfree(cluster_info);
        kvfree(frontswap_map);
                unsigned long ci, nr_cluster;
 
                p->flags |= SWP_SOLIDSTATE;
+               p->cluster_next_cpu = alloc_percpu(unsigned int);
+               if (!p->cluster_next_cpu) {
+                       error = -ENOMEM;
+                       goto bad_swap_unlock_inode;
+               }
                /*
                 * select a random position to start with to help wear leveling
                 * SSD
                 */
-               p->cluster_next = 1 + prandom_u32_max(p->highest_bit);
+               for_each_possible_cpu(cpu) {
+                       per_cpu(*p->cluster_next_cpu, cpu) =
+                               1 + prandom_u32_max(p->highest_bit);
+               }
                nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 
                cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
 bad_swap:
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
+       free_percpu(p->cluster_next_cpu);
+       p->cluster_next_cpu = NULL;
        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
                set_blocksize(p->bdev, p->old_block_size);
                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);