static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
                                 unsigned char);
 static void free_swap_count_continuations(struct swap_info_struct *);
+static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+                            unsigned int nr_entries);
 
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
        return ci - si->cluster_info;
 }
 
+static inline unsigned int cluster_offset(struct swap_info_struct *si,
+                                         struct swap_cluster_info *ci)
+{
+       return cluster_index(si, ci) * SWAPFILE_CLUSTER;
+}
+
 static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
                                                     unsigned long offset)
 {
 
 static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
 {
+       lockdep_assert_held(&si->lock);
+       lockdep_assert_held(&ci->lock);
+
        if (ci->flags & CLUSTER_FLAG_NONFULL)
                list_move_tail(&ci->list, &si->free_clusters);
        else
                list_add_tail(&ci->list, &si->free_clusters);
        ci->flags = CLUSTER_FLAG_FREE;
+       ci->order = 0;
 }
 
 /*
        struct swap_cluster_info *ci = list_first_entry(&si->free_clusters,
                                                        struct swap_cluster_info, list);
 
+       lockdep_assert_held(&si->lock);
+       lockdep_assert_held(&ci->lock);
        VM_BUG_ON(cluster_index(si, ci) != idx);
+       VM_BUG_ON(ci->count);
        list_del(&ci->list);
-       ci->count = 0;
        ci->flags = 0;
        return ci;
 }
 static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
 {
        VM_BUG_ON(ci->count != 0);
+       lockdep_assert_held(&si->lock);
+       lockdep_assert_held(&ci->lock);
        /*
         * If the swap is discardable, prepare discard the cluster
         * instead of free it immediately. The cluster will be freed
                return;
 
        VM_BUG_ON(ci->count == 0);
+       VM_BUG_ON(cluster_is_free(ci));
+       lockdep_assert_held(&p->lock);
+       lockdep_assert_held(&ci->lock);
        ci->count--;
 
        if (!ci->count) {
        }
 
        if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+               VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
                list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
-               ci->flags |= CLUSTER_FLAG_NONFULL;
+               ci->flags = CLUSTER_FLAG_NONFULL;
        }
 }
 
-/*
- * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
- * cluster list. Avoiding such abuse to avoid list corruption.
- */
-static bool
-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
-       unsigned long offset, int order)
+static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
+                                     unsigned int nr_pages)
 {
-       struct percpu_cluster *percpu_cluster;
-       bool conflict;
-       struct swap_cluster_info *first = list_first_entry(&si->free_clusters,
-                                                          struct swap_cluster_info, list);
-
-       offset /= SWAPFILE_CLUSTER;
-       conflict = !list_empty(&si->free_clusters) &&
-               offset !=  cluster_index(si, first) &&
-               cluster_is_free(&si->cluster_info[offset]);
+       unsigned char *p = si->swap_map + start;
+       unsigned char *end = p + nr_pages;
 
-       if (!conflict)
-               return false;
+       while (p < end)
+               if (*p++)
+                       return false;
 
-       percpu_cluster = this_cpu_ptr(si->percpu_cluster);
-       percpu_cluster->next[order] = SWAP_NEXT_INVALID;
        return true;
 }
 
-static inline bool swap_range_empty(char *swap_map, unsigned int start,
-                                   unsigned int nr_pages)
+
+static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+                                               unsigned int start, unsigned char usage,
+                                               unsigned int order)
 {
-       unsigned int i;
+       unsigned int nr_pages = 1 << order;
 
-       for (i = 0; i < nr_pages; i++) {
-               if (swap_map[start + i])
-                       return false;
+       if (cluster_is_free(ci)) {
+               if (nr_pages < SWAPFILE_CLUSTER) {
+                       list_move_tail(&ci->list, &si->nonfull_clusters[order]);
+                       ci->flags = CLUSTER_FLAG_NONFULL;
+               }
+               ci->order = order;
        }
 
-       return true;
+       memset(si->swap_map + start, usage, nr_pages);
+       swap_range_alloc(si, start, nr_pages);
+       ci->count += nr_pages;
+
+       if (ci->count == SWAPFILE_CLUSTER) {
+               VM_BUG_ON(!(ci->flags & (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL)));
+               list_del(&ci->list);
+               ci->flags = 0;
+       }
+}
+
+static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
+                                           unsigned int *foundp, unsigned int order,
+                                           unsigned char usage)
+{
+       unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
+       unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
+       unsigned int nr_pages = 1 << order;
+       struct swap_cluster_info *ci;
+
+       if (end < nr_pages)
+               return SWAP_NEXT_INVALID;
+       end -= nr_pages;
+
+       ci = lock_cluster(si, offset);
+       if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
+               offset = SWAP_NEXT_INVALID;
+               goto done;
+       }
+
+       while (offset <= end) {
+               if (cluster_scan_range(si, offset, nr_pages)) {
+                       cluster_alloc_range(si, ci, offset, usage, order);
+                       *foundp = offset;
+                       if (ci->count == SWAPFILE_CLUSTER) {
+                               offset = SWAP_NEXT_INVALID;
+                               goto done;
+                       }
+                       offset += nr_pages;
+                       break;
+               }
+               offset += nr_pages;
+       }
+       if (offset > end)
+               offset = SWAP_NEXT_INVALID;
+done:
+       unlock_cluster(ci);
+       return offset;
 }
 
 /*
  * pool (a cluster). This might involve allocating a new cluster for current CPU
  * too.
  */
-static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
-       unsigned long *offset, unsigned long *scan_base, int order)
+static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
+                                             unsigned char usage)
 {
-       unsigned int nr_pages = 1 << order;
        struct percpu_cluster *cluster;
-       struct swap_cluster_info *ci;
-       unsigned int tmp, max;
+       struct swap_cluster_info *ci, *n;
+       unsigned int offset, found = 0;
 
 new_cluster:
+       lockdep_assert_held(&si->lock);
        cluster = this_cpu_ptr(si->percpu_cluster);
-       tmp = cluster->next[order];
-       if (tmp == SWAP_NEXT_INVALID) {
-               if (!list_empty(&si->free_clusters)) {
-                       ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
-                       list_del(&ci->list);
-                       spin_lock(&ci->lock);
-                       ci->order = order;
-                       ci->flags = 0;
-                       spin_unlock(&ci->lock);
-                       tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
-               } else if (!list_empty(&si->nonfull_clusters[order])) {
-                       ci = list_first_entry(&si->nonfull_clusters[order],
-                                             struct swap_cluster_info, list);
-                       list_del(&ci->list);
-                       spin_lock(&ci->lock);
-                       ci->flags = 0;
-                       spin_unlock(&ci->lock);
-                       tmp = cluster_index(si, ci) * SWAPFILE_CLUSTER;
-               } else if (!list_empty(&si->discard_clusters)) {
-                       /*
-                        * we don't have free cluster but have some clusters in
-                        * discarding, do discard now and reclaim them, then
-                        * reread cluster_next_cpu since we dropped si->lock
-                        */
-                       swap_do_scheduled_discard(si);
-                       *scan_base = this_cpu_read(*si->cluster_next_cpu);
-                       *offset = *scan_base;
-                       goto new_cluster;
-               } else
-                       return false;
+       offset = cluster->next[order];
+       if (offset) {
+               offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
+               if (found)
+                       goto done;
        }
 
-       /*
-        * Other CPUs can use our cluster if they can't find a free cluster,
-        * check if there is still free entry in the cluster, maintaining
-        * natural alignment.
-        */
-       max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
-       if (tmp < max) {
-               ci = lock_cluster(si, tmp);
-               while (tmp < max) {
-                       if (swap_range_empty(si->swap_map, tmp, nr_pages))
-                               break;
-                       tmp += nr_pages;
+       if (!list_empty(&si->free_clusters)) {
+               ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
+               offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
+               VM_BUG_ON(!found);
+               goto done;
+       }
+
+       if (order < PMD_ORDER) {
+               list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
+                       offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+                                                        &found, order, usage);
+                       if (found)
+                               goto done;
                }
-               unlock_cluster(ci);
        }
-       if (tmp >= max) {
-               cluster->next[order] = SWAP_NEXT_INVALID;
+
+       if (!list_empty(&si->discard_clusters)) {
+               /*
+                * we don't have free cluster but have some clusters in
+                * discarding, do discard now and reclaim them, then
+                * reread cluster_next_cpu since we dropped si->lock
+                */
+               swap_do_scheduled_discard(si);
                goto new_cluster;
        }
-       *offset = tmp;
-       *scan_base = tmp;
-       tmp += nr_pages;
-       cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
-       return true;
+
+       if (order)
+               goto done;
+
+       for (int o = 1; o < SWAP_NR_ORDERS; o++) {
+               if (!list_empty(&si->nonfull_clusters[o])) {
+                       ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
+                                             list);
+                       offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+                                                        &found, 0, usage);
+                       VM_BUG_ON(!found);
+                       goto done;
+               }
+       }
+
+done:
+       cluster->next[order] = offset;
+       return found;
 }
 
 static void __del_from_avail_list(struct swap_info_struct *p)
        return false;
 }
 
+static int cluster_alloc_swap(struct swap_info_struct *si,
+                            unsigned char usage, int nr,
+                            swp_entry_t slots[], int order)
+{
+       int n_ret = 0;
+
+       VM_BUG_ON(!si->cluster_info);
+
+       while (n_ret < nr) {
+               unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
+
+               if (!offset)
+                       break;
+               slots[n_ret++] = swp_entry(si->type, offset);
+       }
+
+       return n_ret;
+}
+
 static int scan_swap_map_slots(struct swap_info_struct *si,
                               unsigned char usage, int nr,
                               swp_entry_t slots[], int order)
 {
-       struct swap_cluster_info *ci;
        unsigned long offset;
        unsigned long scan_base;
        unsigned long last_in_cluster = 0;
                        return 0;
        }
 
+       if (si->cluster_info)
+               return cluster_alloc_swap(si, usage, nr, slots, order);
+
        si->flags += SWP_SCANNING;
-       /*
-        * Use percpu scan base for SSD to reduce lock contention on
-        * cluster and swap cache.  For HDD, sequential access is more
-        * important.
-        */
-       if (si->flags & SWP_SOLIDSTATE)
-               scan_base = this_cpu_read(*si->cluster_next_cpu);
-       else
-               scan_base = si->cluster_next;
+
+       /* For HDD, sequential access is more important. */
+       scan_base = si->cluster_next;
        offset = scan_base;
 
-       /* SSD algorithm */
-       if (si->cluster_info) {
-               if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
-                       if (order > 0)
-                               goto no_page;
-                       goto scan;
-               }
-       } else if (unlikely(!si->cluster_nr--)) {
+       if (unlikely(!si->cluster_nr--)) {
                if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
                        si->cluster_nr = SWAPFILE_CLUSTER - 1;
                        goto checks;
                /*
                 * If seek is expensive, start searching for new cluster from
                 * start of partition, to minimize the span of allocated swap.
-                * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
-                * case, just handled by scan_swap_map_try_ssd_cluster() above.
                 */
                scan_base = offset = si->lowest_bit;
                last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
        }
 
 checks:
-       if (si->cluster_info) {
-               while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
-               /* take a break if we already got some slots */
-                       if (n_ret)
-                               goto done;
-                       if (!scan_swap_map_try_ssd_cluster(si, &offset,
-                                                       &scan_base, order)) {
-                               if (order > 0)
-                                       goto no_page;
-                               goto scan;
-                       }
-               }
-       }
        if (!(si->flags & SWP_WRITEOK))
                goto no_page;
        if (!si->highest_bit)
        if (offset > si->highest_bit)
                scan_base = offset = si->lowest_bit;
 
-       ci = lock_cluster(si, offset);
        /* reuse swap entry of cache-only swap if not busy. */
        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
                int swap_was_freed;
-               unlock_cluster(ci);
                spin_unlock(&si->lock);
                swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
                spin_lock(&si->lock);
        }
 
        if (si->swap_map[offset]) {
-               unlock_cluster(ci);
                if (!n_ret)
                        goto scan;
                else
                        goto done;
        }
        memset(si->swap_map + offset, usage, nr_pages);
-       add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
-       unlock_cluster(ci);
 
        swap_range_alloc(si, offset, nr_pages);
        slots[n_ret++] = swp_entry(si->type, offset);
                latency_ration = LATENCY_LIMIT;
        }
 
-       /* try to get more slots in cluster */
-       if (si->cluster_info) {
-               if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
-                       goto checks;
-               if (order > 0)
-                       goto done;
-       } else if (si->cluster_nr && !si->swap_map[++offset]) {
+       if (si->cluster_nr && !si->swap_map[++offset]) {
                /* non-ssd case, still more slots in cluster? */
                --si->cluster_nr;
                goto checks;
        ci = lock_cluster(si, offset);
        memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
        ci->count = 0;
-       ci->order = 0;
-       ci->flags = 0;
        free_cluster(si, ci);
        unlock_cluster(ci);
        swap_range_free(si, offset, SWAPFILE_CLUSTER);
                        ci = cluster_info + idx;
                        if (idx >= nr_clusters)
                                continue;
-                       if (ci->count)
+                       if (ci->count) {
+                               ci->flags = CLUSTER_FLAG_NONFULL;
+                               list_add_tail(&ci->list, &p->nonfull_clusters[0]);
                                continue;
+                       }
                        ci->flags = CLUSTER_FLAG_FREE;
                        list_add_tail(&ci->list, &p->free_clusters);
                }