]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
mm: swap: relaim the cached parts that got scanned
authorKairui Song <kasong@tencent.com>
Wed, 31 Jul 2024 06:49:20 +0000 (23:49 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 4 Sep 2024 04:15:26 +0000 (21:15 -0700)
This commit implements reclaim during scan for cluster allocator.

Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could
result in low allocation success rate or early OOM.

So to ensure maximum allocation success rate, integrate reclaiming with
scanning.  If found a range of suitable swap slots but fragmented due to
HAS_CACHE, just try to reclaim the slots.

Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org
Signed-off-by: Kairui Song <kasong@tencent.com>
Reported-by: Barry Song <21cnbao@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kalesh Singh <kaleshsingh@google.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/swap.h
mm/swapfile.c

index c526cd3e7b31869242f08e6cc2f07026834aacab..e372122e4f4599c53ec1c6a936e58f92a3a3692f 100644 (file)
@@ -301,6 +301,7 @@ struct swap_info_struct {
                                        /* list of cluster that contains at least one free slot */
        struct list_head frag_clusters[SWAP_NR_ORDERS];
                                        /* list of cluster that are fragmented or contented */
+       unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
        unsigned int lowest_bit;        /* index of first free in swap_map */
        unsigned int highest_bit;       /* index of last free in swap_map */
        unsigned int pages;             /* total of usable pages of swap */
index b0b726a9bf0a152ca26cf2d0ab1da56b9140a3ad..ed5f85504f37a22c583c3e0e475c87ef2ffb1d93 100644 (file)
@@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
        VM_BUG_ON(ci->count != 0);
        lockdep_assert_held(&si->lock);
        lockdep_assert_held(&ci->lock);
+
+       if (ci->flags & CLUSTER_FLAG_FRAG)
+               si->frag_cluster_nr[ci->order]--;
+
        /*
         * If the swap is discardable, prepare discard the cluster
         * instead of free it immediately. The cluster will be freed
@@ -572,31 +576,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 
        if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
                VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
-               if (ci->flags & CLUSTER_FLAG_FRAG)
+               if (ci->flags & CLUSTER_FLAG_FRAG) {
+                       p->frag_cluster_nr[ci->order]--;
                        list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
-               else
+               } else {
                        list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
+               }
                ci->flags = CLUSTER_FLAG_NONFULL;
        }
 }
 
-static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
-                                     unsigned int nr_pages)
+static bool cluster_reclaim_range(struct swap_info_struct *si,
+                                 struct swap_cluster_info *ci,
+                                 unsigned long start, unsigned long end)
 {
-       unsigned char *p = si->swap_map + start;
-       unsigned char *end = p + nr_pages;
+       unsigned char *map = si->swap_map;
+       unsigned long offset;
+
+       spin_unlock(&ci->lock);
+       spin_unlock(&si->lock);
+
+       for (offset = start; offset < end; offset++) {
+               switch (READ_ONCE(map[offset])) {
+               case 0:
+                       continue;
+               case SWAP_HAS_CACHE:
+                       if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
+                               continue;
+                       goto out;
+               default:
+                       goto out;
+               }
+       }
+out:
+       spin_lock(&si->lock);
+       spin_lock(&ci->lock);
 
-       while (p < end)
-               if (*p++)
+       /*
+        * Recheck the range no matter reclaim succeeded or not, the slot
+        * could have been be freed while we are not holding the lock.
+        */
+       for (offset = start; offset < end; offset++)
+               if (READ_ONCE(map[offset]))
                        return false;
 
        return true;
 }
 
+static bool cluster_scan_range(struct swap_info_struct *si,
+                              struct swap_cluster_info *ci,
+                              unsigned long start, unsigned int nr_pages)
+{
+       unsigned long offset, end = start + nr_pages;
+       unsigned char *map = si->swap_map;
+       bool need_reclaim = false;
 
-static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
-                                               unsigned int start, unsigned char usage,
-                                               unsigned int order)
+       for (offset = start; offset < end; offset++) {
+               switch (READ_ONCE(map[offset])) {
+               case 0:
+                       continue;
+               case SWAP_HAS_CACHE:
+                       if (!vm_swap_full())
+                               return false;
+                       need_reclaim = true;
+                       continue;
+               default:
+                       return false;
+               }
+       }
+
+       if (need_reclaim)
+               return cluster_reclaim_range(si, ci, start, end);
+
+       return true;
+}
+
+static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+                               unsigned int start, unsigned char usage,
+                               unsigned int order)
 {
        unsigned int nr_pages = 1 << order;
 
@@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
        if (ci->count == SWAPFILE_CLUSTER) {
                VM_BUG_ON(!(ci->flags &
                          (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
+               if (ci->flags & CLUSTER_FLAG_FRAG)
+                       si->frag_cluster_nr[ci->order]--;
                list_del(&ci->list);
                ci->flags = 0;
        }
@@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
        }
 
        while (offset <= end) {
-               if (cluster_scan_range(si, offset, nr_pages)) {
+               if (cluster_scan_range(si, ci, offset, nr_pages)) {
                        cluster_alloc_range(si, ci, offset, usage, order);
                        *foundp = offset;
                        if (ci->count == SWAPFILE_CLUSTER) {
@@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
                                              unsigned char usage)
 {
        struct percpu_cluster *cluster;
-       struct swap_cluster_info *ci, *n;
+       struct swap_cluster_info *ci;
        unsigned int offset, found = 0;
-       LIST_HEAD(fraged);
 
 new_cluster:
        lockdep_assert_held(&si->lock);
@@ -690,25 +748,42 @@ new_cluster:
        }
 
        if (order < PMD_ORDER) {
-               list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
-                       list_move_tail(&ci->list, &fraged);
+               unsigned int frags = 0;
+
+               while (!list_empty(&si->nonfull_clusters[order])) {
+                       ci = list_first_entry(&si->nonfull_clusters[order],
+                                             struct swap_cluster_info, list);
+                       list_move_tail(&ci->list, &si->frag_clusters[order]);
                        ci->flags = CLUSTER_FLAG_FRAG;
+                       si->frag_cluster_nr[order]++;
                        offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
                                                         &found, order, usage);
+                       frags++;
                        if (found)
                                break;
                }
 
                if (!found) {
-                       list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
+                       /*
+                        * Nonfull clusters are moved to frag tail if we reached
+                        * here, count them too, don't over scan the frag list.
+                        */
+                       while (frags < si->frag_cluster_nr[order]) {
+                               ci = list_first_entry(&si->frag_clusters[order],
+                                                     struct swap_cluster_info, list);
+                               /*
+                                * Rotate the frag list to iterate, they were all failing
+                                * high order allocation or moved here due to per-CPU usage,
+                                * this help keeping usable cluster ahead.
+                                */
+                               list_move_tail(&ci->list, &si->frag_clusters[order]);
                                offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
                                                                 &found, order, usage);
+                               frags++;
                                if (found)
                                        break;
                        }
                }
-
-               list_splice_tail(&fraged, &si->frag_clusters[order]);
        }
 
        if (found)
@@ -729,25 +804,28 @@ new_cluster:
 
        /* Order 0 stealing from higher order */
        for (int o = 1; o < SWAP_NR_ORDERS; o++) {
-               if (!list_empty(&si->frag_clusters[o])) {
+               /*
+                * Clusters here have at least one usable slots and can't fail order 0
+                * allocation, but reclaim may drop si->lock and race with another user.
+                */
+               while (!list_empty(&si->frag_clusters[o])) {
                        ci = list_first_entry(&si->frag_clusters[o],
                                              struct swap_cluster_info, list);
-                       offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
-                                                        0, usage);
-                       VM_BUG_ON(!found);
-                       goto done;
+                       offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+                                                        &found, 0, usage);
+                       if (found)
+                               goto done;
                }
 
-               if (!list_empty(&si->nonfull_clusters[o])) {
-                       ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
-                                             list);
+               while (!list_empty(&si->nonfull_clusters[o])) {
+                       ci = list_first_entry(&si->nonfull_clusters[o],
+                                             struct swap_cluster_info, list);
                        offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
                                                         &found, 0, usage);
-                       VM_BUG_ON(!found);
-                       goto done;
+                       if (found)
+                               goto done;
                }
        }
-
 done:
        cluster->next[order] = offset;
        return found;
@@ -3042,6 +3120,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
        for (i = 0; i < SWAP_NR_ORDERS; i++) {
                INIT_LIST_HEAD(&p->nonfull_clusters[i]);
                INIT_LIST_HEAD(&p->frag_clusters[i]);
+               p->frag_cluster_nr[i] = 0;
        }
 
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
@@ -3085,7 +3164,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
        if (!cluster_info)
                return nr_extents;
 
-
        /*
         * Reduce false cache line sharing between cluster_info and
         * sharing same address space.