swap: reduce lock contention on swap cache from swap slots allocation

author Huang Ying <ying.huang@intel.com>

Tue, 2 Jun 2020 04:49:22 +0000 (21:49 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 2 Jun 2020 17:59:09 +0000 (10:59 -0700)
author Huang Ying <ying.huang@intel.com>
Tue, 2 Jun 2020 04:49:22 +0000 (21:49 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Jun 2020 17:59:09 +0000 (10:59 -0700)
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 873bf5206afbbc78999ecd7f2f690d807726e0cd..6c23a6a14012f1b8fe23a0cf564d636afc84d715 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -252,6 +252,7 @@ struct swap_info_struct {
         unsigned int inuse_pages;       /* number of those currently in use */
         unsigned int cluster_next;      /* likely index for next allocation */
         unsigned int cluster_nr;        /* countdown to next cluster search */
+       unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */
         struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
         struct rb_root swap_extent_root;/* root of the swap extent rbtree */
         struct block_device *bdev;      /* swap device or bdev of swap file */
diff --git a/mm/swapfile.c b/mm/swapfile.c

index 18dfccb91123039917b0819f17e00146ec029512..c531d2687dd0bc4201875c4f63343d41f71e8987 100644 (file)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -613,10 +613,12 @@ new_cluster:
                 } else if (!cluster_list_empty(&si->discard_clusters)) {
                         /*
                          * we don't have free cluster but have some clusters in
-                        * discarding, do discard now and reclaim them
+                        * discarding, do discard now and reclaim them, then
+                        * reread cluster_next_cpu since we dropped si->lock
                          */
                         swap_do_scheduled_discard(si);
-                       *scan_base = *offset = si->cluster_next;
+                       *scan_base = this_cpu_read(*si->cluster_next_cpu);
+                       *offset = *scan_base;
                         goto new_cluster;
                 } else
                         return false;
@@ -722,6 +724,34 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
         }
  }
  
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+{
+       unsigned long prev;
+
+       if (!(si->flags & SWP_SOLIDSTATE)) {
+               si->cluster_next = next;
+               return;
+       }
+
+       prev = this_cpu_read(*si->cluster_next_cpu);
+       /*
+        * Cross the swap address space size aligned trunk, choose
+        * another trunk randomly to avoid lock contention on swap
+        * address space if possible.
+        */
+       if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
+           (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
+               /* No free swap slots available */
+               if (si->highest_bit <= si->lowest_bit)
+                       return;
+               next = si->lowest_bit +
+                       prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+               next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
+               next = max_t(unsigned int, next, si->lowest_bit);
+       }
+       this_cpu_write(*si->cluster_next_cpu, next);
+}
+
  static int scan_swap_map_slots(struct swap_info_struct *si,
                                unsigned char usage, int nr,
                                swp_entry_t slots[])
@@ -746,7 +776,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
          */
  
         si->flags += SWP_SCANNING;
-       scan_base = offset = si->cluster_next;
+       /*
+        * Use percpu scan base for SSD to reduce lock contention on
+        * cluster and swap cache.  For HDD, sequential access is more
+        * important.
+        */
+       if (si->flags & SWP_SOLIDSTATE)
+               scan_base = this_cpu_read(*si->cluster_next_cpu);
+       else
+               scan_base = si->cluster_next;
+       offset = scan_base;
  
         /* SSD algorithm */
         if (si->cluster_info) {
@@ -835,7 +874,6 @@ checks:
         unlock_cluster(ci);
  
         swap_range_alloc(si, offset, 1);
-       si->cluster_next = offset + 1;
         slots[n_ret++] = swp_entry(si->type, offset);
  
         /* got enough slots or reach max slots? */
@@ -884,6 +922,7 @@ checks:
         }
  
  done:
+       set_cluster_next(si, offset + 1);
         si->flags -= SWP_SCANNING;
         return n_ret;
  
@@ -2653,6 +2692,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
         mutex_unlock(&swapon_mutex);
         free_percpu(p->percpu_cluster);
         p->percpu_cluster = NULL;
+       free_percpu(p->cluster_next_cpu);
+       p->cluster_next_cpu = NULL;
         vfree(swap_map);
         kvfree(cluster_info);
         kvfree(frontswap_map);
@@ -3205,11 +3246,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                 unsigned long ci, nr_cluster;
  
                 p->flags |= SWP_SOLIDSTATE;
+               p->cluster_next_cpu = alloc_percpu(unsigned int);
+               if (!p->cluster_next_cpu) {
+                       error = -ENOMEM;
+                       goto bad_swap_unlock_inode;
+               }
                 /*
                  * select a random position to start with to help wear leveling
                  * SSD
                  */
-               p->cluster_next = 1 + prandom_u32_max(p->highest_bit);
+               for_each_possible_cpu(cpu) {
+                       per_cpu(*p->cluster_next_cpu, cpu) =
+                               1 + prandom_u32_max(p->highest_bit);
+               }
                 nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  
                 cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
@@ -3325,6 +3374,8 @@ bad_swap_unlock_inode:
  bad_swap:
         free_percpu(p->percpu_cluster);
         p->percpu_cluster = NULL;
+       free_percpu(p->cluster_next_cpu);
+       p->cluster_next_cpu = NULL;
         if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
                 set_blocksize(p->bdev, p->old_block_size);
                 blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
author	Huang Ying <ying.huang@intel.com>
	Tue, 2 Jun 2020 04:49:22 +0000 (21:49 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 2 Jun 2020 17:59:09 +0000 (10:59 -0700)
include/linux/swap.h		patch \| blob \| history
mm/swapfile.c		patch \| blob \| history