From 09126a9c1be2bded95d9145ee8421e98ce5a626e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 5 Dec 2018 11:14:03 +1100 Subject: [PATCH] mm, swap: fix race between swapoff and some swap operations MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit When swapin is performed, after getting the swap entry information from the page table, system will swap in the swap entry, without any lock held to prevent the swap device from being swapoff. This may cause the race like below, CPU 1 CPU 2 ----- ----- do_swap_page swapin_readahead __read_swap_cache_async swapoff swapcache_prepare p->swap_map = NULL __swap_duplicate p->swap_map[?] /* !!! NULL pointer access */ Because swapoff is usually done when system shutdown only, the race may not hit many people in practice. But it is still a race need to be fixed. To fix the race, get_swap_device() is added to check whether the specified swap entry is valid in its swap device. If so, it will keep the swap entry valid via preventing the swap device from being swapoff, until put_swap_device() is called. Because swapoff() is very rare code path, to make the normal path runs as fast as possible, disabling preemption + stop_machine() instead of reference count is used to implement get/put_swap_device(). From get_swap_device() to put_swap_device(), the preemption is disabled, so stop_machine() in swapoff() will wait until put_swap_device() is called. In addition to swap_map, cluster_info, etc. data structure in the struct swap_info_struct, the swap cache radix tree will be freed after swapoff, so this patch fixes the race between swap cache looking up and swapoff too. Races between some other swap cache usages protected via disabling preemption and swapoff are fixed too via calling stop_machine() between clearing PageSwapCache() and freeing swap cache data structure. Alternative implementation could be replacing disable preemption with rcu_read_lock_sched and stop_machine() with synchronize_sched(). Link: http://lkml.kernel.org/r/20180213014220.2464-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Hugh Dickins Cc: Paul E. McKenney Cc: Minchan Kim Cc: Johannes Weiner Cc: Tim Chen Cc: Shaohua Li Cc: Mel Gorman Cc: Jérôme Glisse Cc: Michal Hocko Cc: Andrea Arcangeli Cc: David Rientjes Cc: Rik van Riel Cc: Jan Kara Cc: Dave Jiang Cc: Aaron Lu Signed-off-by: Andrew Morton Signed-off-by: Stephen Rothwell --- include/linux/swap.h | 13 ++++- mm/memory.c | 2 +- mm/swap_state.c | 16 +++++- mm/swapfile.c | 129 +++++++++++++++++++++++++++++++++---------- 4 files changed, 123 insertions(+), 37 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 3f750b709a4f..a78b416a606a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -173,8 +173,9 @@ enum { SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ + SWP_VALID = (1 << 13), /* swap is valid to be operated on? */ /* add others here before... */ - SWP_SCANNING = (1 << 13), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL @@ -458,7 +459,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); -extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); +extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -468,6 +469,12 @@ extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); +extern struct swap_info_struct *get_swap_device(swp_entry_t entry); + +static inline void put_swap_device(struct swap_info_struct *si) +{ + preempt_enable(); +} #else /* CONFIG_SWAP */ @@ -574,7 +581,7 @@ static inline int page_swapcount(struct page *page) return 0; } -static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +static inline int __swap_count(swp_entry_t entry) { return 0; } diff --git a/mm/memory.c b/mm/memory.c index 4a67eb8aef55..0a62bead768f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2716,7 +2716,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) struct swap_info_struct *si = swp_swap_info(entry); if (si->flags & SWP_SYNCHRONOUS_IO && - __swap_count(si, entry) == 1) { + __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); diff --git a/mm/swap_state.c b/mm/swap_state.c index fd2f21e1c60a..5a1cc9387151 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -310,8 +310,13 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; + struct swap_info_struct *si; + si = get_swap_device(entry); + if (!si) + return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); + put_swap_device(si); INC_CACHE_INFO(find_total); if (page) { @@ -354,8 +359,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { - struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = swap_address_space(entry); + struct page *found_page = NULL, *new_page = NULL; + struct swap_info_struct *si; int err; *new_page_allocated = false; @@ -365,7 +370,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ - found_page = find_get_page(swapper_space, swp_offset(entry)); + si = get_swap_device(entry); + if (!si) + break; + found_page = find_get_page(swap_address_space(entry), + swp_offset(entry)); + put_swap_device(si); if (found_page) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index 23a500d0b4d5..f0cbad9d6ccd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -1174,6 +1175,41 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, return usage; } +/* + * Check whether swap entry is valid in the swap device. If so, + * return pointer to swap_info_struct, and keep the swap entry valid + * via preventing the swap device from being swapoff, until + * put_swap_device() is called. Otherwise return NULL. + */ +struct swap_info_struct *get_swap_device(swp_entry_t entry) +{ + struct swap_info_struct *si; + unsigned long type, offset; + + if (!entry.val) + goto out; + type = swp_type(entry); + if (type >= nr_swapfiles) + goto bad_nofile; + si = swap_info[type]; + + preempt_disable(); + if (!(si->flags & SWP_VALID)) + goto unlock_out; + offset = swp_offset(entry); + if (offset >= si->max) + goto unlock_out; + + return si; +bad_nofile: + pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val); +out: + return NULL; +unlock_out: + preempt_enable(); + return NULL; +} + static unsigned char __swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -1345,11 +1381,18 @@ int page_swapcount(struct page *page) return count; } -int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +int __swap_count(swp_entry_t entry) { + struct swap_info_struct *si; pgoff_t offset = swp_offset(entry); + int count = 0; - return swap_count(si->swap_map[offset]); + si = get_swap_device(entry); + if (si) { + count = swap_count(si->swap_map[offset]); + put_swap_device(si); + } + return count; } static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) @@ -1374,9 +1417,11 @@ int __swp_swapcount(swp_entry_t entry) int count = 0; struct swap_info_struct *si; - si = __swap_info_get(entry); - if (si) + si = get_swap_device(entry); + if (si) { count = swap_swapcount(si, entry); + put_swap_device(si); + } return count; } @@ -2330,9 +2375,9 @@ static int swap_node(struct swap_info_struct *p) return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; } -static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map, - struct swap_cluster_info *cluster_info) +static void setup_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i; @@ -2357,7 +2402,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, } p->swap_map = swap_map; p->cluster_info = cluster_info; - p->flags |= SWP_WRITEOK; +} + +static void _enable_swap_info(struct swap_info_struct *p) +{ + p->flags |= SWP_WRITEOK | SWP_VALID; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -2376,6 +2425,11 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, add_to_avail_list(p); } +static int swap_onoff_stop(void *arg) +{ + return 0; +} + static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, @@ -2384,7 +2438,17 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map, cluster_info); + setup_swap_info(p, prio, swap_map, cluster_info); + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * Guarantee swap_map, cluster_info, etc. fields are used + * between get/put_swap_device() only if SWP_VALID bit is set + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + spin_lock(&swap_lock); + spin_lock(&p->lock); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2393,7 +2457,8 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); + setup_swap_info(p, p->prio, p->swap_map, p->cluster_info); + _enable_swap_info(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -2496,6 +2561,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) reenable_swap_slots_cache_unlock(); + spin_lock(&swap_lock); + spin_lock(&p->lock); + p->flags &= ~SWP_VALID; /* mark swap device as invalid */ + spin_unlock(&p->lock); + spin_unlock(&swap_lock); + /* + * wait for swap operations protected by get/put_swap_device() + * to complete + */ + stop_machine(swap_onoff_stop, NULL, cpu_online_mask); + flush_work(&p->discard_work); destroy_swap_extents(p); @@ -3260,22 +3336,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { struct swap_info_struct *p; struct swap_cluster_info *ci; - unsigned long offset, type; + unsigned long offset; unsigned char count; unsigned char has_cache; int err = -EINVAL; - if (non_swap_entry(entry)) + p = get_swap_device(entry); + if (!p) goto out; - type = swp_type(entry); - if (type >= nr_swapfiles) - goto bad_file; - p = swap_info[type]; offset = swp_offset(entry); - if (unlikely(offset >= p->max)) - goto out; - ci = lock_cluster_or_swap_info(p, offset); count = p->swap_map[offset]; @@ -3321,11 +3391,9 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) unlock_out: unlock_cluster_or_swap_info(p, ci); out: + if (p) + put_swap_device(p); return err; - -bad_file: - pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; } /* @@ -3417,6 +3485,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) struct page *list_page; pgoff_t offset; unsigned char count; + int ret = 0; /* * When debugging, it's easier to use __GFP_ZERO here; but it's better @@ -3424,15 +3493,15 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) */ page = alloc_page(gfp_mask | __GFP_HIGHMEM); - si = swap_info_get(entry); + si = get_swap_device(entry); if (!si) { /* * An acceptable race has occurred since the failing - * __swap_duplicate(): the swap entry has been freed, - * perhaps even the whole swap_map cleared for swapoff. + * __swap_duplicate(): the swap device may be swapoff */ goto outer; } + spin_lock(&si->lock); offset = swp_offset(entry); @@ -3450,9 +3519,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) } if (!page) { - unlock_cluster(ci); - spin_unlock(&si->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out; } /* @@ -3504,10 +3572,11 @@ out_unlock_cont: out: unlock_cluster(ci); spin_unlock(&si->lock); + put_swap_device(si); outer: if (page) __free_page(page); - return 0; + return ret; } /* -- 2.50.1