From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 28 Jun 2023 17:28:11 +0000 (-0700)
Subject: Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel... 
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=6e17c6de3ddf3073741d9c91a796ee696914d8a0;p=linux.git

Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Pull mm updates from Andrew Morton:

 - Yosry Ahmed brought back some cgroup v1 stats in OOM logs

 - Yosry has also eliminated cgroup's atomic rstat flushing

 - Nhat Pham adds the new cachestat() syscall. It provides userspace
   with the ability to query pagecache status - a similar concept to
   mincore() but more powerful and with improved usability

 - Mel Gorman provides more optimizations for compaction, reducing the
   prevalence of page rescanning

 - Lorenzo Stoakes has done some maintanance work on the
   get_user_pages() interface

 - Liam Howlett continues with cleanups and maintenance work to the
   maple tree code. Peng Zhang also does some work on maple tree

 - Johannes Weiner has done some cleanup work on the compaction code

 - David Hildenbrand has contributed additional selftests for
   get_user_pages()

 - Thomas Gleixner has contributed some maintenance and optimization
   work for the vmalloc code

 - Baolin Wang has provided some compaction cleanups,

 - SeongJae Park continues maintenance work on the DAMON code

 - Huang Ying has done some maintenance on the swap code's usage of
   device refcounting

 - Christoph Hellwig has some cleanups for the filemap/directio code

 - Ryan Roberts provides two patch series which yield some
   rationalization of the kernel's access to pte entries - use the
   provided APIs rather than open-coding accesses

 - Lorenzo Stoakes has some fixes to the interaction between pagecache
   and directio access to file mappings

 - John Hubbard has a series of fixes to the MM selftesting code

 - ZhangPeng continues the folio conversion campaign

 - Hugh Dickins has been working on the pagetable handling code, mainly
   with a view to reducing the load on the mmap_lock

 - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment
   from 128 to 8

 - Domenico Cerasuolo has improved the zswap reclaim mechanism by
   reorganizing the LRU management

 - Matthew Wilcox provides some fixups to make gfs2 work better with the
   buffer_head code

 - Vishal Moola also has done some folio conversion work

 - Matthew Wilcox has removed the remnants of the pagevec code - their
   functionality is migrated over to struct folio_batch

* tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits)
  mm/hugetlb: remove hugetlb_set_page_subpool()
  mm: nommu: correct the range of mmap_sem_read_lock in task_mem()
  hugetlb: revert use of page_cache_next_miss()
  Revert "page cache: fix page_cache_next/prev_miss off by one"
  mm/vmscan: fix root proactive reclaim unthrottling unbalanced node
  mm: memcg: rename and document global_reclaim()
  mm: kill [add|del]_page_to_lru_list()
  mm: compaction: convert to use a folio in isolate_migratepages_block()
  mm: zswap: fix double invalidate with exclusive loads
  mm: remove unnecessary pagevec includes
  mm: remove references to pagevec
  mm: rename invalidate_mapping_pagevec to mapping_try_invalidate
  mm: remove struct pagevec
  net: convert sunrpc from pagevec to folio_batch
  i915: convert i915_gpu_error to use a folio_batch
  pagevec: rename fbatch_count()
  mm: remove check_move_unevictable_pages()
  drm: convert drm_gem_put_pages() to use a folio_batch
  i915: convert shmem_sg_free_table() to use a folio_batch
  scatterlist: add sg_set_folio()
  ...
---

6e17c6de3ddf3073741d9c91a796ee696914d8a0
diff --cc fs/btrfs/file.c
index ba5b0c9f2bbd,ecd43ab66fa6..fd03e689a6be
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@@ -1682,7 -1683,9 +1679,6 @@@ ssize_t btrfs_do_write_iter(struct kioc
  			num_written = num_sync;
  	}
  
- 	current->backing_dev_info = NULL;
 -	if (sync)
 -		atomic_dec(&inode->sync_writers);
 -
  	return num_written;
  }
  
diff --cc include/linux/mm.h
index fec149585985,9ecb8b9c07f6..ae866bc9bad6
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -1910,40 -1901,18 +1901,40 @@@ static inline bool page_needs_cow_for_d
  	return page_maybe_dma_pinned(page);
  }
  
 +/**
 + * is_zero_page - Query if a page is a zero page
 + * @page: The page to query
 + *
 + * This returns true if @page is one of the permanent zero pages.
 + */
 +static inline bool is_zero_page(const struct page *page)
 +{
 +	return is_zero_pfn(page_to_pfn(page));
 +}
 +
 +/**
 + * is_zero_folio - Query if a folio is a zero page
 + * @folio: The folio to query
 + *
 + * This returns true if @folio is one of the permanent zero pages.
 + */
 +static inline bool is_zero_folio(const struct folio *folio)
 +{
 +	return is_zero_page(&folio->page);
 +}
 +
- /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */
+ /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */
  #ifdef CONFIG_MIGRATION
- static inline bool is_longterm_pinnable_page(struct page *page)
+ static inline bool folio_is_longterm_pinnable(struct folio *folio)
  {
  #ifdef CONFIG_CMA
- 	int mt = get_pageblock_migratetype(page);
+ 	int mt = folio_migratetype(folio);
  
  	if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
  		return false;
  #endif
 -	/* The zero page may always be pinned */
 -	if (is_zero_pfn(folio_pfn(folio)))
 +	/* The zero page can be "pinned" but gets special handling. */
- 	if (is_zero_page(page))
++	if (is_zero_folio(folio))
  		return true;
  
  	/* Coherent device memory must always allow eviction. */
diff --cc include/linux/suspend.h
index 4d0095e8989e,76923051c03d..ef503088942d
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@@ -512,7 -500,13 +509,12 @@@ extern void pm_report_max_hw_sleep(u64 
  
  /* drivers/base/power/wakeup.c */
  extern bool events_check_enabled;
 -extern suspend_state_t pm_suspend_target_state;
  
+ static inline bool pm_suspended_storage(void)
+ {
+ 	return !gfp_has_io_fs(gfp_allowed_mask);
+ }
+ 
  extern bool pm_wakeup_pending(void);
  extern void pm_system_wakeup(void);
  extern void pm_system_cancel_wakeup(void);
diff --cc mm/gup.c
index 0814576b7366,a8336b39d6b5..48c1659314b0
--- a/mm/gup.c
+++ b/mm/gup.c
@@@ -127,62 -132,50 +133,57 @@@ struct folio *try_grab_folio(struct pag
  	if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
  		return NULL;
  
 -	folio = try_get_folio(page, refs);
 -
  	if (flags & FOLL_GET)
 -		return folio;
 +		return try_get_folio(page, refs);
- 	else if (flags & FOLL_PIN) {
- 		struct folio *folio;
  
- 		/*
- 		 * Don't take a pin on the zero page - it's not going anywhere
- 		 * and it is used in a *lot* of places.
- 		 */
- 		if (is_zero_page(page))
- 			return page_folio(page);
+ 	/* FOLL_PIN is set */
 +
- 		/*
- 		 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
- 		 * right zone, so fail and let the caller fall back to the slow
- 		 * path.
- 		 */
- 		if (unlikely((flags & FOLL_LONGTERM) &&
- 			     !is_longterm_pinnable_page(page)))
- 			return NULL;
++	/*
++	 * Don't take a pin on the zero page - it's not going anywhere
++	 * and it is used in a *lot* of places.
++	 */
++	if (is_zero_page(page))
++		return page_folio(page);
 +
- 		/*
- 		 * CAUTION: Don't use compound_head() on the page before this
- 		 * point, the result won't be stable.
- 		 */
- 		folio = try_get_folio(page, refs);
- 		if (!folio)
- 			return NULL;
++	folio = try_get_folio(page, refs);
+ 	if (!folio)
+ 		return NULL;
  
- 		/*
- 		 * When pinning a large folio, use an exact count to track it.
- 		 *
- 		 * However, be sure to *also* increment the normal folio
- 		 * refcount field at least once, so that the folio really
- 		 * is pinned.  That's why the refcount from the earlier
- 		 * try_get_folio() is left intact.
- 		 */
- 		if (folio_test_large(folio))
- 			atomic_add(refs, &folio->_pincount);
- 		else
- 			folio_ref_add(folio,
- 					refs * (GUP_PIN_COUNTING_BIAS - 1));
- 		/*
- 		 * Adjust the pincount before re-checking the PTE for changes.
- 		 * This is essentially a smp_mb() and is paired with a memory
- 		 * barrier in page_try_share_anon_rmap().
- 		 */
- 		smp_mb__after_atomic();
+ 	/*
+ 	 * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
+ 	 * right zone, so fail and let the caller fall back to the slow
+ 	 * path.
+ 	 */
+ 	if (unlikely((flags & FOLL_LONGTERM) &&
+ 		     !folio_is_longterm_pinnable(folio))) {
+ 		if (!put_devmap_managed_page_refs(&folio->page, refs))
+ 			folio_put_refs(folio, refs);
+ 		return NULL;
+ 	}
  
- 		node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
+ 	/*
+ 	 * When pinning a large folio, use an exact count to track it.
+ 	 *
+ 	 * However, be sure to *also* increment the normal folio
+ 	 * refcount field at least once, so that the folio really
+ 	 * is pinned.  That's why the refcount from the earlier
+ 	 * try_get_folio() is left intact.
+ 	 */
+ 	if (folio_test_large(folio))
+ 		atomic_add(refs, &folio->_pincount);
+ 	else
+ 		folio_ref_add(folio,
+ 				refs * (GUP_PIN_COUNTING_BIAS - 1));
+ 	/*
+ 	 * Adjust the pincount before re-checking the PTE for changes.
+ 	 * This is essentially a smp_mb() and is paired with a memory
+ 	 * barrier in page_try_share_anon_rmap().
+ 	 */
+ 	smp_mb__after_atomic();
  
- 		return folio;
- 	}
+ 	node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
  
- 	WARN_ON_ONCE(1);
- 	return NULL;
+ 	return folio;
  }
  
  static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
@@@ -3193,13 -3250,9 +3300,12 @@@ EXPORT_SYMBOL(pin_user_pages_remote)
   *
   * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
   * see Documentation/core-api/pin_user_pages.rst for details.
 + *
 + * Note that if a zero_page is amongst the returned pages, it will not have
 + * pins in it and unpin_user_page*() will not remove pins from it.
   */
  long pin_user_pages(unsigned long start, unsigned long nr_pages,
- 		    unsigned int gup_flags, struct page **pages,
- 		    struct vm_area_struct **vmas)
+ 		    unsigned int gup_flags, struct page **pages)
  {
  	int locked = 1;
  
diff --cc mm/mmap.c
index d600404580b2,4fc496bc5b95..8f1000bc35df
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@@ -2385,15 -2412,26 +2398,30 @@@ do_vmi_align_munmap(struct vma_iterato
  			if (error)
  				goto end_split_failed;
  		}
 -		error = munmap_sidetree(next, &mas_detach);
 -		if (error)
 -			goto munmap_sidetree_failed;
 +		vma_start_write(next);
 +		mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1);
 +		if (mas_store_gfp(&mas_detach, next, GFP_KERNEL))
 +			goto munmap_gather_failed;
 +		vma_mark_detached(next, true);
 +		if (next->vm_flags & VM_LOCKED)
 +			locked_vm += vma_pages(next);
  
  		count++;
+ 		if (unlikely(uf)) {
+ 			/*
+ 			 * If userfaultfd_unmap_prep returns an error the vmas
+ 			 * will remain split, but userland will get a
+ 			 * highly unexpected error anyway. This is no
+ 			 * different than the case where the first of the two
+ 			 * __split_vma fails, but we don't undo the first
+ 			 * split, despite we could. This is unlikely enough
+ 			 * failure that it's not worth optimizing it for.
+ 			 */
+ 			error = userfaultfd_unmap_prep(next, start, end, uf);
+ 
+ 			if (error)
+ 				goto userfaultfd_error;
+ 		}
  #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
  		BUG_ON(next->vm_start < start);
  		BUG_ON(next->vm_start > end);