From: Linus Torvalds Date: Wed, 28 Jun 2023 17:28:11 +0000 (-0700) Subject: Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel... X-Git-Tag: dma-mapping-6.6-2023-08-29~309 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=6e17c6de3ddf3073741d9c91a796ee696914d8a0;p=users%2Fhch%2Fdma-mapping.git Merge tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull mm updates from Andrew Morton: - Yosry Ahmed brought back some cgroup v1 stats in OOM logs - Yosry has also eliminated cgroup's atomic rstat flushing - Nhat Pham adds the new cachestat() syscall. It provides userspace with the ability to query pagecache status - a similar concept to mincore() but more powerful and with improved usability - Mel Gorman provides more optimizations for compaction, reducing the prevalence of page rescanning - Lorenzo Stoakes has done some maintanance work on the get_user_pages() interface - Liam Howlett continues with cleanups and maintenance work to the maple tree code. Peng Zhang also does some work on maple tree - Johannes Weiner has done some cleanup work on the compaction code - David Hildenbrand has contributed additional selftests for get_user_pages() - Thomas Gleixner has contributed some maintenance and optimization work for the vmalloc code - Baolin Wang has provided some compaction cleanups, - SeongJae Park continues maintenance work on the DAMON code - Huang Ying has done some maintenance on the swap code's usage of device refcounting - Christoph Hellwig has some cleanups for the filemap/directio code - Ryan Roberts provides two patch series which yield some rationalization of the kernel's access to pte entries - use the provided APIs rather than open-coding accesses - Lorenzo Stoakes has some fixes to the interaction between pagecache and directio access to file mappings - John Hubbard has a series of fixes to the MM selftesting code - ZhangPeng continues the folio conversion campaign - Hugh Dickins has been working on the pagetable handling code, mainly with a view to reducing the load on the mmap_lock - Catalin Marinas has reduced the arm64 kmalloc() minimum alignment from 128 to 8 - Domenico Cerasuolo has improved the zswap reclaim mechanism by reorganizing the LRU management - Matthew Wilcox provides some fixups to make gfs2 work better with the buffer_head code - Vishal Moola also has done some folio conversion work - Matthew Wilcox has removed the remnants of the pagevec code - their functionality is migrated over to struct folio_batch * tag 'mm-stable-2023-06-24-19-15' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (380 commits) mm/hugetlb: remove hugetlb_set_page_subpool() mm: nommu: correct the range of mmap_sem_read_lock in task_mem() hugetlb: revert use of page_cache_next_miss() Revert "page cache: fix page_cache_next/prev_miss off by one" mm/vmscan: fix root proactive reclaim unthrottling unbalanced node mm: memcg: rename and document global_reclaim() mm: kill [add|del]_page_to_lru_list() mm: compaction: convert to use a folio in isolate_migratepages_block() mm: zswap: fix double invalidate with exclusive loads mm: remove unnecessary pagevec includes mm: remove references to pagevec mm: rename invalidate_mapping_pagevec to mapping_try_invalidate mm: remove struct pagevec net: convert sunrpc from pagevec to folio_batch i915: convert i915_gpu_error to use a folio_batch pagevec: rename fbatch_count() mm: remove check_move_unevictable_pages() drm: convert drm_gem_put_pages() to use a folio_batch i915: convert shmem_sg_free_table() to use a folio_batch scatterlist: add sg_set_folio() ... --- 6e17c6de3ddf3073741d9c91a796ee696914d8a0 diff --cc fs/btrfs/file.c index ba5b0c9f2bbd,ecd43ab66fa6..fd03e689a6be --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@@ -1682,7 -1683,9 +1679,6 @@@ ssize_t btrfs_do_write_iter(struct kioc num_written = num_sync; } - current->backing_dev_info = NULL; - if (sync) - atomic_dec(&inode->sync_writers); - return num_written; } diff --cc include/linux/mm.h index fec149585985,9ecb8b9c07f6..ae866bc9bad6 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@@ -1910,40 -1901,18 +1901,40 @@@ static inline bool page_needs_cow_for_d return page_maybe_dma_pinned(page); } +/** + * is_zero_page - Query if a page is a zero page + * @page: The page to query + * + * This returns true if @page is one of the permanent zero pages. + */ +static inline bool is_zero_page(const struct page *page) +{ + return is_zero_pfn(page_to_pfn(page)); +} + +/** + * is_zero_folio - Query if a folio is a zero page + * @folio: The folio to query + * + * This returns true if @folio is one of the permanent zero pages. + */ +static inline bool is_zero_folio(const struct folio *folio) +{ + return is_zero_page(&folio->page); +} + - /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin pages */ + /* MIGRATE_CMA and ZONE_MOVABLE do not allow pin folios */ #ifdef CONFIG_MIGRATION - static inline bool is_longterm_pinnable_page(struct page *page) + static inline bool folio_is_longterm_pinnable(struct folio *folio) { #ifdef CONFIG_CMA - int mt = get_pageblock_migratetype(page); + int mt = folio_migratetype(folio); if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE) return false; #endif - /* The zero page may always be pinned */ - if (is_zero_pfn(folio_pfn(folio))) + /* The zero page can be "pinned" but gets special handling. */ - if (is_zero_page(page)) ++ if (is_zero_folio(folio)) return true; /* Coherent device memory must always allow eviction. */ diff --cc include/linux/suspend.h index 4d0095e8989e,76923051c03d..ef503088942d --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@@ -512,7 -500,13 +509,12 @@@ extern void pm_report_max_hw_sleep(u64 /* drivers/base/power/wakeup.c */ extern bool events_check_enabled; -extern suspend_state_t pm_suspend_target_state; + static inline bool pm_suspended_storage(void) + { + return !gfp_has_io_fs(gfp_allowed_mask); + } + extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); extern void pm_system_cancel_wakeup(void); diff --cc mm/gup.c index 0814576b7366,a8336b39d6b5..48c1659314b0 --- a/mm/gup.c +++ b/mm/gup.c @@@ -127,62 -132,50 +133,57 @@@ struct folio *try_grab_folio(struct pag if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) return NULL; - folio = try_get_folio(page, refs); - if (flags & FOLL_GET) - return folio; + return try_get_folio(page, refs); - else if (flags & FOLL_PIN) { - struct folio *folio; - /* - * Don't take a pin on the zero page - it's not going anywhere - * and it is used in a *lot* of places. - */ - if (is_zero_page(page)) - return page_folio(page); + /* FOLL_PIN is set */ + - /* - * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a - * right zone, so fail and let the caller fall back to the slow - * path. - */ - if (unlikely((flags & FOLL_LONGTERM) && - !is_longterm_pinnable_page(page))) - return NULL; ++ /* ++ * Don't take a pin on the zero page - it's not going anywhere ++ * and it is used in a *lot* of places. ++ */ ++ if (is_zero_page(page)) ++ return page_folio(page); + - /* - * CAUTION: Don't use compound_head() on the page before this - * point, the result won't be stable. - */ - folio = try_get_folio(page, refs); - if (!folio) - return NULL; ++ folio = try_get_folio(page, refs); + if (!folio) + return NULL; - /* - * When pinning a large folio, use an exact count to track it. - * - * However, be sure to *also* increment the normal folio - * refcount field at least once, so that the folio really - * is pinned. That's why the refcount from the earlier - * try_get_folio() is left intact. - */ - if (folio_test_large(folio)) - atomic_add(refs, &folio->_pincount); - else - folio_ref_add(folio, - refs * (GUP_PIN_COUNTING_BIAS - 1)); - /* - * Adjust the pincount before re-checking the PTE for changes. - * This is essentially a smp_mb() and is paired with a memory - * barrier in page_try_share_anon_rmap(). - */ - smp_mb__after_atomic(); + /* + * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a + * right zone, so fail and let the caller fall back to the slow + * path. + */ + if (unlikely((flags & FOLL_LONGTERM) && + !folio_is_longterm_pinnable(folio))) { + if (!put_devmap_managed_page_refs(&folio->page, refs)) + folio_put_refs(folio, refs); + return NULL; + } - node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); + /* + * When pinning a large folio, use an exact count to track it. + * + * However, be sure to *also* increment the normal folio + * refcount field at least once, so that the folio really + * is pinned. That's why the refcount from the earlier + * try_get_folio() is left intact. + */ + if (folio_test_large(folio)) + atomic_add(refs, &folio->_pincount); + else + folio_ref_add(folio, + refs * (GUP_PIN_COUNTING_BIAS - 1)); + /* + * Adjust the pincount before re-checking the PTE for changes. + * This is essentially a smp_mb() and is paired with a memory + * barrier in page_try_share_anon_rmap(). + */ + smp_mb__after_atomic(); - return folio; - } + node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs); - WARN_ON_ONCE(1); - return NULL; + return folio; } static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) @@@ -3193,13 -3250,9 +3300,12 @@@ EXPORT_SYMBOL(pin_user_pages_remote) * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. + * + * Note that if a zero_page is amongst the returned pages, it will not have + * pins in it and unpin_user_page*() will not remove pins from it. */ long pin_user_pages(unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + unsigned int gup_flags, struct page **pages) { int locked = 1; diff --cc mm/mmap.c index d600404580b2,4fc496bc5b95..8f1000bc35df --- a/mm/mmap.c +++ b/mm/mmap.c @@@ -2385,15 -2412,26 +2398,30 @@@ do_vmi_align_munmap(struct vma_iterato if (error) goto end_split_failed; } - error = munmap_sidetree(next, &mas_detach); - if (error) - goto munmap_sidetree_failed; + vma_start_write(next); + mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1); + if (mas_store_gfp(&mas_detach, next, GFP_KERNEL)) + goto munmap_gather_failed; + vma_mark_detached(next, true); + if (next->vm_flags & VM_LOCKED) + locked_vm += vma_pages(next); count++; + if (unlikely(uf)) { + /* + * If userfaultfd_unmap_prep returns an error the vmas + * will remain split, but userland will get a + * highly unexpected error anyway. This is no + * different than the case where the first of the two + * __split_vma fails, but we don't undo the first + * split, despite we could. This is unlikely enough + * failure that it's not worth optimizing it for. + */ + error = userfaultfd_unmap_prep(next, start, end, uf); + + if (error) + goto userfaultfd_error; + } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < start); BUG_ON(next->vm_start > end);