mm/rmap: basic MM owner tracking for large folios (!hugetlb)

author David Hildenbrand <david@redhat.com>

Mon, 3 Mar 2025 16:30:05 +0000 (17:30 +0100)

committer Andrew Morton <akpm@linux-foundation.org>

Tue, 4 Mar 2025 05:50:44 +0000 (21:50 -0800)
author David Hildenbrand <david@redhat.com>
Mon, 3 Mar 2025 16:30:05 +0000 (17:30 +0100)
committer Andrew Morton <akpm@linux-foundation.org>
Tue, 4 Mar 2025 05:50:44 +0000 (21:50 -0800)
diff --git a/Documentation/mm/transhuge.rst b/Documentation/mm/transhuge.rst

index a2cd8800d52799cba37674e2570179825de61c42..baa17d718a762cf31d58f3e88040a448e8b0e0ae 100644 (file)
--- a/Documentation/mm/transhuge.rst
+++ b/Documentation/mm/transhuge.rst
@@ -120,11 +120,19 @@ pages:
      and also increment/decrement folio->_nr_pages_mapped by ENTIRELY_MAPPED
      when _entire_mapcount goes from -1 to 0 or 0 to -1.
  
+    We also maintain the two slots for tracking MM owners (MM ID and
+    corresponding mapcount), and the current status ("maybe mapped shared" vs.
+    "mapped exclusively").
+
    - map/unmap of individual pages with PTE entry increment/decrement
      page->_mapcount, increment/decrement folio->_large_mapcount and also
      increment/decrement folio->_nr_pages_mapped when page->_mapcount goes
      from -1 to 0 or 0 to -1 as this counts the number of pages mapped by PTE.
  
+    We also maintain the two slots for tracking MM owners (MM ID and
+    corresponding mapcount), and the current status ("maybe mapped shared" vs.
+    "mapped exclusively").
+
  split_huge_page internally has to distribute the refcounts in the head
  page to the tail pages before clearing all PG_head/tail bits from the page
  structures. It can be done easily for refcounts taken by page table
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index c83dd2f1ee25e44beae746ea4e72a79e4a46b3a2..2d657ac8e9b0c97e3787227d18fd49712645ab22 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -292,6 +292,44 @@ typedef struct {
  #define NR_PAGES_IN_LARGE_FOLIO
  #endif
  
+/*
+ * On 32bit, we can cut the required metadata in half, because:
+ * (a) PID_MAX_LIMIT implicitly limits the number of MMs we could ever have,
+ *     so we can limit MM IDs to 15 bit (32767).
+ * (b) We don't expect folios where even a single complete PTE mapping by
+ *     one MM would exceed 15 bits (order-15).
+ */
+#ifdef CONFIG_64BIT
+typedef int mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX             INT_MAX
+typedef unsigned int mm_id_t;
+#else /* !CONFIG_64BIT */
+typedef short mm_id_mapcount_t;
+#define MM_ID_MAPCOUNT_MAX             SHRT_MAX
+typedef unsigned short mm_id_t;
+#endif /* CONFIG_64BIT */
+
+/* We implicitly use the dummy ID for init-mm etc. where we never rmap pages. */
+#define MM_ID_DUMMY                    0
+#define MM_ID_MIN                      (MM_ID_DUMMY + 1)
+
+/*
+ * We leave the highest bit of each MM id unused, so we can store a flag
+ * in the highest bit of each folio->_mm_id[].
+ */
+#define MM_ID_BITS                     ((sizeof(mm_id_t) * BITS_PER_BYTE) - 1)
+#define MM_ID_MASK                     ((1U << MM_ID_BITS) - 1)
+#define MM_ID_MAX                      MM_ID_MASK
+
+/*
+ * In order to use bit_spin_lock(), which requires an unsigned long, we
+ * operate on folio->_mm_ids when working on flags.
+ */
+#define FOLIO_MM_IDS_LOCK_BITNUM       MM_ID_BITS
+#define FOLIO_MM_IDS_LOCK_BIT          BIT(FOLIO_MM_IDS_LOCK_BITNUM)
+#define FOLIO_MM_IDS_SHARED_BITNUM     (2 * MM_ID_BITS + 1)
+#define FOLIO_MM_IDS_SHARED_BIT                BIT(FOLIO_MM_IDS_SHARED_BITNUM)
+
  /**
   * struct folio - Represents a contiguous set of bytes.
   * @flags: Identical to the page flags.
@@ -318,6 +356,9 @@ typedef struct {
   * @_nr_pages_mapped: Do not use outside of rmap and debug code.
   * @_pincount: Do not use directly, call folio_maybe_dma_pinned().
   * @_nr_pages: Do not use directly, call folio_nr_pages().
+ * @_mm_id: Do not use outside of rmap code.
+ * @_mm_ids: Do not use outside of rmap code.
+ * @_mm_id_mapcount: Do not use outside of rmap code.
   * @_hugetlb_subpool: Do not use directly, use accessor in hugetlb.h.
   * @_hugetlb_cgroup: Do not use directly, use accessor in hugetlb_cgroup.h.
   * @_hugetlb_cgroup_rsvd: Do not use directly, use accessor in hugetlb_cgroup.h.
@@ -390,6 +431,11 @@ struct folio {
                                         atomic_t _entire_mapcount;
                                         atomic_t _pincount;
  #endif /* CONFIG_64BIT */
+                                       mm_id_mapcount_t _mm_id_mapcount[2];
+                                       union {
+                                               mm_id_t _mm_id[2];
+                                               unsigned long _mm_ids;
+                                       };
                                 };
                                 unsigned long _usable_1[4];
                         };
@@ -1111,6 +1157,9 @@ struct mm_struct {
  #endif
                 } lru_gen;
  #endif /* CONFIG_LRU_GEN_WALKS_MMU */
+#ifdef CONFIG_MM_ID
+               mm_id_t mm_id;
+#endif /* CONFIG_MM_ID */
         } __randomize_layout;
  
         /*
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h

index 30fe3eb62b90c2232aaad67ba5a7af3e8713691d..01716710066df9a007f71f477a8abf5b87533e44 100644 (file)
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -1222,6 +1222,10 @@ static inline int folio_has_private(const struct folio *folio)
         return !!(folio->flags & PAGE_FLAGS_PRIVATE);
  }
  
+static inline bool folio_test_large_maybe_mapped_shared(const struct folio *folio)
+{
+       return test_bit(FOLIO_MM_IDS_SHARED_BITNUM, &folio->_mm_ids);
+}
  #undef PF_ANY
  #undef PF_HEAD
  #undef PF_NO_TAIL
diff --git a/include/linux/rmap.h b/include/linux/rmap.h

index d1e888cc97a58ef77fd5bfb5566811adbe29c1d6..c131b0efff0fa35fd4c554720325b2b8c4e52051 100644 (file)
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -13,6 +13,7 @@
  #include <linux/highmem.h>
  #include <linux/pagemap.h>
  #include <linux/memremap.h>
+#include <linux/bit_spinlock.h>
  
  /*
   * The anon_vma heads a list of private "related" vmas, to scan if
@@ -173,6 +174,169 @@ static inline void anon_vma_merge(struct vm_area_struct *vma,
  
  struct anon_vma *folio_get_anon_vma(const struct folio *folio);
  
+#ifdef CONFIG_MM_ID
+static __always_inline void folio_lock_large_mapcount(struct folio *folio)
+{
+       bit_spin_lock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
+}
+
+static __always_inline void folio_unlock_large_mapcount(struct folio *folio)
+{
+       __bit_spin_unlock(FOLIO_MM_IDS_LOCK_BITNUM, &folio->_mm_ids);
+}
+
+static inline unsigned int folio_mm_id(const struct folio *folio, int idx)
+{
+       VM_WARN_ON_ONCE(idx != 0 && idx != 1);
+       return folio->_mm_id[idx] & MM_ID_MASK;
+}
+
+static inline void folio_set_mm_id(struct folio *folio, int idx, mm_id_t id)
+{
+       VM_WARN_ON_ONCE(idx != 0 && idx != 1);
+       folio->_mm_id[idx] &= ~MM_ID_MASK;
+       folio->_mm_id[idx] |= id;
+}
+
+static inline void __folio_large_mapcount_sanity_checks(const struct folio *folio,
+               int diff, mm_id_t mm_id)
+{
+       VM_WARN_ON_ONCE(!folio_test_large(folio) || folio_test_hugetlb(folio));
+       VM_WARN_ON_ONCE(diff <= 0);
+       VM_WARN_ON_ONCE(mm_id < MM_ID_MIN || mm_id > MM_ID_MAX);
+
+       /*
+        * Make sure we can detect at least one complete PTE mapping of the
+        * folio in a single MM as "exclusively mapped". This is primarily
+        * a check on 32bit, where we currently reduce the size of the per-MM
+        * mapcount to a short.
+        */
+       VM_WARN_ON_ONCE(diff > folio_large_nr_pages(folio));
+       VM_WARN_ON_ONCE(folio_large_nr_pages(folio) - 1 > MM_ID_MAPCOUNT_MAX);
+
+       VM_WARN_ON_ONCE(folio_mm_id(folio, 0) == MM_ID_DUMMY &&
+                       folio->_mm_id_mapcount[0] != -1);
+       VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY &&
+                       folio->_mm_id_mapcount[0] < 0);
+       VM_WARN_ON_ONCE(folio_mm_id(folio, 1) == MM_ID_DUMMY &&
+                       folio->_mm_id_mapcount[1] != -1);
+       VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY &&
+                       folio->_mm_id_mapcount[1] < 0);
+       VM_WARN_ON_ONCE(!folio_mapped(folio) &&
+                       folio_test_large_maybe_mapped_shared(folio));
+}
+
+static __always_inline void folio_set_large_mapcount(struct folio *folio,
+               int mapcount, struct vm_area_struct *vma)
+{
+       __folio_large_mapcount_sanity_checks(folio, mapcount, vma->vm_mm->mm_id);
+
+       VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != MM_ID_DUMMY);
+       VM_WARN_ON_ONCE(folio_mm_id(folio, 1) != MM_ID_DUMMY);
+
+       /* Note: mapcounts start at -1. */
+       atomic_set(&folio->_large_mapcount, mapcount - 1);
+       folio->_mm_id_mapcount[0] = mapcount - 1;
+       folio_set_mm_id(folio, 0, vma->vm_mm->mm_id);
+}
+
+static __always_inline void folio_add_large_mapcount(struct folio *folio,
+               int diff, struct vm_area_struct *vma)
+{
+       const mm_id_t mm_id = vma->vm_mm->mm_id;
+       int new_mapcount_val;
+
+       folio_lock_large_mapcount(folio);
+       __folio_large_mapcount_sanity_checks(folio, diff, mm_id);
+
+       new_mapcount_val = atomic_read(&folio->_large_mapcount) + diff;
+       atomic_set(&folio->_large_mapcount, new_mapcount_val);
+
+       /*
+        * If a folio is mapped more than once into an MM on 32bit, we
+        * can in theory overflow the per-MM mapcount (although only for
+        * fairly large folios), turning it negative. In that case, just
+        * free up the slot and mark the folio "mapped shared", otherwise
+        * we might be in trouble when unmapping pages later.
+        */
+       if (folio_mm_id(folio, 0) == mm_id) {
+               folio->_mm_id_mapcount[0] += diff;
+               if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[0] < 0)) {
+                       folio->_mm_id_mapcount[0] = -1;
+                       folio_set_mm_id(folio, 0, MM_ID_DUMMY);
+                       folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+               }
+       } else if (folio_mm_id(folio, 1) == mm_id) {
+               folio->_mm_id_mapcount[1] += diff;
+               if (!IS_ENABLED(CONFIG_64BIT) && unlikely(folio->_mm_id_mapcount[1] < 0)) {
+                       folio->_mm_id_mapcount[1] = -1;
+                       folio_set_mm_id(folio, 1, MM_ID_DUMMY);
+                       folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+               }
+       } else if (folio_mm_id(folio, 0) == MM_ID_DUMMY) {
+               folio_set_mm_id(folio, 0, mm_id);
+               folio->_mm_id_mapcount[0] = diff - 1;
+               /* We might have other mappings already. */
+               if (new_mapcount_val != diff - 1)
+                       folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+       } else if (folio_mm_id(folio, 1) == MM_ID_DUMMY) {
+               folio_set_mm_id(folio, 1, mm_id);
+               folio->_mm_id_mapcount[1] = diff - 1;
+               /* Slot 0 certainly has mappings as well. */
+               folio->_mm_ids |= FOLIO_MM_IDS_SHARED_BIT;
+       }
+       folio_unlock_large_mapcount(folio);
+}
+
+static __always_inline void folio_sub_large_mapcount(struct folio *folio,
+               int diff, struct vm_area_struct *vma)
+{
+       const mm_id_t mm_id = vma->vm_mm->mm_id;
+       int new_mapcount_val;
+
+       folio_lock_large_mapcount(folio);
+       __folio_large_mapcount_sanity_checks(folio, diff, mm_id);
+
+       new_mapcount_val = atomic_read(&folio->_large_mapcount) - diff;
+       atomic_set(&folio->_large_mapcount, new_mapcount_val);
+
+       /*
+        * There are valid corner cases where we might underflow a per-MM
+        * mapcount (some mappings added when no slot was free, some mappings
+        * added once a slot was free), so we always set it to -1 once we go
+        * negative.
+        */
+       if (folio_mm_id(folio, 0) == mm_id) {
+               folio->_mm_id_mapcount[0] -= diff;
+               if (folio->_mm_id_mapcount[0] >= 0)
+                       goto out;
+               folio->_mm_id_mapcount[0] = -1;
+               folio_set_mm_id(folio, 0, MM_ID_DUMMY);
+       } else if (folio_mm_id(folio, 1) == mm_id) {
+               folio->_mm_id_mapcount[1] -= diff;
+               if (folio->_mm_id_mapcount[1] >= 0)
+                       goto out;
+               folio->_mm_id_mapcount[1] = -1;
+               folio_set_mm_id(folio, 1, MM_ID_DUMMY);
+       }
+
+       /*
+        * If one MM slot owns all mappings, the folio is mapped exclusively.
+        * Note that if the folio is now unmapped (new_mapcount_val == -1), both
+        * slots must be free (mapcount == -1), and we'll also mark it as
+        * exclusive.
+        */
+       if (folio->_mm_id_mapcount[0] == new_mapcount_val ||
+           folio->_mm_id_mapcount[1] == new_mapcount_val)
+               folio->_mm_ids &= ~FOLIO_MM_IDS_SHARED_BIT;
+out:
+       folio_unlock_large_mapcount(folio);
+}
+#else /* !CONFIG_MM_ID */
+/*
+ * See __folio_rmap_sanity_checks(), we might map large folios even without
+ * CONFIG_TRANSPARENT_HUGEPAGE. We'll keep that working for now.
+ */
  static inline void folio_set_large_mapcount(struct folio *folio, int mapcount,
                 struct vm_area_struct *vma)
  {
@@ -191,6 +355,7 @@ static inline void folio_sub_large_mapcount(struct folio *folio,
  {
         atomic_sub(diff, &folio->_large_mapcount);
  }
+#endif /* CONFIG_MM_ID */
  
  #define folio_inc_large_mapcount(folio, vma) \
         folio_add_large_mapcount(folio, 1, vma)
diff --git a/kernel/fork.c b/kernel/fork.c

index 364b2d4fd3efa8ffe50921ee6a4b30c425a863f0..f9cf0f056eb6fdc984335c4a64361a9a2692a308 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -802,6 +802,36 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
  #define mm_free_pgd(mm)
  #endif /* CONFIG_MMU */
  
+#ifdef CONFIG_MM_ID
+static DEFINE_IDA(mm_ida);
+
+static inline int mm_alloc_id(struct mm_struct *mm)
+{
+       int ret;
+
+       ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL);
+       if (ret < 0)
+               return ret;
+       mm->mm_id = ret;
+       return 0;
+}
+
+static inline void mm_free_id(struct mm_struct *mm)
+{
+       const mm_id_t id = mm->mm_id;
+
+       mm->mm_id = MM_ID_DUMMY;
+       if (id == MM_ID_DUMMY)
+               return;
+       if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX))
+               return;
+       ida_free(&mm_ida, id);
+}
+#else /* !CONFIG_MM_ID */
+static inline int mm_alloc_id(struct mm_struct *mm) { return 0; }
+static inline void mm_free_id(struct mm_struct *mm) {}
+#endif /* CONFIG_MM_ID */
+
  static void check_mm(struct mm_struct *mm)
  {
         int i;
@@ -905,6 +935,7 @@ void __mmdrop(struct mm_struct *mm)
  
         WARN_ON_ONCE(mm == current->active_mm);
         mm_free_pgd(mm);
+       mm_free_id(mm);
         destroy_context(mm);
         mmu_notifier_subscriptions_destroy(mm);
         check_mm(mm);
@@ -1289,6 +1320,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
         if (mm_alloc_pgd(mm))
                 goto fail_nopgd;
  
+       if (mm_alloc_id(mm))
+               goto fail_noid;
+
         if (init_new_context(p, mm))
                 goto fail_nocontext;
  
@@ -1308,6 +1342,8 @@ fail_pcpu:
  fail_cid:
         destroy_context(mm);
  fail_nocontext:
+       mm_free_id(mm);
+fail_noid:
         mm_free_pgd(mm);
  fail_nopgd:
         free_mm(mm);
diff --git a/mm/Kconfig b/mm/Kconfig

index 4c1640a197a0a5de833e5ceeebc7dc6008f0cb2e..a25c5476b3ad44dcc44d43566324982e9c02ab87 100644 (file)
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -815,11 +815,15 @@ config ARCH_WANT_GENERAL_HUGETLB
  config ARCH_WANTS_THP_SWAP
         def_bool n
  
+config MM_ID
+       def_bool n
+
  menuconfig TRANSPARENT_HUGEPAGE
         bool "Transparent Hugepage Support"
         depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
         select COMPACTION
         select XARRAY_MULTI
+       select MM_ID
         help
           Transparent Hugepages allows the kernel to use huge pages and
           huge tlb transparently to the applications whenever possible.
diff --git a/mm/internal.h b/mm/internal.h

index ffdc91b19322ededdaa105ee6d1d72c5f2a1634d..7303ddd9dac042c89dca38ce9a06979b293c221a 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -722,6 +722,11 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
         folio_set_order(folio, order);
         atomic_set(&folio->_large_mapcount, -1);
         atomic_set(&folio->_nr_pages_mapped, 0);
+       if (IS_ENABLED(CONFIG_MM_ID)) {
+               folio->_mm_ids = 0;
+               folio->_mm_id_mapcount[0] = -1;
+               folio->_mm_id_mapcount[1] = -1;
+       }
         if (IS_ENABLED(CONFIG_64BIT) || order > 1) {
                 atomic_set(&folio->_pincount, 0);
                 atomic_set(&folio->_entire_mapcount, -1);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index dbf2da10565ed1cb66f3b051affba349bfa4fec4..6e6510651f748a240dc2dd90947ce9af45e67ac0 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -955,6 +955,16 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
                         bad_page(page, "nonzero nr_pages_mapped");
                         goto out;
                 }
+               if (IS_ENABLED(CONFIG_MM_ID)) {
+                       if (unlikely(folio->_mm_id_mapcount[0] != -1)) {
+                               bad_page(page, "nonzero mm mapcount 0");
+                               goto out;
+                       }
+                       if (unlikely(folio->_mm_id_mapcount[1] != -1)) {
+                               bad_page(page, "nonzero mm mapcount 1");
+                               goto out;
+                       }
+               }
                 if (IS_ENABLED(CONFIG_64BIT)) {
                         if (unlikely(atomic_read(&folio->_entire_mapcount) + 1)) {
                                 bad_page(page, "nonzero entire_mapcount");
author	David Hildenbrand <david@redhat.com>
	Mon, 3 Mar 2025 16:30:05 +0000 (17:30 +0100)
committer	Andrew Morton <akpm@linux-foundation.org>
	Tue, 4 Mar 2025 05:50:44 +0000 (21:50 -0800)
Documentation/mm/transhuge.rst		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
include/linux/page-flags.h		patch \| blob \| history
include/linux/rmap.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/Kconfig		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history