hugetlb: handle truncate racing with page faults

author Mike Kravetz <mike.kravetz@oracle.com>

Wed, 24 Aug 2022 17:57:53 +0000 (10:57 -0700)

committer Andrew Morton <akpm@linux-foundation.org>

Fri, 26 Aug 2022 05:03:28 +0000 (22:03 -0700)
author Mike Kravetz <mike.kravetz@oracle.com>
Wed, 24 Aug 2022 17:57:53 +0000 (10:57 -0700)
committer Andrew Morton <akpm@linux-foundation.org>
Fri, 26 Aug 2022 05:03:28 +0000 (22:03 -0700)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c

index d98c6edbd1a4c70d0a73d472e6379a2cca7b849a..e83fd31671b32201098bdfe03d72bbb2f174dad8 100644 (file)
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -411,6 +411,95 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
         }
  }
  
+/*
+ * Called with hugetlb fault mutex held.
+ * Returns true if page was actually removed, false otherwise.
+ */
+static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
+                                       struct address_space *mapping,
+                                       struct folio *folio, pgoff_t index,
+                                       bool truncate_op)
+{
+       bool ret = false;
+
+       /*
+        * If folio is mapped, it was faulted in after being
+        * unmapped in caller.  Unmap (again) while holding
+        * the fault mutex.  The mutex will prevent faults
+        * until we finish removing the folio.
+        */
+       if (unlikely(folio_mapped(folio))) {
+               i_mmap_lock_write(mapping);
+               hugetlb_vmdelete_list(&mapping->i_mmap,
+                                       index * pages_per_huge_page(h),
+                                       (index + 1) * pages_per_huge_page(h),
+                                       ZAP_FLAG_DROP_MARKER);
+               i_mmap_unlock_write(mapping);
+       }
+
+       folio_lock(folio);
+       /*
+        * After locking page, make sure mapping is the same.
+        * We could have raced with page fault populate and
+        * backout code.
+        */
+       if (folio_mapping(folio) == mapping) {
+               /*
+                * We must remove the folio from page cache before removing
+                * the region/ reserve map (hugetlb_unreserve_pages).  In
+                * rare out of memory conditions, removal of the region/reserve
+                * map could fail.  Correspondingly, the subpool and global
+                * reserve usage count can need to be adjusted.
+                */
+               VM_BUG_ON(HPageRestoreReserve(&folio->page));
+               hugetlb_delete_from_page_cache(&folio->page);
+               ret = true;
+               if (!truncate_op) {
+                       if (unlikely(hugetlb_unreserve_pages(inode, index,
+                                                               index + 1, 1)))
+                               hugetlb_fix_reserve_counts(inode);
+               }
+       }
+
+       folio_unlock(folio);
+       return ret;
+}
+
+/*
+ * Take hugetlb fault mutex for a set of inode indicies.
+ * Check for and remove any found folios.  Return the number of
+ * any removed folios.
+ *
+ */
+static long fault_lock_inode_indicies(struct hstate *h,
+                                       struct inode *inode,
+                                       struct address_space *mapping,
+                                       pgoff_t start, pgoff_t end,
+                                       bool truncate_op)
+{
+       struct folio *folio;
+       long freed = 0;
+       pgoff_t index;
+       u32 hash;
+
+       for (index = start; index < end; index++) {
+               hash = hugetlb_fault_mutex_hash(mapping, index);
+               mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+               folio = filemap_get_folio(mapping, index);
+               if (folio) {
+                       if (remove_inode_single_folio(h, inode, mapping, folio,
+                                                       index, truncate_op))
+                               freed++;
+                       folio_put(folio);
+               }
+
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       }
+
+       return freed;
+}
+
  /*
   * remove_inode_hugepages handles two distinct cases: truncation and hole
   * punch.  There are subtle differences in operation for each case.
@@ -418,11 +507,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
   * truncation is indicated by end of range being LLONG_MAX
   *     In this case, we first scan the range and release found pages.
   *     After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
- *     maps and global counts.  Page faults can not race with truncation
- *     in this routine.  hugetlb_no_page() prevents page faults in the
- *     truncated range.  It checks i_size before allocation, and again after
- *     with the page table lock for the page held.  The same lock must be
- *     acquired to unmap a page.
+ *     maps and global counts.  Page faults can race with truncation.
+ *     During faults, hugetlb_no_page() checks i_size before page allocation,
+ *     and again after obtaining page table lock.  It will 'back out'
+ *     allocations in the truncated range.
   * hole punch is indicated if end is not LLONG_MAX
   *     In the hole punch case we scan the range and release found pages.
   *     Only when releasing a page is the associated region/reserve map
@@ -431,75 +519,69 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
   *     This is indicated if we find a mapped page.
   * Note: If the passed end of range value is beyond the end of file, but
   * not LLONG_MAX this routine still performs a hole punch operation.
+ *
+ * Since page faults can race with this routine, care must be taken as both
+ * modify huge page reservation data.  To somewhat synchronize these operations
+ * the hugetlb fault mutex is taken for EVERY index in the range to be hole
+ * punched or truncated.  In this way, we KNOW either:
+ * - fault code has added a page beyond i_size, and we will remove here
+ * - fault code will see updated i_size and not add a page beyond
+ * The parameter 'lm__end' indicates the offset of the end of hole or file
+ * before truncation.  For hole punch lm_end == lend.
   */
  static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
-                                  loff_t lend)
+                                  loff_t lend, loff_t lm_end)
  {
         struct hstate *h = hstate_inode(inode);
         struct address_space *mapping = &inode->i_data;
         const pgoff_t start = lstart >> huge_page_shift(h);
         const pgoff_t end = lend >> huge_page_shift(h);
+       pgoff_t m_end = lm_end >> huge_page_shift(h);
+       pgoff_t m_start, m_index;
         struct folio_batch fbatch;
+       struct folio *folio;
         pgoff_t next, index;
-       int i, freed = 0;
+       unsigned int i;
+       long freed = 0;
+       u32 hash;
         bool truncate_op = (lend == LLONG_MAX);
  
         folio_batch_init(&fbatch);
-       next = start;
+       next = m_start = start;
         while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
                 for (i = 0; i < folio_batch_count(&fbatch); ++i) {
-                       struct folio *folio = fbatch.folios[i];
-                       u32 hash = 0;
+                       folio = fbatch.folios[i];
  
                         index = folio->index;
-                       hash = hugetlb_fault_mutex_hash(mapping, index);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
                         /*
-                        * If folio is mapped, it was faulted in after being
-                        * unmapped in caller.  Unmap (again) now after taking
-                        * the fault mutex.  The mutex will prevent faults
-                        * until we finish removing the folio.
-                        *
-                        * This race can only happen in the hole punch case.
-                        * Getting here in a truncate operation is a bug.
+                        * Take fault mutex for missing folios before index,
+                        * while checking folios that might have been added
+                        * due to a race with fault code.
                          */
-                       if (unlikely(folio_mapped(folio))) {
-                               BUG_ON(truncate_op);
-
-                               i_mmap_lock_write(mapping);
-                               hugetlb_vmdelete_list(&mapping->i_mmap,
-                                       index * pages_per_huge_page(h),
-                                       (index + 1) * pages_per_huge_page(h),
-                                       ZAP_FLAG_DROP_MARKER);
-                               i_mmap_unlock_write(mapping);
-                       }
+                       freed += fault_lock_inode_indicies(h, inode, mapping,
+                                               m_start, m_index, truncate_op);
  
-                       folio_lock(folio);
                         /*
-                        * We must free the huge page and remove from page
-                        * cache BEFORE removing the * region/reserve map
-                        * (hugetlb_unreserve_pages).  In rare out of memory
-                        * conditions, removal of the region/reserve map could
-                        * fail. Correspondingly, the subpool and global
-                        * reserve usage count can need to be adjusted.
+                        * Remove folio that was part of folio_batch.
                          */
-                       VM_BUG_ON(HPageRestoreReserve(&folio->page));
-                       hugetlb_delete_from_page_cache(&folio->page);
-                       freed++;
-                       if (!truncate_op) {
-                               if (unlikely(hugetlb_unreserve_pages(inode,
-                                                       index, index + 1, 1)))
-                                       hugetlb_fix_reserve_counts(inode);
-                       }
-
-                       folio_unlock(folio);
+                       hash = hugetlb_fault_mutex_hash(mapping, index);
+                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       if (remove_inode_single_folio(h, inode, mapping, folio,
+                                                       index, truncate_op))
+                               freed++;
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                 }
                 folio_batch_release(&fbatch);
                 cond_resched();
         }
  
+       /*
+        * Take fault mutex for missing folios at end of range while checking
+        * for folios that might have been added due to a race with fault code.
+        */
+       freed += fault_lock_inode_indicies(h, inode, mapping, m_start, m_end,
+                                               truncate_op);
+
         if (truncate_op)
                 (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
  }
@@ -507,8 +589,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
  static void hugetlbfs_evict_inode(struct inode *inode)
  {
         struct resv_map *resv_map;
+       loff_t prev_size = i_size_read(inode);
  
-       remove_inode_hugepages(inode, 0, LLONG_MAX);
+       remove_inode_hugepages(inode, 0, LLONG_MAX, prev_size);
  
         /*
          * Get the resv_map from the address space embedded in the inode.
@@ -528,6 +611,7 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
         pgoff_t pgoff;
         struct address_space *mapping = inode->i_mapping;
         struct hstate *h = hstate_inode(inode);
+       loff_t prev_size = i_size_read(inode);
  
         BUG_ON(offset & ~huge_page_mask(h));
         pgoff = offset >> PAGE_SHIFT;
@@ -538,7 +622,7 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
                 hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
                                       ZAP_FLAG_DROP_MARKER);
         i_mmap_unlock_write(mapping);
-       remove_inode_hugepages(inode, offset, LLONG_MAX);
+       remove_inode_hugepages(inode, offset, LLONG_MAX, prev_size);
  }
  
  static void hugetlbfs_zero_partial_page(struct hstate *h,
@@ -610,7 +694,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
  
         /* Remove full pages from the file. */
         if (hole_end > hole_start)
-               remove_inode_hugepages(inode, hole_start, hole_end);
+               remove_inode_hugepages(inode, hole_start, hole_end, hole_end);
  
         inode_unlock(inode);
  
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index c3ee487ae93c1e811039654e169d781af169f248..a30d4c2662b1e4a21854ff76daf3786d61160cad 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5526,6 +5526,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         spinlock_t *ptl;
         unsigned long haddr = address & huge_page_mask(h);
         bool new_page, new_pagecache_page = false;
+       bool reserve_alloc = false;
  
         /*
          * Currently, we are forced to kill the process in the event the
@@ -5582,9 +5583,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                 clear_huge_page(page, address, pages_per_huge_page(h));
                 __SetPageUptodate(page);
                 new_page = true;
+               if (HPageRestoreReserve(page))
+                       reserve_alloc = true;
  
                 if (vma->vm_flags & VM_MAYSHARE) {
-                       int err = hugetlb_add_to_page_cache(page, mapping, idx);
+                       int err;
+
+                       err = hugetlb_add_to_page_cache(page, mapping, idx);
                         if (err) {
                                 /*
                                  * err can't be -EEXIST which implies someone
@@ -5645,10 +5650,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         }
  
         ptl = huge_pte_lock(h, mm, ptep);
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       if (idx >= size)
-               goto backout;
-
         ret = 0;
         /* If pte changed from under us, retry */
         if (!pte_same(huge_ptep_get(ptep), old_pte))
@@ -5692,10 +5693,18 @@ out:
  backout:
         spin_unlock(ptl);
  backout_unlocked:
-       unlock_page(page);
-       /* restore reserve for newly allocated pages not in page cache */
-       if (new_page && !new_pagecache_page)
+       if (new_page && !new_pagecache_page) {
+               /*
+                * If reserve was consumed, make sure flag is set so that it
+                * will be restored in free_huge_page().
+                */
+               if (reserve_alloc)
+                       SetHPageRestoreReserve(page);
+
                 restore_reserve_on_error(h, vma, haddr, page);
+       }
+
+       unlock_page(page);
         put_page(page);
         goto out;
  }
@@ -6009,26 +6018,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
         spin_lock(ptl);
  
-       /*
-        * Recheck the i_size after holding PT lock to make sure not
-        * to leave any page mapped (as page_mapped()) beyond the end
-        * of the i_size (remove_inode_hugepages() is strict about
-        * enforcing that). If we bail out here, we'll also leave a
-        * page in the radix tree in the vm_shared case beyond the end
-        * of the i_size, but remove_inode_hugepages() will take care
-        * of it as soon as we drop the hugetlb_fault_mutex_table.
-        */
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       ret = -EFAULT;
-       if (idx >= size)
-               goto out_release_unlock;
-
-       ret = -EEXIST;
         /*
          * We allow to overwrite a pte marker: consider when both MISSING|WP
          * registered, we firstly wr-protect a none pte which has no page cache
          * page backing it, then access the page.
          */
+       ret = -EEXIST;
         if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                 goto out_release_unlock;
author	Mike Kravetz <mike.kravetz@oracle.com>
	Wed, 24 Aug 2022 17:57:53 +0000 (10:57 -0700)
committer	Andrew Morton <akpm@linux-foundation.org>
	Fri, 26 Aug 2022 05:03:28 +0000 (22:03 -0700)
fs/hugetlbfs/inode.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history