}
}
+/*
+ * Called with hugetlb fault mutex held.
+ * Returns true if page was actually removed, false otherwise.
+ */
+static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index,
+ bool truncate_op)
+{
+ bool ret = false;
+
+ /*
+ * If folio is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) while holding
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the folio.
+ */
+ if (unlikely(folio_mapped(folio))) {
+ i_mmap_lock_write(mapping);
+ hugetlb_vmdelete_list(&mapping->i_mmap,
+ index * pages_per_huge_page(h),
+ (index + 1) * pages_per_huge_page(h),
+ ZAP_FLAG_DROP_MARKER);
+ i_mmap_unlock_write(mapping);
+ }
+
+ folio_lock(folio);
+ /*
+ * After locking page, make sure mapping is the same.
+ * We could have raced with page fault populate and
+ * backout code.
+ */
+ if (folio_mapping(folio) == mapping) {
+ /*
+ * We must remove the folio from page cache before removing
+ * the region/ reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the region/reserve
+ * map could fail. Correspondingly, the subpool and global
+ * reserve usage count can need to be adjusted.
+ */
+ VM_BUG_ON(HPageRestoreReserve(&folio->page));
+ hugetlb_delete_from_page_cache(&folio->page);
+ ret = true;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode, index,
+ index + 1, 1)))
+ hugetlb_fix_reserve_counts(inode);
+ }
+ }
+
+ folio_unlock(folio);
+ return ret;
+}
+
+/*
+ * Take hugetlb fault mutex for a set of inode indicies.
+ * Check for and remove any found folios. Return the number of
+ * any removed folios.
+ *
+ */
+static long fault_lock_inode_indicies(struct hstate *h,
+ struct inode *inode,
+ struct address_space *mapping,
+ pgoff_t start, pgoff_t end,
+ bool truncate_op)
+{
+ struct folio *folio;
+ long freed = 0;
+ pgoff_t index;
+ u32 hash;
+
+ for (index = start; index < end; index++) {
+ hash = hugetlb_fault_mutex_hash(mapping, index);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+ folio = filemap_get_folio(mapping, index);
+ if (folio) {
+ if (remove_inode_single_folio(h, inode, mapping, folio,
+ index, truncate_op))
+ freed++;
+ folio_put(folio);
+ }
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ }
+
+ return freed;
+}
+
/*
* remove_inode_hugepages handles two distinct cases: truncation and hole
* punch. There are subtle differences in operation for each case.
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
* After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
- * maps and global counts. Page faults can not race with truncation
- * in this routine. hugetlb_no_page() prevents page faults in the
- * truncated range. It checks i_size before allocation, and again after
- * with the page table lock for the page held. The same lock must be
- * acquired to unmap a page.
+ * maps and global counts. Page faults can race with truncation.
+ * During faults, hugetlb_no_page() checks i_size before page allocation,
+ * and again after obtaining page table lock. It will 'back out'
+ * allocations in the truncated range.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
* Only when releasing a page is the associated region/reserve map
* This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
* not LLONG_MAX this routine still performs a hole punch operation.
+ *
+ * Since page faults can race with this routine, care must be taken as both
+ * modify huge page reservation data. To somewhat synchronize these operations
+ * the hugetlb fault mutex is taken for EVERY index in the range to be hole
+ * punched or truncated. In this way, we KNOW either:
+ * - fault code has added a page beyond i_size, and we will remove here
+ * - fault code will see updated i_size and not add a page beyond
+ * The parameter 'lm__end' indicates the offset of the end of hole or file
+ * before truncation. For hole punch lm_end == lend.
*/
static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
- loff_t lend)
+ loff_t lend, loff_t lm_end)
{
struct hstate *h = hstate_inode(inode);
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
const pgoff_t end = lend >> huge_page_shift(h);
+ pgoff_t m_end = lm_end >> huge_page_shift(h);
+ pgoff_t m_start, m_index;
struct folio_batch fbatch;
+ struct folio *folio;
pgoff_t next, index;
- int i, freed = 0;
+ unsigned int i;
+ long freed = 0;
+ u32 hash;
bool truncate_op = (lend == LLONG_MAX);
folio_batch_init(&fbatch);
- next = start;
+ next = m_start = start;
while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
for (i = 0; i < folio_batch_count(&fbatch); ++i) {
- struct folio *folio = fbatch.folios[i];
- u32 hash = 0;
+ folio = fbatch.folios[i];
index = folio->index;
- hash = hugetlb_fault_mutex_hash(mapping, index);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
/*
- * If folio is mapped, it was faulted in after being
- * unmapped in caller. Unmap (again) now after taking
- * the fault mutex. The mutex will prevent faults
- * until we finish removing the folio.
- *
- * This race can only happen in the hole punch case.
- * Getting here in a truncate operation is a bug.
+ * Take fault mutex for missing folios before index,
+ * while checking folios that might have been added
+ * due to a race with fault code.
*/
- if (unlikely(folio_mapped(folio))) {
- BUG_ON(truncate_op);
-
- i_mmap_lock_write(mapping);
- hugetlb_vmdelete_list(&mapping->i_mmap,
- index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h),
- ZAP_FLAG_DROP_MARKER);
- i_mmap_unlock_write(mapping);
- }
+ freed += fault_lock_inode_indicies(h, inode, mapping,
+ m_start, m_index, truncate_op);
- folio_lock(folio);
/*
- * We must free the huge page and remove from page
- * cache BEFORE removing the * region/reserve map
- * (hugetlb_unreserve_pages). In rare out of memory
- * conditions, removal of the region/reserve map could
- * fail. Correspondingly, the subpool and global
- * reserve usage count can need to be adjusted.
+ * Remove folio that was part of folio_batch.
*/
- VM_BUG_ON(HPageRestoreReserve(&folio->page));
- hugetlb_delete_from_page_cache(&folio->page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(inode,
- index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
- }
-
- folio_unlock(folio);
+ hash = hugetlb_fault_mutex_hash(mapping, index);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+ if (remove_inode_single_folio(h, inode, mapping, folio,
+ index, truncate_op))
+ freed++;
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
folio_batch_release(&fbatch);
cond_resched();
}
+ /*
+ * Take fault mutex for missing folios at end of range while checking
+ * for folios that might have been added due to a race with fault code.
+ */
+ freed += fault_lock_inode_indicies(h, inode, mapping, m_start, m_end,
+ truncate_op);
+
if (truncate_op)
(void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed);
}
static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
+ loff_t prev_size = i_size_read(inode);
- remove_inode_hugepages(inode, 0, LLONG_MAX);
+ remove_inode_hugepages(inode, 0, LLONG_MAX, prev_size);
/*
* Get the resv_map from the address space embedded in the inode.
pgoff_t pgoff;
struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
+ loff_t prev_size = i_size_read(inode);
BUG_ON(offset & ~huge_page_mask(h));
pgoff = offset >> PAGE_SHIFT;
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, offset, LLONG_MAX);
+ remove_inode_hugepages(inode, offset, LLONG_MAX, prev_size);
}
static void hugetlbfs_zero_partial_page(struct hstate *h,
/* Remove full pages from the file. */
if (hole_end > hole_start)
- remove_inode_hugepages(inode, hole_start, hole_end);
+ remove_inode_hugepages(inode, hole_start, hole_end, hole_end);
inode_unlock(inode);
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
bool new_page, new_pagecache_page = false;
+ bool reserve_alloc = false;
/*
* Currently, we are forced to kill the process in the event the
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
new_page = true;
+ if (HPageRestoreReserve(page))
+ reserve_alloc = true;
if (vma->vm_flags & VM_MAYSHARE) {
- int err = hugetlb_add_to_page_cache(page, mapping, idx);
+ int err;
+
+ err = hugetlb_add_to_page_cache(page, mapping, idx);
if (err) {
/*
* err can't be -EEXIST which implies someone
}
ptl = huge_pte_lock(h, mm, ptep);
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto backout;
-
ret = 0;
/* If pte changed from under us, retry */
if (!pte_same(huge_ptep_get(ptep), old_pte))
backout:
spin_unlock(ptl);
backout_unlocked:
- unlock_page(page);
- /* restore reserve for newly allocated pages not in page cache */
- if (new_page && !new_pagecache_page)
+ if (new_page && !new_pagecache_page) {
+ /*
+ * If reserve was consumed, make sure flag is set so that it
+ * will be restored in free_huge_page().
+ */
+ if (reserve_alloc)
+ SetHPageRestoreReserve(page);
+
restore_reserve_on_error(h, vma, haddr, page);
+ }
+
+ unlock_page(page);
put_page(page);
goto out;
}
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
spin_lock(ptl);
- /*
- * Recheck the i_size after holding PT lock to make sure not
- * to leave any page mapped (as page_mapped()) beyond the end
- * of the i_size (remove_inode_hugepages() is strict about
- * enforcing that). If we bail out here, we'll also leave a
- * page in the radix tree in the vm_shared case beyond the end
- * of the i_size, but remove_inode_hugepages() will take care
- * of it as soon as we drop the hugetlb_fault_mutex_table.
- */
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- ret = -EFAULT;
- if (idx >= size)
- goto out_release_unlock;
-
- ret = -EEXIST;
/*
* We allow to overwrite a pte marker: consider when both MISSING|WP
* registered, we firstly wr-protect a none pte which has no page cache
* page backing it, then access the page.
*/
+ ret = -EEXIST;
if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
goto out_release_unlock;