#define PGOFF_LOFFT_MAX \
(((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1)))
-static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
+static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma)
{
+ /* Unfortunate we have to reassign vma->vm_private_data. */
+ return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma);
+}
+
+static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc)
+{
+ struct file *file = desc->file;
struct inode *inode = file_inode(file);
loff_t len, vma_len;
int ret;
* way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
- vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
- vma->vm_ops = &hugetlb_vm_ops;
+ desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
+ desc->vm_ops = &hugetlb_vm_ops;
/*
* page based offset in vm_pgoff could be sufficiently large to
* sizeof(unsigned long). So, only check in those instances.
*/
if (sizeof(unsigned long) == sizeof(loff_t)) {
- if (vma->vm_pgoff & PGOFF_LOFFT_MAX)
+ if (desc->pgoff & PGOFF_LOFFT_MAX)
return -EINVAL;
}
/* must be huge page aligned */
- if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
+ if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT))
return -EINVAL;
- vma_len = (loff_t)(vma->vm_end - vma->vm_start);
- len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ vma_len = (loff_t)vma_desc_size(desc);
+ len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT);
/* check for overflow */
if (len < vma_len)
return -EINVAL;
ret = -ENOMEM;
- vm_flags = vma->vm_flags;
+ vm_flags = desc->vm_flags;
/*
* for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
* reserving here. Note: only for SHM hugetlbfs file, the inode
vm_flags |= VM_NORESERVE;
if (hugetlb_reserve_pages(inode,
- vma->vm_pgoff >> huge_page_order(h),
- len >> huge_page_shift(h), vma,
- vm_flags) < 0)
+ desc->pgoff >> huge_page_order(h),
+ len >> huge_page_shift(h), desc,
+ vm_flags) < 0)
goto out;
ret = 0;
- if (vma->vm_flags & VM_WRITE && inode->i_size < len)
+ if ((desc->vm_flags & VM_WRITE) && inode->i_size < len)
i_size_write(inode, len);
out:
inode_unlock(inode);
+ if (!ret) {
+ /* Allocate the VMA lock after we set it up. */
+ desc->action.success_hook = hugetlb_file_mmap_prepare_success;
+ /*
+ * We cannot permit the rmap finding this VMA in the time
+ * between the VMA being inserted into the VMA tree and the
+ * completion/success hook being invoked.
+ *
+ * This is because we establish a per-VMA hugetlb lock which can
+ * be raced by rmap.
+ */
+ desc->action.hide_from_rmap_until_complete = true;
+ }
return ret;
}
static const struct file_operations hugetlbfs_file_operations = {
.read_iter = hugetlbfs_read_iter,
- .mmap = hugetlbfs_file_mmap,
+ .mmap_prepare = hugetlbfs_file_mmap_prepare,
.fsync = noop_fsync,
.get_unmapped_area = hugetlb_get_unmapped_area,
.llseek = default_llseek,
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end, bool take_locks);
}
}
-static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+/*
+ * vma specific semaphore used for pmd sharing and fault/truncation
+ * synchronization
+ */
+int hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
{
struct hugetlb_vma_lock *vma_lock;
/* Only establish in (flags) sharable vmas */
if (!vma || !(vma->vm_flags & VM_MAYSHARE))
- return;
+ return 0;
/* Should never get here with non-NULL vm_private_data */
if (vma->vm_private_data)
- return;
+ return -EINVAL;
vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
if (!vma_lock) {
* allocation failure.
*/
pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
- return;
+ return -EINVAL;
}
kref_init(&vma_lock->refs);
init_rwsem(&vma_lock->rw_sema);
vma_lock->vma = vma;
vma->vm_private_data = vma_lock;
+
+ return 0;
}
/* Helper that removes a struct file_region from the resv_map cache and returns
}
}
-static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
{
- VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
- VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+ VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma);
+ VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma);
- set_vma_private_data(vma, (unsigned long)map);
+ set_vma_private_data(vma, get_vma_private_data(vma) | flags);
}
-static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map)
{
- VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
- VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma);
+ VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+ VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
- set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+ desc->private_data = map;
+}
+
+static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags)
+{
+ VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+ VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE);
+
+ desc->private_data = (void *)((unsigned long)desc->private_data | flags);
}
static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
return (get_vma_private_data(vma) & flag) != 0;
}
+static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag)
+{
+ VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags));
+
+ return ((unsigned long)desc->private_data) & flag;
+}
+
bool __vma_private_lock(struct vm_area_struct *vma)
{
return !(vma->vm_flags & VM_MAYSHARE) &&
*/
long hugetlb_reserve_pages(struct inode *inode,
- long from, long to,
- struct vm_area_struct *vma,
- vm_flags_t vm_flags)
+ long from, long to,
+ struct vm_area_desc *desc,
+ vm_flags_t vm_flags)
{
long chg = -1, add = -1, spool_resv, gbl_resv;
struct hstate *h = hstate_inode(inode);
return -EINVAL;
}
- /*
- * vma specific semaphore used for pmd sharing and fault/truncation
- * synchronization
- */
- hugetlb_vma_lock_alloc(vma);
-
/*
* Only apply hugepage reservation if asked. At fault time, an
* attempt will be made for VM_NORESERVE to allocate a page
* Shared mappings base their reservation on the number of pages that
* are already allocated on behalf of the file. Private mappings need
* to reserve the full area even if read-only as mprotect() may be
- * called to make the mapping read-write. Assume !vma is a shm mapping
+ * called to make the mapping read-write. Assume !desc is a shm mapping
*/
- if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ if (!desc || desc->vm_flags & VM_MAYSHARE) {
/*
* resv_map can not be NULL as hugetlb_reserve_pages is only
* called for inodes for which resv_maps were created (see
chg = to - from;
- set_vma_resv_map(vma, resv_map);
- set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+ set_vma_desc_resv_map(desc, resv_map);
+ set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER);
}
if (chg < 0)
chg * pages_per_huge_page(h), &h_cg) < 0)
goto out_err;
- if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+ if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) {
/* For private mappings, the hugetlb_cgroup uncharge info hangs
* of the resv_map.
*/
* consumed reservations are stored in the map. Hence, nothing
* else has to be done for private mappings here
*/
- if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ if (!desc || desc->vm_flags & VM_MAYSHARE) {
add = region_add(resv_map, from, to, regions_needed, h, h_cg);
if (unlikely(add < 0)) {
hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
chg * pages_per_huge_page(h), h_cg);
out_err:
- hugetlb_vma_lock_free(vma);
- if (!vma || vma->vm_flags & VM_MAYSHARE)
+ if (!desc || desc->vm_flags & VM_MAYSHARE)
/* Only call region_abort if the region_chg succeeded but the
* region_add failed or didn't run.
*/
if (chg >= 0 && add < 0)
region_abort(resv_map, from, to, regions_needed);
- if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) {
kref_put(&resv_map->refs, resv_map_release);
- set_vma_resv_map(vma, NULL);
+ set_vma_desc_resv_map(desc, NULL);
}
return chg < 0 ? chg : add < 0 ? add : -EINVAL;
}