]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
ext2: Add locking for DAX faults
authorRoss Zwisler <ross.zwisler@linux.intel.com>
Tue, 13 Oct 2015 22:25:37 +0000 (16:25 -0600)
committerDan Duval <dan.duval@oracle.com>
Wed, 7 Dec 2016 17:19:59 +0000 (12:19 -0500)
Orabug: 22913653

Add locking to ensure that DAX faults are isolated from ext2 operations
that modify the data blocks allocation for an inode.  This is intended to
be analogous to the work being done in XFS by Dave Chinner:

http://www.spinics.net/lists/linux-fsdevel/msg90260.html

Compared with XFS the ext2 case is greatly simplified by the fact that ext2
already allocates and zeros new blocks before they are returned as part of
ext2_get_block(), so DAX doesn't need to worry about getting unmapped or
unwritten buffer heads.

This means that the only work we need to do in ext2 is to isolate the DAX
faults from inode block allocation changes.  I believe this just means that
we need to isolate the DAX faults from truncate operations.

The newly introduced dax_sem is intended to replicate the protection
offered by i_mmaplock in XFS.  In addition to truncate the i_mmaplock also
protects XFS operations like hole punching, fallocate down, extent
manipulation IOCTLS like xfs_ioc_space() and extent swapping.  Truncate is
the only one of these operations supported by ext2.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.com>
(cherry picked from commit 5726b27b09cc92452b543764899a07e7c8037edd)
Signed-off-by: Dan Duval <dan.duval@oracle.com>
fs/ext2/ext2.h
fs/ext2/file.c
fs/ext2/inode.c
fs/ext2/super.c

index 8d15febd0aa38db2c87031f6473335e8bfea3aac..4c69c94cafd84d44a16c5128d750918ec6d9aff3 100644 (file)
@@ -684,6 +684,9 @@ struct ext2_inode_info {
        struct rw_semaphore xattr_sem;
 #endif
        rwlock_t i_meta_lock;
+#ifdef CONFIG_FS_DAX
+       struct rw_semaphore dax_sem;
+#endif
 
        /*
         * truncate_mutex is for serialising ext2_truncate() against
@@ -699,6 +702,14 @@ struct ext2_inode_info {
 #endif
 };
 
+#ifdef CONFIG_FS_DAX
+#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
+#define dax_sem_up_write(ext2_inode)   up_write(&(ext2_inode)->dax_sem)
+#else
+#define dax_sem_down_write(ext2_inode)
+#define dax_sem_up_write(ext2_inode)
+#endif
+
 /*
  * Inode dynamic state flags
  */
index 1982c3f11aec421f871c57a701f78fc3b6488d8d..11a42c5a09aee553bd85db0154be89dbd1b374cc 100644 (file)
 #include "acl.h"
 
 #ifdef CONFIG_FS_DAX
+/*
+ * The lock ordering for ext2 DAX fault paths is:
+ *
+ * mmap_sem (MM)
+ *   sb_start_pagefault (vfs, freeze)
+ *     ext2_inode_info->dax_sem
+ *       address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
+ *         ext2_inode_info->truncate_mutex
+ *
+ * The default page_lock and i_size verification done by non-DAX fault paths
+ * is sufficient because ext2 doesn't support hole punching.
+ */
 static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_fault(vma, vmf, ext2_get_block, NULL);
+       struct inode *inode = file_inode(vma->vm_file);
+       struct ext2_inode_info *ei = EXT2_I(inode);
+       int ret;
+
+       if (vmf->flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(inode->i_sb);
+               file_update_time(vma->vm_file);
+       }
+       down_read(&ei->dax_sem);
+
+       ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+
+       up_read(&ei->dax_sem);
+       if (vmf->flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(inode->i_sb);
+       return ret;
 }
 
 static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
                                                pmd_t *pmd, unsigned int flags)
 {
-       return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+       struct inode *inode = file_inode(vma->vm_file);
+       struct ext2_inode_info *ei = EXT2_I(inode);
+       int ret;
+
+       if (flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(inode->i_sb);
+               file_update_time(vma->vm_file);
+       }
+       down_read(&ei->dax_sem);
+
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+
+       up_read(&ei->dax_sem);
+       if (flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(inode->i_sb);
+       return ret;
 }
 
 static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-       return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+       struct inode *inode = file_inode(vma->vm_file);
+       struct ext2_inode_info *ei = EXT2_I(inode);
+       int ret;
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       down_read(&ei->dax_sem);
+
+       ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+
+       up_read(&ei->dax_sem);
+       sb_end_pagefault(inode->i_sb);
+       return ret;
+}
+
+static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
+               struct vm_fault *vmf)
+{
+       struct inode *inode = file_inode(vma->vm_file);
+       struct ext2_inode_info *ei = EXT2_I(inode);
+       int ret = VM_FAULT_NOPAGE;
+       loff_t size;
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       down_read(&ei->dax_sem);
+
+       /* check that the faulting page hasn't raced with truncate */
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size)
+               ret = VM_FAULT_SIGBUS;
+
+       up_read(&ei->dax_sem);
+       sb_end_pagefault(inode->i_sb);
+       return ret;
 }
 
 static const struct vm_operations_struct ext2_dax_vm_ops = {
        .fault          = ext2_dax_fault,
        .pmd_fault      = ext2_dax_pmd_fault,
        .page_mkwrite   = ext2_dax_mkwrite,
-       .pfn_mkwrite    = dax_pfn_mkwrite,
+       .pfn_mkwrite    = ext2_dax_pfn_mkwrite,
 };
 
 static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
index fd13c959958c8d38f431ea66274e82a7c9f4bd86..e879b0bfd06da13b26b260ef6d4fa2f661c0a697 100644 (file)
@@ -1085,6 +1085,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
                ext2_free_data(inode, p, q);
 }
 
+/* dax_sem must be held when calling this function */
 static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
        __le32 *i_data = EXT2_I(inode)->i_data;
@@ -1100,6 +1101,10 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
        blocksize = inode->i_sb->s_blocksize;
        iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
 
+#ifdef CONFIG_FS_DAX
+       WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+#endif
+
        n = ext2_block_to_path(inode, iblock, offsets, NULL);
        if (n == 0)
                return;
@@ -1185,7 +1190,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
                return;
        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
                return;
+
+       dax_sem_down_write(EXT2_I(inode));
        __ext2_truncate_blocks(inode, offset);
+       dax_sem_up_write(EXT2_I(inode));
 }
 
 static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1213,8 +1221,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
        if (error)
                return error;
 
+       dax_sem_down_write(EXT2_I(inode));
        truncate_setsize(inode, newsize);
        __ext2_truncate_blocks(inode, newsize);
+       dax_sem_up_write(EXT2_I(inode));
 
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
index d0e746e965118f9dd6410f1b65396e5650cf483a..68e85b664d4a5883671a4b007a261e66d61cc9d4 100644 (file)
@@ -192,6 +192,9 @@ static void init_once(void *foo)
        init_rwsem(&ei->xattr_sem);
 #endif
        mutex_init(&ei->truncate_mutex);
+#ifdef CONFIG_FS_DAX
+       init_rwsem(&ei->dax_sem);
+#endif
        inode_init_once(&ei->vfs_inode);
 }