]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
ocfs2: serialize unaligned aio
authorMark Fasheh <mfasheh@suse.com>
Mon, 15 Aug 2011 17:08:44 +0000 (10:08 -0700)
committerMaxim Uvarov <maxim.uvarov@oracle.com>
Thu, 22 Mar 2012 00:17:23 +0000 (17:17 -0700)
[Pulled before push to mainline]

Fix a corruption that can happen when we have (two or more) outstanding
aio's to an overlapping unaligned region.  Ext4
(e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
similar issues.

In our case what happens is that we can have an outstanding aio on a region
and if a write comes in with some bytes overlapping the original aio we may
decide to read that region into a page before continuing (typically because
of buffered-io fallback).  Since we have no ordering guarantees with the
aio, we can read stale or bad data into the page and then write it back out.

If the i/o is page and block aligned, then we avoid this issue as there
won't be any need to read data from disk.

I took the same approach as Eric in the ext4 patch and introduced some
serialization of unaligned async direct i/o.  I don't expect this to have an
effect on the most common cases of AIO.  Unaligned aio will be slower
though, but that's far more acceptable than data corruption.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/file.c
fs/ocfs2/inode.h
fs/ocfs2/super.c

index cdd91f4e7339ef3dd07b617557b6e275c9b882ae..d4859fdbb9444bf88ba24c9bb227f684d37cabb1 100644 (file)
@@ -564,6 +564,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        int level;
+       wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
 
        /* this io's submitter should not have unlocked this before we could */
        BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
@@ -573,6 +574,15 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
                ocfs2_iocb_clear_sem_locked(iocb);
        }
 
+       if (ocfs2_iocb_is_unaligned_aio(iocb)) {
+               ocfs2_iocb_clear_unaligned_aio(iocb);
+
+               if (atomic_dec_and_test(&OCFS2_I(inode)->ip_unaligned_aio) &&
+                   waitqueue_active(wq)) {
+                       wake_up_all(wq);
+               }
+       }
+
        ocfs2_iocb_clear_rw_locked(iocb);
 
        level = ocfs2_iocb_rw_locked_level(iocb);
index 75cf3ad987a66d911c15234a803243185ccc5a94..ffb2da370a99d05dd4b919fc64a5483dbc2df7a3 100644 (file)
@@ -78,6 +78,7 @@ enum ocfs2_iocb_lock_bits {
        OCFS2_IOCB_RW_LOCK = 0,
        OCFS2_IOCB_RW_LOCK_LEVEL,
        OCFS2_IOCB_SEM,
+       OCFS2_IOCB_UNALIGNED_IO,
        OCFS2_IOCB_NUM_LOCKS
 };
 
@@ -91,4 +92,17 @@ enum ocfs2_iocb_lock_bits {
        clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
 #define ocfs2_iocb_is_sem_locked(iocb) \
        test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private)
+
+#define ocfs2_iocb_set_unaligned_aio(iocb) \
+       set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_clear_unaligned_aio(iocb) \
+       clear_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+#define ocfs2_iocb_is_unaligned_aio(iocb) \
+       test_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private)
+
+#define OCFS2_IOEND_WQ_HASH_SZ 37
+#define ocfs2_ioend_wq(v)   (&ocfs2__ioend_wq[((unsigned long)(v)) %\
+                                           OCFS2_IOEND_WQ_HASH_SZ])
+extern wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
 #endif /* OCFS2_FILE_H */
index 224e38f33c914181bbff24e77dbf7e355e61333d..fa090b0a1e82ecbafd2c30cea8dbb54369e3e1b6 100644 (file)
@@ -2038,6 +2038,23 @@ out:
        return ret;
 }
 
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+       wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+       wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+       int blockmask = inode->i_sb->s_blocksize - 1;
+       loff_t final_size = pos + count;
+
+       if ((pos & blockmask) || (final_size & blockmask))
+               return 1;
+       return 0;
+}
+
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
                                            struct file *file,
                                            loff_t pos, size_t count,
@@ -2214,6 +2231,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
+       int unaligned_dio = 0;
 
        trace_ocfs2_file_write_iter(inode, file, file->f_path.dentry,
                (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2282,6 +2300,10 @@ relock:
                goto out;
        }
 
+       if (direct_io && !is_sync_kiocb(iocb))
+               unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+                                                     *ppos);
+
        /*
         * We can't complete the direct I/O as requested, fall back to
         * buffered I/O.
@@ -2297,6 +2319,18 @@ relock:
                goto relock;
        }
 
+       if (unaligned_dio) {
+               /*
+                * Wait on previous unaligned aio to complete before
+                * proceeding.
+                */
+               ocfs2_aiodio_wait(inode);
+
+               /* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+               atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+               ocfs2_iocb_set_unaligned_aio(iocb);
+       }
+
        /*
         * To later detect whether a journal commit for sync writes is
         * necessary, we sample i_size, and cluster count here.
@@ -2365,8 +2399,12 @@ out_dio:
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
                have_alloc_sem = 0;
+               unaligned_dio = 0;
        }
 
+       if (unaligned_dio)
+               atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+
 out:
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
index 1c508b149b3ac1bd4325fd33a9aae6bdb70e024a..88924a3133fae7c15ca3f5a5259b64eecd97022e 100644 (file)
@@ -43,6 +43,9 @@ struct ocfs2_inode_info
        /* protects extended attribute changes on this inode */
        struct rw_semaphore             ip_xattr_sem;
 
+       /* Number of outstanding AIO's which are not page aligned */
+       atomic_t                        ip_unaligned_aio;
+
        /* These fields are protected by ip_lock */
        spinlock_t                      ip_lock;
        u32                             ip_open_count;
index 51a831f9047119295b000fa7236418c83cafe867..d436c1f49684d55d68b56807152975357622ea33 100644 (file)
@@ -54,6 +54,7 @@
 #include "ocfs1_fs_compat.h"
 
 #include "alloc.h"
+#include "aops.h"
 #include "blockcheck.h"
 #include "dlmglue.h"
 #include "export.h"
@@ -1634,12 +1635,17 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        return 0;
 }
 
+wait_queue_head_t ocfs2__ioend_wq[OCFS2_IOEND_WQ_HASH_SZ];
+
 static int __init ocfs2_init(void)
 {
-       int status;
+       int status, i;
 
        ocfs2_print_version();
 
+       for (i = 0; i < OCFS2_IOEND_WQ_HASH_SZ; i++)
+               init_waitqueue_head(&ocfs2__ioend_wq[i]);
+
        status = init_ocfs2_uptodate_cache();
        if (status < 0) {
                mlog_errno(status);
@@ -1781,7 +1787,7 @@ static void ocfs2_inode_init_once(void *data)
        ocfs2_extent_map_init(&oi->vfs_inode);
        INIT_LIST_HEAD(&oi->ip_io_markers);
        oi->ip_dir_start_lookup = 0;
-
+       atomic_set(&oi->ip_unaligned_aio, 0);
        init_rwsem(&oi->ip_alloc_sem);
        init_rwsem(&oi->ip_xattr_sem);
        mutex_init(&oi->ip_io_mutex);