Revert "ocfs2: code clean up for direct io"

author Wengang Wang <wen.gang.wang@oracle.com>

Tue, 23 Jan 2018 17:16:59 +0000 (09:16 -0800)

committer Jack Vogel <jack.vogel@oracle.com>

Wed, 24 Jan 2018 22:43:54 +0000 (14:43 -0800)
author Wengang Wang <wen.gang.wang@oracle.com>
Tue, 23 Jan 2018 17:16:59 +0000 (09:16 -0800)
committer Jack Vogel <jack.vogel@oracle.com>
Wed, 24 Jan 2018 22:43:54 +0000 (14:43 -0800)
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c

index 79a9e2d4f8bf40dba1eaebbaa879fca31e7dfb8e..53e71d2e0958e7955eb4bce3db4dc71e06a7094d 100644 (file)
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1405,6 +1405,44 @@ out:
         return ret;
  }
  
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+                                      size_t count)
+{
+       int ret = 0;
+       unsigned int extent_flags;
+       u32 cpos, clusters, extent_len, phys_cpos;
+       struct super_block *sb = inode->i_sb;
+
+       cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+       clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+       while (clusters) {
+               ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+                                        &extent_flags);
+               if (ret < 0) {
+                       mlog_errno(ret);
+                       goto out;
+               }
+
+               if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+                       ret = 1;
+                       break;
+               }
+
+               if (extent_len > clusters)
+                       extent_len = clusters;
+
+               clusters -= extent_len;
+               cpos += extent_len;
+       }
+out:
+       return ret;
+}
+
  static int ocfs2_write_remove_suid(struct inode *inode)
  {
         int ret;
@@ -2136,12 +2174,18 @@ out:
  
  static int ocfs2_prepare_inode_for_write(struct file *file,
                                          loff_t pos,
-                                        size_t count)
+                                        size_t count,
+                                        int appending,
+                                        int *direct_io,
+                                        int *has_refcount)
  {
         int ret = 0, meta_level = 0;
         struct dentry *dentry = file->f_path.dentry;
         struct inode *inode = d_inode(dentry);
         loff_t end;
+       struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+       int full_coherency = !(osb->s_mount_opt &
+               OCFS2_MOUNT_COHERENCY_BUFFERED);
  
         /*
          * We start with a read level meta lock and only jump to an ex
@@ -2190,6 +2234,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                                                                pos,
                                                                count,
                                                                &meta_level);
+                       if (has_refcount)
+                               *has_refcount = 1;
+                       if (direct_io)
+                               *direct_io = 0;
                 }
  
                 if (ret < 0) {
@@ -2197,12 +2245,67 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                         goto out_unlock;
                 }
  
+               /*
+                * Skip the O_DIRECT checks if we don't need
+                * them.
+                */
+               if (!direct_io || !(*direct_io))
+                       break;
+
+               /*
+                * There's no sane way to do direct writes to an inode
+                * with inline data.
+                */
+               if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+                       *direct_io = 0;
+                       break;
+               }
+
+               /*
+                * Allowing concurrent direct writes means
+                * i_size changes wouldn't be synchronized, so
+                * one node could wind up truncating another
+                * nodes writes.
+                */
+               if (end > i_size_read(inode) && !full_coherency) {
+                       *direct_io = 0;
+                       break;
+               }
+
+               /*
+                * Fallback to old way if the feature bit is not set.
+                */
+               if (end > i_size_read(inode) &&
+                               !ocfs2_supports_append_dio(osb)) {
+                       *direct_io = 0;
+                       break;
+               }
+
+               /*
+                * We don't fill holes during direct io, so
+                * check for them here. If any are found, the
+                * caller will have to retake some cluster
+                * locks and initiate the io as buffered.
+                */
+               ret = ocfs2_check_range_for_holes(inode, pos, count);
+               if (ret == 1) {
+                       /*
+                        * Fallback to old way if the feature bit is not set.
+                        * Otherwise try dio first and then complete the rest
+                        * request through buffer io.
+                        */
+                       if (!ocfs2_supports_append_dio(osb))
+                               *direct_io = 0;
+                       ret = 0;
+               } else if (ret < 0)
+                       mlog_errno(ret);
                 break;
         }
  
  out_unlock:
         trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
-                                           pos, count);
+                                           pos, appending, count,
+                                           direct_io, has_refcount);
  
         if (meta_level >= 0)
                 ocfs2_inode_unlock(inode, meta_level);
@@ -2214,10 +2317,11 @@ out:
  static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                     struct iov_iter *from)
  {
-       int direct_io, rw_level, have_alloc_sem  = 0;
+       int direct_io, appending, rw_level, have_alloc_sem  = 0;
+       int can_do_direct, has_refcount = 0;
         ssize_t written = 0;
         ssize_t ret;
-       size_t count = iov_iter_count(from);
+       size_t count = iov_iter_count(from), orig_count;
         loff_t old_size;
         u32 old_clusters;
         struct file *file = iocb->ki_filp;
@@ -2226,6 +2330,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
         int full_coherency = !(osb->s_mount_opt &
                                OCFS2_MOUNT_COHERENCY_BUFFERED);
         int unaligned_dio = 0;
+       int dropped_dio = 0;
  
         trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
                 (unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2236,12 +2341,14 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
         if (count == 0)
                 return 0;
  
+       appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
         direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
  
         mutex_lock(&inode->i_mutex);
  
         ocfs2_iocb_clear_sem_locked(iocb);
  
+relock:
         /* to match setattr's i_mutex -> rw_lock ordering */
         if (direct_io) {
                 have_alloc_sem = 1;
@@ -2280,6 +2387,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                 ocfs2_inode_unlock(inode, 1);
         }
  
+       orig_count = iov_iter_count(from);
         ret = generic_write_checks(iocb, from);
         if (ret <= 0) {
                 if (ret)
@@ -2288,7 +2396,9 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
         }
         count = ret;
  
-       ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
+       can_do_direct = direct_io;
+       ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
+                                           &can_do_direct, &has_refcount);
         if (ret < 0) {
                 mlog_errno(ret);
                 goto out;
@@ -2297,6 +2407,23 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
         if (direct_io && !is_sync_kiocb(iocb))
                 unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
  
+       /*
+        * We can't complete the direct I/O as requested, fall back to
+        * buffered I/O.
+        */
+       if (direct_io && !can_do_direct) {
+               ocfs2_rw_unlock(inode, rw_level);
+
+               have_alloc_sem = 0;
+               rw_level = -1;
+
+               direct_io = 0;
+               iocb->ki_flags &= ~IOCB_DIRECT;
+               iov_iter_reexpand(from, orig_count);
+               dropped_dio = 1;
+               goto relock;
+       }
+
         if (unaligned_dio) {
                 static unsigned long unaligned_warn_time;
                 /*
@@ -2352,7 +2479,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                 goto no_sync;
  
         if (((file->f_flags & O_DSYNC) && !direct_io) ||
-           IS_SYNC(inode)) {
+           IS_SYNC(inode) || dropped_dio) {
                 /*
                  * There is an performance issue when we are doing a flush with
                  * WB_SYNC_ALL flag. block_write_full_page() will transfer it
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h

index 09d0c89a9daf963df206e45fc48cf8339d85064b..6cb019b7c6a83c4ec449baff12652e84b5fca06a 100644 (file)
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1450,20 +1450,28 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
  
  TRACE_EVENT(ocfs2_prepare_inode_for_write,
         TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
-                unsigned long count),
-       TP_ARGS(ino, saved_pos, count),
+                int appending, unsigned long count,
+                int *direct_io, int *has_refcount),
+       TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
         TP_STRUCT__entry(
                 __field(unsigned long long, ino)
                 __field(unsigned long long, saved_pos)
+               __field(int, appending)
                 __field(unsigned long, count)
+               __field(int, direct_io)
+               __field(int, has_refcount)
         ),
         TP_fast_assign(
                 __entry->ino = ino;
                 __entry->saved_pos = saved_pos;
+               __entry->appending = appending;
                 __entry->count = count;
+               __entry->direct_io = direct_io ? *direct_io : -1;
+               __entry->has_refcount = has_refcount ? *has_refcount : -1;
         ),
-       TP_printk("%llu %llu %lu", __entry->ino,
-                 __entry->saved_pos, __entry->count)
+       TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+                 __entry->saved_pos, __entry->appending, __entry->count,
+                 __entry->direct_io, __entry->has_refcount)
  );
  
  DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
author	Wengang Wang <wen.gang.wang@oracle.com>
	Tue, 23 Jan 2018 17:16:59 +0000 (09:16 -0800)
committer	Jack Vogel <jack.vogel@oracle.com>
	Wed, 24 Jan 2018 22:43:54 +0000 (14:43 -0800)
fs/ocfs2/file.c		patch \| blob \| history
fs/ocfs2/ocfs2_trace.h		patch \| blob \| history