From b7499ee100d21721f2ea5623813b3f4cfa2ef631 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Tue, 23 Jan 2018 09:16:59 -0800 Subject: [PATCH] Revert "ocfs2: code clean up for direct io" This reverts commit 11fc5176778a82fb5bb0100413496e125862e649. This is patch in a patch set but back ported separately and caused problem. orabug: 27431376 Signed-off-by: Wengang Wang Reviewed-by: Junxiao Bi Reviewed-by: Ashish Samant --- fs/ocfs2/file.c | 139 +++++++++++++++++++++++++++++++++++++++-- fs/ocfs2/ocfs2_trace.h | 16 +++-- 2 files changed, 145 insertions(+), 10 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 79a9e2d4f8bf..53e71d2e0958 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1405,6 +1405,44 @@ out: return ret; } +/* + * Will look for holes and unwritten extents in the range starting at + * pos for count bytes (inclusive). + */ +static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, + size_t count) +{ + int ret = 0; + unsigned int extent_flags; + u32 cpos, clusters, extent_len, phys_cpos; + struct super_block *sb = inode->i_sb; + + cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; + clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; + + while (clusters) { + ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, + &extent_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { + ret = 1; + break; + } + + if (extent_len > clusters) + extent_len = clusters; + + clusters -= extent_len; + cpos += extent_len; + } +out: + return ret; +} + static int ocfs2_write_remove_suid(struct inode *inode) { int ret; @@ -2136,12 +2174,18 @@ out: static int ocfs2_prepare_inode_for_write(struct file *file, loff_t pos, - size_t count) + size_t count, + int appending, + int *direct_io, + int *has_refcount) { int ret = 0, meta_level = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); loff_t end; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int full_coherency = !(osb->s_mount_opt & + OCFS2_MOUNT_COHERENCY_BUFFERED); /* * We start with a read level meta lock and only jump to an ex @@ -2190,6 +2234,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file, pos, count, &meta_level); + if (has_refcount) + *has_refcount = 1; + if (direct_io) + *direct_io = 0; } if (ret < 0) { @@ -2197,12 +2245,67 @@ static int ocfs2_prepare_inode_for_write(struct file *file, goto out_unlock; } + /* + * Skip the O_DIRECT checks if we don't need + * them. + */ + if (!direct_io || !(*direct_io)) + break; + + /* + * There's no sane way to do direct writes to an inode + * with inline data. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + *direct_io = 0; + break; + } + + /* + * Allowing concurrent direct writes means + * i_size changes wouldn't be synchronized, so + * one node could wind up truncating another + * nodes writes. + */ + if (end > i_size_read(inode) && !full_coherency) { + *direct_io = 0; + break; + } + + /* + * Fallback to old way if the feature bit is not set. + */ + if (end > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) { + *direct_io = 0; + break; + } + + /* + * We don't fill holes during direct io, so + * check for them here. If any are found, the + * caller will have to retake some cluster + * locks and initiate the io as buffered. + */ + ret = ocfs2_check_range_for_holes(inode, pos, count); + if (ret == 1) { + /* + * Fallback to old way if the feature bit is not set. + * Otherwise try dio first and then complete the rest + * request through buffer io. + */ + if (!ocfs2_supports_append_dio(osb)) + *direct_io = 0; + ret = 0; + } else if (ret < 0) + mlog_errno(ret); break; } out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - pos, count); + pos, appending, count, + direct_io, has_refcount); if (meta_level >= 0) ocfs2_inode_unlock(inode, meta_level); @@ -2214,10 +2317,11 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int direct_io, rw_level, have_alloc_sem = 0; + int direct_io, appending, rw_level, have_alloc_sem = 0; + int can_do_direct, has_refcount = 0; ssize_t written = 0; ssize_t ret; - size_t count = iov_iter_count(from); + size_t count = iov_iter_count(from), orig_count; loff_t old_size; u32 old_clusters; struct file *file = iocb->ki_filp; @@ -2226,6 +2330,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); int unaligned_dio = 0; + int dropped_dio = 0; trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2236,12 +2341,14 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, if (count == 0) return 0; + appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; mutex_lock(&inode->i_mutex); ocfs2_iocb_clear_sem_locked(iocb); +relock: /* to match setattr's i_mutex -> rw_lock ordering */ if (direct_io) { have_alloc_sem = 1; @@ -2280,6 +2387,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, ocfs2_inode_unlock(inode, 1); } + orig_count = iov_iter_count(from); ret = generic_write_checks(iocb, from); if (ret <= 0) { if (ret) @@ -2288,7 +2396,9 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } count = ret; - ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); + can_do_direct = direct_io; + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending, + &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); goto out; @@ -2297,6 +2407,23 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, if (direct_io && !is_sync_kiocb(iocb)) unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); + /* + * We can't complete the direct I/O as requested, fall back to + * buffered I/O. + */ + if (direct_io && !can_do_direct) { + ocfs2_rw_unlock(inode, rw_level); + + have_alloc_sem = 0; + rw_level = -1; + + direct_io = 0; + iocb->ki_flags &= ~IOCB_DIRECT; + iov_iter_reexpand(from, orig_count); + dropped_dio = 1; + goto relock; + } + if (unaligned_dio) { static unsigned long unaligned_warn_time; /* @@ -2352,7 +2479,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, goto no_sync; if (((file->f_flags & O_DSYNC) && !direct_io) || - IS_SYNC(inode)) { + IS_SYNC(inode) || dropped_dio) { /* * There is an performance issue when we are doing a flush with * WB_SYNC_ALL flag. block_write_full_page() will transfer it diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 09d0c89a9daf..6cb019b7c6a8 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1450,20 +1450,28 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); TRACE_EVENT(ocfs2_prepare_inode_for_write, TP_PROTO(unsigned long long ino, unsigned long long saved_pos, - unsigned long count), - TP_ARGS(ino, saved_pos, count), + int appending, unsigned long count, + int *direct_io, int *has_refcount), + TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount), TP_STRUCT__entry( __field(unsigned long long, ino) __field(unsigned long long, saved_pos) + __field(int, appending) __field(unsigned long, count) + __field(int, direct_io) + __field(int, has_refcount) ), TP_fast_assign( __entry->ino = ino; __entry->saved_pos = saved_pos; + __entry->appending = appending; __entry->count = count; + __entry->direct_io = direct_io ? *direct_io : -1; + __entry->has_refcount = has_refcount ? *has_refcount : -1; ), - TP_printk("%llu %llu %lu", __entry->ino, - __entry->saved_pos, __entry->count) + TP_printk("%llu %llu %d %lu %d %d", __entry->ino, + __entry->saved_pos, __entry->appending, __entry->count, + __entry->direct_io, __entry->has_refcount) ); DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); -- 2.50.1