return ret;
}
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+ size_t count)
+{
+ int ret = 0;
+ unsigned int extent_flags;
+ u32 cpos, clusters, extent_len, phys_cpos;
+ struct super_block *sb = inode->i_sb;
+
+ cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+ clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+ while (clusters) {
+ ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+ &extent_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+ ret = 1;
+ break;
+ }
+
+ if (extent_len > clusters)
+ extent_len = clusters;
+
+ clusters -= extent_len;
+ cpos += extent_len;
+ }
+out:
+ return ret;
+}
+
static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos,
- size_t count)
+ size_t count,
+ int appending,
+ int *direct_io,
+ int *has_refcount)
{
int ret = 0, meta_level = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
loff_t end;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
pos,
count,
&meta_level);
+ if (has_refcount)
+ *has_refcount = 1;
+ if (direct_io)
+ *direct_io = 0;
}
if (ret < 0) {
goto out_unlock;
}
+ /*
+ * Skip the O_DIRECT checks if we don't need
+ * them.
+ */
+ if (!direct_io || !(*direct_io))
+ break;
+
+ /*
+ * There's no sane way to do direct writes to an inode
+ * with inline data.
+ */
+ if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ *direct_io = 0;
+ break;
+ }
+
+ /*
+ * Allowing concurrent direct writes means
+ * i_size changes wouldn't be synchronized, so
+ * one node could wind up truncating another
+ * nodes writes.
+ */
+ if (end > i_size_read(inode) && !full_coherency) {
+ *direct_io = 0;
+ break;
+ }
+
+ /*
+ * Fallback to old way if the feature bit is not set.
+ */
+ if (end > i_size_read(inode) &&
+ !ocfs2_supports_append_dio(osb)) {
+ *direct_io = 0;
+ break;
+ }
+
+ /*
+ * We don't fill holes during direct io, so
+ * check for them here. If any are found, the
+ * caller will have to retake some cluster
+ * locks and initiate the io as buffered.
+ */
+ ret = ocfs2_check_range_for_holes(inode, pos, count);
+ if (ret == 1) {
+ /*
+ * Fallback to old way if the feature bit is not set.
+ * Otherwise try dio first and then complete the rest
+ * request through buffer io.
+ */
+ if (!ocfs2_supports_append_dio(osb))
+ *direct_io = 0;
+ ret = 0;
+ } else if (ret < 0)
+ mlog_errno(ret);
break;
}
out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
- pos, count);
+ pos, appending, count,
+ direct_io, has_refcount);
if (meta_level >= 0)
ocfs2_inode_unlock(inode, meta_level);
static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
- int direct_io, rw_level, have_alloc_sem = 0;
+ int direct_io, appending, rw_level, have_alloc_sem = 0;
+ int can_do_direct, has_refcount = 0;
ssize_t written = 0;
ssize_t ret;
- size_t count = iov_iter_count(from);
+ size_t count = iov_iter_count(from), orig_count;
loff_t old_size;
u32 old_clusters;
struct file *file = iocb->ki_filp;
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
int unaligned_dio = 0;
+ int dropped_dio = 0;
trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
if (count == 0)
return 0;
+ appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
mutex_lock(&inode->i_mutex);
ocfs2_iocb_clear_sem_locked(iocb);
+relock:
/* to match setattr's i_mutex -> rw_lock ordering */
if (direct_io) {
have_alloc_sem = 1;
ocfs2_inode_unlock(inode, 1);
}
+ orig_count = iov_iter_count(from);
ret = generic_write_checks(iocb, from);
if (ret <= 0) {
if (ret)
}
count = ret;
- ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count);
+ can_do_direct = direct_io;
+ ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending,
+ &can_do_direct, &has_refcount);
if (ret < 0) {
mlog_errno(ret);
goto out;
if (direct_io && !is_sync_kiocb(iocb))
unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos);
+ /*
+ * We can't complete the direct I/O as requested, fall back to
+ * buffered I/O.
+ */
+ if (direct_io && !can_do_direct) {
+ ocfs2_rw_unlock(inode, rw_level);
+
+ have_alloc_sem = 0;
+ rw_level = -1;
+
+ direct_io = 0;
+ iocb->ki_flags &= ~IOCB_DIRECT;
+ iov_iter_reexpand(from, orig_count);
+ dropped_dio = 1;
+ goto relock;
+ }
+
if (unaligned_dio) {
static unsigned long unaligned_warn_time;
/*
goto no_sync;
if (((file->f_flags & O_DSYNC) && !direct_io) ||
- IS_SYNC(inode)) {
+ IS_SYNC(inode) || dropped_dio) {
/*
* There is an performance issue when we are doing a flush with
* WB_SYNC_ALL flag. block_write_full_page() will transfer it
TRACE_EVENT(ocfs2_prepare_inode_for_write,
TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
- unsigned long count),
- TP_ARGS(ino, saved_pos, count),
+ int appending, unsigned long count,
+ int *direct_io, int *has_refcount),
+ TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
TP_STRUCT__entry(
__field(unsigned long long, ino)
__field(unsigned long long, saved_pos)
+ __field(int, appending)
__field(unsigned long, count)
+ __field(int, direct_io)
+ __field(int, has_refcount)
),
TP_fast_assign(
__entry->ino = ino;
__entry->saved_pos = saved_pos;
+ __entry->appending = appending;
__entry->count = count;
+ __entry->direct_io = direct_io ? *direct_io : -1;
+ __entry->has_refcount = has_refcount ? *has_refcount : -1;
),
- TP_printk("%llu %llu %lu", __entry->ino,
- __entry->saved_pos, __entry->count)
+ TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+ __entry->saved_pos, __entry->appending, __entry->count,
+ __entry->direct_io, __entry->has_refcount)
);
DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);