]> www.infradead.org Git - users/hch/xfs.git/commitdiff
xfs: introduce new file range commit ioctls
authorDarrick J. Wong <djwong@kernel.org>
Fri, 30 Aug 2024 22:36:47 +0000 (15:36 -0700)
committerDarrick J. Wong <djwong@kernel.org>
Sun, 1 Sep 2024 15:58:19 +0000 (08:58 -0700)
This patch introduces two more new ioctls to manage atomic updates to
file contents -- XFS_IOC_START_COMMIT and XFS_IOC_COMMIT_RANGE.  The
commit mechanism here is exactly the same as what XFS_IOC_EXCHANGE_RANGE
does, but with the additional requirement that file2 cannot have changed
since some sampling point.  The start-commit ioctl performs the sampling
of file attributes.

Note: This patch currently samples i_ctime during START_COMMIT and
checks that it hasn't changed during COMMIT_RANGE.  This isn't entirely
safe in kernels prior to 6.12 because ctime only had coarse grained
granularity and very fast updates could collide with a COMMIT_RANGE.
With the multi-granularity ctime introduced by Jeff Layton, it's now
possible to update ctime such that this does not happen.

It is critical, then, that this patch must not be backported to any
kernel that does not support fine-grained file change timestamps.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Acked-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
fs/xfs/libxfs/xfs_fs.h
fs/xfs/xfs_exchrange.c
fs/xfs/xfs_exchrange.h
fs/xfs/xfs_ioctl.c
fs/xfs/xfs_trace.h

index 454b63ef720162f27c49b8fba15f5994064ca9ad..c85c8077fac391ba182fe89334c016d3c6934d38 100644 (file)
@@ -825,6 +825,30 @@ struct xfs_exchange_range {
        __u64           flags;          /* see XFS_EXCHANGE_RANGE_* below */
 };
 
+/*
+ * Using the same definition of file2 as struct xfs_exchange_range, commit the
+ * contents of file1 into file2 if file2 has the same inode number, mtime, and
+ * ctime as the arguments provided to the call.  The old contents of file2 will
+ * be moved to file1.
+ *
+ * Returns -EBUSY if there isn't an exact match for the file2 fields.
+ *
+ * Filesystems must be able to restart and complete the operation even after
+ * the system goes down.
+ */
+struct xfs_commit_range {
+       __s32           file1_fd;
+       __u32           pad;            /* must be zeroes */
+       __u64           file1_offset;   /* file1 offset, bytes */
+       __u64           file2_offset;   /* file2 offset, bytes */
+       __u64           length;         /* bytes to exchange */
+
+       __u64           flags;          /* see XFS_EXCHANGE_RANGE_* below */
+
+       /* opaque file2 metadata for freshness checks */
+       __u64           file2_freshness[6];
+};
+
 /*
  * Exchange file data all the way to the ends of both files, and then exchange
  * the file sizes.  This flag can be used to replace a file's contents with a
@@ -997,6 +1021,8 @@ struct xfs_getparents_by_handle {
 #define XFS_IOC_BULKSTAT            _IOR ('X', 127, struct xfs_bulkstat_req)
 #define XFS_IOC_INUMBERS            _IOR ('X', 128, struct xfs_inumbers_req)
 #define XFS_IOC_EXCHANGE_RANGE      _IOW ('X', 129, struct xfs_exchange_range)
+#define XFS_IOC_START_COMMIT        _IOR ('X', 130, struct xfs_commit_range)
+#define XFS_IOC_COMMIT_RANGE        _IOW ('X', 131, struct xfs_commit_range)
 /*     XFS_IOC_GETFSUUID ---------- deprecated 140      */
 
 
index c8a655c92c92f4e4f4c1680d8dada1d5023bdadb..d0889190ab7ff5b01497f19c2ecefa5956d1eba2 100644 (file)
@@ -72,6 +72,34 @@ xfs_exchrange_estimate(
        return error;
 }
 
+/*
+ * Check that file2's metadata agree with the snapshot that we took for the
+ * range commit request.
+ *
+ * This should be called after the filesystem has locked /all/ inode metadata
+ * against modification.
+ */
+STATIC int
+xfs_exchrange_check_freshness(
+       const struct xfs_exchrange      *fxr,
+       struct xfs_inode                *ip2)
+{
+       struct inode                    *inode2 = VFS_I(ip2);
+       struct timespec64               ctime = inode_get_ctime(inode2);
+       struct timespec64               mtime = inode_get_mtime(inode2);
+
+       trace_xfs_exchrange_freshness(fxr, ip2);
+
+       /* Check that file2 hasn't otherwise been modified. */
+       if (fxr->file2_ino != ip2->i_ino ||
+           fxr->file2_gen != inode2->i_generation ||
+           !timespec64_equal(&fxr->file2_ctime, &ctime) ||
+           !timespec64_equal(&fxr->file2_mtime, &mtime))
+               return -EBUSY;
+
+       return 0;
+}
+
 #define QRETRY_IP1     (0x1)
 #define QRETRY_IP2     (0x2)
 
@@ -607,6 +635,12 @@ xfs_exchrange_prep(
        if (error || fxr->length == 0)
                return error;
 
+       if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) {
+               error = xfs_exchrange_check_freshness(fxr, ip2);
+               if (error)
+                       return error;
+       }
+
        /* Attach dquots to both inodes before changing block maps. */
        error = xfs_qm_dqattach(ip2);
        if (error)
@@ -719,7 +753,8 @@ xfs_exchange_range(
        if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt)
                return -EXDEV;
 
-       if (fxr->flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+       if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS |
+                        __XFS_EXCHANGE_RANGE_CHECK_FRESH2))
                return -EINVAL;
 
        /* Userspace requests only honored for regular files. */
@@ -802,3 +837,109 @@ xfs_ioc_exchange_range(
        fdput(file1);
        return error;
 }
+
+/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
+struct xfs_commit_range_fresh {
+       xfs_fsid_t      fsid;           /* m_fixedfsid */
+       __u64           file2_ino;      /* inode number */
+       __s64           file2_mtime;    /* modification time */
+       __s64           file2_ctime;    /* change time */
+       __s32           file2_mtime_nsec; /* mod time, nsec */
+       __s32           file2_ctime_nsec; /* change time, nsec */
+       __u32           file2_gen;      /* inode generation */
+       __u32           magic;          /* zero */
+};
+#define XCR_FRESH_MAGIC        0x444F524B      /* DORK */
+
+/* Set up a commitrange operation by sampling file2's write-related attrs */
+long
+xfs_ioc_start_commit(
+       struct file                     *file,
+       struct xfs_commit_range __user  *argp)
+{
+       struct xfs_commit_range         args = { };
+       struct timespec64               ts;
+       struct xfs_commit_range_fresh   *kern_f;
+       struct xfs_commit_range_fresh   __user *user_f;
+       struct inode                    *inode2 = file_inode(file);
+       struct xfs_inode                *ip2 = XFS_I(inode2);
+       const unsigned int              lockflags = XFS_IOLOCK_SHARED |
+                                                   XFS_MMAPLOCK_SHARED |
+                                                   XFS_ILOCK_SHARED;
+
+       BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) !=
+                    sizeof(args.file2_freshness));
+
+       kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+
+       memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+       xfs_ilock(ip2, lockflags);
+       ts = inode_get_ctime(inode2);
+       kern_f->file2_ctime             = ts.tv_sec;
+       kern_f->file2_ctime_nsec        = ts.tv_nsec;
+       ts = inode_get_mtime(inode2);
+       kern_f->file2_mtime             = ts.tv_sec;
+       kern_f->file2_mtime_nsec        = ts.tv_nsec;
+       kern_f->file2_ino               = ip2->i_ino;
+       kern_f->file2_gen               = inode2->i_generation;
+       kern_f->magic                   = XCR_FRESH_MAGIC;
+       xfs_iunlock(ip2, lockflags);
+
+       user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness;
+       if (copy_to_user(user_f, kern_f, sizeof(*kern_f)))
+               return -EFAULT;
+
+       return 0;
+}
+
+/*
+ * Exchange file1 and file2 contents if file2 has not been written since the
+ * start commit operation.
+ */
+long
+xfs_ioc_commit_range(
+       struct file                     *file,
+       struct xfs_commit_range __user  *argp)
+{
+       struct xfs_exchrange            fxr = {
+               .file2                  = file,
+       };
+       struct xfs_commit_range         args;
+       struct xfs_commit_range_fresh   *kern_f;
+       struct xfs_inode                *ip2 = XFS_I(file_inode(file));
+       struct xfs_mount                *mp = ip2->i_mount;
+       struct fd                       file1;
+       int                             error;
+
+       kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
+
+       if (copy_from_user(&args, argp, sizeof(args)))
+               return -EFAULT;
+       if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS)
+               return -EINVAL;
+       if (kern_f->magic != XCR_FRESH_MAGIC)
+               return -EBUSY;
+       if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t)))
+               return -EBUSY;
+
+       fxr.file1_offset        = args.file1_offset;
+       fxr.file2_offset        = args.file2_offset;
+       fxr.length              = args.length;
+       fxr.flags               = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2;
+       fxr.file2_ino           = kern_f->file2_ino;
+       fxr.file2_gen           = kern_f->file2_gen;
+       fxr.file2_mtime.tv_sec  = kern_f->file2_mtime;
+       fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec;
+       fxr.file2_ctime.tv_sec  = kern_f->file2_ctime;
+       fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
+
+       file1 = fdget(args.file1_fd);
+       if (!file1.file)
+               return -EBADF;
+       fxr.file1 = file1.file;
+
+       error = xfs_exchange_range(&fxr);
+       fdput(file1);
+       return error;
+}
index 039abcca546e81e07e6eddd8c1e93fed4f8ac9c2..bc1298aba806baa2152f3890cd4df3b0fb2c7911 100644 (file)
 #define __XFS_EXCHANGE_RANGE_UPD_CMTIME1       (1ULL << 63)
 #define __XFS_EXCHANGE_RANGE_UPD_CMTIME2       (1ULL << 62)
 
+/* Freshness check required */
+#define __XFS_EXCHANGE_RANGE_CHECK_FRESH2      (1ULL << 61)
+
 #define XFS_EXCHANGE_RANGE_PRIV_FLAGS  (__XFS_EXCHANGE_RANGE_UPD_CMTIME1 | \
-                                        __XFS_EXCHANGE_RANGE_UPD_CMTIME2)
+                                        __XFS_EXCHANGE_RANGE_UPD_CMTIME2 | \
+                                        __XFS_EXCHANGE_RANGE_CHECK_FRESH2)
 
 struct xfs_exchrange {
        struct file             *file1;
@@ -22,10 +26,20 @@ struct xfs_exchrange {
        u64                     length;
 
        u64                     flags;  /* XFS_EXCHANGE_RANGE flags */
+
+       /* file2 metadata for freshness checks */
+       u64                     file2_ino;
+       struct timespec64       file2_mtime;
+       struct timespec64       file2_ctime;
+       u32                     file2_gen;
 };
 
 long xfs_ioc_exchange_range(struct file *file,
                struct xfs_exchange_range __user *argp);
+long xfs_ioc_start_commit(struct file *file,
+               struct xfs_commit_range __user *argp);
+long xfs_ioc_commit_range(struct file *file,
+               struct xfs_commit_range __user  *argp);
 
 struct xfs_exchmaps_req;
 
index 6b13666d4e96355aba7de80b5a1718df54dcd518..90b3ee21e7fe6a448792ccfef6a76ab4a265f571 100644 (file)
@@ -1518,6 +1518,10 @@ xfs_file_ioctl(
 
        case XFS_IOC_EXCHANGE_RANGE:
                return xfs_ioc_exchange_range(filp, arg);
+       case XFS_IOC_START_COMMIT:
+               return xfs_ioc_start_commit(filp, arg);
+       case XFS_IOC_COMMIT_RANGE:
+               return xfs_ioc_commit_range(filp, arg);
 
        default:
                return -ENOTTY;
index 180ce697305a92ea829f7e55ac5af55ce7ed8709..4cf0fa71ba9cee0acb51167c68361cbd69c54147 100644 (file)
@@ -4926,7 +4926,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_exchrange_error);
        { XFS_EXCHANGE_RANGE_DRY_RUN,           "DRY_RUN" }, \
        { XFS_EXCHANGE_RANGE_FILE1_WRITTEN,     "F1_WRITTEN" }, \
        { __XFS_EXCHANGE_RANGE_UPD_CMTIME1,     "CMTIME1" }, \
-       { __XFS_EXCHANGE_RANGE_UPD_CMTIME2,     "CMTIME2" }
+       { __XFS_EXCHANGE_RANGE_UPD_CMTIME2,     "CMTIME2" }, \
+       { __XFS_EXCHANGE_RANGE_CHECK_FRESH2,    "FRESH2" }
 
 /* file exchange-range tracepoint class */
 DECLARE_EVENT_CLASS(xfs_exchrange_class,
@@ -4986,6 +4987,60 @@ DEFINE_EXCHRANGE_EVENT(xfs_exchrange_prep);
 DEFINE_EXCHRANGE_EVENT(xfs_exchrange_flush);
 DEFINE_EXCHRANGE_EVENT(xfs_exchrange_mappings);
 
+TRACE_EVENT(xfs_exchrange_freshness,
+       TP_PROTO(const struct xfs_exchrange *fxr, struct xfs_inode *ip2),
+       TP_ARGS(fxr, ip2),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ip2_ino)
+               __field(long long, ip2_mtime)
+               __field(long long, ip2_ctime)
+               __field(int, ip2_mtime_nsec)
+               __field(int, ip2_ctime_nsec)
+
+               __field(xfs_ino_t, file2_ino)
+               __field(long long, file2_mtime)
+               __field(long long, file2_ctime)
+               __field(int, file2_mtime_nsec)
+               __field(int, file2_ctime_nsec)
+       ),
+       TP_fast_assign(
+               struct timespec64       ts64;
+               struct inode            *inode2 = VFS_I(ip2);
+
+               __entry->dev = inode2->i_sb->s_dev;
+               __entry->ip2_ino = ip2->i_ino;
+
+               ts64 = inode_get_ctime(inode2);
+               __entry->ip2_ctime = ts64.tv_sec;
+               __entry->ip2_ctime_nsec = ts64.tv_nsec;
+
+               ts64 = inode_get_mtime(inode2);
+               __entry->ip2_mtime = ts64.tv_sec;
+               __entry->ip2_mtime_nsec = ts64.tv_nsec;
+
+               __entry->file2_ino = fxr->file2_ino;
+               __entry->file2_mtime = fxr->file2_mtime.tv_sec;
+               __entry->file2_ctime = fxr->file2_ctime.tv_sec;
+               __entry->file2_mtime_nsec = fxr->file2_mtime.tv_nsec;
+               __entry->file2_ctime_nsec = fxr->file2_ctime.tv_nsec;
+       ),
+       TP_printk("dev %d:%d "
+                 "ino 0x%llx mtime %lld:%d ctime %lld:%d -> "
+                 "file 0x%llx mtime %lld:%d ctime %lld:%d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ip2_ino,
+                 __entry->ip2_mtime,
+                 __entry->ip2_mtime_nsec,
+                 __entry->ip2_ctime,
+                 __entry->ip2_ctime_nsec,
+                 __entry->file2_ino,
+                 __entry->file2_mtime,
+                 __entry->file2_mtime_nsec,
+                 __entry->file2_ctime,
+                 __entry->file2_ctime_nsec)
+);
+
 TRACE_EVENT(xfs_exchmaps_overhead,
        TP_PROTO(struct xfs_mount *mp, unsigned long long bmbt_blocks,
                 unsigned long long rmapbt_blocks),