From: Linus Torvalds Date: Fri, 2 Nov 2018 16:33:08 +0000 (-0700) Subject: Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux X-Git-Tag: v4.20-rc1~23 X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=c2aa1a444cab2c673650ada80a7dffc4345ce2e6;p=users%2Fhch%2Fmisc.git Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux Pull vfs dedup fixes from Dave Chinner: "This reworks the vfs data cloning infrastructure. We discovered many issues with these interfaces late in the 4.19 cycle - the worst of them (data corruption, setuid stripping) were fixed for XFS in 4.19-rc8, but a larger rework of the infrastructure fixing all the problems was needed. That rework is the contents of this pull request. Rework the vfs_clone_file_range and vfs_dedupe_file_range infrastructure to use a common .remap_file_range method and supply generic bounds and sanity checking functions that are shared with the data write path. The current VFS infrastructure has problems with rlimit, LFS file sizes, file time stamps, maximum filesystem file sizes, stripping setuid bits, etc and so they are addressed in these commits. We also introduce the ability for the ->remap_file_range methods to return short clones so that clones for vfs_copy_file_range() don't get rejected if the entire range can't be cloned. It also allows filesystems to sliently skip deduplication of partial EOF blocks if they are not capable of doing so without requiring errors to be thrown to userspace. Existing filesystems are converted to user the new remap_file_range method, and both XFS and ocfs2 are modified to make use of the new generic checking infrastructure" * tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (28 commits) xfs: remove [cm]time update from reflink calls xfs: remove xfs_reflink_remap_range xfs: remove redundant remap partial EOF block checks xfs: support returning partial reflink results xfs: clean up xfs_reflink_remap_blocks call site xfs: fix pagecache truncation prior to reflink ocfs2: remove ocfs2_reflink_remap_range ocfs2: support partial clone range and dedupe range ocfs2: fix pagecache truncation prior to reflink ocfs2: truncate page cache for clone destination file before remapping vfs: clean up generic_remap_file_range_prep return value vfs: hide file range comparison function vfs: enable remap callers that can handle short operations vfs: plumb remap flags through the vfs dedupe functions vfs: plumb remap flags through the vfs clone functions vfs: make remap_file_range functions take and return bytes completed vfs: remap helper should update destination inode metadata vfs: pass remap flags to generic_remap_checks vfs: pass remap flags to generic_remap_file_range_prep vfs: combine the clone and dedupe into a single remap_file_range ... --- c2aa1a444cab2c673650ada80a7dffc4345ce2e6 diff --cc Documentation/filesystems/porting index 321d74b73937,e6d4466268dd..cf43bc4dbf31 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@@ -623,13 -623,7 +623,18 @@@ in your dentry operations instead On success you get a new struct file sharing the mount/dentry with the original, on failure - ERR_PTR(). -- + [mandatory] + ->clone_file_range() and ->dedupe_file_range have been replaced with + ->remap_file_range(). See Documentation/filesystems/vfs.txt for more + information. ++-- +[recommended] + ->lookup() instances doing an equivalent of + if (IS_ERR(inode)) + return ERR_CAST(inode); + return d_splice_alias(inode, dentry); + don't need to bother with the check - d_splice_alias() will do the + right thing when given ERR_PTR(...) as inode. Moreover, passing NULL + inode to d_splice_alias() will also do the right thing (equivalent of + d_add(dentry, NULL); return NULL;), so that kind of special cases + also doesn't need a separate treatment. diff --cc fs/read_write.c index 5a2ee488c5d2,6b40a43edf18..bfcb4ced5664 --- a/fs/read_write.c +++ b/fs/read_write.c @@@ -1880,120 -2014,14 +2013,28 @@@ loff_t vfs_clone_file_range(struct fil } EXPORT_SYMBOL(vfs_clone_file_range); - /* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ - static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) - { - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; - } - - /* - * Compare extents of two files to see if they are the same. - * Caller must have locked both inodes to prevent write races. - */ - int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, - struct inode *dest, loff_t destoff, - loff_t len, bool *is_same) - { - loff_t src_poff; - loff_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - loff_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - if (cmp_len <= 0) - goto out_error; - - src_page = vfs_dedupe_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = vfs_dedupe_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - - out_error: - return error; - } - EXPORT_SYMBOL(vfs_dedupe_file_range_compare); - +/* Check whether we are allowed to dedupe the destination file */ +static bool allow_file_dedupe(struct file *file) +{ + if (capable(CAP_SYS_ADMIN)) + return true; + if (file->f_mode & FMODE_WRITE) + return true; + if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) + return true; + if (!inode_permission(file_inode(file), MAY_WRITE)) + return true; + return false; +} + - int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, - struct file *dst_file, loff_t dst_pos, u64 len) + loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, + struct file *dst_file, loff_t dst_pos, + loff_t len, unsigned int remap_flags) { - s64 ret; + loff_t ret; + + WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | + REMAP_FILE_CAN_SHORTEN)); ret = mnt_want_write_file(dst_file); if (ret)