From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 2 Nov 2018 16:33:08 +0000 (-0700)
Subject: Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
X-Git-Tag: v4.20-rc1~23
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=c2aa1a444cab2c673650ada80a7dffc4345ce2e6;p=users%2Fhch%2Fmisc.git

Merge tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux

Pull vfs dedup fixes from Dave Chinner:
 "This reworks the vfs data cloning infrastructure.

  We discovered many issues with these interfaces late in the 4.19 cycle
  - the worst of them (data corruption, setuid stripping) were fixed for
  XFS in 4.19-rc8, but a larger rework of the infrastructure fixing all
  the problems was needed. That rework is the contents of this pull
  request.

  Rework the vfs_clone_file_range and vfs_dedupe_file_range
  infrastructure to use a common .remap_file_range method and supply
  generic bounds and sanity checking functions that are shared with the
  data write path. The current VFS infrastructure has problems with
  rlimit, LFS file sizes, file time stamps, maximum filesystem file
  sizes, stripping setuid bits, etc and so they are addressed in these
  commits.

  We also introduce the ability for the ->remap_file_range methods to
  return short clones so that clones for vfs_copy_file_range() don't get
  rejected if the entire range can't be cloned. It also allows
  filesystems to sliently skip deduplication of partial EOF blocks if
  they are not capable of doing so without requiring errors to be thrown
  to userspace.

  Existing filesystems are converted to user the new remap_file_range
  method, and both XFS and ocfs2 are modified to make use of the new
  generic checking infrastructure"

* tag 'xfs-4.20-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: (28 commits)
  xfs: remove [cm]time update from reflink calls
  xfs: remove xfs_reflink_remap_range
  xfs: remove redundant remap partial EOF block checks
  xfs: support returning partial reflink results
  xfs: clean up xfs_reflink_remap_blocks call site
  xfs: fix pagecache truncation prior to reflink
  ocfs2: remove ocfs2_reflink_remap_range
  ocfs2: support partial clone range and dedupe range
  ocfs2: fix pagecache truncation prior to reflink
  ocfs2: truncate page cache for clone destination file before remapping
  vfs: clean up generic_remap_file_range_prep return value
  vfs: hide file range comparison function
  vfs: enable remap callers that can handle short operations
  vfs: plumb remap flags through the vfs dedupe functions
  vfs: plumb remap flags through the vfs clone functions
  vfs: make remap_file_range functions take and return bytes completed
  vfs: remap helper should update destination inode metadata
  vfs: pass remap flags to generic_remap_checks
  vfs: pass remap flags to generic_remap_file_range_prep
  vfs: combine the clone and dedupe into a single remap_file_range
  ...
---

c2aa1a444cab2c673650ada80a7dffc4345ce2e6
diff --cc Documentation/filesystems/porting
index 321d74b73937,e6d4466268dd..cf43bc4dbf31
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@@ -623,13 -623,7 +623,18 @@@ in your dentry operations instead
  	On success you get a new struct file sharing the mount/dentry with the
  	original, on failure - ERR_PTR().
  --
+ [mandatory]
+ 	->clone_file_range() and ->dedupe_file_range have been replaced with
+ 	->remap_file_range().  See Documentation/filesystems/vfs.txt for more
+ 	information.
++--
 +[recommended]
 +	->lookup() instances doing an equivalent of
 +		if (IS_ERR(inode))
 +			return ERR_CAST(inode);
 +		return d_splice_alias(inode, dentry);
 +	don't need to bother with the check - d_splice_alias() will do the
 +	right thing when given ERR_PTR(...) as inode.  Moreover, passing NULL
 +	inode to d_splice_alias() will also do the right thing (equivalent of
 +	d_add(dentry, NULL); return NULL;), so that kind of special cases
 +	also doesn't need a separate treatment.
diff --cc fs/read_write.c
index 5a2ee488c5d2,6b40a43edf18..bfcb4ced5664
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@@ -1880,120 -2014,14 +2013,28 @@@ loff_t vfs_clone_file_range(struct fil
  }
  EXPORT_SYMBOL(vfs_clone_file_range);
  
- /*
-  * Read a page's worth of file data into the page cache.  Return the page
-  * locked.
-  */
- static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
- {
- 	struct address_space *mapping;
- 	struct page *page;
- 	pgoff_t n;
- 
- 	n = offset >> PAGE_SHIFT;
- 	mapping = inode->i_mapping;
- 	page = read_mapping_page(mapping, n, NULL);
- 	if (IS_ERR(page))
- 		return page;
- 	if (!PageUptodate(page)) {
- 		put_page(page);
- 		return ERR_PTR(-EIO);
- 	}
- 	lock_page(page);
- 	return page;
- }
- 
- /*
-  * Compare extents of two files to see if they are the same.
-  * Caller must have locked both inodes to prevent write races.
-  */
- int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
- 				  struct inode *dest, loff_t destoff,
- 				  loff_t len, bool *is_same)
- {
- 	loff_t src_poff;
- 	loff_t dest_poff;
- 	void *src_addr;
- 	void *dest_addr;
- 	struct page *src_page;
- 	struct page *dest_page;
- 	loff_t cmp_len;
- 	bool same;
- 	int error;
- 
- 	error = -EINVAL;
- 	same = true;
- 	while (len) {
- 		src_poff = srcoff & (PAGE_SIZE - 1);
- 		dest_poff = destoff & (PAGE_SIZE - 1);
- 		cmp_len = min(PAGE_SIZE - src_poff,
- 			      PAGE_SIZE - dest_poff);
- 		cmp_len = min(cmp_len, len);
- 		if (cmp_len <= 0)
- 			goto out_error;
- 
- 		src_page = vfs_dedupe_get_page(src, srcoff);
- 		if (IS_ERR(src_page)) {
- 			error = PTR_ERR(src_page);
- 			goto out_error;
- 		}
- 		dest_page = vfs_dedupe_get_page(dest, destoff);
- 		if (IS_ERR(dest_page)) {
- 			error = PTR_ERR(dest_page);
- 			unlock_page(src_page);
- 			put_page(src_page);
- 			goto out_error;
- 		}
- 		src_addr = kmap_atomic(src_page);
- 		dest_addr = kmap_atomic(dest_page);
- 
- 		flush_dcache_page(src_page);
- 		flush_dcache_page(dest_page);
- 
- 		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
- 			same = false;
- 
- 		kunmap_atomic(dest_addr);
- 		kunmap_atomic(src_addr);
- 		unlock_page(dest_page);
- 		unlock_page(src_page);
- 		put_page(dest_page);
- 		put_page(src_page);
- 
- 		if (!same)
- 			break;
- 
- 		srcoff += cmp_len;
- 		destoff += cmp_len;
- 		len -= cmp_len;
- 	}
- 
- 	*is_same = same;
- 	return 0;
- 
- out_error:
- 	return error;
- }
- EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
- 
 +/* Check whether we are allowed to dedupe the destination file */
 +static bool allow_file_dedupe(struct file *file)
 +{
 +	if (capable(CAP_SYS_ADMIN))
 +		return true;
 +	if (file->f_mode & FMODE_WRITE)
 +		return true;
 +	if (uid_eq(current_fsuid(), file_inode(file)->i_uid))
 +		return true;
 +	if (!inode_permission(file_inode(file), MAY_WRITE))
 +		return true;
 +	return false;
 +}
 +
- int vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
- 			      struct file *dst_file, loff_t dst_pos, u64 len)
+ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
+ 				 struct file *dst_file, loff_t dst_pos,
+ 				 loff_t len, unsigned int remap_flags)
  {
- 	s64 ret;
+ 	loff_t ret;
+ 
+ 	WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
+ 				     REMAP_FILE_CAN_SHORTEN));
  
  	ret = mnt_want_write_file(dst_file);
  	if (ret)