mm, pmem, xfs: Introduce MF_MEM_PRE_REMOVE for unbind

author Shiyang Ruan <ruansy.fnst@fujitsu.com>

Mon, 23 Oct 2023 07:20:46 +0000 (15:20 +0800)

committer Chandan Babu R <chandanbabu@kernel.org>

Thu, 7 Dec 2023 09:04:26 +0000 (14:34 +0530)
author Shiyang Ruan <ruansy.fnst@fujitsu.com>
Mon, 23 Oct 2023 07:20:46 +0000 (15:20 +0800)
committer Chandan Babu R <chandanbabu@kernel.org>
Thu, 7 Dec 2023 09:04:26 +0000 (14:34 +0530)
diff --git a/drivers/dax/super.c b/drivers/dax/super.c

index 0da9232ea1754b862455fe6d2d1c0e57398cfde1..f4b635526345adcffb2bf3031c35a464599fef1d 100644 (file)
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -326,7 +326,8 @@ void kill_dax(struct dax_device *dax_dev)
                 return;
  
         if (dax_dev->holder_data != NULL)
-               dax_holder_notify_failure(dax_dev, 0, U64_MAX, 0);
+               dax_holder_notify_failure(dax_dev, 0, U64_MAX,
+                               MF_MEM_PRE_REMOVE);
  
         clear_bit(DAXDEV_ALIVE, &dax_dev->flags);
         synchronize_srcu(&dax_srcu);
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c

index a7daa522e00fe758039b0811bfc9568b28dd1466..fa50e5308292d3eaf2e77b0aafef5848e5ed18fe 100644 (file)
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
  
  #include <linux/mm.h>
  #include <linux/dax.h>
+#include <linux/fs.h>
  
  struct xfs_failure_info {
         xfs_agblock_t           startblock;
@@ -73,10 +74,16 @@ xfs_dax_failure_fn(
         struct xfs_mount                *mp = cur->bc_mp;
         struct xfs_inode                *ip;
         struct xfs_failure_info         *notify = data;
+       struct address_space            *mapping;
+       pgoff_t                         pgoff;
+       unsigned long                   pgcnt;
         int                             error = 0;
  
         if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
             (rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+               /* Continue the query because this isn't a failure. */
+               if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+                       return 0;
                 notify->want_shutdown = true;
                 return 0;
         }
@@ -92,14 +99,60 @@ xfs_dax_failure_fn(
                 return 0;
         }
  
-       error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
-                                 xfs_failure_pgoff(mp, rec, notify),
-                                 xfs_failure_pgcnt(mp, rec, notify),
-                                 notify->mf_flags);
+       mapping = VFS_I(ip)->i_mapping;
+       pgoff = xfs_failure_pgoff(mp, rec, notify);
+       pgcnt = xfs_failure_pgcnt(mp, rec, notify);
+
+       /* Continue the rmap query if the inode isn't a dax file. */
+       if (dax_mapping(mapping))
+               error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
+                                         notify->mf_flags);
+
+       /* Invalidate the cache in dax pages. */
+       if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+               invalidate_inode_pages2_range(mapping, pgoff,
+                                             pgoff + pgcnt - 1);
+
         xfs_irele(ip);
         return error;
  }
  
+static int
+xfs_dax_notify_failure_freeze(
+       struct xfs_mount        *mp)
+{
+       struct super_block      *sb = mp->m_super;
+       int                     error;
+
+       error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+       if (error)
+               xfs_emerg(mp, "already frozen by kernel, err=%d", error);
+
+       return error;
+}
+
+static void
+xfs_dax_notify_failure_thaw(
+       struct xfs_mount        *mp,
+       bool                    kernel_frozen)
+{
+       struct super_block      *sb = mp->m_super;
+       int                     error;
+
+       if (kernel_frozen) {
+               error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+               if (error)
+                       xfs_emerg(mp, "still frozen after notify failure, err=%d",
+                               error);
+       }
+
+       /*
+        * Also thaw userspace call anyway because the device is about to be
+        * removed immediately.
+        */
+       thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+}
+
  static int
  xfs_dax_notify_ddev_failure(
         struct xfs_mount        *mp,
@@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure(
         struct xfs_btree_cur    *cur = NULL;
         struct xfs_buf          *agf_bp = NULL;
         int                     error = 0;
+       bool                    kernel_frozen = false;
         xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, daddr);
         xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, fsbno);
         xfs_fsblock_t           end_fsbno = XFS_DADDR_TO_FSB(mp,
                                                              daddr + bblen - 1);
         xfs_agnumber_t          end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
  
+       if (mf_flags & MF_MEM_PRE_REMOVE) {
+               xfs_info(mp, "Device is about to be removed!");
+               /*
+                * Freeze fs to prevent new mappings from being created.
+                * - Keep going on if others already hold the kernel forzen.
+                * - Keep going on if other errors too because this device is
+                *   starting to fail.
+                * - If kernel frozen state is hold successfully here, thaw it
+                *   here as well at the end.
+                */
+               kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
+       }
+
         error = xfs_trans_alloc_empty(mp, &tp);
         if (error)
-               return error;
+               goto out;
  
         for (; agno <= end_agno; agno++) {
                 struct xfs_rmap_irec    ri_low = { };
@@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure(
         }
  
         xfs_trans_cancel(tp);
-       if (error || notify.want_shutdown) {
+
+       /*
+        * Shutdown fs from a force umount in pre-remove case which won't fail,
+        * so errors can be ignored.  Otherwise, shutdown the filesystem with
+        * CORRUPT flag if error occured or notify.want_shutdown was set during
+        * RMAP querying.
+        */
+       if (mf_flags & MF_MEM_PRE_REMOVE)
+               xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+       else if (error || notify.want_shutdown) {
                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
                 if (!error)
                         error = -EFSCORRUPTED;
         }
+
+out:
+       /* Thaw the fs if it has been frozen before. */
+       if (mf_flags & MF_MEM_PRE_REMOVE)
+               xfs_dax_notify_failure_thaw(mp, kernel_frozen);
+
         return error;
  }
  
@@ -197,6 +279,14 @@ xfs_dax_notify_failure(
  
         if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
             mp->m_logdev_targp != mp->m_ddev_targp) {
+               /*
+                * In the pre-remove case the failure notification is attempting
+                * to trigger a force unmount.  The expectation is that the
+                * device is still present, but its removal is in progress and
+                * can not be cancelled, proceed with accessing the log device.
+                */
+               if (mf_flags & MF_MEM_PRE_REMOVE)
+                       return 0;
                 xfs_err(mp, "ondisk log corrupt, shutting down fs!");
                 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
                 return -EFSCORRUPTED;
@@ -210,6 +300,12 @@ xfs_dax_notify_failure(
         ddev_start = mp->m_ddev_targp->bt_dax_part_off;
         ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
  
+       /* Notify failure on the whole device. */
+       if (offset == 0 && len == U64_MAX) {
+               offset = ddev_start;
+               len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
+       }
+
         /* Ignore the range out of filesystem area */
         if (offset + len - 1 < ddev_start)
                 return -ENXIO;
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 418d26608ece70d12a5608dff42f0f4d04af5aea..caf13e94260e3ad58ade0dc569abcd0897b213ec 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3904,6 +3904,7 @@ enum mf_flags {
         MF_UNPOISON = 1 << 4,
         MF_SW_SIMULATED = 1 << 5,
         MF_NO_RETRY = 1 << 6,
+       MF_MEM_PRE_REMOVE = 1 << 7,
  };
  int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                       unsigned long count, int mf_flags);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index 660c21859118e3fa1d156df7df11e79831e8de2a..cff3bda60691d524cbb4d3ae63b7e67d8b3410cb 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -679,7 +679,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
   */
  static void collect_procs_fsdax(struct page *page,
                 struct address_space *mapping, pgoff_t pgoff,
-               struct list_head *to_kill)
+               struct list_head *to_kill, bool pre_remove)
  {
         struct vm_area_struct *vma;
         struct task_struct *tsk;
@@ -687,8 +687,15 @@ static void collect_procs_fsdax(struct page *page,
         i_mmap_lock_read(mapping);
         rcu_read_lock();
         for_each_process(tsk) {
-               struct task_struct *t = task_early_kill(tsk, true);
+               struct task_struct *t = tsk;
  
+               /*
+                * Search for all tasks while MF_MEM_PRE_REMOVE is set, because
+                * the current may not be the one accessing the fsdax page.
+                * Otherwise, search for the current task.
+                */
+               if (!pre_remove)
+                       t = task_early_kill(tsk, true);
                 if (!t)
                         continue;
                 vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@@ -1795,6 +1802,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
         dax_entry_t cookie;
         struct page *page;
         size_t end = index + count;
+       bool pre_remove = mf_flags & MF_MEM_PRE_REMOVE;
  
         mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
  
@@ -1806,9 +1814,14 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
                 if (!page)
                         goto unlock;
  
-               SetPageHWPoison(page);
+               if (!pre_remove)
+                       SetPageHWPoison(page);
  
-               collect_procs_fsdax(page, mapping, index, &to_kill);
+               /*
+                * The pre_remove case is revoking access, the memory is still
+                * good and could theoretically be put back into service.
+                */
+               collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
                 unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
                                 index, mf_flags);
  unlock:
author	Shiyang Ruan <ruansy.fnst@fujitsu.com>
	Mon, 23 Oct 2023 07:20:46 +0000 (15:20 +0800)
committer	Chandan Babu R <chandanbabu@kernel.org>
	Thu, 7 Dec 2023 09:04:26 +0000 (14:34 +0530)
drivers/dax/super.c		patch \| blob \| history
fs/xfs/xfs_notify_failure.c		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
mm/memory-failure.c		patch \| blob \| history