xfs_scrub: retest metadata across scrub groups after a repair

author Darrick J. Wong <djwong@kernel.org>

Wed, 3 Jul 2024 21:22:22 +0000 (14:22 -0700)

committer Christoph Hellwig <hch@lst.de>

Sun, 11 Aug 2024 06:35:55 +0000 (08:35 +0200)
author Darrick J. Wong <djwong@kernel.org>
Wed, 3 Jul 2024 21:22:22 +0000 (14:22 -0700)
committer Christoph Hellwig <hch@lst.de>
Sun, 11 Aug 2024 06:35:55 +0000 (08:35 +0200)
diff --git a/scrub/phase4.c b/scrub/phase4.c

index 88cb53aeac9076359477b3fa0bd1fd947b8bd8b9..c58e4aaabda639701c574bd7a83de8f1be22d551 100644 (file)
--- a/scrub/phase4.c
+++ b/scrub/phase4.c
@@ -42,6 +42,51 @@ struct repair_list_schedule {
         bool                            made_progress;
  };
  
+/*
+ * After a successful repair, schedule any additional revalidations needed in
+ * other scrub groups.
+ */
+static int
+revalidate_across_groups(
+       struct scrub_ctx                *ctx,
+       const struct action_item        *old_aitem,
+       struct repair_list_schedule     *rls)
+{
+       struct action_list              alist;
+       int                             error;
+
+       action_list_init(&alist);
+
+       error = action_item_schedule_revalidation(ctx, old_aitem, &alist);
+       if (error) {
+               rls->aborted = true;
+               return error;
+       }
+
+       if (action_list_empty(&alist))
+               return 0;
+
+       pthread_mutex_unlock(&rls->lock);
+       error = action_list_revalidate(ctx, &alist);
+       pthread_mutex_lock(&rls->lock);
+
+       /*
+        * Action items attached to @alist after the revalidation are either
+        * the result of finding new inconsistencies or an incomplete list
+        * after an operational error.  In the first case we need these new
+        * items to be processed; in the second case, we're going to exit the
+        * process.  Either way, pass the items back to the caller.
+        */
+       action_list_merge(&rls->requeue_list, &alist);
+
+       if (error) {
+               rls->aborted = true;
+               return error;
+       }
+
+       return 0;
+}
+
  /* Try to repair as many things on our list as we can. */
  static void
  repair_list_worker(
@@ -89,9 +134,16 @@ repair_list_worker(
                         action_list_add(&rls->requeue_list, aitem);
                         break;
                 case TR_REPAIRED:
+                       ret = revalidate_across_groups(ctx, aitem, rls);
+                       if (ret) {
+                               free(aitem);
+                               break;
+                       }
+
                         /*
                          * All repairs for this item completed.  Free the item,
-                        * and remember that progress was made.
+                        * and remember that progress was made, even if group
+                        * revalidation uncovered more issues.
                          */
                         rls->made_progress = true;
                         free(aitem);
diff --git a/scrub/repair.c b/scrub/repair.c

index e6906cbd37d0b32936ef23dc5c27bdef8605aeff..c2ef7b87685fc7a3dde739e223c473ab9ae0e721 100644 (file)
--- a/scrub/repair.c
+++ b/scrub/repair.c
@@ -43,6 +43,15 @@ static const unsigned int repair_deps[XFS_SCRUB_TYPE_NR] = {
                                           DEP(XFS_SCRUB_TYPE_PQUOTA),
         [XFS_SCRUB_TYPE_RTSUM]          = DEP(XFS_SCRUB_TYPE_RTBITMAP),
  };
+
+/*
+ * Data dependencies that cross scrub groups.  When we repair a metadata object
+ * of the given type (e.g. rtgroup bitmaps), we want to trigger a revalidation
+ * of the specified objects (e.g. rt summary file).
+ */
+static const unsigned int cross_group_recheck[XFS_SCRUB_TYPE_NR] = {
+       [XFS_SCRUB_TYPE_RTBITMAP]       = DEP(XFS_SCRUB_TYPE_RTSUM),
+};
  #undef DEP
  
  /*
@@ -632,6 +641,16 @@ action_list_add(
         list_add_tail(&aitem->list, &alist->list);
  }
  
+/* Move an action item off of a list onto alist. */
+static void
+action_list_move(
+       struct action_list              *alist,
+       struct action_item              *aitem)
+{
+       list_del_init(&aitem->list);
+       action_list_add(alist, aitem);
+}
+
  /*
   * Try to repair a filesystem object and let the caller know what it should do
   * with the action item.  The caller must be able to requeue action items, so
@@ -895,3 +914,142 @@ repair_item_to_action_item(
         *aitemp = aitem;
         return 0;
  }
+
+static int
+schedule_cross_group_recheck(
+       struct scrub_ctx        *ctx,
+       unsigned int            recheck_mask,
+       struct action_list      *new_items)
+{
+       unsigned int            scrub_type;
+
+       foreach_scrub_type(scrub_type) {
+               struct action_item      *aitem;
+
+               if (!(recheck_mask & (1U << scrub_type)))
+                       continue;
+
+               switch (xfrog_scrubbers[scrub_type].group) {
+               case XFROG_SCRUB_GROUP_FS:
+                       /*
+                        * XXX gcc fortify gets confused on the memset in
+                        * scrub_item_init_fs if we hoist this allocation to a
+                        * helper function.
+                        */
+                       aitem = malloc(sizeof(struct action_item));
+                       if (!aitem) {
+                               int     error = errno;
+
+                               str_liberror(ctx, error,
+                                               _("creating repair revalidation action item"));
+                               return error;
+                       }
+
+                       INIT_LIST_HEAD(&aitem->list);
+                       aitem->sri.sri_revalidate = true;
+
+                       scrub_item_init_fs(&aitem->sri);
+                       scrub_item_schedule(&aitem->sri, scrub_type);
+                       action_list_add(new_items, aitem);
+                       break;
+               default:
+                       /* We don't support any other groups yet. */
+                       assert(false);
+                       continue;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * After a successful repair, schedule revalidation of metadata outside of this
+ * scrub item's group.
+ */
+int
+action_item_schedule_revalidation(
+       struct scrub_ctx                *ctx,
+       const struct action_item        *old_aitem,
+       struct action_list              *new_repairs)
+{
+       struct action_list              new_items;
+       struct action_item              *aitem, *n;
+       unsigned int                    scrub_type;
+       int                             error = 0;
+
+       /* Find new scrub items to revalidate */
+       action_list_init(&new_items);
+       foreach_scrub_type(scrub_type) {
+               unsigned int            mask;
+
+               if (!(old_aitem->sri.sri_selected & (1ULL << scrub_type)))
+                       continue;
+               mask = cross_group_recheck[scrub_type];
+               if (!mask)
+                       continue;
+
+               error = schedule_cross_group_recheck(ctx, mask, &new_items);
+               if (error)
+                       goto bad;
+       }
+       if (action_list_empty(&new_items))
+               return 0;
+
+       /* Scrub them all, and move corrupted items to the caller's list */
+       list_for_each_entry_safe(aitem, n, &new_items.list, list) {
+               unsigned int    bad;
+
+               error = scrub_item_check(ctx, &aitem->sri);
+               if (error)
+                        goto bad;
+
+               bad = repair_item_count_needsrepair(&aitem->sri);
+               if (bad > 0) {
+                       /*
+                        * Uhoh, we found something else broken.  Queue it for
+                        * more repairs.
+                        */
+                       aitem->sri.sri_revalidate = false;
+                       action_list_move(new_repairs, aitem);
+               }
+       }
+
+bad:
+       /* Delete anything that's still on the list. */
+       list_for_each_entry_safe(aitem, n, &new_items.list, list) {
+               list_del(&aitem->list);
+               free(aitem);
+       }
+
+       return error;
+}
+
+/*
+ * Revalidate all items scheduled for a recheck, and drop the ones that are
+ * clean.
+ */
+int
+action_list_revalidate(
+       struct scrub_ctx        *ctx,
+       struct action_list      *alist)
+{
+       struct action_item      *aitem, *n;
+       int                     error;
+
+       list_for_each_entry_safe(aitem, n, &alist->list, list) {
+               error = scrub_item_check(ctx, &aitem->sri);
+               if (error)
+                       return error;
+
+               if (repair_item_count_needsrepair(&aitem->sri) > 0) {
+                       aitem->sri.sri_revalidate = false;
+                       continue;
+               }
+
+               /* Metadata are clean, delete from list. */
+               list_del(&aitem->list);
+               free(aitem);
+       }
+
+       return 0;
+}
diff --git a/scrub/repair.h b/scrub/repair.h

index ec4aa381a82d36a490d92e7cfe4ed6ff919f4cb5..96f621f124d7e8955359553a76b219380390d4af 100644 (file)
--- a/scrub/repair.h
+++ b/scrub/repair.h
@@ -50,6 +50,11 @@ enum tryrepair_outcome {
  int action_item_try_repair(struct scrub_ctx *ctx, struct action_item *aitem,
                 enum tryrepair_outcome *outcome);
  
+int action_item_schedule_revalidation(struct scrub_ctx *ctx,
+               const struct action_item *old_aitem,
+               struct action_list *new_items);
+int action_list_revalidate(struct scrub_ctx *sc, struct action_list *alist);
+
  void repair_item_mustfix(struct scrub_item *sri, struct scrub_item *fix_now);
  
  /* Primary metadata is corrupt */
author	Darrick J. Wong <djwong@kernel.org>
	Wed, 3 Jul 2024 21:22:22 +0000 (14:22 -0700)
committer	Christoph Hellwig <hch@lst.de>
	Sun, 11 Aug 2024 06:35:55 +0000 (08:35 +0200)
scrub/phase4.c		patch \| blob \| history
scrub/repair.c		patch \| blob \| history
scrub/repair.h		patch \| blob \| history