]> www.infradead.org Git - users/hch/dma-mapping.git/commitdiff
xfs: log recovery should replay deferred ops in order
authorDarrick J. Wong <darrick.wong@oracle.com>
Wed, 22 Nov 2017 04:53:02 +0000 (20:53 -0800)
committerDarrick J. Wong <darrick.wong@oracle.com>
Mon, 27 Nov 2017 17:34:08 +0000 (09:34 -0800)
As part of testing log recovery with dm_log_writes, Amir Goldstein
discovered an error in the deferred ops recovery that lead to corruption
of the filesystem metadata if a reflink+rmap filesystem happened to shut
down midway through a CoW remap:

"This is what happens [after failed log recovery]:

"Phase 1 - find and verify superblock...
"Phase 2 - using internal log
"        - zero log...
"        - scan filesystem freespace and inode maps...
"        - found root inode chunk
"Phase 3 - for each AG...
"        - scan (but don't clear) agi unlinked lists...
"        - process known inodes and perform inode discovery...
"        - agno = 0
"data fork in regular inode 134 claims CoW block 376
"correcting nextents for inode 134
"bad data fork in inode 134
"would have cleared inode 134"

Hou Tao dissected the log contents of exactly such a crash:

"According to the implementation of xfs_defer_finish(), these ops should
be completed in the following sequence:

"Have been done:
"(1) CUI: Oper (160)
"(2) BUI: Oper (161)
"(3) CUD: Oper (194), for CUI Oper (160)
"(4) RUI A: Oper (197), free rmap [0x155, 2, -9]

"Should be done:
"(5) BUD: for BUI Oper (161)
"(6) RUI B: add rmap [0x155, 2, 137]
"(7) RUD: for RUI A
"(8) RUD: for RUI B

"Actually be done by xlog_recover_process_intents()
"(5) BUD: for BUI Oper (161)
"(6) RUI B: add rmap [0x155, 2, 137]
"(7) RUD: for RUI B
"(8) RUD: for RUI A

"So the rmap entry [0x155, 2, -9] for COW should be freed firstly,
then a new rmap entry [0x155, 2, 137] will be added. However, as we can see
from the log record in post_mount.log (generated after umount) and the trace
print, the new rmap entry [0x155, 2, 137] are added firstly, then the rmap
entry [0x155, 2, -9] are freed."

When reconstructing the internal log state from the log items found on
disk, it's required that deferred ops replay in exactly the same order
that they would have had the filesystem not gone down.  However,
replaying unfinished deferred ops can create /more/ deferred ops.  These
new deferred ops are finished in the wrong order.  This causes fs
corruption and replay crashes, so let's create a single defer_ops to
handle the subsequent ops created during replay, then use one single
transaction at the end of log recovery to ensure that everything is
replayed in the same order as they're supposed to be.

Reported-by: Amir Goldstein <amir73il@gmail.com>
Analyzed-by: Hou Tao <houtao1@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
fs/xfs/xfs_bmap_item.c
fs/xfs/xfs_bmap_item.h
fs/xfs/xfs_log_recover.c
fs/xfs/xfs_refcount_item.c
fs/xfs/xfs_refcount_item.h

index dd136f7275e4a2efede5a7612de243ddd1349abf..e5fb008d75e899aaa26948ae5044e55e59ca6f11 100644 (file)
@@ -389,7 +389,8 @@ xfs_bud_init(
 int
 xfs_bui_recover(
        struct xfs_mount                *mp,
-       struct xfs_bui_log_item         *buip)
+       struct xfs_bui_log_item         *buip,
+       struct xfs_defer_ops            *dfops)
 {
        int                             error = 0;
        unsigned int                    bui_type;
@@ -404,9 +405,7 @@ xfs_bui_recover(
        xfs_exntst_t                    state;
        struct xfs_trans                *tp;
        struct xfs_inode                *ip = NULL;
-       struct xfs_defer_ops            dfops;
        struct xfs_bmbt_irec            irec;
-       xfs_fsblock_t                   firstfsb;
 
        ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
 
@@ -464,7 +463,6 @@ xfs_bui_recover(
 
        if (VFS_I(ip)->i_nlink == 0)
                xfs_iflags_set(ip, XFS_IRECOVERY);
-       xfs_defer_init(&dfops, &firstfsb);
 
        /* Process deferred bmap item. */
        state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
@@ -479,16 +477,16 @@ xfs_bui_recover(
                break;
        default:
                error = -EFSCORRUPTED;
-               goto err_dfops;
+               goto err_inode;
        }
        xfs_trans_ijoin(tp, ip, 0);
 
        count = bmap->me_len;
-       error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+       error = xfs_trans_log_finish_bmap_update(tp, budp, dfops, type,
                        ip, whichfork, bmap->me_startoff,
                        bmap->me_startblock, &count, state);
        if (error)
-               goto err_dfops;
+               goto err_inode;
 
        if (count > 0) {
                ASSERT(type == XFS_BMAP_UNMAP);
@@ -496,16 +494,11 @@ xfs_bui_recover(
                irec.br_blockcount = count;
                irec.br_startoff = bmap->me_startoff;
                irec.br_state = state;
-               error = xfs_bmap_unmap_extent(tp->t_mountp, &dfops, ip, &irec);
+               error = xfs_bmap_unmap_extent(tp->t_mountp, dfops, ip, &irec);
                if (error)
-                       goto err_dfops;
+                       goto err_inode;
        }
 
-       /* Finish transaction, free inodes. */
-       error = xfs_defer_finish(&tp, &dfops);
-       if (error)
-               goto err_dfops;
-
        set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
        error = xfs_trans_commit(tp);
        xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -513,8 +506,6 @@ xfs_bui_recover(
 
        return error;
 
-err_dfops:
-       xfs_defer_cancel(&dfops);
 err_inode:
        xfs_trans_cancel(tp);
        if (ip) {
index c867daae4a3ce54c97055e133b478dd3baf6db61..24b354a2c83641487acfeb76a336a6476b9b98b3 100644 (file)
@@ -93,6 +93,7 @@ struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
                struct xfs_bui_log_item *);
 void xfs_bui_item_free(struct xfs_bui_log_item *);
 void xfs_bui_release(struct xfs_bui_log_item *);
-int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip);
+int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip,
+               struct xfs_defer_ops *dfops);
 
 #endif /* __XFS_BMAP_ITEM_H__ */
index 87b1c331f9ebfb7cefb708adc47b55890bf7ab9d..28d1abfe835eef3e9d87f7da1c7c805fef0488f4 100644 (file)
@@ -24,6 +24,7 @@
 #include "xfs_bit.h"
 #include "xfs_sb.h"
 #include "xfs_mount.h"
+#include "xfs_defer.h"
 #include "xfs_da_format.h"
 #include "xfs_da_btree.h"
 #include "xfs_inode.h"
@@ -4716,7 +4717,8 @@ STATIC int
 xlog_recover_process_cui(
        struct xfs_mount                *mp,
        struct xfs_ail                  *ailp,
-       struct xfs_log_item             *lip)
+       struct xfs_log_item             *lip,
+       struct xfs_defer_ops            *dfops)
 {
        struct xfs_cui_log_item         *cuip;
        int                             error;
@@ -4729,7 +4731,7 @@ xlog_recover_process_cui(
                return 0;
 
        spin_unlock(&ailp->xa_lock);
-       error = xfs_cui_recover(mp, cuip);
+       error = xfs_cui_recover(mp, cuip, dfops);
        spin_lock(&ailp->xa_lock);
 
        return error;
@@ -4756,7 +4758,8 @@ STATIC int
 xlog_recover_process_bui(
        struct xfs_mount                *mp,
        struct xfs_ail                  *ailp,
-       struct xfs_log_item             *lip)
+       struct xfs_log_item             *lip,
+       struct xfs_defer_ops            *dfops)
 {
        struct xfs_bui_log_item         *buip;
        int                             error;
@@ -4769,7 +4772,7 @@ xlog_recover_process_bui(
                return 0;
 
        spin_unlock(&ailp->xa_lock);
-       error = xfs_bui_recover(mp, buip);
+       error = xfs_bui_recover(mp, buip, dfops);
        spin_lock(&ailp->xa_lock);
 
        return error;
@@ -4805,6 +4808,46 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
        }
 }
 
+/* Take all the collected deferred ops and finish them in order. */
+static int
+xlog_finish_defer_ops(
+       struct xfs_mount        *mp,
+       struct xfs_defer_ops    *dfops)
+{
+       struct xfs_trans        *tp;
+       int64_t                 freeblks;
+       uint                    resblks;
+       int                     error;
+
+       /*
+        * We're finishing the defer_ops that accumulated as a result of
+        * recovering unfinished intent items during log recovery.  We
+        * reserve an itruncate transaction because it is the largest
+        * permanent transaction type.  Since we're the only user of the fs
+        * right now, take 93% (15/16) of the available free blocks.  Use
+        * weird math to avoid a 64-bit division.
+        */
+       freeblks = percpu_counter_sum(&mp->m_fdblocks);
+       if (freeblks <= 0)
+               return -ENOSPC;
+       resblks = min_t(int64_t, UINT_MAX, freeblks);
+       resblks = (resblks * 15) >> 4;
+       error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
+                       0, XFS_TRANS_RESERVE, &tp);
+       if (error)
+               return error;
+
+       error = xfs_defer_finish(&tp, dfops);
+       if (error)
+               goto out_cancel;
+
+       return xfs_trans_commit(tp);
+
+out_cancel:
+       xfs_trans_cancel(tp);
+       return error;
+}
+
 /*
  * When this is called, all of the log intent items which did not have
  * corresponding log done items should be in the AIL.  What we do now
@@ -4825,10 +4868,12 @@ STATIC int
 xlog_recover_process_intents(
        struct xlog             *log)
 {
-       struct xfs_log_item     *lip;
-       int                     error = 0;
+       struct xfs_defer_ops    dfops;
        struct xfs_ail_cursor   cur;
+       struct xfs_log_item     *lip;
        struct xfs_ail          *ailp;
+       xfs_fsblock_t           firstfsb;
+       int                     error = 0;
 #if defined(DEBUG) || defined(XFS_WARN)
        xfs_lsn_t               last_lsn;
 #endif
@@ -4839,6 +4884,7 @@ xlog_recover_process_intents(
 #if defined(DEBUG) || defined(XFS_WARN)
        last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
 #endif
+       xfs_defer_init(&dfops, &firstfsb);
        while (lip != NULL) {
                /*
                 * We're done when we see something other than an intent.
@@ -4859,6 +4905,12 @@ xlog_recover_process_intents(
                 */
                ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
 
+               /*
+                * NOTE: If your intent processing routine can create more
+                * deferred ops, you /must/ attach them to the dfops in this
+                * routine or else those subsequent intents will get
+                * replayed in the wrong order!
+                */
                switch (lip->li_type) {
                case XFS_LI_EFI:
                        error = xlog_recover_process_efi(log->l_mp, ailp, lip);
@@ -4867,10 +4919,12 @@ xlog_recover_process_intents(
                        error = xlog_recover_process_rui(log->l_mp, ailp, lip);
                        break;
                case XFS_LI_CUI:
-                       error = xlog_recover_process_cui(log->l_mp, ailp, lip);
+                       error = xlog_recover_process_cui(log->l_mp, ailp, lip,
+                                       &dfops);
                        break;
                case XFS_LI_BUI:
-                       error = xlog_recover_process_bui(log->l_mp, ailp, lip);
+                       error = xlog_recover_process_bui(log->l_mp, ailp, lip,
+                                       &dfops);
                        break;
                }
                if (error)
@@ -4880,6 +4934,11 @@ xlog_recover_process_intents(
 out:
        xfs_trans_ail_cursor_done(&cur);
        spin_unlock(&ailp->xa_lock);
+       if (error)
+               xfs_defer_cancel(&dfops);
+       else
+               error = xlog_finish_defer_ops(log->l_mp, &dfops);
+
        return error;
 }
 
index 8f2e2fac4255d63fc1dab2d268433e099aaf5aa8..3a55d6fc271b1e6d50aa6ce96c9e84b7c5886e43 100644 (file)
@@ -393,7 +393,8 @@ xfs_cud_init(
 int
 xfs_cui_recover(
        struct xfs_mount                *mp,
-       struct xfs_cui_log_item         *cuip)
+       struct xfs_cui_log_item         *cuip,
+       struct xfs_defer_ops            *dfops)
 {
        int                             i;
        int                             error = 0;
@@ -405,11 +406,9 @@ xfs_cui_recover(
        struct xfs_trans                *tp;
        struct xfs_btree_cur            *rcur = NULL;
        enum xfs_refcount_intent_type   type;
-       xfs_fsblock_t                   firstfsb;
        xfs_fsblock_t                   new_fsb;
        xfs_extlen_t                    new_len;
        struct xfs_bmbt_irec            irec;
-       struct xfs_defer_ops            dfops;
        bool                            requeue_only = false;
 
        ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
@@ -465,7 +464,6 @@ xfs_cui_recover(
                return error;
        cudp = xfs_trans_get_cud(tp, cuip);
 
-       xfs_defer_init(&dfops, &firstfsb);
        for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
                refc = &cuip->cui_format.cui_extents[i];
                refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
@@ -485,7 +483,7 @@ xfs_cui_recover(
                        new_len = refc->pe_len;
                } else
                        error = xfs_trans_log_finish_refcount_update(tp, cudp,
-                               &dfops, type, refc->pe_startblock, refc->pe_len,
+                               dfops, type, refc->pe_startblock, refc->pe_len,
                                &new_fsb, &new_len, &rcur);
                if (error)
                        goto abort_error;
@@ -497,21 +495,21 @@ xfs_cui_recover(
                        switch (type) {
                        case XFS_REFCOUNT_INCREASE:
                                error = xfs_refcount_increase_extent(
-                                               tp->t_mountp, &dfops, &irec);
+                                               tp->t_mountp, dfops, &irec);
                                break;
                        case XFS_REFCOUNT_DECREASE:
                                error = xfs_refcount_decrease_extent(
-                                               tp->t_mountp, &dfops, &irec);
+                                               tp->t_mountp, dfops, &irec);
                                break;
                        case XFS_REFCOUNT_ALLOC_COW:
                                error = xfs_refcount_alloc_cow_extent(
-                                               tp->t_mountp, &dfops,
+                                               tp->t_mountp, dfops,
                                                irec.br_startblock,
                                                irec.br_blockcount);
                                break;
                        case XFS_REFCOUNT_FREE_COW:
                                error = xfs_refcount_free_cow_extent(
-                                               tp->t_mountp, &dfops,
+                                               tp->t_mountp, dfops,
                                                irec.br_startblock,
                                                irec.br_blockcount);
                                break;
@@ -525,17 +523,12 @@ xfs_cui_recover(
        }
 
        xfs_refcount_finish_one_cleanup(tp, rcur, error);
-       error = xfs_defer_finish(&tp, &dfops);
-       if (error)
-               goto abort_defer;
        set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
        error = xfs_trans_commit(tp);
        return error;
 
 abort_error:
        xfs_refcount_finish_one_cleanup(tp, rcur, error);
-abort_defer:
-       xfs_defer_cancel(&dfops);
        xfs_trans_cancel(tp);
        return error;
 }
index 5b74dddfa64be728f74e7dd1f731983c1523ad1a..0e5327349a13ee5921808ed866ac02acc038d0df 100644 (file)
@@ -96,6 +96,7 @@ struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
                struct xfs_cui_log_item *);
 void xfs_cui_item_free(struct xfs_cui_log_item *);
 void xfs_cui_release(struct xfs_cui_log_item *);
-int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip);
+int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip,
+               struct xfs_defer_ops *dfops);
 
 #endif /* __XFS_REFCOUNT_ITEM_H__ */