]> www.infradead.org Git - users/jedix/linux-maple.git/commitdiff
bcachefs: Checksum errors get additional retries
authorKent Overstreet <kent.overstreet@linux.dev>
Sat, 8 Mar 2025 17:56:43 +0000 (12:56 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 16 Mar 2025 17:47:55 +0000 (13:47 -0400)
It's possible for checksum errors to be transient - e.g. flakey
controller or cable, thus we need additional retries (besides retrying
from different replicas) before we can definitely return an error.

This is particularly important for the next patch, which will allow the
data move path to move extents with checksum errors - we don't want to
accidentally introduce bitrot due to a transient error!

- bch2_bkey_pick_read_device() is substantially reworked, and
  bch2_dev_io_failures is expanded to record more information about the
  type of failure (i.e. number of checksum errors).

  It now returns an error code that describes more precisely the reason
  for the failure - checksum error, io error, or offline device, instead
  of the previous generic "insufficient devices". This is important for
  the next patches that add poisoning, as we only want to poison extents
  when we've got real checksum errors (or perhaps IO errors?) - not
  because a device was offline.

- Add a new option and superblock field for the number of checksum
  retries.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/bcachefs_format.h
fs/bcachefs/btree_io.c
fs/bcachefs/errcode.h
fs/bcachefs/extents.c
fs/bcachefs/extents.h
fs/bcachefs/extents_types.h
fs/bcachefs/io_read.c
fs/bcachefs/opts.h
fs/bcachefs/super-io.c

index 7a5b0d211a82a8eb829838540c0dd6dba8467b4f..e96d877670204896c33875ab915dabe7788c99c1 100644 (file)
@@ -842,6 +842,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS,    struct bch_sb, flags[3], 28, 29);
 LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62);
 LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63);
+/* one free bit */
 LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32);
 LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33);
 LE64_BITMASK(BCH_SB_NOCOW,             struct bch_sb, flags[4], 33, 34);
@@ -861,6 +862,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
                                        struct bch_sb, flags[5], 48, 64);
 LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6],  0,  4);
 LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);
+LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20);
 
 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
index cd792fee7ee3ff51b328c018b9f5692b9869ebfe..6abc9f17ea3cbfb1ce15331dfbac87dd2776502a 100644 (file)
@@ -1355,7 +1355,7 @@ start:
                        percpu_ref_put(&ca->io_ref);
                rb->have_ioref = false;
 
-               bch2_mark_io_failure(&failed, &rb->pick);
+               bch2_mark_io_failure(&failed, &rb->pick, false);
 
                can_retry = bch2_bkey_pick_read_device(c,
                                bkey_i_to_s_c(&b->key),
index afa16d58041eb083fb6513379959c1a01341b3fd..493cae4efc3757ba0e00de624b0e9c910b6b2916 100644 (file)
        x(EIO,                          stripe_reconstruct)                     \
        x(EIO,                          key_type_error)                         \
        x(EIO,                          extent_poisened)                        \
-       x(EIO,                          no_device_to_read_from)                 \
        x(EIO,                          missing_indirect_extent)                \
        x(EIO,                          invalidate_stripe_to_dev)               \
        x(EIO,                          no_encryption_key)                      \
        x(EIO,                          ec_block_read)                          \
        x(EIO,                          ec_block_write)                         \
        x(EIO,                          data_read)                              \
+       x(BCH_ERR_data_read,            no_device_to_read_from)                 \
+       x(BCH_ERR_data_read,            data_read_io_err)                       \
+       x(BCH_ERR_data_read,            data_read_csum_err)                     \
        x(BCH_ERR_data_read,            data_read_retry)                        \
        x(BCH_ERR_data_read_retry,      data_read_retry_avoid)                  \
        x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline)         \
index 032cd0bda017da70c6b513b55febf18066fa428f..04946d9911f59703cbe3c0ea046428386535de2b 100644 (file)
@@ -58,7 +58,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
 }
 
 void bch2_mark_io_failure(struct bch_io_failures *failed,
-                         struct extent_ptr_decoded *p)
+                         struct extent_ptr_decoded *p,
+                         bool csum_error)
 {
        struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev);
 
@@ -66,17 +67,16 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
                BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
 
                f = &failed->devs[failed->nr++];
-               f->dev          = p->ptr.dev;
-               f->idx          = p->idx;
-               f->nr_failed    = 1;
-               f->nr_retries   = 0;
-       } else if (p->idx != f->idx) {
-               f->idx          = p->idx;
-               f->nr_failed    = 1;
-               f->nr_retries   = 0;
-       } else {
-               f->nr_failed++;
+               memset(f, 0, sizeof(*f));
+               f->dev = p->ptr.dev;
        }
+
+       if (p->do_ec_reconstruct)
+               f->failed_ec = true;
+       else if (!csum_error)
+               f->failed_io = true;
+       else
+               f->failed_csum_nr++;
 }
 
 static inline u64 dev_latency(struct bch_dev *ca)
@@ -94,37 +94,30 @@ static inline int dev_failed(struct bch_dev *ca)
  */
 static inline bool ptr_better(struct bch_fs *c,
                              const struct extent_ptr_decoded p1,
-                             const struct extent_ptr_decoded p2)
+                             u64 p1_latency,
+                             struct bch_dev *ca1,
+                             const struct extent_ptr_decoded p2,
+                             u64 p2_latency)
 {
-       if (likely(!p1.idx && !p2.idx)) {
-               struct bch_dev *ca1 = bch2_dev_rcu(c, p1.ptr.dev);
-               struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
-
-               int failed_delta = dev_failed(ca1) - dev_failed(ca2);
-
-               if (failed_delta)
-                       return failed_delta < 0;
+       struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev);
 
-               u64 l1 = dev_latency(ca1);
-               u64 l2 = dev_latency(ca2);
+       int failed_delta = dev_failed(ca1) - dev_failed(ca2);
+       if (unlikely(failed_delta))
+               return failed_delta < 0;
 
-               /*
-                * Square the latencies, to bias more in favor of the faster
-                * device - we never want to stop issuing reads to the slower
-                * device altogether, so that we can update our latency numbers:
-                */
-               l1 *= l1;
-               l2 *= l2;
+       if (unlikely(bch2_force_reconstruct_read))
+               return p1.do_ec_reconstruct > p2.do_ec_reconstruct;
 
-               /* Pick at random, biased in favor of the faster device: */
+       if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct))
+               return p1.do_ec_reconstruct < p2.do_ec_reconstruct;
 
-               return bch2_get_random_u64_below(l1 + l2) > l1;
-       }
+       int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr;
+       if (unlikely(crc_retry_delta))
+               return crc_retry_delta < 0;
 
-       if (bch2_force_reconstruct_read)
-               return p1.idx > p2.idx;
+       /* Pick at random, biased in favor of the faster device: */
 
-       return p1.idx < p2.idx;
+       return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency;
 }
 
 /*
@@ -133,73 +126,109 @@ static inline bool ptr_better(struct bch_fs *c,
  * other devices, it will still pick a pointer from avoid.
  */
 int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
-                             struct bch_io_failures *failed,
-                             struct extent_ptr_decoded *pick,
-                             int dev)
+                              struct bch_io_failures *failed,
+                              struct extent_ptr_decoded *pick,
+                              int dev)
 {
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       struct bch_dev_io_failures *f;
-       int ret = 0;
+       bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false;
+       bool have_dirty_ptrs = false, have_pick = false;
 
        if (k.k->type == KEY_TYPE_error)
                return -BCH_ERR_key_type_error;
 
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+
        if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned))
                return -BCH_ERR_extent_poisened;
 
        rcu_read_lock();
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       u64 pick_latency;
+
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               have_dirty_ptrs |= !p.ptr.cached;
+
                /*
                 * Unwritten extent: no need to actually read, treat it as a
                 * hole and return 0s:
                 */
                if (p.ptr.unwritten) {
-                       ret = 0;
-                       break;
+                       rcu_read_unlock();
+                       return 0;
                }
 
                /* Are we being asked to read from a specific device? */
                if (dev >= 0 && p.ptr.dev != dev)
                        continue;
 
-               /*
-                * If there are any dirty pointers it's an error if we can't
-                * read:
-                */
-               if (!ret && !p.ptr.cached)
-                       ret = -BCH_ERR_no_device_to_read_from;
-
                struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev);
 
                if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)))
                        continue;
 
-               f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
-               if (f)
-                       p.idx = f->nr_failed < f->nr_retries
-                               ? f->idx
-                               : f->idx + 1;
+               struct bch_dev_io_failures *f =
+                       unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL;
+               if (unlikely(f)) {
+                       p.crc_retry_nr     = f->failed_csum_nr;
+                       p.has_ec          &= ~f->failed_ec;
 
-               if (!p.idx && (!ca || !bch2_dev_is_online(ca)))
-                       p.idx++;
+                       if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
+                               have_io_errors  |= f->failed_io;
+                               have_io_errors  |= f->failed_ec;
+                       }
+                       have_csum_errors        |= !!f->failed_csum_nr;
 
-               if (!p.idx && p.has_ec && bch2_force_reconstruct_read)
-                       p.idx++;
+                       if (p.has_ec && (f->failed_io || f->failed_csum_nr))
+                               p.do_ec_reconstruct = true;
+                       else if (f->failed_io ||
+                                f->failed_csum_nr > c->opts.checksum_err_retry_nr)
+                               continue;
+               }
 
-               if (p.idx > (unsigned) p.has_ec)
-                       continue;
+               have_missing_devs |= ca && !bch2_dev_is_online(ca);
 
-               if (ret > 0 && !ptr_better(c, p, *pick))
-                       continue;
+               if (!ca || !bch2_dev_is_online(ca)) {
+                       if (!p.has_ec)
+                               continue;
+                       p.do_ec_reconstruct = true;
+               }
 
-               *pick = p;
-               ret = 1;
+               if (bch2_force_reconstruct_read && p.has_ec)
+                       p.do_ec_reconstruct = true;
+
+               u64 p_latency = dev_latency(ca);
+               /*
+                * Square the latencies, to bias more in favor of the faster
+                * device - we never want to stop issuing reads to the slower
+                * device altogether, so that we can update our latency numbers:
+                */
+               p_latency *= p_latency;
+
+               if (!have_pick ||
+                   ptr_better(c,
+                              p, p_latency, ca,
+                              *pick, pick_latency)) {
+                       *pick = p;
+                       pick_latency = p_latency;
+                       have_pick = true;
+               }
        }
        rcu_read_unlock();
 
-       return ret;
+       if (have_pick)
+               return 1;
+       if (!have_dirty_ptrs)
+               return 0;
+       if (have_missing_devs)
+               return -BCH_ERR_no_device_to_read_from;
+       if (have_csum_errors)
+               return -BCH_ERR_data_read_csum_err;
+       if (have_io_errors)
+               return -BCH_ERR_data_read_io_err;
+
+       WARN_ONCE(1, "unhandled error case in %s\n", __func__);
+       return -EINVAL;
 }
 
 /* KEY_TYPE_btree_ptr: */
index c50c4f353bab2814d045d60ee4cf66da42bdd9ab..e78a39e7e18f9a1f294421361940d6dfee2df4c1 100644 (file)
@@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 ({                                                                     \
        __label__ out;                                                  \
                                                                        \
-       (_ptr).idx      = 0;                                            \
-       (_ptr).has_ec   = false;                                        \
+       (_ptr).has_ec                   = false;                        \
+       (_ptr).do_ec_reconstruct        = false;                        \
+       (_ptr).crc_retry_nr             = 0;                            \
                                                                        \
        __bkey_extent_entry_for_each_from(_entry, _end, _entry)         \
                switch (__extent_entry_type(_entry)) {                  \
@@ -401,7 +402,7 @@ out:                                                                        \
 struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
                                                 unsigned);
 void bch2_mark_io_failure(struct bch_io_failures *,
-                         struct extent_ptr_decoded *);
+                         struct extent_ptr_decoded *, bool);
 int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
                               struct bch_io_failures *,
                               struct extent_ptr_decoded *, int);
index 43d6c341eccabb86be8c5395fef5b558bb52004a..e51529dca4c2f13f17626010d7df759e779c2c57 100644 (file)
@@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked {
 };
 
 struct extent_ptr_decoded {
-       unsigned                        idx;
        bool                            has_ec;
+       bool                            do_ec_reconstruct;
+       u8                              crc_retry_nr;
        struct bch_extent_crc_unpacked  crc;
        struct bch_extent_ptr           ptr;
        struct bch_extent_stripe_ptr    ec;
@@ -31,10 +32,10 @@ struct bch_io_failures {
        u8                      nr;
        struct bch_dev_io_failures {
                u8              dev;
-               u8              idx;
-               u8              nr_failed;
-               u8              nr_retries;
-       }                       devs[BCH_REPLICAS_MAX];
+               unsigned        failed_csum_nr:6,
+                               failed_io:1,
+                               failed_ec:1;
+       }                       devs[BCH_REPLICAS_MAX + 1];
 };
 
 #endif /* _BCACHEFS_EXTENTS_TYPES_H */
index 887e3c9ac181b49789a7133664fd9cb7a110de84..aecc645e651632b632db059cf85489f56b6d5b43 100644 (file)
@@ -487,7 +487,8 @@ static void bch2_rbio_retry(struct work_struct *work)
                     bvec_iter_sectors(rbio->bvec_iter));
 
        if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid))
-               bch2_mark_io_failure(&failed, &rbio->pick);
+               bch2_mark_io_failure(&failed, &rbio->pick,
+                                    rbio->ret == -BCH_ERR_data_read_retry_csum_err);
 
        if (!rbio->split) {
                rbio->bio.bi_status     = 0;
@@ -991,7 +992,7 @@ retry_pick:
            ca &&
            unlikely(dev_ptr_stale(ca, &pick.ptr))) {
                read_from_stale_dirty_pointer(trans, ca, k, pick.ptr);
-               bch2_mark_io_failure(failed, &pick);
+               bch2_mark_io_failure(failed, &pick, false);
                percpu_ref_put(&ca->io_ref);
                goto retry_pick;
        }
@@ -1154,7 +1155,7 @@ retry_pick:
        else
                bch2_trans_unlock_long(trans);
 
-       if (!rbio->pick.idx) {
+       if (likely(!rbio->pick.do_ec_reconstruct)) {
                if (unlikely(!rbio->have_ioref)) {
                        struct printbuf buf = PRINTBUF;
                        bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
@@ -1215,7 +1216,8 @@ out:
                rbio = bch2_rbio_free(rbio);
 
                if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid))
-                       bch2_mark_io_failure(failed, &pick);
+                       bch2_mark_io_failure(failed, &pick,
+                                       ret == -BCH_ERR_data_read_retry_csum_err);
 
                return ret;
        }
index afb89d318d24e891b1c6e9befbd582dbd2d266ca..baa9c11acb1af71bf7ad915c3a35fc7d7553bac2 100644 (file)
@@ -186,6 +186,11 @@ enum fsck_err_opts {
          OPT_STR(__bch2_csum_opts),                                    \
          BCH_SB_DATA_CSUM_TYPE,        BCH_CSUM_OPT_crc32c,            \
          NULL,         NULL)                                           \
+       x(checksum_err_retry_nr,        u8,                             \
+         OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
+         OPT_UINT(0, 32),                                              \
+         BCH_SB_CSUM_ERR_RETRY_NR,     3,                              \
+         NULL,         NULL)                                           \
        x(compression,                  u8,                             \
          OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,            \
          OPT_FN(bch2_opt_compression),                                 \
index ee32d043414aecaaf2f96ff5d088e36f9ddaba69..f87e3bf33ec0c50c0066cf1d8d5deb3be3969c19 100644 (file)
@@ -457,6 +457,10 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 
                if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
                        SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
+
+               if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags &&
+                   !BCH_SB_CSUM_ERR_RETRY_NR(sb))
+                       SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3);
        }
 
 #ifdef __KERNEL__