]> www.infradead.org Git - users/hch/misc.git/commitdiff
bcachefs: bch2_write_op_error() now prints info about data update
authorKent Overstreet <kent.overstreet@linux.dev>
Mon, 10 Feb 2025 22:04:08 +0000 (17:04 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Sat, 15 Mar 2025 01:02:14 +0000 (21:02 -0400)
A user has been seeing the "error verifying existing checksum while
rewriting existing data (memory corruption?)" error.

This generally indicates a hardware issue (and that may be the case
here), but it might also indicate a bug, in which case we need more
information to look for patterns.

Reported-by: Roland Vet <vet.roland@protonmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/compress.c
fs/bcachefs/error.c
fs/bcachefs/error.h
fs/bcachefs/io_write.c
fs/bcachefs/io_write.h

index 114bf2f3879f677a1d2952dd571ab60d214ddc77..31467f77930f859a0991f105e16cc6f1f8d2441e 100644 (file)
@@ -271,8 +271,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
        if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max ||
            crc->compressed_size << 9   > c->opts.encoded_extent_max) {
                struct printbuf buf = PRINTBUF;
-               bch2_write_op_error(&buf, op);
-               prt_printf(&buf, "error rewriting existing data: extent too big");
+               bch2_write_op_error(&buf, op, op->pos.offset,
+                                   "extent too big to decompress");
                bch_err_ratelimited(c, "%s", buf.buf);
                printbuf_exit(&buf);
                return -EIO;
@@ -283,8 +283,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op,
        if (__bio_uncompress(c, bio, data.b, *crc)) {
                if (!c->opts.no_data_io) {
                        struct printbuf buf = PRINTBUF;
-                       bch2_write_op_error(&buf, op);
-                       prt_printf(&buf, "error rewriting existing data: decompression error");
+                       bch2_write_op_error(&buf, op, op->pos.offset,
+                                           "decompression error");
                        bch_err_ratelimited(c, "%s", buf.buf);
                        printbuf_exit(&buf);
                }
index c8fc58fab958cd680e5a779adbdd418a49595faf..3f93a5a6bbfa2ad7737ce85cac66e7de75ae81e9 100644 (file)
@@ -580,3 +580,9 @@ int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printb
        prt_printf(out, " offset %llu: ", pos.offset << 8);
        return 0;
 }
+
+void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out,
+                                 struct bpos pos)
+{
+       bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
+}
index 76da0e88cee8b6b9e126d25ac63e43d311b09682..b3cc69f29fd99db74583a81fcc66dd3998a849cf 100644 (file)
@@ -243,5 +243,6 @@ int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subv
 void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64);
 
 int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos);
+void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos);
 
 #endif /* _BCACHEFS_ERROR_H */
index 076e39474610a22ee8a11e95b9b0ebef761d3716..738bdbfbdb149bf886fd7e7c5b6c6d33dabb2cf5 100644 (file)
@@ -396,29 +396,61 @@ static int bch2_write_index_default(struct bch_write_op *op)
 
 /* Writes */
 
-static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op,
-                                 u64 offset)
+void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
+                              struct bch_write_op *op, u64 offset, const char *fmt, ...)
 {
-       bch2_inum_offset_err_msg(op->c, out,
-                                (subvol_inum) { op->subvol, op->pos.inode, },
-                                offset << 9);
-       prt_printf(out, "write error%s: ",
-                  op->flags & BCH_WRITE_move ? "(internal move)" : "");
-}
+       if (op->subvol)
+               lockrestart_do(trans,
+                       bch2_inum_offset_err_msg_trans(trans, out,
+                                                      (subvol_inum) { op->subvol, op->pos.inode, },
+                                                      offset << 9));
+       else {
+               struct bpos pos = op->pos;
+               pos.offset = offset;
+               lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos));
+       }
 
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
-{
-       __bch2_write_op_error(out, op, op->pos.offset);
+       prt_str(out, "write error: ");
+
+       va_list args;
+       va_start(args, fmt);
+       prt_vprintf(out, fmt, args);
+       va_end(args);
+
+       if (op->flags & BCH_WRITE_move) {
+               struct data_update *u = container_of(op, struct data_update, op);
+
+               prt_printf(out, "\n  from internal move ");
+               bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k));
+       }
 }
 
-static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
-                                     struct bch_write_op *op, u64 offset)
+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset,
+                        const char *fmt, ...)
 {
-       bch2_inum_offset_err_msg_trans(trans, out,
-                                      (subvol_inum) { op->subvol, op->pos.inode, },
-                                      offset << 9);
-       prt_printf(out, "write error%s: ",
-                  op->flags & BCH_WRITE_move ? "(internal move)" : "");
+       if (op->subvol)
+               bch2_inum_offset_err_msg(op->c, out,
+                                        (subvol_inum) { op->subvol, op->pos.inode, },
+                                        offset << 9);
+       else {
+               struct bpos pos = op->pos;
+               pos.offset = offset;
+               bch2_inum_snap_offset_err_msg(op->c, out, pos);
+       }
+
+       prt_str(out, "write error: ");
+
+       va_list args;
+       va_start(args, fmt);
+       prt_vprintf(out, fmt, args);
+       va_end(args);
+
+       if (op->flags & BCH_WRITE_move) {
+               struct data_update *u = container_of(op, struct data_update, op);
+
+               prt_printf(out, "\n  from internal move ");
+               bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k));
+       }
 }
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
@@ -561,8 +593,8 @@ static void __bch2_write_index(struct bch_write_op *op)
                        struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
                        struct printbuf buf = PRINTBUF;
-                       __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
-                       prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+                       bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k),
+                                           "btree update error: %s", bch2_err_str(ret));
                        bch_err_ratelimited(c, "%s", buf.buf);
                        printbuf_exit(&buf);
                }
@@ -1114,8 +1146,8 @@ do_write:
 csum_err:
        {
                struct printbuf buf = PRINTBUF;
-               bch2_write_op_error(&buf, op);
-               prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+               bch2_write_op_error(&buf, op, op->pos.offset,
+                                   "error verifying existing checksum while rewriting existing data (memory corruption?)");
                bch_err_ratelimited(c, "%s", buf.buf);
                printbuf_exit(&buf);
        }
@@ -1211,8 +1243,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
                        struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
 
                        struct printbuf buf = PRINTBUF;
-                       bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
-                       prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
+                       bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k),
+                                                 "btree update error: %s", bch2_err_str(ret));
                        bch_err_ratelimited(c, "%s", buf.buf);
                        printbuf_exit(&buf);
                }
@@ -1379,8 +1411,8 @@ err:
 
        if (ret) {
                struct printbuf buf = PRINTBUF;
-               bch2_write_op_error(&buf, op);
-               prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
+               bch2_write_op_error(&buf, op, op->pos.offset,
+                                   "%s(): btree lookup error: %s", __func__, bch2_err_str(ret));
                bch_err_ratelimited(c, "%s", buf.buf);
                printbuf_exit(&buf);
                op->error = ret;
@@ -1502,8 +1534,8 @@ err:
                        if (unlikely(ret < 0)) {
                                if (!(op->flags & BCH_WRITE_alloc_nowait)) {
                                        struct printbuf buf = PRINTBUF;
-                                       bch2_write_op_error(&buf, op);
-                                       prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret));
+                                       bch2_write_op_error(&buf, op, op->pos.offset,
+                                                           "%s(): %s", __func__, bch2_err_str(ret));
                                        bch_err_ratelimited(c, "%s", buf.buf);
                                        printbuf_exit(&buf);
                                }
@@ -1634,8 +1666,8 @@ CLOSURE_CALLBACK(bch2_write)
 
        if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) {
                struct printbuf buf = PRINTBUF;
-               bch2_write_op_error(&buf, op);
-               prt_printf(&buf, "misaligned write");
+               bch2_write_op_error(&buf, op, op->pos.offset,
+                                   "misaligned write");
                printbuf_exit(&buf);
                op->error = -EIO;
                goto err;
index 02cca52be0bdd98b381b447d664e4dafa6505944..bf942566a8eba8e1d18eb48467d441aac60c3805 100644 (file)
@@ -20,7 +20,13 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
                               enum bch_data_type, const struct bkey_i *, bool);
 
-void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op);
+__printf(5, 6)
+void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
+                              struct bch_write_op *op, u64, const char *, ...);
+
+__printf(4, 5)
+void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64,
+                        const char *, ...);
 
 #define BCH_WRITE_FLAGS()              \
        x(alloc_nowait)                 \