From 1ccbcd320577271c85d9a5bfbdd3394cb9baadb3 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 17:04:08 -0500 Subject: [PATCH] bcachefs: bch2_write_op_error() now prints info about data update A user has been seeing the "error verifying existing checksum while rewriting existing data (memory corruption?)" error. This generally indicates a hardware issue (and that may be the case here), but it might also indicate a bug, in which case we need more information to look for patterns. Reported-by: Roland Vet Signed-off-by: Kent Overstreet --- fs/bcachefs/compress.c | 8 ++-- fs/bcachefs/error.c | 6 +++ fs/bcachefs/error.h | 1 + fs/bcachefs/io_write.c | 92 ++++++++++++++++++++++++++++-------------- fs/bcachefs/io_write.h | 8 +++- 5 files changed, 80 insertions(+), 35 deletions(-) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 114bf2f3879f..31467f77930f 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -271,8 +271,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || crc->compressed_size << 9 > c->opts.encoded_extent_max) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: extent too big"); + bch2_write_op_error(&buf, op, op->pos.offset, + "extent too big to decompress"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); return -EIO; @@ -283,8 +283,8 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, if (__bio_uncompress(c, bio, data.b, *crc)) { if (!c->opts.no_data_io) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: decompression error"); + bch2_write_op_error(&buf, op, op->pos.offset, + "decompression error"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index c8fc58fab958..3f93a5a6bbfa 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -580,3 +580,9 @@ int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printb prt_printf(out, " offset %llu: ", pos.offset << 8); return 0; } + +void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, + struct bpos pos) +{ + bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 76da0e88cee8..b3cc69f29fd9 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -243,5 +243,6 @@ int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subv void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); +void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 076e39474610..738bdbfbdb14 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -396,29 +396,61 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ -static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, - u64 offset) +void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset, const char *fmt, ...) { - bch2_inum_offset_err_msg(op->c, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_move ? "(internal move)" : ""); -} + if (op->subvol) + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9)); + else { + struct bpos pos = op->pos; + pos.offset = offset; + lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); + } -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) -{ - __bch2_write_op_error(out, op, op->pos.offset); + prt_str(out, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(out, "\n from internal move "); + bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); + } } -static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64 offset) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64 offset, + const char *fmt, ...) { - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_move ? "(internal move)" : ""); + if (op->subvol) + bch2_inum_offset_err_msg(op->c, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + else { + struct bpos pos = op->pos; + pos.offset = offset; + bch2_inum_snap_offset_err_msg(op->c, out, pos); + } + + prt_str(out, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(out, "\n from internal move "); + bch2_bkey_val_to_text(out, op->c, bkey_i_to_s_c(u->k.k)); + } } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -561,8 +593,8 @@ static void __bch2_write_index(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1114,8 +1146,8 @@ do_write: csum_err: { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch2_write_op_error(&buf, op, op->pos.offset, + "error verifying existing checksum while rewriting existing data (memory corruption?)"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1211,8 +1243,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1379,8 +1411,8 @@ err: if (ret) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); + bch2_write_op_error(&buf, op, op->pos.offset, + "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); op->error = ret; @@ -1502,8 +1534,8 @@ err: if (unlikely(ret < 0)) { if (!(op->flags & BCH_WRITE_alloc_nowait)) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); + bch2_write_op_error(&buf, op, op->pos.offset, + "%s(): %s", __func__, bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } @@ -1634,8 +1666,8 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "misaligned write"); + bch2_write_op_error(&buf, op, op->pos.offset, + "misaligned write"); printbuf_exit(&buf); op->error = -EIO; goto err; diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index 02cca52be0bd..bf942566a8eb 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -20,7 +20,13 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); +__printf(5, 6) +void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64, const char *, ...); + +__printf(4, 5) +void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, u64, + const char *, ...); #define BCH_WRITE_FLAGS() \ x(alloc_nowait) \ -- 2.50.1