From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Wed, 26 Feb 2025 23:44:23 +0000 (-0500)
Subject: bcachefs: Kick devices out after too many write IO errors
X-Git-Url: https://www.infradead.org/git/?a=commitdiff_plain;h=981e3801443f507d74e2dae5710452642c96e8e3;p=users%2Fjedix%2Flinux-maple.git

bcachefs: Kick devices out after too many write IO errors

We're improving our handling of write errors - we shouldn't write
degraded data just because a write failed once, we should retry it (on
other devices, if possible).

But for this to work, we need to kick devices out when they're only
returning errors - otherwise those retries will loop infinitely.

This adds a configurable timeout - if writes are failing for too long,
we'll set that device read-only.

In the future we should also implement more tracking and another knob
for an "allowed error rate", so that we can kick out drives that are
acting "unhealthy".

Another thing we'll want is a mechanism (likely in userspace) for
bringing a device back in after a transient error - perhaps a cable was
jiggled, or there was a controller reset.

After transient errors we also need a mechanism to walk (from the
journal) recent btree updates that weren't flushed to that device and
treat them as "degraded", since unflushed data may well not have been
written. Out of scope for this patch, but becoming relevant.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index d2c3f59a668f..8abefc994016 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -536,6 +536,7 @@ struct bch_dev {
 	 */
 	struct bch_member_cpu	mi;
 	atomic64_t		errors[BCH_MEMBER_ERROR_NR];
+	unsigned long		write_errors_start;
 
 	__uuid_t		uuid;
 	char			name[BDEVNAME_SIZE];
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index a6cc817ccd87..7a5b0d211a82 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -860,6 +860,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT,	struct bch_sb, flags[5], 32, 48);
 LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
 					struct bch_sb, flags[5], 48, 64);
 LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS,	struct bch_sb, flags[6],  0,  4);
+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6],  4, 14);
 
 static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
 {
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 3f93a5a6bbfa..6d68c89a49b2 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -54,25 +54,41 @@ void bch2_io_error_work(struct work_struct *work)
 {
 	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
 	struct bch_fs *c = ca->fs;
-	bool dev;
+
+	/* XXX: if it's reads or checksums that are failing, set it to failed */
 
 	down_write(&c->state_lock);
-	dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
-				    BCH_FORCE_IF_DEGRADED);
-	if (dev
-	    ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
-				  BCH_FORCE_IF_DEGRADED)
-	    : bch2_fs_emergency_read_only(c))
+	unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
+
+	if (write_errors_start &&
+	    time_after(jiffies,
+		       write_errors_start + c->opts.write_error_timeout * HZ)) {
+		if (ca->mi.state >= BCH_MEMBER_STATE_ro)
+			goto out;
+
+		bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+						 BCH_FORCE_IF_DEGRADED);
+
 		bch_err(ca,
-			"too many IO errors, setting %s RO",
+			"writes erroring for %u seconds, setting %s ro",
+			c->opts.write_error_timeout,
 			dev ? "device" : "filesystem");
+		if (!dev)
+			bch2_fs_emergency_read_only(c);
+
+	}
+out:
 	up_write(&c->state_lock);
 }
 
 void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
 {
 	atomic64_inc(&ca->errors[type]);
-	//queue_work(system_long_wq, &ca->io_error_work);
+
+	if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
+		ca->write_errors_start = jiffies;
+
+	queue_work(system_long_wq, &ca->io_error_work);
 }
 
 enum ask_yn {
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index a57b9f18d060..7d3f0e2a5fd6 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -226,8 +226,13 @@ static inline void bch2_account_io_success_fail(struct bch_dev *ca,
 						enum bch_member_error_type type,
 						bool success)
 {
-	if (!success)
+	if (likely(success)) {
+		if (type == BCH_MEMBER_ERROR_write &&
+		    ca->write_errors_start)
+			ca->write_errors_start = 0;
+	} else {
 		bch2_io_error(ca, type);
+	}
 }
 
 static inline void bch2_account_io_completion(struct bch_dev *ca,
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 071a92ec8a14..afb89d318d24 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -145,6 +145,11 @@ enum fsck_err_opts {
 	  OPT_STR(bch2_error_actions),					\
 	  BCH_SB_ERROR_ACTION,		BCH_ON_ERROR_fix_safe,		\
 	  NULL,		"Action to take on filesystem error")		\
+	x(write_error_timeout,		u16,				\
+	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
+	  OPT_UINT(1, 300),						\
+	  BCH_SB_WRITE_ERROR_TIMEOUT,	30,				\
+	  NULL,		"Number of consecutive write errors allowed before kicking out a device")\
 	x(metadata_replicas,		u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_UINT(1, BCH_REPLICAS_MAX),				\
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 918e4e7704dd..ee32d043414a 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -454,6 +454,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb,
 
 		if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
 			SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
+
+		if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
+			SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
 	}
 
 #ifdef __KERNEL__