*/
struct bch_member_cpu mi;
atomic64_t errors[BCH_MEMBER_ERROR_NR];
+ unsigned long write_errors_start;
__uuid_t uuid;
char name[BDEVNAME_SIZE];
LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED,
struct bch_sb, flags[5], 48, 64);
LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4);
+LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14);
static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb)
{
{
struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
struct bch_fs *c = ca->fs;
- bool dev;
+
+ /* XXX: if it's reads or checksums that are failing, set it to failed */
down_write(&c->state_lock);
- dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED);
- if (dev
- ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
- BCH_FORCE_IF_DEGRADED)
- : bch2_fs_emergency_read_only(c))
+ unsigned long write_errors_start = READ_ONCE(ca->write_errors_start);
+
+ if (write_errors_start &&
+ time_after(jiffies,
+ write_errors_start + c->opts.write_error_timeout * HZ)) {
+ if (ca->mi.state >= BCH_MEMBER_STATE_ro)
+ goto out;
+
+ bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro,
+ BCH_FORCE_IF_DEGRADED);
+
bch_err(ca,
- "too many IO errors, setting %s RO",
+ "writes erroring for %u seconds, setting %s ro",
+ c->opts.write_error_timeout,
dev ? "device" : "filesystem");
+ if (!dev)
+ bch2_fs_emergency_read_only(c);
+
+ }
+out:
up_write(&c->state_lock);
}
void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type)
{
atomic64_inc(&ca->errors[type]);
- //queue_work(system_long_wq, &ca->io_error_work);
+
+ if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start)
+ ca->write_errors_start = jiffies;
+
+ queue_work(system_long_wq, &ca->io_error_work);
}
enum ask_yn {
enum bch_member_error_type type,
bool success)
{
- if (!success)
+ if (likely(success)) {
+ if (type == BCH_MEMBER_ERROR_write &&
+ ca->write_errors_start)
+ ca->write_errors_start = 0;
+ } else {
bch2_io_error(ca, type);
+ }
}
static inline void bch2_account_io_completion(struct bch_dev *ca,
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \
NULL, "Action to take on filesystem error") \
+ x(write_error_timeout, u16, \
+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_UINT(1, 300), \
+ BCH_SB_WRITE_ERROR_TIMEOUT, 30, \
+ NULL, "Number of consecutive write errors allowed before kicking out a device")\
x(metadata_replicas, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2)
SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true);
+
+ if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb))
+ SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30);
}
#ifdef __KERNEL__