diff options
Diffstat (limited to 'fs/bcachefs/error.c')
-rw-r--r-- | fs/bcachefs/error.c | 317 |
1 files changed, 234 insertions, 83 deletions
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 038da6a61f6b..baf5dfb32298 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -3,15 +3,24 @@ #include "btree_cache.h" #include "btree_iter.h" #include "error.h" -#include "fs-common.h" #include "journal.h" +#include "namei.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" #define FSCK_ERR_RATELIMIT_NR 10 -bool bch2_inconsistent_error(struct bch_fs *c) +void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +{ + printbuf_indent_add_nextline(out, 2); + +#ifdef BCACHEFS_LOG_PREFIX + prt_printf(out, bch2_log_msg(c, "")); +#endif +} + +bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) { set_bit(BCH_FS_error, &c->flags); @@ -21,10 +30,11 @@ bool bch2_inconsistent_error(struct bch_fs *c) case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: if (bch2_fs_emergency_read_only(c)) - bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", - journal_cur_seq(&c->journal)); + prt_printf(out, "inconsistency detected - emergency read only at journal seq %llu\n", + journal_cur_seq(&c->journal)); return true; case BCH_ON_ERROR_panic: + bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -32,11 +42,66 @@ bool bch2_inconsistent_error(struct bch_fs *c) } } -int bch2_topology_error(struct bch_fs *c) +bool bch2_inconsistent_error(struct bch_fs *c) +{ + struct printbuf buf = PRINTBUF; + buf.atomic++; + + printbuf_indent_add_nextline(&buf, 2); + + bool ret = __bch2_inconsistent_error(c, &buf); + if (ret) + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return ret; +} + +__printf(3, 0) +static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, + const char *fmt, va_list args) +{ + struct printbuf buf = PRINTBUF; + buf.atomic++; + + bch2_log_msg_start(c, &buf); + + prt_vprintf(&buf, fmt, args); + prt_newline(&buf); + + if (trans) + bch2_trans_updates_to_text(&buf, trans); + bool ret = __bch2_inconsistent_error(c, &buf); + bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); + + printbuf_exit(&buf); + return ret; +} + +bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args); + va_end(args); + return ret; +} + +bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args); + va_end(args); + return ret; +} + +int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) { + prt_printf(out, "btree topology error: "); + set_bit(BCH_FS_topology_error, &c->flags); if (!test_bit(BCH_FS_recovery_running, &c->flags)) { - bch2_inconsistent_error(c); + __bch2_inconsistent_error(c, out); return -BCH_ERR_btree_need_topology_repair; } else { return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: @@ -44,6 +109,24 @@ int bch2_topology_error(struct bch_fs *c) } } +int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) +{ + struct printbuf buf = PRINTBUF; + + bch2_log_msg_start(c, &buf); + + va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + + int ret = __bch2_topology_error(c, &buf); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + + printbuf_exit(&buf); + return ret; +} + void bch2_fatal_error(struct bch_fs *c) { if (bch2_fs_emergency_read_only(c)) @@ -54,25 +137,41 @@ void bch2_io_error_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; - bool dev; + + /* XXX: if it's reads or checksums that are failing, set it to failed */ down_write(&c->state_lock); - dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - if (dev - ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED) - : bch2_fs_emergency_read_only(c)) + unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); + + if (write_errors_start && + time_after(jiffies, + write_errors_start + c->opts.write_error_timeout * HZ)) { + if (ca->mi.state >= BCH_MEMBER_STATE_ro) + goto out; + + bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + bch_err(ca, - "too many IO errors, setting %s RO", + "writes erroring for %u seconds, setting %s ro", + c->opts.write_error_timeout, dev ? "device" : "filesystem"); + if (!dev) + bch2_fs_emergency_read_only(c); + + } +out: up_write(&c->state_lock); } void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { atomic64_inc(&ca->errors[type]); - //queue_work(system_long_wq, &ca->io_error_work); + + if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) + ca->write_errors_start = jiffies; + + queue_work(system_long_wq, &ca->io_error_work); } enum ask_yn { @@ -168,7 +267,8 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) #endif -static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) +static struct fsck_err_state *fsck_err_get(struct bch_fs *c, + enum bch_sb_error_id id) { struct fsck_err_state *s; @@ -176,7 +276,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) return NULL; list_for_each_entry(s, &c->fsck_error_msgs, list) - if (s->fmt == fmt) { + if (s->id == id) { /* * move it to the head of the list: repeated fsck errors * are common @@ -194,7 +294,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) } INIT_LIST_HEAD(&s->list); - s->fmt = fmt; + s->id = id; list_add(&s->list, &c->fsck_error_msgs); return s; } @@ -244,15 +344,59 @@ static int do_fsck_ask_yn(struct bch_fs *c, return ask; } +static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, + enum bch_sb_error_id id, const char *msg, + bool *repeat, bool *print, bool *suppress) +{ + bch2_sb_error_count(c, id); + + struct fsck_err_state *s = fsck_err_get(c, id); + if (s) { + /* + * We may be called multiple times for the same error on + * transaction restart - this memoizes instead of asking the user + * multiple times for the same error: + */ + if (s->last_msg && !strcmp(msg, s->last_msg)) { + *repeat = true; + *print = false; + return s; + } + + kfree(s->last_msg); + s->last_msg = kstrdup(msg, GFP_KERNEL); + + if (c->opts.ratelimit_errors && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + *suppress = true; + else + *print = false; + } + + s->nr++; + } + return s; +} + +void __bch2_count_fsck_err(struct bch_fs *c, + enum bch_sb_error_id id, const char *msg, + bool *repeat, bool *print, bool *suppress) +{ + bch2_sb_error_count(c, id); + + mutex_lock(&c->fsck_error_msgs_lock); + count_fsck_err_locked(c, id, msg, repeat, print, suppress); + mutex_unlock(&c->fsck_error_msgs_lock); +} + int __bch2_fsck_err(struct bch_fs *c, struct btree_trans *trans, enum bch_fsck_flags flags, enum bch_sb_error_id err, const char *fmt, ...) { - struct fsck_err_state *s = NULL; va_list args; - bool print = true, suppressing = false, inconsistent = false, exiting = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; @@ -287,7 +431,12 @@ int __bch2_fsck_err(struct bch_fs *c, ? -BCH_ERR_fsck_fix : -BCH_ERR_fsck_ignore; - bch2_sb_error_count(c, err); + printbuf_indent_add_nextline(out, 2); + +#ifdef BCACHEFS_LOG_PREFIX + if (strncmp(fmt, "bcachefs", 8)) + prt_printf(out, bch2_log_msg(c, "")); +#endif va_start(args, fmt); prt_vprintf(out, fmt, args); @@ -307,42 +456,15 @@ int __bch2_fsck_err(struct bch_fs *c, } mutex_lock(&c->fsck_error_msgs_lock); - s = fsck_err_get(c, fmt); - if (s) { - /* - * We may be called multiple times for the same error on - * transaction restart - this memoizes instead of asking the user - * multiple times for the same error: - */ - if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { - ret = s->ret; - goto err_unlock; - } - - kfree(s->last_msg); - s->last_msg = kstrdup(buf.buf, GFP_KERNEL); - if (!s->last_msg) { - ret = -ENOMEM; - goto err_unlock; - } - - if (c->opts.ratelimit_errors && - !(flags & FSCK_NO_RATELIMIT) && - s->nr >= FSCK_ERR_RATELIMIT_NR) { - if (s->nr == FSCK_ERR_RATELIMIT_NR) - suppressing = true; - else - print = false; - } - - s->nr++; + bool repeat = false, print = true, suppress = false; + bool inconsistent = false, exiting = false; + struct fsck_err_state *s = + count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress); + if (repeat) { + ret = s->ret; + goto err_unlock; } -#ifdef BCACHEFS_LOG_PREFIX - if (!strncmp(fmt, "bcachefs:", 9)) - prt_printf(out, bch2_log_msg(c, "")); -#endif - if ((flags & FSCK_AUTOFIX) && (c->opts.errors == BCH_ON_ERROR_continue || c->opts.errors == BCH_ON_ERROR_fix_safe)) { @@ -361,6 +483,7 @@ int __bch2_fsck_err(struct bch_fs *c, !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); inconsistent = true; + print = true; ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { prt_str(out, ", "); @@ -419,24 +542,30 @@ int __bch2_fsck_err(struct bch_fs *c, print = true; } print: + prt_newline(out); + + if (inconsistent) + __bch2_inconsistent_error(c, out); + else if (exiting) + prt_printf(out, "Unable to continue, halting\n"); + else if (suppress) + prt_printf(out, "Ratelimiting new instances of previous error\n"); + if (print) { + /* possibly strip an empty line, from printbuf_indent_add */ + while (out->pos && out->buf[out->pos - 1] == ' ') + --out->pos; + printbuf_nul_terminate(out); + if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s\n", out->buf); + bch2_print(c, "%s", out->buf); else bch2_print_string_as_lines(KERN_ERR, out->buf); } - if (exiting) - bch_err(c, "Unable to continue, halting"); - else if (suppressing) - bch_err(c, "Ratelimiting new instances of previous error"); - if (s) s->ret = ret; - if (inconsistent) - bch2_inconsistent_error(c); - /* * We don't yet track whether the filesystem currently has errors, for * log_fsck_err()s: that would require us to track for every error type @@ -498,16 +627,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, prt_printf(&buf, " level=%u: ", from.level); bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\n "); + prt_newline(&buf); va_list args; va_start(args, fmt); prt_vprintf(&buf, fmt, args); va_end(args); - prt_str(&buf, ": delete?"); - - int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); + int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); printbuf_exit(&buf); return ret; } @@ -520,7 +647,7 @@ void bch2_flush_fsck_errs(struct bch_fs *c) list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { if (s->ratelimited && s->last_msg) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); list_del(&s->list); kfree(s->last_msg); @@ -530,35 +657,59 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } -int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) +int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + subvol_inum inum, u64 offset) { u32 restart_count = trans->restart_count; int ret = 0; - /* XXX: we don't yet attempt to print paths when we don't know the subvol */ - if (inum.subvol) - ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } if (!inum.subvol || ret) prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); + prt_printf(out, " offset %llu: ", offset); return trans_was_restarted(trans, restart_count); } -int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - subvol_inum inum, u64 offset) +void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, + subvol_inum inum, u64 offset) { - int ret = bch2_inum_err_msg_trans(trans, out, inum); - prt_printf(out, " offset %llu: ", offset); - return ret; + bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); } -void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bpos pos) { - bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); + struct bch_fs *c = trans->c; + int ret = 0; + + if (!bch2_snapshot_is_leaf(c, pos.snapshot)) + prt_str(out, "(multiple snapshots) "); + + subvol_inum inum = { + .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), + .inum = pos.inode, + }; + + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } + + if (!inum.subvol || ret) + prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); + + prt_printf(out, " offset %llu: ", pos.offset << 8); + return 0; } -void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, - subvol_inum inum, u64 offset) +void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, + struct bpos pos) { - bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); + bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); } |