// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "alloc/accounting.h" #include "alloc/background.h" #include "alloc/backpointers.h" #include "alloc/check.h" #include "alloc/lru.h" #include "btree/check.h" #include "btree/node_scan.h" #include "data/copygc.h" #include "data/ec.h" #include "data/reconcile.h" #include "fs/check.h" #include "fs/inode.h" #include "fs/logged_ops.h" #include "journal/init.h" #include "journal/journal.h" #include "sb/io.h" #include "snapshots/snapshot.h" #include "snapshots/subvolume.h" #include "init/recovery.h" #include "init/passes.h" #include "init/fs.h" const char * const bch2_recovery_passes[] = { #define x(_fn, ...) #_fn, BCH_RECOVERY_PASSES() #undef x NULL }; static const u8 passes_to_stable_map[] = { #define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, BCH_RECOVERY_PASSES() #undef x }; static const u8 passes_from_stable_map[] = { #define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, BCH_RECOVERY_PASSES() #undef x }; static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) { return passes_to_stable_map[pass]; } u64 bch2_recovery_passes_to_stable(u64 v) { u64 ret = 0; for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) if (v & BIT_ULL(i)) ret |= BIT_ULL(passes_to_stable_map[i]); return ret; } static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) { return pass < ARRAY_SIZE(passes_from_stable_map) ? passes_from_stable_map[pass] : 0; } u64 bch2_recovery_passes_from_stable(u64 v) { u64 ret = 0; for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) if (v & BIT_ULL(i)) ret |= BIT_ULL(passes_from_stable_map[i]); return ret; } static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { return 0; } static void bch2_sb_recovery_passes_to_text(struct printbuf *out, struct bch_fs *c, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_recovery_passes *r = field_to_type(f, recovery_passes); unsigned nr = recovery_passes_nr_entries(r); if (out->nr_tabstops < 1) printbuf_tabstop_push(out, 32); if (out->nr_tabstops < 2) printbuf_tabstop_push(out, 16); prt_printf(out, "Pass\tLast run\tLast runtime\n"); for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { if (!i->last_run) continue; unsigned idx = i - r->start; prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); bch2_prt_datetime(out, le64_to_cpu(i->last_run)); prt_tab(out); bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) prt_str(out, " (no ratelimit)"); prt_newline(out); } } static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, enum bch_recovery_pass pass) { enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); lockdep_assert_held(&c->sb_lock); struct bch_sb_field_recovery_passes *r = bch2_sb_field_get(c->disk_sb.sb, recovery_passes); if (stable >= recovery_passes_nr_entries(r)) { unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); if (!r) { bch_err(c, "error creating recovery_passes sb section"); return NULL; } } return r->start + stable; } static void bch2_sb_recovery_pass_complete(struct bch_fs *c, enum bch_recovery_pass pass, s64 start_time) { guard(mutex)(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); __clear_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); if (e) { s64 end_time = ktime_get_real_seconds(); e->last_run = cpu_to_le64(end_time); e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); } bch2_write_super(c); } void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) { guard(mutex)(&c->sb_lock); struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); bch2_write_super(c); } } static bool bch2_recovery_pass_entry_get_locked(struct bch_fs *c, enum bch_recovery_pass pass, struct recovery_pass_entry *e) { lockdep_assert_held(&c->sb_lock); struct bch_sb_field_recovery_passes *r = bch2_sb_field_get(c->disk_sb.sb, recovery_passes); enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); bool found = stable < recovery_passes_nr_entries(r); if (found) *e = r->start[stable]; return found; } static bool bch2_recovery_pass_want_ratelimit_locked(struct bch_fs *c, enum bch_recovery_pass pass, unsigned runtime_fraction) { struct recovery_pass_entry e; if (!bch2_recovery_pass_entry_get_locked(c, pass, &e)) return false; /* * Ratelimit if the last runtime was more than 1% of the time * since we last ran */ return !BCH_RECOVERY_PASS_NO_RATELIMIT(&e) && (u64) le32_to_cpu(e.last_runtime) * runtime_fraction > ktime_get_real_seconds() - le64_to_cpu(e.last_run); } bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass, unsigned runtime_fraction) { guard(mutex)(&c->sb_lock); return bch2_recovery_pass_want_ratelimit_locked(c, pass, runtime_fraction); } const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { .validate = bch2_sb_recovery_passes_validate, .to_text = bch2_sb_recovery_passes_to_text }; /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ static int bch2_recovery_pass_empty(struct bch_fs *c) { return 0; } /* * Make sure root inode is readable while we're still in recovery and can rewind * for repair: */ static int bch2_lookup_root_inode(struct bch_fs *c) { subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; struct bch_inode_unpacked inode_u; struct bch_subvolume subvol; CLASS(btree_trans, trans)(c); return lockrestart_do(trans, bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); } struct recovery_pass { int (*fn)(struct bch_fs *); const char *name; unsigned when; u64 depends; }; static const struct recovery_pass recovery_passes[] = { #define x(_fn, _id, _when, _depends) { \ .fn = bch2_##_fn, \ .name = #_fn, \ .when = _when, \ .depends = _depends, \ }, BCH_RECOVERY_PASSES() #undef x }; u64 bch2_recovery_passes_match(unsigned flags) { u64 ret = 0; for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++) if (recovery_passes[i].when & flags) ret |= BIT_ULL(i); return ret; } u64 bch2_fsck_recovery_passes(void) { return bch2_recovery_passes_match(PASS_FSCK); } /* Set of all passes that depend on @pass, transitively */ static u64 pass_dependents(enum bch_recovery_pass pass) { u64 passes = BIT_ULL(pass); bool found; do { found = false; for (unsigned i = 0; i < BCH_RECOVERY_PASS_NR; i++) if (!(passes & BIT_ULL(i)) && (passes & recovery_passes[i].depends)) { passes |= BIT_ULL(i); found = true; } } while (found); return passes; } /* true if all passes can be run online */ static bool passes_online(u64 passes) { return passes == (passes & bch2_recovery_passes_match(PASS_ONLINE)); } /* Returns true if a given pass and all scheduled dependents can run online */ static bool recovery_pass_should_defer(enum bch_recovery_pass pass, u64 passes) { return (passes_online(pass_dependents(pass) & passes)) != 0; } static bool recovery_pass_needs_rewind(struct bch_fs *c, enum bch_recovery_pass pass) { struct bch_fs_recovery *r = &c->recovery; return test_bit(BCH_FS_in_recovery, &c->flags) && r->current_pass > pass && !(r->passes_complete & BIT_ULL(pass)); } static bool recovery_pass_needs_set(struct bch_fs *c, enum bch_recovery_pass pass, enum bch_run_recovery_pass_flags *flags) { struct bch_fs_recovery *r = &c->recovery; /* * Never run scan_for_btree_nodes persistently: check_topology will run * it if required */ if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) *flags |= RUN_RECOVERY_PASS_nopersistent; if ((*flags & RUN_RECOVERY_PASS_ratelimit) && !bch2_recovery_pass_want_ratelimit_locked(c, pass, 100)) *flags &= ~RUN_RECOVERY_PASS_ratelimit; /* * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do * anything if the pass has already run: these mean we need a prior pass * to run before we continue to repair, we don't expect that pass to fix * the damage we encountered. * * Otherwise, we run run_explicit_recovery_pass when we find damage, so * it should run again even if it's already run: */ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); u64 already_running = persistent ? c->sb.recovery_passes_required : r->current_passes; if (!(already_running & BIT_ULL(pass))) return true; if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && (r->passes_ratelimiting & BIT_ULL(pass))) return true; return recovery_pass_needs_rewind(c, pass); } /* * For when we need to rewind recovery passes and run a pass we skipped: */ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, struct printbuf *out, enum bch_recovery_pass pass, enum bch_run_recovery_pass_flags flags, bool *write_sb) { struct bch_fs_recovery *r = &c->recovery; lockdep_assert_held(&c->sb_lock); bch2_printbuf_make_room(out, 1024); guard(printbuf_atomic)(out); guard(spinlock_irq)(&r->lock); if (!recovery_pass_needs_set(c, pass, &flags)) return 0; out->suppress = false; bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; if (flags & RUN_RECOVERY_PASS_nopersistent) { r->scheduled_passes_ephemeral |= BIT_ULL(pass); } else { struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); *write_sb |= !__test_and_set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); } if (pass < BCH_RECOVERY_PASS_set_may_go_rw && test_bit(BCH_FS_may_go_rw, &c->flags)) { prt_printf(out, "need recovery pass %s (%u), but already rw\n", bch2_recovery_passes[pass], pass); return bch_err_throw(c, cannot_rewind_recovery); } if (ratelimit) r->passes_ratelimiting |= BIT_ULL(pass); else r->passes_ratelimiting &= ~BIT_ULL(pass); if (in_recovery && !ratelimit && !recovery_pass_should_defer(pass, r->current_passes)) { bool rewind = recovery_pass_needs_rewind(c, pass); prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", bch2_recovery_passes[pass], pass, bch2_recovery_passes[r->current_pass], r->current_pass, rewind ? " - rewinding" : ""); r->current_passes |= BIT_ULL(pass); if (rewind) { r->rewound_to = r->rewound_to ? min(r->rewound_to, pass) : pass; return bch_err_throw(c, restart_recovery); } } else { prt_printf(out, "scheduling recovery pass %s (%u)%s\n", bch2_recovery_passes[pass], pass, ratelimit ? " - ratelimiting" : ""); const struct recovery_pass *p = recovery_passes + pass; if (!ratelimit && (p->when & PASS_ONLINE)) bch2_run_async_recovery_passes(c); } return 0; } int bch2_run_explicit_recovery_pass(struct bch_fs *c, struct printbuf *out, enum bch_recovery_pass pass, enum bch_run_recovery_pass_flags flags) { /* * With RUN_RECOVERY_PASS_ratelimit, recovery_pass_needs_set needs * sb_lock */ if (!(flags & RUN_RECOVERY_PASS_ratelimit) && !recovery_pass_needs_set(c, pass, &flags)) return 0; guard(mutex)(&c->sb_lock); bool write_sb = false; int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb); if (write_sb) bch2_write_super(c); return ret; } /* * Returns 0 if @pass has run recently, otherwise one of * -BCH_ERR_restart_recovery * -BCH_ERR_recovery_pass_will_run */ int bch2_require_recovery_pass(struct bch_fs *c, struct printbuf *out, enum bch_recovery_pass pass) { if (test_bit(BCH_FS_in_recovery, &c->flags) && c->recovery.passes_complete & BIT_ULL(pass)) return 0; guard(mutex)(&c->sb_lock); if (bch2_recovery_pass_want_ratelimit_locked(c, pass, 100)) return 0; enum bch_run_recovery_pass_flags flags = 0; bool write_sb = false; int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb) ?: bch_err_throw(c, recovery_pass_will_run); if (write_sb) bch2_write_super(c); return ret; } static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { struct bch_fs_recovery *r = &c->recovery; const struct recovery_pass *p = recovery_passes + pass; if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), bch2_recovery_passes[pass]); s64 start_time = ktime_get_real_seconds(); int ret = p->fn(c); if (ret) { bch_err(c, "%s(): error %s", p->name, bch2_err_str(ret)); r->passes_failing |= BIT_ULL(pass); return ret; } if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_CONT " done\n"); r->passes_failing = 0; if (!test_bit(BCH_FS_error, &c->flags)) bch2_sb_recovery_pass_complete(c, pass, start_time); return 0; } int bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, bool failfast) { struct bch_fs_recovery *r = &c->recovery; int ret = 0; spin_lock_irq(&r->lock); if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); /* * A failed recovery pass will be retried after another pass succeeds - * but not this iteration. * * This is because some passes depend on repair done by other passes: we * may want to retry, but we don't want to loop on failing passes. */ orig_passes_to_run &= ~r->passes_failing; r->current_passes = orig_passes_to_run; enum bch_recovery_pass prev = 0; while (r->current_passes) { unsigned pass = __ffs64(r->current_passes); r->current_pass = pass; r->current_passes &= ~BIT_ULL(pass); r->scheduled_passes_ephemeral &= ~BIT_ULL(pass); spin_unlock_irq(&r->lock); int ret2 = bch2_run_recovery_pass(c, pass) ?: bch2_journal_flush(&c->journal); spin_lock_irq(&r->lock); if (r->rewound_to) { r->rewound_from = max(r->rewound_from, pass); /* Restore r->current_passses up to and including r->rewound_to */ r->current_passes |= orig_passes_to_run & (~0ULL << r->rewound_to); r->rewound_to = 0; } else if (!ret2) { r->pass_done = max(r->pass_done, pass); r->passes_complete |= BIT_ULL(pass); } else { ret = ret2; } if (ret && failfast) break; if (prev <= BCH_RECOVERY_PASS_check_snapshots && pass > BCH_RECOVERY_PASS_check_snapshots) { bch2_copygc_wakeup(c); bch2_reconcile_wakeup(c); } prev = pass; } r->current_pass = 0; spin_unlock_irq(&r->lock); return ret; } static void bch2_async_recovery_passes_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); struct bch_fs_recovery *r = &c->recovery; if (mutex_trylock(&r->run_lock)) { bch2_run_recovery_passes(c, (c->sb.recovery_passes_required | r->scheduled_passes_ephemeral) & ~r->passes_ratelimiting & bch2_recovery_passes_match(PASS_ONLINE), false); mutex_unlock(&r->run_lock); } enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); } void bch2_run_async_recovery_passes(struct bch_fs *c) { if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) return; if (queue_work(system_long_wq, &c->recovery.work)) return; enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); } int bch2_run_recovery_passes_startup(struct bch_fs *c, enum bch_recovery_pass from) { struct bch_fs_recovery *r = &c->recovery; r->scheduled_passes_ephemeral = c->opts.recovery_passes; u64 passes = bch2_recovery_passes_match(PASS_ALWAYS) | (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | c->opts.recovery_passes | c->sb.recovery_passes_required; if (c->opts.recovery_pass_last) passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; /* * We can't allow set_may_go_rw to be excluded; that would cause us to * use the journal replay keys for updates where it's not expected. */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; passes &= ~c->opts.recovery_passes_exclude; passes &= ~(BIT_ULL(from) - 1); /* * Defer passes that can be run online, and don't have dependents that * can't be run online */ u64 defer = 0; if (!c->opts.fsck) for (unsigned i = 0; i < BCH_RECOVERY_PASS_NR; i++) if ((passes & BIT_ULL(i)) && recovery_pass_should_defer(i, passes)) { defer |= BIT_ULL(i); passes &= ~BIT_ULL(i); } scoped_guard(mutex, &r->run_lock) try(bch2_run_recovery_passes(c, passes, true)); clear_bit(BCH_FS_in_recovery, &c->flags); if (defer) { CLASS(bch_log_msg_level, msg)(c, LOGLEVEL_notice); prt_printf(&msg.m, "Running the following recovery passes in the background:\n"); prt_bitflags(&msg.m, bch2_recovery_passes, defer); r->scheduled_passes_ephemeral |= defer; bch2_run_async_recovery_passes(c); } return 0; } static void prt_passes(struct printbuf *out, const char *msg, u64 passes) { prt_printf(out, "%s:\t", msg); prt_bitflags(out, bch2_recovery_passes, passes); prt_newline(out); } void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) { struct bch_fs_recovery *r = &c->recovery; printbuf_tabstop_push(out, 32); prt_passes(out, "Scheduled (superblock)", c->sb.recovery_passes_required); prt_passes(out, "Scheduled (ephemeral)", r->scheduled_passes_ephemeral); prt_passes(out, "Completed", r->passes_complete); prt_passes(out, "Failing", r->passes_failing); if (r->current_pass) { prt_printf(out, "Currently running:\t%s (%u)\n", bch2_recovery_passes[r->current_pass], r->current_pass); prt_passes(out, "Next", r->current_passes); if (test_bit(BCH_FS_in_recovery, &c->flags) && r->rewound_from) prt_printf(out, "Rewound from:\t%s (%u)\n", bch2_recovery_passes[r->rewound_from], r->rewound_from); } } void bch2_fs_recovery_passes_init(struct bch_fs *c) { spin_lock_init(&c->recovery.lock); mutex_init(&c->recovery.run_lock); INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); }