// SPDX-License-Identifier: GPL-2.0

#include "bcachefs.h"

#include "alloc/accounting.h"
#include "alloc/background.h"
#include "alloc/backpointers.h"
#include "alloc/check.h"
#include "alloc/lru.h"

#include "btree/check.h"
#include "btree/node_scan.h"

#include "data/copygc.h"
#include "data/ec.h"
#include "data/reconcile.h"

#include "fs/check.h"
#include "fs/inode.h"
#include "fs/logged_ops.h"

#include "journal/init.h"
#include "journal/journal.h"

#include "sb/io.h"

#include "snapshots/snapshot.h"
#include "snapshots/subvolume.h"

#include "init/recovery.h"
#include "init/passes.h"
#include "init/fs.h"

const char * const bch2_recovery_passes[] = {
#define x(_fn, ...)	#_fn,
	BCH_RECOVERY_PASSES()
#undef x
	NULL
};

static const u8 passes_to_stable_map[] = {
#define x(n, id, ...)	[BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
	BCH_RECOVERY_PASSES()
#undef x
};

static const u8 passes_from_stable_map[] = {
#define x(n, id, ...)	[BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
	BCH_RECOVERY_PASSES()
#undef x
};

static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
{
	return passes_to_stable_map[pass];
}

u64 bch2_recovery_passes_to_stable(u64 v)
{
	u64 ret = 0;
	for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
		if (v & BIT_ULL(i))
			ret |= BIT_ULL(passes_to_stable_map[i]);
	return ret;
}

static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass)
{
	return pass < ARRAY_SIZE(passes_from_stable_map)
		? passes_from_stable_map[pass]
		: 0;
}

u64 bch2_recovery_passes_from_stable(u64 v)
{
	u64 ret = 0;
	for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++)
		if (v & BIT_ULL(i))
			ret |= BIT_ULL(passes_from_stable_map[i]);
	return ret;
}

static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f,
					    enum bch_validate_flags flags, struct printbuf *err)
{
	return 0;
}

static void bch2_sb_recovery_passes_to_text(struct printbuf *out,
					    struct bch_fs *c,
					    struct bch_sb *sb,
					    struct bch_sb_field *f)
{
	struct bch_sb_field_recovery_passes *r =
		field_to_type(f, recovery_passes);
	unsigned nr = recovery_passes_nr_entries(r);

	if (out->nr_tabstops < 1)
		printbuf_tabstop_push(out, 32);
	if (out->nr_tabstops < 2)
		printbuf_tabstop_push(out, 16);

	prt_printf(out, "Pass\tLast run\tLast runtime\n");

	for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) {
		if (!i->last_run)
			continue;

		unsigned idx = i - r->start;

		prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]);

		bch2_prt_datetime(out, le64_to_cpu(i->last_run));
		prt_tab(out);

		bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC);

		if (BCH_RECOVERY_PASS_NO_RATELIMIT(i))
			prt_str(out, " (no ratelimit)");

		prt_newline(out);
	}
}

static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c,
							       enum bch_recovery_pass pass)
{
	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);

	lockdep_assert_held(&c->sb_lock);

	struct bch_sb_field_recovery_passes *r =
		bch2_sb_field_get(c->disk_sb.sb, recovery_passes);

	if (stable >= recovery_passes_nr_entries(r)) {
		unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64);

		r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s);
		if (!r) {
			bch_err(c, "error creating recovery_passes sb section");
			return NULL;
		}
	}

	return r->start + stable;
}

static void bch2_sb_recovery_pass_complete(struct bch_fs *c,
					   enum bch_recovery_pass pass,
					   s64 start_time)
{
	guard(mutex)(&c->sb_lock);
	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
	__clear_bit_le64(bch2_recovery_pass_to_stable(pass),
			 ext->recovery_passes_required);

	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
	if (e) {
		s64 end_time	= ktime_get_real_seconds();
		e->last_run	= cpu_to_le64(end_time);
		e->last_runtime	= cpu_to_le32(max(0, end_time - start_time));
		SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
	}

	bch2_write_super(c);
}

void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c,
					 enum bch_recovery_pass pass)
{
	guard(mutex)(&c->sb_lock);

	struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass);
	if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) {
		SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false);
		bch2_write_super(c);
	}
}

static bool bch2_recovery_pass_entry_get_locked(struct bch_fs *c, enum bch_recovery_pass pass,
						struct recovery_pass_entry *e)
{
	lockdep_assert_held(&c->sb_lock);

	struct bch_sb_field_recovery_passes *r =
		bch2_sb_field_get(c->disk_sb.sb, recovery_passes);

	enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass);
	bool found = stable < recovery_passes_nr_entries(r);
	if (found)
		*e = r->start[stable];

	return found;
}

static bool bch2_recovery_pass_want_ratelimit_locked(struct bch_fs *c, enum bch_recovery_pass pass,
						     unsigned runtime_fraction)
{
	struct recovery_pass_entry e;
	if (!bch2_recovery_pass_entry_get_locked(c, pass, &e))
		return false;

	/*
	 * Ratelimit if the last runtime was more than 1% of the time
	 * since we last ran
	 */
	return !BCH_RECOVERY_PASS_NO_RATELIMIT(&e) &&
		(u64) le32_to_cpu(e.last_runtime) * runtime_fraction >
		ktime_get_real_seconds() - le64_to_cpu(e.last_run);
}

bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass,
				       unsigned runtime_fraction)
{
	guard(mutex)(&c->sb_lock);
	return bch2_recovery_pass_want_ratelimit_locked(c, pass, runtime_fraction);
}

const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = {
	.validate	= bch2_sb_recovery_passes_validate,
	.to_text	= bch2_sb_recovery_passes_to_text
};

/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
static int bch2_recovery_pass_empty(struct bch_fs *c)
{
	return 0;
}

/*
 * Make sure root inode is readable while we're still in recovery and can rewind
 * for repair:
 */
static int bch2_lookup_root_inode(struct bch_fs *c)
{
	subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM;
	struct bch_inode_unpacked inode_u;
	struct bch_subvolume subvol;
	CLASS(btree_trans, trans)(c);

	return lockrestart_do(trans,
		bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?:
		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
}

struct recovery_pass {
	int		(*fn)(struct bch_fs *);
	const char	*name;
	unsigned	when;
	u64		depends;
};

static const struct recovery_pass recovery_passes[] = {
#define x(_fn, _id, _when, _depends)	{	\
	.fn		= bch2_##_fn,		\
	.name		= #_fn,			\
	.when		= _when,		\
	.depends	= _depends,		\
},
	BCH_RECOVERY_PASSES()
#undef x
};

u64 bch2_recovery_passes_match(unsigned flags)
{
	u64 ret = 0;

	for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++)
		if (recovery_passes[i].when & flags)
			ret |= BIT_ULL(i);
	return ret;
}

u64 bch2_fsck_recovery_passes(void)
{
	return bch2_recovery_passes_match(PASS_FSCK);
}

/* Set of all passes that depend on @pass, transitively */
static u64 pass_dependents(enum bch_recovery_pass pass)
{
	u64 passes = BIT_ULL(pass);
	bool found;

	do {
		found = false;
		for (unsigned i = 0; i < BCH_RECOVERY_PASS_NR; i++)
			if (!(passes & BIT_ULL(i)) &&
			    (passes & recovery_passes[i].depends)) {
				passes |= BIT_ULL(i);
				found = true;
			}
	} while (found);

	return passes;
}

/* true if all passes can be run online */
static bool passes_online(u64 passes)
{
	return passes == (passes & bch2_recovery_passes_match(PASS_ONLINE));
}

/* Returns true if a given pass and all scheduled dependents can run online */
static bool recovery_pass_should_defer(enum bch_recovery_pass pass,
				       u64 passes)
{
	return (passes_online(pass_dependents(pass) & passes)) != 0;
}

static bool recovery_pass_needs_rewind(struct bch_fs *c,
				       enum bch_recovery_pass pass)
{
	struct bch_fs_recovery *r = &c->recovery;
	return  test_bit(BCH_FS_in_recovery, &c->flags) &&
		r->current_pass > pass &&
		!(r->passes_complete & BIT_ULL(pass));
}

static bool recovery_pass_needs_set(struct bch_fs *c,
				    enum bch_recovery_pass pass,
				    enum bch_run_recovery_pass_flags *flags)
{
	struct bch_fs_recovery *r = &c->recovery;

	/*
	 * Never run scan_for_btree_nodes persistently: check_topology will run
	 * it if required
	 */
	if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
		*flags |= RUN_RECOVERY_PASS_nopersistent;

	if ((*flags & RUN_RECOVERY_PASS_ratelimit) &&
	    !bch2_recovery_pass_want_ratelimit_locked(c, pass, 100))
		*flags &= ~RUN_RECOVERY_PASS_ratelimit;

	/*
	 * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do
	 * anything if the pass has already run: these mean we need a prior pass
	 * to run before we continue to repair, we don't expect that pass to fix
	 * the damage we encountered.
	 *
	 * Otherwise, we run run_explicit_recovery_pass when we find damage, so
	 * it should run again even if it's already run:
	 */
	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
	bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent);
	u64 already_running = persistent
		? c->sb.recovery_passes_required
		: r->current_passes;

	if (!(already_running & BIT_ULL(pass)))
		return true;

	if (!(*flags & RUN_RECOVERY_PASS_ratelimit) &&
	    (r->passes_ratelimiting & BIT_ULL(pass)))
		return true;

	return recovery_pass_needs_rewind(c, pass);
}

/*
 * For when we need to rewind recovery passes and run a pass we skipped:
 */
int __bch2_run_explicit_recovery_pass(struct bch_fs *c,
				      struct printbuf *out,
				      enum bch_recovery_pass pass,
				      enum bch_run_recovery_pass_flags flags,
				      bool *write_sb)
{
	struct bch_fs_recovery *r = &c->recovery;

	lockdep_assert_held(&c->sb_lock);

	bch2_printbuf_make_room(out, 1024);
	guard(printbuf_atomic)(out);
	guard(spinlock_irq)(&r->lock);

	if (!recovery_pass_needs_set(c, pass, &flags))
		return 0;

	out->suppress = false;

	bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags);
	bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit;

	if (flags & RUN_RECOVERY_PASS_nopersistent) {
		r->scheduled_passes_ephemeral |= BIT_ULL(pass);
	} else {
		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
		*write_sb |= !__test_and_set_bit_le64(bch2_recovery_pass_to_stable(pass),
						     ext->recovery_passes_required);
	}

	if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
	    test_bit(BCH_FS_may_go_rw, &c->flags)) {
		prt_printf(out, "need recovery pass %s (%u), but already rw\n",
			   bch2_recovery_passes[pass], pass);
		return bch_err_throw(c, cannot_rewind_recovery);
	}

	if (ratelimit)
		r->passes_ratelimiting |= BIT_ULL(pass);
	else
		r->passes_ratelimiting &= ~BIT_ULL(pass);

	if (in_recovery && !ratelimit && !recovery_pass_should_defer(pass, r->current_passes)) {
		bool rewind = recovery_pass_needs_rewind(c, pass);

		prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n",
			   bch2_recovery_passes[pass], pass,
			   bch2_recovery_passes[r->current_pass], r->current_pass,
			   rewind ? " - rewinding" : "");

		r->current_passes |= BIT_ULL(pass);

		if (rewind) {
			r->rewound_to = r->rewound_to
				? min(r->rewound_to, pass)
				: pass;
			return bch_err_throw(c, restart_recovery);
		}
	} else {
		prt_printf(out, "scheduling recovery pass %s (%u)%s\n",
			   bch2_recovery_passes[pass], pass,
			   ratelimit ? " - ratelimiting" : "");

		const struct recovery_pass *p = recovery_passes + pass;
		if (!ratelimit && (p->when & PASS_ONLINE))
			bch2_run_async_recovery_passes(c);
	}

	return 0;
}

int bch2_run_explicit_recovery_pass(struct bch_fs *c,
				    struct printbuf *out,
				    enum bch_recovery_pass pass,
				    enum bch_run_recovery_pass_flags flags)
{
	/*
	 * With RUN_RECOVERY_PASS_ratelimit, recovery_pass_needs_set needs
	 * sb_lock
	 */
	if (!(flags & RUN_RECOVERY_PASS_ratelimit) &&
	    !recovery_pass_needs_set(c, pass, &flags))
		return 0;

	guard(mutex)(&c->sb_lock);
	bool write_sb = false;
	int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb);
	if (write_sb)
		bch2_write_super(c);
	return ret;
}

/*
 * Returns 0 if @pass has run recently, otherwise one of
 * -BCH_ERR_restart_recovery
 * -BCH_ERR_recovery_pass_will_run
 */
int bch2_require_recovery_pass(struct bch_fs *c,
			       struct printbuf *out,
			       enum bch_recovery_pass pass)
{
	if (test_bit(BCH_FS_in_recovery, &c->flags) &&
	    c->recovery.passes_complete & BIT_ULL(pass))
		return 0;

	guard(mutex)(&c->sb_lock);

	if (bch2_recovery_pass_want_ratelimit_locked(c, pass, 100))
		return 0;

	enum bch_run_recovery_pass_flags flags = 0;

	bool write_sb = false;
	int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb) ?:
		bch_err_throw(c, recovery_pass_will_run);
	if (write_sb)
		bch2_write_super(c);
	return ret;
}

static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
	struct bch_fs_recovery *r = &c->recovery;
	const struct recovery_pass *p = recovery_passes + pass;

	if (!(p->when & PASS_SILENT))
		bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
			   bch2_recovery_passes[pass]);

	s64 start_time = ktime_get_real_seconds();
	int ret = p->fn(c);
	if (ret) {
		bch_err(c, "%s(): error %s", p->name, bch2_err_str(ret));
		r->passes_failing |= BIT_ULL(pass);
		return ret;
	}

	if (!(p->when & PASS_SILENT))
		bch2_print(c, KERN_CONT " done\n");
	r->passes_failing = 0;

	if (!test_bit(BCH_FS_error, &c->flags))
		bch2_sb_recovery_pass_complete(c, pass, start_time);

	return 0;
}

int bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, bool failfast)
{
	struct bch_fs_recovery *r = &c->recovery;
	int ret = 0;

	spin_lock_irq(&r->lock);

	if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))
		orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC);

	/*
	 * A failed recovery pass will be retried after another pass succeeds -
	 * but not this iteration.
	 *
	 * This is because some passes depend on repair done by other passes: we
	 * may want to retry, but we don't want to loop on failing passes.
	 */

	orig_passes_to_run &= ~r->passes_failing;

	r->current_passes = orig_passes_to_run;

	enum bch_recovery_pass prev = 0;
	while (r->current_passes) {
		unsigned pass = __ffs64(r->current_passes);

		r->current_pass			= pass;
		r->current_passes		&= ~BIT_ULL(pass);
		r->scheduled_passes_ephemeral	&= ~BIT_ULL(pass);

		spin_unlock_irq(&r->lock);

		int ret2 = bch2_run_recovery_pass(c, pass) ?:
			bch2_journal_flush(&c->journal);

		spin_lock_irq(&r->lock);

		if (r->rewound_to) {
			r->rewound_from	= max(r->rewound_from, pass);
			/* Restore r->current_passses up to and including r->rewound_to */
			r->current_passes |= orig_passes_to_run & (~0ULL << r->rewound_to);
			r->rewound_to = 0;
		} else if (!ret2) {
			r->pass_done = max(r->pass_done, pass);
			r->passes_complete |= BIT_ULL(pass);
		} else {
			ret = ret2;
		}

		if (ret && failfast)
			break;

		if (prev <= BCH_RECOVERY_PASS_check_snapshots &&
		    pass > BCH_RECOVERY_PASS_check_snapshots) {
			bch2_copygc_wakeup(c);
			bch2_reconcile_wakeup(c);
		}

		prev = pass;
	}

	r->current_pass = 0;
	spin_unlock_irq(&r->lock);

	return ret;
}

static void bch2_async_recovery_passes_work(struct work_struct *work)
{
	struct bch_fs *c = container_of(work, struct bch_fs, recovery.work);
	struct bch_fs_recovery *r = &c->recovery;

	if (mutex_trylock(&r->run_lock)) {
		bch2_run_recovery_passes(c,
			(c->sb.recovery_passes_required |
			 r->scheduled_passes_ephemeral) &
			~r->passes_ratelimiting &
			bch2_recovery_passes_match(PASS_ONLINE),
			false);

		mutex_unlock(&r->run_lock);
	}
	enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
}

void bch2_run_async_recovery_passes(struct bch_fs *c)
{
	if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes))
		return;

	if (queue_work(system_long_wq, &c->recovery.work))
		return;

	enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes);
}

int bch2_run_recovery_passes_startup(struct bch_fs *c, enum bch_recovery_pass from)
{
	struct bch_fs_recovery *r = &c->recovery;

	r->scheduled_passes_ephemeral = c->opts.recovery_passes;

	u64 passes =
		bch2_recovery_passes_match(PASS_ALWAYS) |
		(!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) |
		(c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) |
		c->opts.recovery_passes |
		c->sb.recovery_passes_required;

	if (c->opts.recovery_pass_last)
		passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1;

	/*
	 * We can't allow set_may_go_rw to be excluded; that would cause us to
	 * use the journal replay keys for updates where it's not expected.
	 */
	c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
	passes &= ~c->opts.recovery_passes_exclude;

	passes &= ~(BIT_ULL(from) - 1);

	/*
	 * Defer passes that can be run online, and don't have dependents that
	 * can't be run online
	 */
	u64 defer = 0;
	if (!c->opts.fsck)
		for (unsigned i = 0; i < BCH_RECOVERY_PASS_NR; i++)
			if ((passes & BIT_ULL(i)) &&
			    recovery_pass_should_defer(i, passes)) {
				defer |= BIT_ULL(i);
				passes &= ~BIT_ULL(i);
			}

	scoped_guard(mutex, &r->run_lock)
		try(bch2_run_recovery_passes(c, passes, true));

	clear_bit(BCH_FS_in_recovery, &c->flags);

	if (defer) {
		CLASS(bch_log_msg_level, msg)(c, LOGLEVEL_notice);
		prt_printf(&msg.m, "Running the following recovery passes in the background:\n");
		prt_bitflags(&msg.m, bch2_recovery_passes, defer);

		r->scheduled_passes_ephemeral |= defer;
		bch2_run_async_recovery_passes(c);
	}

	return 0;
}

static void prt_passes(struct printbuf *out, const char *msg, u64 passes)
{
	prt_printf(out, "%s:\t", msg);
	prt_bitflags(out, bch2_recovery_passes, passes);
	prt_newline(out);
}

void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c)
{
	struct bch_fs_recovery *r = &c->recovery;

	printbuf_tabstop_push(out, 32);
	prt_passes(out, "Scheduled (superblock)",	c->sb.recovery_passes_required);
	prt_passes(out, "Scheduled (ephemeral)",	r->scheduled_passes_ephemeral);

	prt_passes(out, "Completed",	r->passes_complete);
	prt_passes(out, "Failing",	r->passes_failing);

	if (r->current_pass) {
		prt_printf(out, "Currently running:\t%s (%u)\n",
			   bch2_recovery_passes[r->current_pass], r->current_pass);
		prt_passes(out, "Next", r->current_passes);

		if (test_bit(BCH_FS_in_recovery, &c->flags) && r->rewound_from)
			prt_printf(out, "Rewound from:\t%s (%u)\n",
				   bch2_recovery_passes[r->rewound_from],
				   r->rewound_from);
	}
}

void bch2_fs_recovery_passes_init(struct bch_fs *c)
{
	spin_lock_init(&c->recovery.lock);
	mutex_init(&c->recovery.run_lock);

	INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work);
}