Update bcachefs sources to 6e73711dc355 bcachefs: Read retries are after checksum errors now REQ_FUA

This commit is contained in:
Kent Overstreet 2025-04-28 12:50:37 -04:00
parent e00e83b84d
commit 0589d9f3c3
43 changed files with 797 additions and 453 deletions

View File

@ -1 +1 @@
c9d875f9be1f853e747c9e00421c678b0adf73d2
6e73711dc3556f90eefa12d6cc7547d4b0eba5dc

View File

@ -484,7 +484,7 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
ret = bch2_trans_update(trans, &iter, &a->k_i, flags);
ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_);
bch2_trans_iter_exit(trans, &iter);
return unlikely(ret) ? ERR_PTR(ret) : a;
}
@ -2393,14 +2393,16 @@ bkey_err:
int bch2_fs_freespace_init(struct bch_fs *c)
{
int ret = 0;
bool doing_init = false;
if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image))
return 0;
/*
* We can crash during the device add path, so we need to check this on
* every mount:
*/
bool doing_init = false;
for_each_member_device(c, ca) {
if (ca->mi.freespace_initialized)
continue;
@ -2410,7 +2412,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
doing_init = true;
}
ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
if (ret) {
bch2_dev_put(ca);
bch_err_fn(c, ret);

View File

@ -1339,6 +1339,8 @@ alloc_done:
open_bucket_for_each(c, &req->wp->ptrs, ob, i)
req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free);
req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c));
BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX);
return 0;

View File

@ -157,7 +157,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ
unsigned i;
open_bucket_for_each(c, &wp->ptrs, ob, i)
ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob);
ob_push(c, ob->sectors_free < block_sectors(c)
? &ptrs
: &keep, ob);
wp->ptrs = keep;
mutex_unlock(&wp->lock);

View File

@ -295,6 +295,16 @@ do { \
bch2_print(_c, __VA_ARGS__); \
} while (0)
#define bch2_print_str_ratelimited(_c, ...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
\
if (__ratelimit(&_rs)) \
bch2_print_str(_c, __VA_ARGS__); \
} while (0)
#define bch_info(c, fmt, ...) \
bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_info_ratelimited(c, fmt, ...) \
@ -834,6 +844,7 @@ struct bch_fs {
unsigned nsec_per_time_unit;
u64 features;
u64 compat;
u64 recovery_passes_required;
unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)];
u64 btrees_lost_data;
} sb;

View File

@ -350,20 +350,13 @@ again:
prt_char(&buf, ' ');
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO),
trans, btree_node_read_error,
"Topology repair: unreadable btree node at\n%s",
buf.buf)) {
if (bch2_err_matches(ret, EIO)) {
bch2_btree_node_evict(trans, cur_k.k);
cur = NULL;
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
if (ret)
break;
ret = bch2_btree_lost_data(c, b->c.btree_id);
if (ret)
break;
continue;
}
@ -525,9 +518,6 @@ int bch2_check_topology(struct bch_fs *c)
bch2_btree_id_to_text(&buf, i);
if (r->error) {
ret = bch2_btree_lost_data(c, i);
if (ret)
break;
reconstruct_root:
bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf);

View File

@ -515,19 +515,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
struct bch_dev *ca,
bool print_pos,
struct btree *b, struct bset *i, struct bkey_packed *k,
unsigned offset, int write)
unsigned offset, int rw)
{
prt_printf(out, bch2_log_msg(c, "%s"),
write == READ
? "error validating btree node "
: "corrupt btree node before write ");
if (ca)
prt_printf(out, "on %s ", ca->name);
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
if (print_pos) {
prt_str(out, rw == READ
? "error validating btree node "
: "corrupt btree node before write ");
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
prt_newline(out);
}
prt_printf(out, "\nnode offset %u/%u",
if (ca)
prt_printf(out, "%s ", ca->name);
prt_printf(out, "node offset %u/%u",
b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
@ -538,75 +542,110 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_str(out, ": ");
}
__printf(10, 11)
__printf(11, 12)
static int __btree_err(int ret,
struct bch_fs *c,
struct bch_dev *ca,
struct btree *b,
struct bset *i,
struct bkey_packed *k,
int write,
bool have_retry,
int rw,
enum bch_sb_error_id err_type,
struct bch_io_failures *failed,
struct printbuf *err_msg,
const char *fmt, ...)
{
bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes;
if (c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes)
return -BCH_ERR_fsck_fix;
bool have_retry = false;
int ret2;
if (ca) {
bch2_mark_btree_validate_failure(failed, ca->dev_idx);
struct extent_ptr_decoded pick;
have_retry = !bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
failed, &pick, -1);
}
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry)
ret = -BCH_ERR_btree_node_read_err_fixable;
if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry)
ret = -BCH_ERR_btree_node_read_err_bad_node;
if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable)
bch2_sb_error_count(c, err_type);
bch2_sb_error_count(c, err_type);
bool print_deferred = err_msg &&
rw == READ &&
!(test_bit(BCH_FS_fsck_running, &c->flags) &&
c->opts.fix_errors == FSCK_FIX_ask);
struct printbuf out = PRINTBUF;
if (write != WRITE && ret != -BCH_ERR_btree_node_read_err_fixable) {
printbuf_indent_add_nextline(&out, 2);
#ifdef BCACHEFS_LOG_PREFIX
prt_printf(&out, bch2_log_msg(c, ""));
#endif
}
bch2_log_msg_start(c, &out);
btree_err_msg(&out, c, ca, b, i, k, b->written, write);
if (!print_deferred)
err_msg = &out;
btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw);
va_list args;
va_start(args, fmt);
prt_vprintf(&out, fmt, args);
prt_vprintf(err_msg, fmt, args);
va_end(args);
if (write == WRITE) {
if (print_deferred) {
prt_newline(err_msg);
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type);
if (ret2 != -BCH_ERR_fsck_fix &&
ret2 != -BCH_ERR_fsck_ignore) {
ret = ret2;
goto fsck_err;
}
if (!have_retry)
ret = -BCH_ERR_fsck_fix;
goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
ret = __bch2_topology_error(c, &out);
break;
}
goto out;
}
if (rw == WRITE) {
prt_str(&out, ", ");
ret = __bch2_inconsistent_error(c, &out)
? -BCH_ERR_fsck_errors_not_fixed
: 0;
silent = false;
goto print;
}
switch (ret) {
case -BCH_ERR_btree_node_read_err_fixable:
ret = !silent
? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf)
: -BCH_ERR_fsck_fix;
if (ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore)
ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf);
if (ret2 != -BCH_ERR_fsck_fix &&
ret2 != -BCH_ERR_fsck_ignore) {
ret = ret2;
goto fsck_err;
ret = -BCH_ERR_fsck_fix;
}
if (!have_retry)
ret = -BCH_ERR_fsck_fix;
goto out;
case -BCH_ERR_btree_node_read_err_bad_node:
prt_str(&out, ", ");
ret = __bch2_topology_error(c, &out);
if (ret)
silent = false;
break;
case -BCH_ERR_btree_node_read_err_incompatible:
ret = -BCH_ERR_fsck_errors_not_fixed;
silent = false;
break;
}
if (!silent)
bch2_print_str(c, KERN_ERR, out.buf);
print:
bch2_print_str(c, KERN_ERR, out.buf);
out:
fsck_err:
printbuf_exit(&out);
@ -615,8 +654,9 @@ fsck_err:
#define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \
({ \
int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \
int _ret = __btree_err(type, c, ca, b, i, k, write, \
BCH_FSCK_ERR_##_err_type, \
failed, err_msg, \
msg, ##__VA_ARGS__); \
\
if (_ret != -BCH_ERR_fsck_fix) { \
@ -624,7 +664,7 @@ fsck_err:
goto fsck_err; \
} \
\
*saw_error = true; \
true; \
})
#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false)
@ -682,8 +722,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, struct bset *i,
unsigned offset, unsigned sectors,
int write, bool have_retry, bool *saw_error)
unsigned offset, unsigned sectors, int write,
struct bch_io_failures *failed,
struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
@ -896,7 +937,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b,
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bset *i, int write,
bool have_retry, bool *saw_error)
struct bch_io_failures *failed,
struct printbuf *err_msg)
{
unsigned version = le16_to_cpu(i->version);
struct bkey_packed *k, *prev = NULL;
@ -1009,7 +1051,9 @@ fsck_err:
}
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct btree *b, bool have_retry, bool *saw_error)
struct btree *b,
struct bch_io_failures *failed,
struct printbuf *err_msg)
{
struct btree_node_entry *bne;
struct sort_iter *iter;
@ -1022,7 +1066,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key));
u64 max_journal_seq = 0;
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
int ret = 0, write = READ;
u64 start_time = local_clock();
b->version_ondisk = U16_MAX;
@ -1156,15 +1200,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
b->version_ondisk = min(b->version_ondisk,
le16_to_cpu(i->version));
ret = validate_bset(c, ca, b, i, b->written, sectors,
READ, have_retry, saw_error);
ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg);
if (ret)
goto fsck_err;
if (!b->written)
btree_node_set_format(b, b->data->format);
ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error);
ret = validate_bset_keys(c, b, i, READ, failed, err_msg);
if (ret)
goto fsck_err;
@ -1292,20 +1335,11 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (!ptr_written)
set_btree_node_need_rewrite(b);
out:
fsck_err:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
ret == -BCH_ERR_btree_node_read_err_must_retry) {
retry_read = 1;
} else {
set_btree_node_read_error(b);
bch2_btree_lost_data(c, b->c.btree_id);
}
goto out;
return ret;
}
static void btree_node_read_work(struct work_struct *work)
@ -1317,15 +1351,25 @@ static void btree_node_read_work(struct work_struct *work)
struct btree *b = rb->b;
struct bio *bio = &rb->bio;
struct bch_io_failures failed = { .nr = 0 };
int ret = 0;
struct printbuf buf = PRINTBUF;
bool saw_error = false;
bool retry = false;
bool can_retry;
bch2_log_msg_start(c, &buf);
prt_printf(&buf, "btree node read error at btree ");
bch2_btree_pos_to_text(&buf, c, b);
prt_newline(&buf);
goto start;
while (1) {
retry = true;
bch_info(c, "retrying read");
ret = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
&failed, &rb->pick, -1);
if (ret) {
set_btree_node_read_error(b);
break;
}
ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = ca != NULL;
rb->start_time = local_clock();
@ -1343,60 +1387,59 @@ static void btree_node_read_work(struct work_struct *work)
bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read,
rb->start_time, !bio->bi_status);
start:
printbuf_reset(&buf);
bch2_btree_pos_to_text(&buf, c, b);
if (ca && bio->bi_status)
bch_err_dev_ratelimited(ca,
"btree read error %s for %s",
bch2_blk_status_to_str(bio->bi_status), buf.buf);
if (rb->have_ioref)
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read);
rb->have_ioref = false;
bch2_mark_io_failure(&failed, &rb->pick, false);
can_retry = bch2_bkey_pick_read_device(c,
bkey_i_to_s_c(&b->key),
&failed, &rb->pick, -1) > 0;
if (!bio->bi_status &&
!bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) {
if (retry)
bch_info(c, "retry success");
break;
if (bio->bi_status) {
bch2_mark_io_failure(&failed, &rb->pick, false);
continue;
}
saw_error = true;
ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf);
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
ret == -BCH_ERR_btree_node_read_err_must_retry)
continue;
if (!can_retry) {
if (ret)
set_btree_node_read_error(b);
bch2_btree_lost_data(c, b->c.btree_id);
break;
}
break;
}
bch2_io_failures_to_text(&buf, c, &failed);
if (btree_node_read_error(b))
bch2_btree_lost_data(c, &buf, b->c.btree_id);
/*
* only print retry success if we read from a replica with no errors
*/
if (btree_node_read_error(b))
prt_printf(&buf, "ret %s", bch2_err_str(ret));
else if (failed.nr) {
if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev))
prt_printf(&buf, "retry success");
else
prt_printf(&buf, "repair success");
}
if ((failed.nr ||
btree_node_need_rewrite(b)) &&
!btree_node_read_error(b) &&
c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
prt_printf(&buf, " (rewriting node)");
bch2_btree_node_rewrite_async(c, b);
}
prt_newline(&buf);
if (failed.nr)
bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
async_object_list_del(c, btree_read_bio, rb->list_idx);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);
bio_put(&rb->bio);
if ((saw_error ||
btree_node_need_rewrite(b)) &&
!btree_node_read_error(b) &&
c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
if (saw_error) {
printbuf_reset(&buf);
bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level);
prt_str(&buf, " ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s",
__func__, buf.buf);
}
bch2_btree_node_rewrite_async(c, b);
}
printbuf_exit(&buf);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
@ -1480,12 +1523,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
struct btree *b = ra->b;
struct printbuf buf = PRINTBUF;
bool dump_bset_maps = false;
bool have_retry = false;
int ret = 0, best = -1, write = READ;
unsigned i, written = 0, written2 = 0;
__le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
bool _saw_error = false, *saw_error = &_saw_error;
struct printbuf *err_msg = NULL;
struct bch_io_failures *failed = NULL;
for (i = 0; i < ra->nr; i++) {
struct btree_node *bn = ra->buf[i];
@ -1578,14 +1622,19 @@ fsck_err:
if (best >= 0) {
memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL);
} else {
ret = -1;
}
if (ret) {
set_btree_node_read_error(b);
bch2_btree_lost_data(c, b->c.btree_id);
struct printbuf buf = PRINTBUF;
bch2_btree_lost_data(c, &buf, b->c.btree_id);
if (buf.pos)
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
} else if (*saw_error)
bch2_btree_node_rewrite_async(c, b);
@ -1718,6 +1767,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
prt_str(&buf, "btree node read error: no device to read from\n at ");
bch2_btree_pos_to_text(&buf, c, b);
prt_newline(&buf);
bch2_btree_lost_data(c, &buf, b->c.btree_id);
bch_err_ratelimited(c, "%s", buf.buf);
if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
@ -1725,7 +1776,6 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bch2_fatal_error(c);
set_btree_node_read_error(b);
bch2_btree_lost_data(c, b->c.btree_id);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
printbuf_exit(&buf);
@ -2194,8 +2244,6 @@ static void btree_node_write_endio(struct bio *bio)
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
struct bset *i, unsigned sectors)
{
bool saw_error;
int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key),
(struct bkey_validate_context) {
.from = BKEY_VALIDATE_btree_node,
@ -2208,8 +2256,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
return ret;
}
ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?:
validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error);
ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?:
validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL);
if (ret) {
bch2_inconsistent_error(c);
dump_stack();

View File

@ -134,7 +134,9 @@ void bch2_btree_build_aux_trees(struct btree *);
void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
struct btree *, bool, bool *);
struct btree *,
struct bch_io_failures *,
struct printbuf *);
void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);

View File

@ -2577,7 +2577,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
struct bpos end)
{
if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) &&
!bkey_eq(iter->pos, POS_MAX)) {
!bkey_eq(iter->pos, POS_MAX) &&
!((iter->flags & BTREE_ITER_is_extents) &&
iter->pos.offset == U64_MAX)) {
/*
* bkey_start_pos(), for extents, is not monotonically
* increasing until after filtering for snapshots:
@ -2602,7 +2605,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct
bch2_trans_verify_not_unlocked_or_in_restart(trans);
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode);
int ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
@ -3123,6 +3126,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long
struct btree_transaction_stats *s = btree_trans_stats(trans);
if (new_bytes > s->max_mem) {
mutex_lock(&s->lock);
#ifdef CONFIG_BCACHEFS_DEBUG
darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr);
s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size,
@ -3134,6 +3138,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long
s->trans_kmalloc_trace.nr);
#endif
s->max_mem = new_bytes;
mutex_unlock(&s->lock);
}
if (trans->used_mempool) {

View File

@ -511,8 +511,9 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
return 0;
}
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_iter_update_trigger_flags flags,
unsigned long ip)
{
kmsan_check_memory(k, bkey_bytes(&k->k));
@ -548,7 +549,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
path_idx = iter->key_cache_path;
}
return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
return bch2_trans_update_by_path(trans, path_idx, k, flags, ip);
}
int bch2_btree_insert_clone_trans(struct btree_trans *trans,

View File

@ -102,8 +102,16 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *
int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
enum btree_id, struct bpos);
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_iter_update_trigger_flags);
int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_iter_update_trigger_flags,
unsigned long);
static inline int __must_check
bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_iter_update_trigger_flags flags)
{
return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_);
}
struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);

View File

@ -399,7 +399,7 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf
bool print = __bch2_count_fsck_err(c, id, buf);
int ret = bch2_run_explicit_recovery_pass_printbuf(c, buf,
int ret = bch2_run_explicit_recovery_pass_persistent(c, buf,
BCH_RECOVERY_PASS_check_allocations);
if (insert) {
@ -599,6 +599,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
}
struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr);
if (!bucket_valid(ca, bucket.offset)) {
if (insert) {
bch2_dev_bucket_missing(ca, bucket.offset);
ret = -BCH_ERR_trigger_pointer;
}
goto err;
}
if (flags & BTREE_TRIGGER_transactional) {
struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0);
@ -965,7 +972,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf);
bch2_run_explicit_recovery_pass_printbuf(c, &buf,
bch2_run_explicit_recovery_pass_persistent(c, &buf,
BCH_RECOVERY_PASS_check_allocations);
if (print)
@ -1310,13 +1317,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
if (resize) {
bucket_gens->nbuckets = min(bucket_gens->nbuckets,
old_bucket_gens->nbuckets);
bucket_gens->nbuckets_minus_first =
bucket_gens->nbuckets - bucket_gens->first_bucket;
u64 copy = min(bucket_gens->nbuckets,
old_bucket_gens->nbuckets);
memcpy(bucket_gens->b,
old_bucket_gens->b,
bucket_gens->nbuckets);
sizeof(bucket_gens->b[0]) * copy);
}
rcu_assign_pointer(ca->bucket_gens, bucket_gens);

View File

@ -42,7 +42,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
struct btree_node *n_sorted = c->verify_data->data;
struct bset *sorted, *inmemory = &b->data->keys;
struct bio *bio;
bool failed = false, saw_error = false;
bool failed = false;
struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ,
BCH_DEV_READ_REF_btree_verify_replicas);
@ -66,7 +66,7 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
if (bch2_btree_node_read_done(c, ca, v, NULL, NULL))
return false;
n_sorted = c->verify_data->data;

View File

@ -13,8 +13,8 @@
#include <linux/dcache.h>
static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
*out_cf = (struct qstr) QSTR_INIT(NULL, 0);
@ -35,18 +35,6 @@ static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *
#endif
}
static inline int bch2_maybe_casefold(struct btree_trans *trans,
const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
if (likely(!info->cf_encoding)) {
*out_cf = *str;
return 0;
} else {
return bch2_casefold(trans, info, str, out_cf);
}
}
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
@ -224,12 +212,19 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
struct qstr d_name = bch2_dirent_get_name(d);
prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
prt_printf(out, "%.*s", d_name.len, d_name.name);
if (d.v->d_casefold) {
struct qstr d_name = bch2_dirent_get_lookup_name(d);
prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name);
}
prt_str(out, " ->");
if (d.v->d_type != DT_SUBVOL)
prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum));
else
prt_printf(out, "%u -> %u",
prt_printf(out, " %u -> %u",
le32_to_cpu(d.v->d_parent_subvol),
le32_to_cpu(d.v->d_child_subvol));

View File

@ -23,6 +23,21 @@ struct bch_fs;
struct bch_hash_info;
struct bch_inode_info;
int bch2_casefold(struct btree_trans *, const struct bch_hash_info *,
const struct qstr *, struct qstr *);
static inline int bch2_maybe_casefold(struct btree_trans *trans,
const struct bch_hash_info *info,
const struct qstr *str, struct qstr *out_cf)
{
if (likely(!info->cf_encoding)) {
*out_cf = *str;
return 0;
} else {
return bch2_casefold(trans, info, str, out_cf);
}
}
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d);
static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len)

View File

@ -175,6 +175,7 @@
x(0, backpointer_to_overwritten_btree_node) \
x(0, journal_reclaim_would_deadlock) \
x(EINVAL, fsck) \
x(BCH_ERR_fsck, fsck_ask) \
x(BCH_ERR_fsck, fsck_fix) \
x(BCH_ERR_fsck, fsck_delete_bkey) \
x(BCH_ERR_fsck, fsck_ignore) \

View File

@ -104,7 +104,7 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out)
__bch2_inconsistent_error(c, out);
return -BCH_ERR_btree_need_topology_repair;
} else {
return bch2_run_explicit_recovery_pass_printbuf(c, out, BCH_RECOVERY_PASS_check_topology) ?:
return bch2_run_explicit_recovery_pass_persistent(c, out, BCH_RECOVERY_PASS_check_topology) ?:
-BCH_ERR_btree_node_read_validate_error;
}
}
@ -393,6 +393,48 @@ bool __bch2_count_fsck_err(struct bch_fs *c,
return print && !repeat;
}
int bch2_fsck_err_opt(struct bch_fs *c,
enum bch_fsck_flags flags,
enum bch_sb_error_id err)
{
if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra)))
flags |= fsck_flags_extra[err];
if (test_bit(BCH_FS_fsck_running, &c->flags)) {
if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE)))
return -BCH_ERR_fsck_repair_unimplemented;
switch (c->opts.fix_errors) {
case FSCK_FIX_exit:
return -BCH_ERR_fsck_errors_not_fixed;
case FSCK_FIX_yes:
if (flags & FSCK_CAN_FIX)
return -BCH_ERR_fsck_fix;
fallthrough;
case FSCK_FIX_no:
if (flags & FSCK_CAN_IGNORE)
return -BCH_ERR_fsck_ignore;
return -BCH_ERR_fsck_errors_not_fixed;
case FSCK_FIX_ask:
if (flags & FSCK_AUTOFIX)
return -BCH_ERR_fsck_fix;
return -BCH_ERR_fsck_ask;
default:
BUG();
}
} else {
if ((flags & FSCK_AUTOFIX) &&
(c->opts.errors == BCH_ON_ERROR_continue ||
c->opts.errors == BCH_ON_ERROR_fix_safe))
return -BCH_ERR_fsck_fix;
if (c->opts.errors == BCH_ON_ERROR_continue &&
(flags & FSCK_CAN_IGNORE))
return -BCH_ERR_fsck_ignore;
return -BCH_ERR_fsck_errors_not_fixed;
}
}
int __bch2_fsck_err(struct bch_fs *c,
struct btree_trans *trans,
enum bch_fsck_flags flags,

View File

@ -80,6 +80,10 @@ bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbu
#define bch2_count_fsck_err(_c, _err, ...) \
__bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__)
int bch2_fsck_err_opt(struct bch_fs *,
enum bch_fsck_flags,
enum bch_sb_error_id);
__printf(5, 6) __cold
int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
enum bch_fsck_flags,

View File

@ -45,6 +45,49 @@ static void bch2_extent_crc_pack(union bch_extent_crc *,
struct bch_extent_crc_unpacked,
enum bch_extent_entry_type);
void bch2_io_failures_to_text(struct printbuf *out,
struct bch_fs *c,
struct bch_io_failures *failed)
{
static const char * const error_types[] = {
"io", "checksum", "ec reconstruct", NULL
};
for (struct bch_dev_io_failures *f = failed->devs;
f < failed->devs + failed->nr;
f++) {
unsigned errflags =
((!!f->failed_io) << 0) |
((!!f->failed_csum_nr) << 1) |
((!!f->failed_ec) << 2);
if (!errflags)
continue;
bch2_printbuf_make_room(out, 1024);
rcu_read_lock();
out->atomic++;
struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev);
if (ca)
prt_str(out, ca->name);
else
prt_printf(out, "(invalid device %u)", f->dev);
--out->atomic;
rcu_read_unlock();
prt_char(out, ' ');
if (is_power_of_2(errflags)) {
prt_bitflags(out, error_types, errflags);
prt_str(out, " error");
} else {
prt_str(out, "errors: ");
prt_bitflags(out, error_types, errflags);
}
prt_newline(out);
}
}
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f,
unsigned dev)
{
@ -79,6 +122,22 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
f->failed_csum_nr++;
}
void bch2_mark_btree_validate_failure(struct bch_io_failures *failed,
unsigned dev)
{
struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev);
if (!f) {
BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
f = &failed->devs[failed->nr++];
memset(f, 0, sizeof(*f));
f->dev = dev;
}
f->failed_btree_validate = true;
}
static inline u64 dev_latency(struct bch_dev *ca)
{
return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX;
@ -179,6 +238,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) {
have_io_errors |= f->failed_io;
have_io_errors |= f->failed_btree_validate;
have_io_errors |= f->failed_ec;
}
have_csum_errors |= !!f->failed_csum_nr;
@ -186,6 +246,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
if (p.has_ec && (f->failed_io || f->failed_csum_nr))
p.do_ec_reconstruct = true;
else if (f->failed_io ||
f->failed_btree_validate ||
f->failed_csum_nr > c->opts.checksum_err_retry_nr)
continue;
}

View File

@ -399,10 +399,13 @@ out: \
/* utility code common to all keys with pointers: */
void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *,
struct bch_io_failures *);
struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *,
unsigned);
void bch2_mark_io_failure(struct bch_io_failures *,
struct extent_ptr_decoded *, bool);
void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned);
int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
struct bch_io_failures *,
struct extent_ptr_decoded *, int);

View File

@ -34,6 +34,7 @@ struct bch_io_failures {
u8 dev;
unsigned failed_csum_nr:6,
failed_io:1,
failed_btree_validate:1,
failed_ec:1;
} devs[BCH_REPLICAS_MAX + 1];
};

View File

@ -53,7 +53,7 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum,
struct bch_subvolume *);
/* Set VFS inode flags from bcachefs inode: */
static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode)
{
static const __maybe_unused unsigned bch_flags_to_vfs[] = {
[__BCH_INODE_sync] = S_SYNC,
@ -64,8 +64,10 @@ static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode)
set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags);
if (inode->ei_inode.bi_casefold)
if (bch2_inode_casefold(c, &inode->ei_inode))
inode->v.i_flags |= S_CASEFOLD;
else
inode->v.i_flags &= ~S_CASEFOLD;
}
void bch2_inode_update_after_write(struct btree_trans *trans,
@ -96,7 +98,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
inode->ei_inode = *bi;
bch2_inode_flags_to_vfs(inode);
bch2_inode_flags_to_vfs(c, inode);
}
int __must_check bch2_write_inode(struct bch_fs *c,
@ -647,13 +649,18 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
const struct qstr *name)
{
struct bch_fs *c = trans->c;
struct btree_iter dirent_iter = {};
subvol_inum inum = {};
struct printbuf buf = PRINTBUF;
struct qstr lookup_name;
int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name);
if (ret)
return ERR_PTR(ret);
struct btree_iter dirent_iter = {};
struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
dir_hash_info, dir, name, 0);
int ret = bkey_err(k);
dir_hash_info, dir, &lookup_name, 0);
ret = bkey_err(k);
if (ret)
return ERR_PTR(ret);
@ -841,6 +848,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
*/
set_nlink(&inode->v, 0);
}
if (IS_CASEFOLDED(vdir))
d_invalidate(dentry);
err:
bch2_trans_put(trans);
bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode);
@ -1251,10 +1261,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap,
return finish_open_simple(file, 0);
}
struct bch_fiemap_extent {
struct bkey_buf kbuf;
unsigned flags;
};
static int bch2_fill_extent(struct bch_fs *c,
struct fiemap_extent_info *info,
struct bkey_s_c k, unsigned flags)
struct bch_fiemap_extent *fe)
{
struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k);
unsigned flags = fe->flags;
BUG_ON(!k.k->size);
if (bkey_extent_is_direct_data(k.k)) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@ -1307,110 +1327,223 @@ static int bch2_fill_extent(struct bch_fs *c,
}
}
/*
* Scan a range of an inode for data in pagecache.
*
* Intended to be retryable, so don't modify the output params until success is
* imminent.
*/
static int
bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end,
bool nonblock)
{
loff_t dstart, dend;
dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock);
if (dstart < 0)
return dstart;
if (dstart == *end) {
*start = dstart;
return 0;
}
dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock);
if (dend < 0)
return dend;
/* race */
BUG_ON(dstart == dend);
*start = dstart;
*end = dend;
return 0;
}
/*
* Scan a range of pagecache that corresponds to a file mapping hole in the
* extent btree. If data is found, fake up an extent key so it looks like a
* delalloc extent to the rest of the fiemap processing code.
*/
static int
bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode,
u64 start, u64 end, struct bch_fiemap_extent *cur)
{
struct bch_fs *c = trans->c;
struct bkey_i_extent *delextent;
struct bch_extent_ptr ptr = {};
loff_t dstart = start << 9, dend = end << 9;
int ret;
/*
* We hold btree locks here so we cannot block on folio locks without
* dropping trans locks first. Run a nonblocking scan for the common
* case of no folios over holes and fall back on failure.
*
* Note that dropping locks like this is technically racy against
* writeback inserting to the extent tree, but a non-sync fiemap scan is
* fundamentally racy with writeback anyways. Therefore, just report the
* range as delalloc regardless of whether we have to cycle trans locks.
*/
ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true);
if (ret == -EAGAIN)
ret = drop_locks_do(trans,
bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false));
if (ret < 0)
return ret;
/*
* Create a fake extent key in the buffer. We have to add a dummy extent
* pointer for the fill code to add an extent entry. It's explicitly
* zeroed to reflect delayed allocation (i.e. phys offset 0).
*/
bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64));
delextent = bkey_extent_init(cur->kbuf.k);
delextent->k.p = POS(inode->ei_inum.inum, dend >> 9);
delextent->k.size = (dend - dstart) >> 9;
bch2_bkey_append_ptr(&delextent->k_i, ptr);
cur->flags = FIEMAP_EXTENT_DELALLOC;
return 0;
}
static int bch2_next_fiemap_extent(struct btree_trans *trans,
struct bch_inode_info *inode,
u64 start, u64 end,
struct bch_fiemap_extent *cur)
{
u32 snapshot;
int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot);
if (ret)
return ret;
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
SPOS(inode->ei_inum.inum, start, snapshot), 0);
struct bkey_s_c k =
bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end));
ret = bkey_err(k);
if (ret)
goto err;
ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur);
if (ret)
goto err;
struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k);
/*
* Does the pagecache or the btree take precedence?
*
* It _should_ be the pagecache, so that we correctly report delalloc
* extents when dirty in the pagecache (we're COW, after all).
*
* But we'd have to add per-sector writeback tracking to
* bch_folio_state, otherwise we report delalloc extents for clean
* cached data in the pagecache.
*
* We should do this, but even then fiemap won't report stable mappings:
* on bcachefs data moves around in the background (copygc, rebalance)
* and we don't provide a way for userspace to lock that out.
*/
if (k.k &&
bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)),
pagecache_start)) {
bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k);
bch2_cut_front(iter.pos, cur->kbuf.k);
bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k);
cur->flags = 0;
} else if (k.k) {
bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k);
}
if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) {
unsigned sectors = cur->kbuf.k->k.size;
s64 offset_into_extent = 0;
enum btree_id data_btree = BTREE_ID_extents;
ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent,
&cur->kbuf);
if (ret)
goto err;
struct bkey_i *k = cur->kbuf.k;
sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent);
bch2_cut_front(POS(k->k.p.inode,
bkey_start_offset(&k->k) + offset_into_extent),
k);
bch2_key_resize(&k->k, sectors);
k->k.p = iter.pos;
k->k.p.offset += k->k.size;
}
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
}
static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
u64 start, u64 len)
{
struct bch_fs *c = vinode->i_sb->s_fs_info;
struct bch_inode_info *ei = to_bch_ei(vinode);
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
struct bkey_buf cur, prev;
bool have_extent = false;
struct bch_fiemap_extent cur, prev;
int ret = 0;
ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC);
ret = fiemap_prep(&ei->v, info, start, &len, 0);
if (ret)
return ret;
struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
if (start + len < start)
return -EINVAL;
start >>= 9;
u64 end = (start + len) >> 9;
bch2_bkey_buf_init(&cur.kbuf);
bch2_bkey_buf_init(&prev.kbuf);
bkey_init(&prev.kbuf.k->k);
bch2_bkey_buf_init(&cur);
bch2_bkey_buf_init(&prev);
trans = bch2_trans_get(c);
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(ei->v.i_ino, start), 0);
while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
enum btree_id data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
u32 snapshot;
ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
while (start < end) {
ret = lockrestart_do(trans,
bch2_next_fiemap_extent(trans, ei, start, end, &cur));
if (ret)
continue;
goto err;
bch2_btree_iter_set_snapshot(trans, &iter, snapshot);
BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start);
BUG_ON(cur.kbuf.k->k.p.offset > end);
k = bch2_btree_iter_peek_max(trans, &iter, end);
ret = bkey_err(k);
if (ret)
continue;
if (!k.k)
if (bkey_start_offset(&cur.kbuf.k->k) == end)
break;
if (!bkey_extent_is_data(k.k) &&
k.k->type != KEY_TYPE_reservation) {
bch2_btree_iter_advance(trans, &iter);
continue;
}
start = cur.kbuf.k->k.p.offset;
s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k);
unsigned sectors = k.k->size - offset_into_extent;
bch2_bkey_buf_reassemble(&cur, c, k);
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &cur);
if (ret)
continue;
k = bkey_i_to_s_c(cur.k);
bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent);
bch2_cut_front(POS(k.k->p.inode,
bkey_start_offset(k.k) +
offset_into_extent),
cur.k);
bch2_key_resize(&cur.k->k, sectors);
cur.k->k.p = iter.pos;
cur.k->k.p.offset += cur.k->k.size;
if (have_extent) {
if (!bkey_deleted(&prev.kbuf.k->k)) {
bch2_trans_unlock(trans);
ret = bch2_fill_extent(c, info,
bkey_i_to_s_c(prev.k), 0);
ret = bch2_fill_extent(c, info, &prev);
if (ret)
break;
goto err;
}
bkey_copy(prev.k, cur.k);
have_extent = true;
bch2_btree_iter_set_pos(trans, &iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k);
prev.flags = cur.flags;
}
bch2_trans_iter_exit(trans, &iter);
if (!ret && have_extent) {
if (!bkey_deleted(&prev.kbuf.k->k)) {
bch2_trans_unlock(trans);
ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
FIEMAP_EXTENT_LAST);
prev.flags |= FIEMAP_EXTENT_LAST;
ret = bch2_fill_extent(c, info, &prev);
}
err:
bch2_trans_put(trans);
bch2_bkey_buf_exit(&cur, c);
bch2_bkey_buf_exit(&prev, c);
return ret < 0 ? ret : 0;
bch2_bkey_buf_exit(&cur.kbuf, c);
bch2_bkey_buf_exit(&prev.kbuf, c);
return bch2_err_class(ret < 0 ? ret : 0);
}
static const struct vm_operations_struct bch_vm_ops = {
@ -1487,13 +1620,14 @@ static int bch2_fileattr_get(struct dentry *dentry,
struct fileattr *fa)
{
struct bch_inode_info *inode = to_bch_ei(d_inode(dentry));
struct bch_fs *c = inode->v.i_sb->s_fs_info;
fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags));
if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project))
fa->fsx_xflags |= FS_XFLAG_PROJINHERIT;
if (inode->ei_inode.bi_casefold)
if (bch2_inode_casefold(c, &inode->ei_inode))
fa->flags |= FS_CASEFOLD_FL;
fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ];
@ -1526,7 +1660,7 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans,
(s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags)
return -EINVAL;
if (s->casefold != bi->bi_casefold) {
if (s->casefold != bch2_inode_casefold(c, bi)) {
#ifdef CONFIG_UNICODE
int ret = 0;
/* Not supported on individual files. */
@ -1547,9 +1681,8 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans,
bch2_check_set_feature(c, BCH_FEATURE_casefolding);
bi->bi_casefold = s->casefold;
bi->bi_fields_set &= ~BIT(Inode_opt_casefold);
bi->bi_fields_set |= s->casefold << Inode_opt_casefold;
bi->bi_casefold = s->casefold + 1;
bi->bi_fields_set |= BIT(Inode_opt_casefold);
#else
printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n");
@ -2445,6 +2578,11 @@ got_sb:
if (ret)
goto err_put_super;
#ifdef CONFIG_UNICODE
sb->s_encoding = c->cf_encoding;
#endif
generic_set_sb_d_ops(sb);
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
bch_err_msg(c, ret, "mounting: error getting root inode");

View File

@ -243,6 +243,14 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k)
}
}
static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */
return bi->bi_casefold
? bi->bi_casefold - 1
: c->opts.casefold;
}
/* i_nlink: */
static inline unsigned nlink_bias(umode_t mode)

View File

@ -573,7 +573,6 @@ static void bch2_rbio_retry(struct work_struct *work)
.inum = rbio->read_pos.inode,
};
struct bch_io_failures failed = { .nr = 0 };
int orig_error = rbio->ret;
struct btree_trans *trans = bch2_trans_get(c);
@ -614,10 +613,11 @@ static void bch2_rbio_retry(struct work_struct *work)
if (ret) {
rbio->ret = ret;
rbio->bio.bi_status = BLK_STS_IOERR;
} else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace &&
orig_error != -BCH_ERR_data_read_ptr_stale_race &&
!failed.nr) {
}
if (failed.nr || ret) {
struct printbuf buf = PRINTBUF;
bch2_log_msg_start(c, &buf);
lockrestart_do(trans,
bch2_inum_offset_err_msg_trans(trans, &buf,
@ -625,9 +625,22 @@ static void bch2_rbio_retry(struct work_struct *work)
read_pos.offset << 9));
if (rbio->data_update)
prt_str(&buf, "(internal move) ");
prt_str(&buf, "successful retry");
bch_err_ratelimited(c, "%s", buf.buf);
prt_str(&buf, "data read error, ");
if (!ret)
prt_str(&buf, "successful retry");
else
prt_str(&buf, bch2_err_str(ret));
prt_newline(&buf);
if (!bkey_deleted(&sk.k->k)) {
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k));
prt_newline(&buf);
}
bch2_io_failures_to_text(&buf, c, &failed);
bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
printbuf_exit(&buf);
}
@ -662,27 +675,6 @@ static void bch2_rbio_error(struct bch_read_bio *rbio,
}
}
static void bch2_read_io_err(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bio *bio = &rbio->bio;
struct bch_fs *c = rbio->c;
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
struct printbuf buf = PRINTBUF;
bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status));
if (ca)
bch_err_ratelimited(ca, "%s", buf.buf);
else
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
}
static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
struct bch_read_bio *rbio)
{
@ -746,31 +738,6 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
__bch2_rbio_narrow_crcs(trans, rbio));
}
static void bch2_read_csum_err(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
struct bch_fs *c = rbio->c;
struct bio *src = &rbio->bio;
struct bch_extent_crc_unpacked crc = rbio->pick.crc;
struct nonce nonce = extent_nonce(rbio->version, crc);
struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
struct printbuf buf = PRINTBUF;
bch2_read_err_msg(c, &buf, rbio, rbio->read_pos);
prt_str(&buf, "data ");
bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL;
if (ca)
bch_err_ratelimited(ca, "%s", buf.buf);
else
bch_err_ratelimited(c, "%s", buf.buf);
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
printbuf_exit(&buf);
}
static void bch2_read_decompress_err(struct work_struct *work)
{
struct bch_read_bio *rbio =
@ -931,7 +898,7 @@ out:
memalloc_nofs_restore(nofs_flags);
return;
csum_err:
bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR);
goto out;
decompression_err:
bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
@ -957,7 +924,7 @@ static void bch2_read_endio(struct bio *bio)
rbio->bio.bi_end_io = rbio->end_io;
if (unlikely(bio->bi_status)) {
bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq);
bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status);
return;
}
@ -1289,14 +1256,6 @@ retry_pick:
if (likely(!rbio->pick.do_ec_reconstruct)) {
if (unlikely(!rbio->have_ioref)) {
struct printbuf buf = PRINTBUF;
bch2_read_err_msg_trans(trans, &buf, rbio, read_pos);
prt_printf(&buf, "no device to read from:\n ");
bch2_bkey_val_to_text(&buf, c, k);
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);
bch2_rbio_error(rbio,
-BCH_ERR_data_read_retry_device_offline,
BLK_STS_IOERR);

View File

@ -214,18 +214,20 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
if (nr_online < metadata_replicas_required(c)) {
struct printbuf buf = PRINTBUF;
buf.atomic++;
prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
"rw journal devs:", nr_online, metadata_replicas_required(c));
if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) {
struct printbuf buf = PRINTBUF;
buf.atomic++;
prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n"
"rw journal devs:", nr_online, metadata_replicas_required(c));
rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
prt_printf(&buf, " %s", ca->name);
rcu_read_unlock();
rcu_read_lock();
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
prt_printf(&buf, " %s", ca->name);
rcu_read_unlock();
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
}
ret = -BCH_ERR_insufficient_journal_devices;
goto out;
}

View File

@ -675,7 +675,7 @@ root_err:
if (ret)
break;
if (bkey_ge(bkey_start_pos(k.k), end))
if (bkey_gt(bkey_start_pos(k.k), end))
break;
if (ctxt->stats)
@ -750,7 +750,8 @@ next:
if (ctxt->stats)
atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
next_nondata:
bch2_btree_iter_advance(trans, &iter);
if (!bch2_btree_iter_advance(trans, &iter))
break;
}
out:
bch2_trans_iter_exit(trans, &reflink_iter);

View File

@ -343,6 +343,9 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
bool ret = false;
for (id = 0; id < Inode_opt_nr; id++) {
if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold)
continue;
/* Skip attributes that were explicitly set on this inode */
if (dst_u->bi_fields_set & (1 << id))
continue;

View File

@ -33,7 +33,9 @@
#include <linux/sort.h>
#include <linux/stat.h>
int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
int bch2_btree_lost_data(struct bch_fs *c,
struct printbuf *msg,
enum btree_id btree)
{
u64 b = BIT_ULL(btree);
int ret = 0;
@ -42,32 +44,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (!(c->sb.btrees_lost_data & b)) {
struct printbuf buf = PRINTBUF;
bch2_btree_id_to_text(&buf, btree);
bch_err(c, "flagging btree %s lost data", buf.buf);
printbuf_exit(&buf);
prt_printf(msg, "flagging btree ");
bch2_btree_id_to_text(msg, btree);
prt_printf(msg, " lost data\n");
ext->btrees_lost_data |= cpu_to_le64(b);
}
/* Once we have runtime self healing for topology errors we won't need this: */
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_topology) ?: ret;
/* Btree node accounting will be off: */
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret;
#ifdef CONFIG_BCACHEFS_DEBUG
/*
* These are much more minor, and don't need to be corrected right away,
* but in debug mode we want the next fsck run to be clean:
*/
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret;
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_lrus) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret;
#endif
switch (btree) {
case BTREE_ID_alloc:
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
@ -77,26 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
goto out;
case BTREE_ID_backpointers:
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret;
goto out;
case BTREE_ID_need_discard:
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
goto out;
case BTREE_ID_freespace:
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
goto out;
case BTREE_ID_bucket_gens:
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
goto out;
case BTREE_ID_lru:
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_alloc_info) ?: ret;
goto out;
case BTREE_ID_accounting:
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_check_allocations) ?: ret;
goto out;
case BTREE_ID_snapshots:
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
goto out;
default:
ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
ret = __bch2_run_explicit_recovery_pass_persistent(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret;
goto out;
}
out:
@ -583,9 +589,6 @@ static int read_btree_roots(struct bch_fs *c)
buf.buf, bch2_err_str(ret))) {
if (btree_id_is_alloc(i))
r->error = 0;
ret = bch2_btree_lost_data(c, i);
BUG_ON(ret);
}
}
@ -734,6 +737,11 @@ int bch2_fs_recovery(struct bch_fs *c)
c->opts.read_only = true;
}
if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
bch_info(c, "filesystem is an unresized image file, mounting ro");
c->opts.read_only = true;
}
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
bool write_sb = false;
@ -949,8 +957,6 @@ use_clean:
set_bit(BCH_FS_btree_running, &c->flags);
ret = bch2_sb_set_upgrade_extra(c);
ret = bch2_fs_resize_on_mount(c);
if (ret)
goto err;

View File

@ -2,7 +2,7 @@
#ifndef _BCACHEFS_RECOVERY_H
#define _BCACHEFS_RECOVERY_H
int bch2_btree_lost_data(struct bch_fs *, enum btree_id);
int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id);
void bch2_reconstruct_alloc(struct bch_fs *);
int bch2_journal_replay(struct bch_fs *);

View File

@ -141,13 +141,13 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out,
if (pass < BCH_RECOVERY_PASS_set_may_go_rw &&
c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) {
if (print)
prt_printf(out, "need recovery pass %s (%u), but already rw",
prt_printf(out, "need recovery pass %s (%u), but already rw\n",
bch2_recovery_passes[pass], pass);
return -BCH_ERR_cannot_rewind_recovery;
}
if (print)
prt_printf(out, "running explicit recovery pass %s (%u), currently at %s (%u)",
prt_printf(out, "running explicit recovery pass %s (%u), currently at %s (%u)\n",
bch2_recovery_passes[pass], pass,
bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
@ -162,7 +162,7 @@ static int __bch2_run_explicit_recovery_pass(struct printbuf *out,
}
}
int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *c,
static int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *c,
struct printbuf *out,
enum bch_recovery_pass pass)
{
@ -193,32 +193,30 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c,
return ret;
}
int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c,
enum bch_recovery_pass pass)
int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
struct printbuf *out,
enum bch_recovery_pass pass)
{
lockdep_assert_held(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
__set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required);
return bch2_run_explicit_recovery_pass(c, pass);
return bch2_run_explicit_recovery_pass_printbuf(c, out, pass);
}
int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
struct printbuf *out,
enum bch_recovery_pass pass)
{
enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
if (c->sb.recovery_passes_required & BIT_ULL(pass))
return 0;
mutex_lock(&c->sb_lock);
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (!test_bit_le64(s, ext->recovery_passes_required)) {
__set_bit_le64(s, ext->recovery_passes_required);
bch2_write_super(c);
}
int ret = __bch2_run_explicit_recovery_pass_persistent(c, out, pass);
mutex_unlock(&c->sb_lock);
return bch2_run_explicit_recovery_pass(c, pass);
return ret;
}
static void bch2_clear_recovery_pass_required(struct bch_fs *c,

View File

@ -8,12 +8,12 @@ u64 bch2_recovery_passes_from_stable(u64 v);
u64 bch2_fsck_recovery_passes(void);
int bch2_run_explicit_recovery_pass_printbuf(struct bch_fs *,
struct printbuf *,
enum bch_recovery_pass);
int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass);
int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
int __bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *,
enum bch_recovery_pass);
int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, struct printbuf *,
enum bch_recovery_pass);
int bch2_run_online_recovery_passes(struct bch_fs *);
int bch2_run_recovery_passes(struct bch_fs *);

View File

@ -20,6 +20,10 @@
* x(version, recovery_passes, errors...)
*/
#define UPGRADE_TABLE() \
x(snapshot_2, \
RECOVERY_PASS_ALL_FSCK, \
BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \
BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \
x(backpointers, \
RECOVERY_PASS_ALL_FSCK) \
x(inode_v3, \

View File

@ -46,7 +46,7 @@ enum bch_fsck_flags {
x(btree_node_unsupported_version, 34, 0) \
x(btree_node_bset_older_than_sb_min, 35, 0) \
x(btree_node_bset_newer_than_sb, 36, 0) \
x(btree_node_data_missing, 37, 0) \
x(btree_node_data_missing, 37, FSCK_AUTOFIX) \
x(btree_node_bset_after_end, 38, 0) \
x(btree_node_replicas_sectors_written_mismatch, 39, 0) \
x(btree_node_replicas_data_mismatch, 40, 0) \
@ -205,9 +205,9 @@ enum bch_fsck_flags {
x(snapshot_bad_depth, 184, 0) \
x(snapshot_bad_skiplist, 185, 0) \
x(subvol_pos_bad, 186, 0) \
x(subvol_not_master_and_not_snapshot, 187, 0) \
x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \
x(subvol_to_missing_root, 188, 0) \
x(subvol_root_wrong_bi_subvol, 189, 0) \
x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \
x(bkey_in_missing_snapshot, 190, 0) \
x(inode_pos_inode_nonzero, 191, 0) \
x(inode_pos_blockdev_range, 192, 0) \

View File

@ -20,7 +20,7 @@ int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev)
bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf);
int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf,
int ret = bch2_run_explicit_recovery_pass_persistent(c, &buf,
BCH_RECOVERY_PASS_check_allocations);
if (print)
@ -35,9 +35,11 @@ void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev)
bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev);
}
void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket)
void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket)
{
bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset);
bch2_fs_inconsistent(ca->fs,
"pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)",
bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets);
}
#define x(t, n, ...) [n] = #t,

View File

@ -258,20 +258,23 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev)
static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket)
{
struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode);
if (ca && !bucket_valid(ca, bucket.offset)) {
if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
bch2_dev_put(ca);
ca = NULL;
}
return ca;
}
void bch2_dev_bucket_missing(struct bch_fs *, struct bpos);
void bch2_dev_bucket_missing(struct bch_dev *, u64);
static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket)
{
struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket);
if (!ca)
bch2_dev_bucket_missing(c, bucket);
struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode);
if (ca && unlikely(!bucket_valid(ca, bucket.offset))) {
bch2_dev_bucket_missing(ca, bucket.offset);
bch2_dev_put(ca);
ca = NULL;
}
return ca;
}

View File

@ -1743,10 +1743,6 @@ int bch2_snapshots_read(struct bch_fs *c)
BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
test_bit(BCH_FS_may_go_rw, &c->flags));
if (bch2_err_matches(ret, EIO) ||
(c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
return ret;
}

View File

@ -33,7 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
struct bch_hash_info {
u8 type;
struct unicode_map *cf_encoding;
struct unicode_map *cf_encoding;
/*
* For crc32 or crc64 string hashes the first key value of
* the siphash_key (k0) is used as the key.
@ -44,11 +44,10 @@ struct bch_hash_info {
static inline struct bch_hash_info
bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* XXX ick */
struct bch_hash_info info = {
.type = INODE_STR_HASH(bi),
#ifdef CONFIG_UNICODE
.cf_encoding = bi->bi_casefold ? c->cf_encoding : NULL,
.cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL,
#endif
.siphash_key = { .k0 = bi->bi_hash_seed }
};

View File

@ -23,7 +23,7 @@ static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid)
prt_printf(&buf, "missing subvolume %u", subvolid);
bool print = bch2_count_fsck_err(c, subvol_missing, &buf);
int ret = bch2_run_explicit_recovery_pass_printbuf(c, &buf,
int ret = bch2_run_explicit_recovery_pass_persistent(c, &buf,
BCH_RECOVERY_PASS_check_inodes);
if (print)
bch2_print_str(c, KERN_ERR, buf.buf);
@ -62,8 +62,8 @@ static int check_subvol(struct btree_trans *trans,
ret = bch2_snapshot_lookup(trans, snapid, &snapshot);
if (bch2_err_matches(ret, ENOENT))
bch_err(c, "subvolume %llu points to nonexistent snapshot %u",
k.k->p.offset, snapid);
return bch2_run_explicit_recovery_pass(c,
BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret;
if (ret)
return ret;

View File

@ -623,6 +623,9 @@ static void bch2_sb_update(struct bch_fs *c)
struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
if (ext) {
c->sb.recovery_passes_required =
bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);

View File

@ -214,6 +214,7 @@ static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
static void bch2_dev_io_ref_stop(struct bch_dev *, int);
static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
static int bch2_fs_init_rw(struct bch_fs *);
static int bch2_fs_resize_on_mount(struct bch_fs *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
@ -567,6 +568,10 @@ static void __bch2_fs_free(struct bch_fs *c)
for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++)
bch2_time_stats_exit(&c->times[i]);
#ifdef CONFIG_UNICODE
utf8_unload(c->cf_encoding);
#endif
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
bch2_free_fsck_errs(c);
@ -898,25 +903,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts,
if (ret)
goto err;
#ifdef CONFIG_UNICODE
/* Default encoding until we can potentially have more as an option. */
c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
if (IS_ERR(c->cf_encoding)) {
printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
ret = -EINVAL;
goto err;
}
#else
if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
ret = -EINVAL;
goto err;
}
#endif
/* Compat: */
if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
!BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@ -1002,6 +988,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts,
if (ret)
goto err;
#ifdef CONFIG_UNICODE
/* Default encoding until we can potentially have more as an option. */
c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING);
if (IS_ERR(c->cf_encoding)) {
printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u",
unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
ret = -EINVAL;
goto err;
}
bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u",
unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING),
unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING));
#else
if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) {
printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n");
ret = -EINVAL;
goto err;
}
#endif
for (i = 0; i < c->sb.nr_devices; i++) {
if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
@ -1070,6 +1079,40 @@ static void print_mount_opts(struct bch_fs *c)
printbuf_exit(&p);
}
static bool bch2_fs_may_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned flags = 0;
switch (c->opts.degraded) {
case BCH_DEGRADED_very:
flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
break;
case BCH_DEGRADED_yes:
flags |= BCH_FORCE_IF_DEGRADED;
break;
default:
mutex_lock(&c->sb_lock);
for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) {
if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
ca = bch2_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro)) {
mutex_unlock(&c->sb_lock);
return false;
}
}
mutex_unlock(&c->sb_lock);
break;
}
return bch2_have_enough_devs(c, c->online_devs, flags, true);
}
int bch2_fs_start(struct bch_fs *c)
{
time64_t now = ktime_get_real_seconds();
@ -1077,6 +1120,9 @@ int bch2_fs_start(struct bch_fs *c)
print_mount_opts(c);
if (!bch2_fs_may_start(c))
return -BCH_ERR_insufficient_devices_to_start;
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
@ -1106,6 +1152,12 @@ int bch2_fs_start(struct bch_fs *c)
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
ret = bch2_fs_resize_on_mount(c);
if (ret) {
up_write(&c->state_lock);
goto err;
}
rcu_read_lock();
for_each_online_member_rcu(c, ca)
if (ca->mi.state == BCH_MEMBER_STATE_rw)
@ -1593,40 +1645,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
}
}
static bool bch2_fs_may_start(struct bch_fs *c)
{
struct bch_dev *ca;
unsigned flags = 0;
switch (c->opts.degraded) {
case BCH_DEGRADED_very:
flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST;
break;
case BCH_DEGRADED_yes:
flags |= BCH_FORCE_IF_DEGRADED;
break;
default:
mutex_lock(&c->sb_lock);
for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) {
if (!bch2_member_exists(c->disk_sb.sb, i))
continue;
ca = bch2_dev_locked(c, i);
if (!bch2_dev_is_online(ca) &&
(ca->mi.state == BCH_MEMBER_STATE_rw ||
ca->mi.state == BCH_MEMBER_STATE_ro)) {
mutex_unlock(&c->sb_lock);
return false;
}
}
mutex_unlock(&c->sb_lock);
break;
}
return bch2_have_enough_devs(c, c->online_devs, flags, true);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
{
bch2_dev_io_ref_stop(ca, WRITE);
@ -2096,10 +2114,8 @@ err:
return ret;
}
int bch2_fs_resize_on_mount(struct bch_fs *c)
static int bch2_fs_resize_on_mount(struct bch_fs *c)
{
down_write(&c->state_lock);
for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
u64 old_nbuckets = ca->mi.nbuckets;
u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk),
@ -2138,9 +2154,6 @@ int bch2_fs_resize_on_mount(struct bch_fs *c)
}
}
}
bch2_recalc_capacity(c);
up_write(&c->state_lock);
return 0;
}
@ -2331,11 +2344,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
}
up_write(&c->state_lock);
if (!bch2_fs_may_start(c)) {
ret = -BCH_ERR_insufficient_devices_to_start;
goto err_print;
}
if (!c->opts.nostart) {
ret = bch2_fs_start(c);
if (ret)

View File

@ -29,7 +29,6 @@ int bch2_dev_add(struct bch_fs *, const char *);
int bch2_dev_online(struct bch_fs *, const char *);
int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
int bch2_fs_resize_on_mount(struct bch_fs *);
struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
bool bch2_fs_emergency_read_only(struct bch_fs *);

View File

@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
*/
static int test_peek_end(struct bch_fs *c, u64 nr)
{
delete_test_keys(c);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;
@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr)
static int test_peek_end_extents(struct bch_fs *c, u64 nr)
{
delete_test_keys(c);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter;
struct bkey_s_c k;

View File

@ -252,6 +252,16 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v)
bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
}
static bool string_is_spaces(const char *str)
{
while (*str) {
if (*str != ' ')
return false;
str++;
}
return true;
}
void bch2_print_string_as_lines(const char *prefix, const char *lines,
bool nonblocking)
{
@ -272,6 +282,9 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines,
while (*lines) {
p = strchrnul(lines, '\n');
if (!*p && string_is_spaces(lines))
break;
printk("%s%.*s\n", prefix, (int) (p - lines), lines);
if (!*p)
break;