Update bcachefs sources to 7307b739bbe5 bcachefs: Read retries are after checksum errors now REQ_FUA
Some checks failed
Nix Flake actions / nix-matrix (push) Has been cancelled
build / bcachefs-tools-deb (ubuntu-22.04) (push) Has been cancelled
build / bcachefs-tools-deb (ubuntu-24.04) (push) Has been cancelled
build / bcachefs-tools-rpm (push) Has been cancelled
build / bcachefs-tools-msrv (push) Has been cancelled
Nix Flake actions / ${{ matrix.name }} (${{ matrix.system }}) (push) Has been cancelled

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-04-19 00:11:55 -04:00
parent d681613129
commit 3b819fd0d7
16 changed files with 296 additions and 78 deletions

View File

@ -1 +1 @@
65456ba56b930afe6935d2dc2128ba12c1e5fa6f
7307b739bbe5f1d9415f5c1da070723b3fb5abbd

View File

@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k)
#define __BKEY_PADDED(key, pad) \
struct bkey_i key; __u64 key ## _pad[pad]
enum bch_bkey_type_flags {
BKEY_TYPE_strict_btree_checks = BIT(0),
};
/*
* - DELETED keys are used internally to mark keys that should be ignored but
* override keys in composition order. Their version number is ignored.
@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k)
*
* - WHITEOUT: for hash table btrees
*/
#define BCH_BKEY_TYPES() \
x(deleted, 0) \
x(whiteout, 1) \
x(error, 2) \
x(cookie, 3) \
x(hash_whiteout, 4) \
x(btree_ptr, 5) \
x(extent, 6) \
x(reservation, 7) \
x(inode, 8) \
x(inode_generation, 9) \
x(dirent, 10) \
x(xattr, 11) \
x(alloc, 12) \
x(quota, 13) \
x(stripe, 14) \
x(reflink_p, 15) \
x(reflink_v, 16) \
x(inline_data, 17) \
x(btree_ptr_v2, 18) \
x(indirect_inline_data, 19) \
x(alloc_v2, 20) \
x(subvolume, 21) \
x(snapshot, 22) \
x(inode_v2, 23) \
x(alloc_v3, 24) \
x(set, 25) \
x(lru, 26) \
x(alloc_v4, 27) \
x(backpointer, 28) \
x(inode_v3, 29) \
x(bucket_gens, 30) \
x(snapshot_tree, 31) \
x(logged_op_truncate, 32) \
x(logged_op_finsert, 33) \
x(accounting, 34) \
x(inode_alloc_cursor, 35)
#define BCH_BKEY_TYPES() \
x(deleted, 0, 0) \
x(whiteout, 1, 0) \
x(error, 2, 0) \
x(cookie, 3, 0) \
x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \
x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \
x(extent, 6, BKEY_TYPE_strict_btree_checks) \
x(reservation, 7, BKEY_TYPE_strict_btree_checks) \
x(inode, 8, BKEY_TYPE_strict_btree_checks) \
x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \
x(dirent, 10, BKEY_TYPE_strict_btree_checks) \
x(xattr, 11, BKEY_TYPE_strict_btree_checks) \
x(alloc, 12, BKEY_TYPE_strict_btree_checks) \
x(quota, 13, BKEY_TYPE_strict_btree_checks) \
x(stripe, 14, BKEY_TYPE_strict_btree_checks) \
x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \
x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \
x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \
x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \
x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \
x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \
x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \
x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \
x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \
x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \
x(set, 25, 0) \
x(lru, 26, BKEY_TYPE_strict_btree_checks) \
x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \
x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \
x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \
x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \
x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \
x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \
x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \
x(accounting, 34, BKEY_TYPE_strict_btree_checks) \
x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks)
enum bch_bkey_type {
#define x(name, nr) KEY_TYPE_##name = nr,
#define x(name, nr, ...) KEY_TYPE_##name = nr,
BCH_BKEY_TYPES()
#undef x
KEY_TYPE_MAX,

View File

@ -21,7 +21,7 @@
#include "xattr.h"
const char * const bch2_bkey_types[] = {
#define x(name, nr) #name,
#define x(name, nr, ...) #name,
BCH_BKEY_TYPES()
#undef x
NULL
@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_
})
const struct bkey_ops bch2_bkey_ops[] = {
#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name,
BCH_BKEY_TYPES()
#undef x
};
@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = {
#undef x
};
static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = {
#define x(name, nr, flags) [KEY_TYPE_##name] = flags,
BCH_BKEY_TYPES()
#undef x
};
const char *bch2_btree_node_type_str(enum btree_node_type type)
{
return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1);
@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k,
if (type >= BKEY_TYPE_NR)
return 0;
bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX &&
(type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) &&
enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX
? bch2_bkey_type_flags[k.k->type]
: 0;
bool strict_key_type_allowed =
(from.flags & BCH_VALIDATE_commit) ||
type == BKEY_TYPE_btree ||
(from.btree < BTREE_ID_NR &&
(bkey_flags & BKEY_TYPE_strict_btree_checks));
bkey_fsck_err_on(strict_key_type_allowed &&
k.k->type < KEY_TYPE_MAX &&
!(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)),
c, bkey_invalid_type_for_btree,
"invalid key type for btree %s (%s)",

View File

@ -272,9 +272,6 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c,
{
struct fsck_err_state *s;
if (!test_bit(BCH_FS_fsck_running, &c->flags))
return NULL;
list_for_each_entry(s, &c->fsck_error_msgs, list)
if (s->id == id) {
/*
@ -645,14 +642,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
return ret;
}
void bch2_flush_fsck_errs(struct bch_fs *c)
static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print)
{
struct fsck_err_state *s, *n;
mutex_lock(&c->fsck_error_msgs_lock);
list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) {
if (s->ratelimited && s->last_msg)
if (print && s->ratelimited && s->last_msg)
bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg);
list_del(&s->list);
@ -663,6 +660,16 @@ void bch2_flush_fsck_errs(struct bch_fs *c)
mutex_unlock(&c->fsck_error_msgs_lock);
}
void bch2_flush_fsck_errs(struct bch_fs *c)
{
__bch2_flush_fsck_errs(c, true);
}
void bch2_free_fsck_errs(struct bch_fs *c)
{
__bch2_flush_fsck_errs(c, false);
}
int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out,
subvol_inum inum, u64 offset)
{

View File

@ -91,6 +91,7 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *,
_flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__)
void bch2_flush_fsck_errs(struct bch_fs *);
void bch2_free_fsck_errs(struct bch_fs *);
#define fsck_err_wrap(_do) \
({ \

View File

@ -321,6 +321,31 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
return false;
/*
* Subvolume roots are special: older versions of subvolume roots may be
* disconnected, it's only the newest version that matters.
*
* We only keep a single dirent pointing to a subvolume root, i.e.
* older versions of snapshots will not have a different dirent pointing
* to the same subvolume root.
*
* This is because dirents that point to subvolumes are only visible in
* the parent subvolume - versioning is not needed - and keeping them
* around would break fsck, because when we're crossing subvolumes we
* don't have a consistent snapshot ID to do check the inode <-> dirent
* relationships.
*
* Thus, a subvolume root that's been renamed after a snapshot will have
* a disconnected older version - that's expected.
*
* Note that taking a snapshot always updates the root inode (to update
* the dirent backpointer), so a subvolume root inode with
* BCH_INODE_has_child_snapshot is never visible.
*/
if (inode->bi_subvol &&
(inode->bi_flags & BCH_INODE_has_child_snapshot))
return false;
return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
}
@ -1007,6 +1032,23 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
if ((ret || dirent_points_to_inode_nowarn(d, inode)) &&
inode->bi_subvol &&
(inode->bi_flags & BCH_INODE_has_child_snapshot)) {
/* Older version of a renamed subvolume root: we won't have a
* correct dirent for it. That's expected, see
* inode_should_reattach().
*
* We don't clear the backpointer field when doing the rename
* because there might be arbitrarily many versions in older
* snapshots.
*/
inode->bi_dir = 0;
inode->bi_dir_offset = 0;
*write_inode = true;
goto out;
}
if (fsck_err_on(ret,
trans, inode_points_to_missing_dirent,
"inode points to missing dirent\n%s",
@ -1027,7 +1069,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
inode->bi_dir_offset = 0;
*write_inode = true;
}
out:
ret = 0;
fsck_err:
bch2_trans_iter_exit(trans, &dirent_iter);

View File

@ -281,7 +281,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
BUG_ON(sectors > buf->sectors);
if (unlikely(sectors > buf->sectors)) {
struct printbuf err = PRINTBUF;
err.atomic++;
prt_printf(&err, "journal entry overran reserved space: %u > %u\n",
sectors, buf->sectors);
prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n",
le32_to_cpu(buf->data->u64s), buf->u64s_reserved,
j->cur_entry_u64s,
c->block_bits);
prt_printf(&err, "fatal error - emergency read only");
bch2_journal_halt_locked(j);
bch_err(c, "%s", err.buf);
printbuf_exit(&err);
return;
}
buf->sectors = sectors;
/*
@ -1467,7 +1484,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
j->last_empty_seq = cur_seq - 1; /* to match j->seq */
spin_lock(&j->lock);
set_bit(JOURNAL_running, &j->flags);
j->last_flush_write = jiffies;
j->reservations.idx = journal_cur_seq(j);
@ -1478,6 +1494,21 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
return 0;
}
void bch2_journal_set_replay_done(struct journal *j)
{
/*
* journal_space_available must happen before setting JOURNAL_running
* JOURNAL_running must happen before JOURNAL_replay_done
*/
spin_lock(&j->lock);
bch2_journal_space_available(j);
set_bit(JOURNAL_need_flush_write, &j->flags);
set_bit(JOURNAL_running, &j->flags);
set_bit(JOURNAL_replay_done, &j->flags);
spin_unlock(&j->lock);
}
/* init/exit: */
void bch2_dev_journal_exit(struct bch_dev *ca)

View File

@ -437,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j)
struct bch_dev;
static inline void bch2_journal_set_replay_done(struct journal *j)
{
BUG_ON(!test_bit(JOURNAL_running, &j->flags));
set_bit(JOURNAL_replay_done, &j->flags);
}
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *);
@ -459,6 +453,7 @@ void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
void bch2_fs_journal_stop(struct journal *);
int bch2_fs_journal_start(struct journal *, u64);
void bch2_journal_set_replay_done(struct journal *);
void bch2_dev_journal_exit(struct bch_dev *);
int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);

View File

@ -252,7 +252,10 @@ void bch2_journal_space_available(struct journal *j)
bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_sectors = !ret
? round_down(j->space[journal_space_discarded].next_entry,
block_sectors(c))
: 0;
j->cur_entry_error = ret;
if (!ret)
@ -624,8 +627,6 @@ static u64 journal_seq_to_flush(struct journal *j)
/* Try to keep the journal at most half full: */
nr_buckets = ja->nr / 2;
nr_buckets = min(nr_buckets, ja->nr);
bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr;
seq_to_flush = max(seq_to_flush,
ja->bucket_seq[bucket_to_flush]);

View File

@ -705,3 +705,119 @@ void bch2_fs_rebalance_init(struct bch_fs *c)
{
bch2_pd_controller_init(&c->rebalance.pd);
}
static int check_rebalance_work_one(struct btree_trans *trans,
struct btree_iter *extent_iter,
struct btree_iter *rebalance_iter,
struct bkey_buf *last_flushed)
{
struct bch_fs *c = trans->c;
struct bkey_s_c extent_k, rebalance_k;
struct printbuf buf = PRINTBUF;
int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?:
bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter));
if (ret)
return ret;
if (!extent_k.k &&
extent_iter->btree_id == BTREE_ID_reflink &&
(!rebalance_k.k ||
rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) {
bch2_trans_iter_exit(trans, extent_iter);
bch2_trans_iter_init(trans, extent_iter,
BTREE_ID_extents, POS_MIN,
BTREE_ITER_prefetch|
BTREE_ITER_all_snapshots);
return -BCH_ERR_transaction_restart_nested;
}
if (!extent_k.k && !rebalance_k.k)
return 1;
int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX,
rebalance_k.k ? rebalance_k.k->p : SPOS_MAX);
struct bkey deleted;
bkey_init(&deleted);
if (cmp < 0) {
deleted.p = extent_k.k->p;
rebalance_k.k = &deleted;
} else if (cmp > 0) {
deleted.p = rebalance_k.k->p;
extent_k.k = &deleted;
}
bool should_have_rebalance =
bch2_bkey_sectors_need_rebalance(c, extent_k) != 0;
bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set;
if (should_have_rebalance != have_rebalance) {
ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed);
if (ret)
return ret;
bch2_bkey_val_to_text(&buf, c, extent_k);
}
if (fsck_err_on(!should_have_rebalance && have_rebalance,
trans, rebalance_work_incorrectly_set,
"rebalance work incorrectly set\n%s", buf.buf)) {
ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
extent_k.k->p, false);
if (ret)
goto err;
}
if (fsck_err_on(should_have_rebalance && !have_rebalance,
trans, rebalance_work_incorrectly_unset,
"rebalance work incorrectly unset\n%s", buf.buf)) {
ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
extent_k.k->p, true);
if (ret)
goto err;
}
if (cmp <= 0)
bch2_btree_iter_advance(trans, extent_iter);
if (cmp >= 0)
bch2_btree_iter_advance(trans, rebalance_iter);
err:
fsck_err:
printbuf_exit(&buf);
return ret;
}
int bch2_check_rebalance_work(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter rebalance_iter, extent_iter;
int ret = 0;
bch2_trans_iter_init(trans, &extent_iter,
BTREE_ID_reflink, POS_MIN,
BTREE_ITER_prefetch);
bch2_trans_iter_init(trans, &rebalance_iter,
BTREE_ID_rebalance_work, POS_MIN,
BTREE_ITER_prefetch);
struct bkey_buf last_flushed;
bch2_bkey_buf_init(&last_flushed);
bkey_init(&last_flushed.k->k);
while (!ret) {
bch2_trans_begin(trans);
ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
ret = 0;
}
bch2_bkey_buf_exit(&last_flushed, c);
bch2_trans_iter_exit(trans, &extent_iter);
bch2_trans_iter_exit(trans, &rebalance_iter);
bch2_trans_put(trans);
return ret < 0 ? ret : 0;
}

View File

@ -54,4 +54,6 @@ void bch2_rebalance_stop(struct bch_fs *);
int bch2_rebalance_start(struct bch_fs *);
void bch2_fs_rebalance_init(struct bch_fs *);
int bch2_check_rebalance_work(struct bch_fs *);
#endif /* _BCACHEFS_REBALANCE_H */

View File

@ -1149,13 +1149,13 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
set_bit(BCH_FS_accounting_replay_done, &c->flags);
bch2_journal_set_replay_done(&c->journal);
ret = bch2_fs_read_write_early(c);
if (ret)
goto err;
set_bit(BCH_FS_accounting_replay_done, &c->flags);
bch2_journal_set_replay_done(&c->journal);
for_each_member_device(c, ca) {
ret = bch2_dev_usage_init(ca, false);
if (ret) {

View File

@ -59,6 +59,7 @@
x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
x(delete_dead_inodes, 32, PASS_ALWAYS) \
x(fix_reflink_p, 33, 0) \

View File

@ -318,7 +318,9 @@ enum bch_fsck_flags {
x(directory_size_mismatch, 303, FSCK_AUTOFIX) \
x(dirent_cf_name_too_big, 304, 0) \
x(dirent_stray_data_after_cf_name, 305, 0) \
x(MAX, 309, 0)
x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \
x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \
x(MAX, 311, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,

View File

@ -406,7 +406,7 @@ u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
u32 subvol = 0, s;
rcu_read_lock();
while (id) {
while (id && bch2_snapshot_exists(c, id)) {
s = snapshot_t(c, id)->subvol;
if (s && (!subvol || s < subvol))

View File

@ -492,29 +492,28 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
clear_bit(BCH_FS_clean_shutdown, &c->flags);
/*
* First journal write must be a flush write: after a clean shutdown we
* don't read the journal, so the first journal write may end up
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
__for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) {
bch2_dev_allocator_add(c, ca);
percpu_ref_reinit(&ca->io_ref[WRITE]);
}
bch2_recalc_capacity(c);
/*
* First journal write must be a flush write: after a clean shutdown we
* don't read the journal, so the first journal write may end up
* overwriting whatever was there previously, and there must always be
* at least one non-flush write in the journal or recovery will fail:
*/
spin_lock(&c->journal.lock);
set_bit(JOURNAL_need_flush_write, &c->journal.flags);
set_bit(JOURNAL_running, &c->journal.flags);
bch2_journal_space_available(&c->journal);
spin_unlock(&c->journal.lock);
ret = bch2_fs_mark_dirty(c);
if (ret)
goto err;
spin_lock(&c->journal.lock);
bch2_journal_space_available(&c->journal);
spin_unlock(&c->journal.lock);
ret = bch2_journal_reclaim_start(&c->journal);
if (ret)
goto err;
@ -582,6 +581,7 @@ static void __bch2_fs_free(struct bch_fs *c)
bch2_find_btree_nodes_exit(&c->found_btree_nodes);
bch2_free_pending_node_rewrites(c);
bch2_free_fsck_errs(c);
bch2_fs_accounting_exit(c);
bch2_fs_sb_errors_exit(c);
bch2_fs_counters_exit(c);