Update bcachefs sources to efd3df255ba5 bcachefs: Btree node reads no longer kick off rewrites for degraded nodes
Some checks failed
build / bcachefs-tools-msrv (push) Has been cancelled
.deb build orchestrator / source-only (push) Has been cancelled
.deb build orchestrator / obs (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:forky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:trixie], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:amd64 host-arch:ppc64el machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:debian version:unstable], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:plucky], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:amd64 host-arch:amd64 machine-arch:amd64 runs-on:ubuntu-24.04]) (push) Has been cancelled
.deb build orchestrator / buildd (map[name:ubuntu version:questing], map[build-arch:arm64 host-arch:arm64 machine-arch:arm64 runs-on:ubuntu-24.04-arm]) (push) Has been cancelled
.deb build orchestrator / reprotest (push) Has been cancelled
.deb build orchestrator / publish (push) Has been cancelled
Nix Flake actions / ${{ matrix.name }} (${{ matrix.system }}) (push) Has been cancelled
Nix Flake actions / nix-matrix (push) Has been cancelled

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-11-24 03:19:07 -05:00
parent aabb371b11
commit e84b0fbfa1
9 changed files with 145 additions and 95 deletions

View File

@ -1 +1 @@
fba763d22acfb4feaacc45e88803f8b90c9740aa
efd3df255ba56d795750510e79d8d79f7812a029

View File

@ -350,12 +350,14 @@ do { \
#define bch_verbose(c, ...) bch_log(c, KERN_DEBUG, __VA_ARGS__)
#define bch_verbose_ratelimited(c, ...) bch_log_ratelimited(c, KERN_DEBUG, __VA_ARGS__)
#define bch_info_dev(ca, fmt, ...) \
bch2_print(c, KERN_INFO bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_notice_dev(ca, fmt, ...) \
bch2_print(c, KERN_NOTICE bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev(ca, fmt, ...) \
bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_dev_log(ca, loglevel, fmt, ...) \
bch2_print(ca->fs, loglevel bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev(ca, ...) bch_dev_log(ca, KERN_ERR, __VA_ARGS__)
#define bch_notice_dev(ca, ...) bch_dev_log(ca, KERN_NOTICE, __VA_ARGS__)
#define bch_info_dev(ca, ...) bch_dev_log(ca, KERN_INFO, __VA_ARGS__)
#define bch_verbose_dev(ca, ...) bch_dev_log(ca, KERN_DEBUG, __VA_ARGS__)
#define bch_err_dev_offset(ca, _offset, fmt, ...) \
bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum(c, _inum, fmt, ...) \

View File

@ -623,6 +623,20 @@ fsck_err:
return ret;
}
static bool btree_node_degraded(struct bch_fs *c, struct btree *b)
{
guard(rcu)();
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
if (ptr->dev == BCH_SB_MEMBER_INVALID)
continue;
struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev);
if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw)
return true;
}
return false;
}
int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct btree *b,
struct bch_io_failures *failed,
@ -912,43 +926,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
if (updated_range)
bch2_btree_node_drop_keys_outside_node(b);
/*
* XXX:
*
* We deadlock if too many btree updates require node rewrites while
* we're still in journal replay.
*
* This is because btree node rewrites generate more updates for the
* interior updates (alloc, backpointers), and if those updates touch
* new nodes and generate more rewrites - well, you see the problem.
*
* The biggest cause is that we don't use the btree write buffer (for
* the backpointer updates - this needs some real thought on locking in
* order to fix.
*
* The problem with this workaround (not doing the rewrite for degraded
* nodes in journal replay) is that those degraded nodes persist, and we
* don't want that (this is a real bug when a btree node write completes
* with fewer replicas than we wanted and leaves a degraded node due to
* device _removal_, i.e. the device went away mid write).
*
* It's less of a bug here, but still a problem because we don't yet
* have a way of tracking degraded data - we another index (all
* extents/btree nodes, by replicas entry) in order to fix properly
* (re-replicate degraded data at the earliest possible time).
*/
if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) {
scoped_guard(rcu)
bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) {
struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev);
if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) {
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_degraded(b);
}
}
}
if (!ptr_written) {
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_ptr_written_zero(b);
@ -1052,6 +1029,16 @@ start:
if (ret || failed.nr)
bch2_print_str_ratelimited(c, KERN_ERR, buf.buf);
/*
* Do this late; unlike other btree_node_need_rewrite() cases if a node
* is merely degraded we should rewrite it before we update it, but we
* don't need to kick off an async rewrite now:
*/
if (btree_node_degraded(c, b)) {
set_btree_node_need_rewrite(b);
set_btree_node_need_rewrite_degraded(b);
}
async_object_list_del(c, btree_read_bio, rb->list_idx);
bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
rb->start_time);

View File

@ -210,8 +210,7 @@ int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bk
static inline int bch2_btree_write_buffer_insert_checks(struct bch_fs *c, enum btree_id btree,
struct bkey_i *k)
{
if (unlikely(!btree_type_uses_write_buffer(btree) ||
k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX))
if (unlikely(!btree_type_uses_write_buffer(btree)))
try(bch2_btree_write_buffer_insert_err(c, btree, k));
return 0;

View File

@ -57,12 +57,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke
#endif
}
static int wb_key_seq_cmp(const void *_l, const void *_r)
static int wb_key_seq_cmp(const void *_l, const void *_r, const void *priv)
{
const struct btree_write_buffered_key *l = _l;
const struct btree_write_buffered_key *r = _r;
const struct btree_write_buffer_keys *keys = priv;
const struct wb_key_ref *l = _l;
const struct wb_key_ref *r = _r;
return cmp_int(l->journal_seq, r->journal_seq);
return cmp_int(wb_keys_idx(keys, l->idx)->journal_seq,
wb_keys_idx(keys, r->idx)->journal_seq);
}
/* Compare excluding idx, the low 24 bits: */
@ -247,7 +249,7 @@ static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
if (!wb->inc.keys.nr)
return;
bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
bch2_journal_pin_add(j, wb_keys_start(&wb->inc)->journal_seq, &wb->flushing.pin,
bch2_btree_write_buffer_journal_flush);
darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
@ -276,7 +278,7 @@ out:
if (!wb->inc.keys.nr)
bch2_journal_pin_drop(j, &wb->inc.pin);
else
bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
bch2_journal_pin_update(j, wb_keys_start(&wb->inc)->journal_seq, &wb->inc.pin,
bch2_btree_write_buffer_journal_flush);
if (j->watermark) {
@ -326,12 +328,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
u64 start_time = local_clock();
u64 nr_flushing = wb->flushing.keys.nr;
for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
wb->sorted.data[i].idx = i;
wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
wb->sorted.nr = 0;
wb_keys_for_each(&wb->flushing, k) {
struct wb_key_ref *dst = &darray_top(wb->sorted);
wb->sorted.nr++;
dst->idx = (u64 *) k - wb->flushing.keys.data;
dst->btree = k->btree;
memcpy(&dst->pos, &k->k.k.p, sizeof(struct bpos));
}
wb->sorted.nr = wb->flushing.keys.nr;
/*
* We first sort so that we can detect and skip redundant updates, and
@ -350,7 +355,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
wb_sort(wb->sorted.data, wb->sorted.nr);
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
struct btree_write_buffered_key *k = wb_keys_idx(&wb->flushing, i->idx);
ret = bch2_btree_write_buffer_insert_checks(c, k->btree, &k->k);
if (unlikely(ret))
@ -369,7 +374,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
if (i + 1 < &darray_top(wb->sorted) &&
wb_key_eq(i, i + 1)) {
struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
struct btree_write_buffered_key *n = wb_keys_idx(&wb->flushing, i[1].idx);
if (k->k.k.type == KEY_TYPE_accounting &&
n->k.k.type == KEY_TYPE_accounting)
@ -439,23 +444,25 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
*/
trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
sort_nonatomic(wb->flushing.keys.data,
wb->flushing.keys.nr,
sizeof(wb->flushing.keys.data[0]),
wb_key_seq_cmp, NULL);
sort_r_nonatomic(wb->sorted.data,
wb->sorted.nr,
sizeof(wb->sorted.data[0]),
wb_key_seq_cmp, NULL,
&wb->flushing);
darray_for_each(wb->flushing.keys, i) {
if (!i->journal_seq)
darray_for_each(wb->sorted, i) {
struct btree_write_buffered_key *k = wb_keys_idx(&wb->flushing, i->idx);
if (!k->journal_seq)
continue;
if (!accounting_replay_done &&
i->k.k.type == KEY_TYPE_accounting) {
k->k.k.type == KEY_TYPE_accounting) {
could_not_insert++;
continue;
}
if (!could_not_insert)
bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
bch2_journal_pin_update(j, k->journal_seq, &wb->flushing.pin,
bch2_btree_write_buffer_journal_flush);
bch2_trans_begin(trans);
@ -466,11 +473,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
BCH_TRANS_COMMIT_no_check_rw|
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_no_journal_res ,
btree_write_buffered_insert(trans, i));
btree_write_buffered_insert(trans, k));
if (ret)
goto err;
i->journal_seq = 0;
k->journal_seq = 0;
}
/*
@ -492,12 +499,14 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
* distinct counters touched somehow was very large.
*/
if (could_not_insert) {
struct btree_write_buffered_key *dst = wb->flushing.keys.data;
struct btree_write_buffered_key *dst = wb_keys_start(&wb->flushing);
darray_for_each(wb->flushing.keys, i)
if (i->journal_seq)
*dst++ = *i;
wb->flushing.keys.nr = dst - wb->flushing.keys.data;
wb_keys_for_each_safe(&wb->flushing, i)
if (i->journal_seq) {
memmove_u64s_down(dst, i, wb_key_u64s(&i->k));
dst = wb_key_next(dst);
}
wb->flushing.keys.nr = (u64 *) dst - wb->flushing.keys.data;
}
}
err:
@ -745,9 +754,10 @@ int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
enum btree_id btree, struct bkey_i *k)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
unsigned u64s = wb_key_u64s(k);
int ret;
retry:
ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
ret = darray_make_room_gfp(&dst->wb->keys, u64s, GFP_KERNEL);
if (!ret && dst->wb == &wb->flushing)
ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
@ -766,15 +776,10 @@ retry:
dst->room = darray_room(dst->wb->keys);
if (dst->wb == &wb->flushing)
dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
BUG_ON(!dst->room);
BUG_ON(dst->room < u64s);
BUG_ON(!dst->seq);
struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
wb_k->journal_seq = dst->seq;
wb_k->btree = btree;
bkey_copy(&wb_k->k, k);
dst->wb->keys.nr++;
dst->room--;
bch2_journal_key_to_wb_reserved(c, dst, btree, k);
return 0;
}

View File

@ -93,19 +93,63 @@ int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
struct journal_keys_to_wb *,
enum btree_id, struct bkey_i *);
static inline unsigned wb_key_u64s(const struct bkey_i *k)
{
return k->k.u64s + offsetof(struct btree_write_buffered_key, k) / sizeof(u64);
}
static inline struct btree_write_buffered_key *wb_keys_start(const struct btree_write_buffer_keys *keys)
{
return (struct btree_write_buffered_key *) &darray_first(keys->keys);
}
static inline struct btree_write_buffered_key *wb_keys_end(const struct btree_write_buffer_keys *keys)
{
return (struct btree_write_buffered_key *) &darray_top(keys->keys);
}
static inline struct btree_write_buffered_key *wb_keys_idx(const struct btree_write_buffer_keys *keys,
unsigned idx)
{
return (struct btree_write_buffered_key *) &keys->keys.data[idx];
}
static inline struct btree_write_buffered_key *wb_key_next(const struct btree_write_buffered_key *k)
{
return (struct btree_write_buffered_key *) ((u64 *) k + wb_key_u64s(&k->k));
}
#define wb_keys_for_each(_keys, _k) \
for (struct btree_write_buffered_key *_k = wb_keys_start(_keys); \
_k != wb_keys_end(_keys); \
_k = wb_key_next(_k))
#define wb_keys_for_each_safe(_keys, _k) \
for (struct btree_write_buffered_key *_next, *_k = wb_keys_start(_keys); \
_k != wb_keys_end(_keys) && (_next = wb_key_next(_k), true); \
_k = _next)
static inline void bch2_journal_key_to_wb_reserved(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
{
unsigned u64s = wb_key_u64s(k);
struct btree_write_buffered_key *wb_k = wb_keys_end(dst->wb);
wb_k->journal_seq = dst->seq;
wb_k->btree = btree;
bkey_copy(&wb_k->k, k);
dst->wb->keys.nr += u64s;
dst->room -= u64s;
}
static inline int __bch2_journal_key_to_wb(struct bch_fs *c,
struct journal_keys_to_wb *dst,
enum btree_id btree, struct bkey_i *k)
{
if (unlikely(!dst->room))
if (unlikely(dst->room < wb_key_u64s(k)))
return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
wb_k->journal_seq = dst->seq;
wb_k->btree = btree;
bkey_copy(&wb_k->k, k);
dst->wb->keys.nr++;
dst->room--;
bch2_journal_key_to_wb_reserved(c, dst, btree, k);
return 0;
}

View File

@ -6,7 +6,6 @@
#include "journal/types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
struct wb_key_ref {
union {
@ -38,11 +37,13 @@ union {
struct btree_write_buffered_key {
enum btree_id btree:8;
u64 journal_seq:56;
/* BTREE_WRITE_BUFERED_VAL_U64s_MAX only applies to accounting keys */
__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
};
struct btree_write_buffer_keys {
DARRAY(struct btree_write_buffered_key) keys;
darray_u64 keys;
struct journal_entry_pin pin;
struct mutex lock;
};

View File

@ -2239,6 +2239,8 @@ static int check_reconcile_work_btrees(struct btree_trans *trans)
struct bch_fs *c = trans->c;
CLASS(disk_reservation, res)(c);
struct progress_indicator progress;
bch2_progress_init_inner(&progress, c, 0, ~0ULL);
for (enum btree_id btree = 0; btree < btree_id_nr_alive(c); btree++) {
if (!bch2_btree_id_root(c, btree)->b)
@ -2252,6 +2254,7 @@ static int check_reconcile_work_btrees(struct btree_trans *trans)
try(for_each_btree_key_continue(trans, iter, 0, k, ({
bch2_disk_reservation_put(c, &res.r);
progress_update_iter(trans, &progress, &iter) ?:
check_reconcile_work_btree_key(trans, &iter, k) ?:
bch2_trans_commit(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc);
})));
@ -2274,10 +2277,15 @@ static int check_reconcile_btree_bp(struct btree_trans *trans, struct bkey_s_c k
noinline_for_stack
static int check_reconcile_btree_bps(struct btree_trans *trans)
{
struct progress_indicator progress;
bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_reconcile_scan));
return for_each_btree_key_max(trans, iter, BTREE_ID_reconcile_scan,
POS(1, 0), POS(1, U64_MAX),
BTREE_ITER_prefetch, k,
check_reconcile_btree_bp(trans, k));
BTREE_ITER_prefetch, k, ({
progress_update_iter(trans, &progress, &iter) ?:
check_reconcile_btree_bp(trans, k);
}));
}
int bch2_check_reconcile_work(struct bch_fs *c)

View File

@ -1233,7 +1233,11 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
ja->discard_idx = ja->dirty_idx_ondisk =
ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
if (!ret)
bch_verbose_dev(ca, "journal read done");
else
bch_err_dev(ca, "journal read error %s", bch2_err_str(ret));
kvfree(buf.data);
enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read);
closure_return(cl);