From e84b0fbfa1f9f9bbfed7ef5ead8aa929243c787a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 24 Nov 2025 03:19:07 -0500 Subject: [PATCH] Update bcachefs sources to efd3df255ba5 bcachefs: Btree node reads no longer kick off rewrites for degraded nodes Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- libbcachefs/bcachefs.h | 14 +++-- libbcachefs/btree/read.c | 61 ++++++++------------ libbcachefs/btree/update.h | 3 +- libbcachefs/btree/write_buffer.c | 79 ++++++++++++++------------ libbcachefs/btree/write_buffer.h | 58 ++++++++++++++++--- libbcachefs/btree/write_buffer_types.h | 5 +- libbcachefs/data/reconcile.c | 12 +++- libbcachefs/journal/read.c | 6 +- 9 files changed, 145 insertions(+), 95 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 6991ff6c..fd9f6c94 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -fba763d22acfb4feaacc45e88803f8b90c9740aa +efd3df255ba56d795750510e79d8d79f7812a029 diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 04b0c09a..2b2a3c6e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -350,12 +350,14 @@ do { \ #define bch_verbose(c, ...) bch_log(c, KERN_DEBUG, __VA_ARGS__) #define bch_verbose_ratelimited(c, ...) bch_log_ratelimited(c, KERN_DEBUG, __VA_ARGS__) -#define bch_info_dev(ca, fmt, ...) \ - bch2_print(c, KERN_INFO bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -#define bch_notice_dev(ca, fmt, ...) \ - bch2_print(c, KERN_NOTICE bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -#define bch_err_dev(ca, fmt, ...) \ - bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) +#define bch_dev_log(ca, loglevel, fmt, ...) \ + bch2_print(ca->fs, loglevel bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + +#define bch_err_dev(ca, ...) bch_dev_log(ca, KERN_ERR, __VA_ARGS__) +#define bch_notice_dev(ca, ...) bch_dev_log(ca, KERN_NOTICE, __VA_ARGS__) +#define bch_info_dev(ca, ...) bch_dev_log(ca, KERN_INFO, __VA_ARGS__) +#define bch_verbose_dev(ca, ...) bch_dev_log(ca, KERN_DEBUG, __VA_ARGS__) + #define bch_err_dev_offset(ca, _offset, fmt, ...) \ bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) #define bch_err_inum(c, _inum, fmt, ...) \ diff --git a/libbcachefs/btree/read.c b/libbcachefs/btree/read.c index 1f6dea4d..8e5c30ac 100644 --- a/libbcachefs/btree/read.c +++ b/libbcachefs/btree/read.c @@ -623,6 +623,20 @@ fsck_err: return ret; } +static bool btree_node_degraded(struct bch_fs *c, struct btree *b) +{ + guard(rcu)(); + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + if (ptr->dev == BCH_SB_MEMBER_INVALID) + continue; + + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw) + return true; + } + return false; +} + int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bch_io_failures *failed, @@ -912,43 +926,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (updated_range) bch2_btree_node_drop_keys_outside_node(b); - /* - * XXX: - * - * We deadlock if too many btree updates require node rewrites while - * we're still in journal replay. - * - * This is because btree node rewrites generate more updates for the - * interior updates (alloc, backpointers), and if those updates touch - * new nodes and generate more rewrites - well, you see the problem. - * - * The biggest cause is that we don't use the btree write buffer (for - * the backpointer updates - this needs some real thought on locking in - * order to fix. - * - * The problem with this workaround (not doing the rewrite for degraded - * nodes in journal replay) is that those degraded nodes persist, and we - * don't want that (this is a real bug when a btree node write completes - * with fewer replicas than we wanted and leaves a degraded node due to - * device _removal_, i.e. the device went away mid write). - * - * It's less of a bug here, but still a problem because we don't yet - * have a way of tracking degraded data - we another index (all - * extents/btree nodes, by replicas entry) in order to fix properly - * (re-replicate degraded data at the earliest possible time). - */ - if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { - scoped_guard(rcu) - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { - set_btree_node_need_rewrite(b); - set_btree_node_need_rewrite_degraded(b); - } - } - } - if (!ptr_written) { set_btree_node_need_rewrite(b); set_btree_node_need_rewrite_ptr_written_zero(b); @@ -1052,6 +1029,16 @@ start: if (ret || failed.nr) bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); + /* + * Do this late; unlike other btree_node_need_rewrite() cases if a node + * is merely degraded we should rewrite it before we update it, but we + * don't need to kick off an async rewrite now: + */ + if (btree_node_degraded(c, b)) { + set_btree_node_need_rewrite(b); + set_btree_node_need_rewrite_degraded(b); + } + async_object_list_del(c, btree_read_bio, rb->list_idx); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], rb->start_time); diff --git a/libbcachefs/btree/update.h b/libbcachefs/btree/update.h index 87bcda97..ad9ee8d4 100644 --- a/libbcachefs/btree/update.h +++ b/libbcachefs/btree/update.h @@ -210,8 +210,7 @@ int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bk static inline int bch2_btree_write_buffer_insert_checks(struct bch_fs *c, enum btree_id btree, struct bkey_i *k) { - if (unlikely(!btree_type_uses_write_buffer(btree) || - k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX)) + if (unlikely(!btree_type_uses_write_buffer(btree))) try(bch2_btree_write_buffer_insert_err(c, btree, k)); return 0; diff --git a/libbcachefs/btree/write_buffer.c b/libbcachefs/btree/write_buffer.c index a18f117b..dee9eef5 100644 --- a/libbcachefs/btree/write_buffer.c +++ b/libbcachefs/btree/write_buffer.c @@ -57,12 +57,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke #endif } -static int wb_key_seq_cmp(const void *_l, const void *_r) +static int wb_key_seq_cmp(const void *_l, const void *_r, const void *priv) { - const struct btree_write_buffered_key *l = _l; - const struct btree_write_buffered_key *r = _r; + const struct btree_write_buffer_keys *keys = priv; + const struct wb_key_ref *l = _l; + const struct wb_key_ref *r = _r; - return cmp_int(l->journal_seq, r->journal_seq); + return cmp_int(wb_keys_idx(keys, l->idx)->journal_seq, + wb_keys_idx(keys, r->idx)->journal_seq); } /* Compare excluding idx, the low 24 bits: */ @@ -227,7 +229,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite */ static int btree_write_buffered_insert(struct btree_trans *trans, - struct btree_write_buffered_key *wb) + struct btree_write_buffered_key *wb) { CLASS(btree_iter, iter)(trans, wb->btree, bkey_start_pos(&wb->k.k), BTREE_ITER_cached|BTREE_ITER_intent); @@ -247,7 +249,7 @@ static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) if (!wb->inc.keys.nr) return; - bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, + bch2_journal_pin_add(j, wb_keys_start(&wb->inc)->journal_seq, &wb->flushing.pin, bch2_btree_write_buffer_journal_flush); darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); @@ -276,7 +278,7 @@ out: if (!wb->inc.keys.nr) bch2_journal_pin_drop(j, &wb->inc.pin); else - bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, + bch2_journal_pin_update(j, wb_keys_start(&wb->inc)->journal_seq, &wb->inc.pin, bch2_btree_write_buffer_journal_flush); if (j->watermark) { @@ -326,12 +328,15 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) u64 start_time = local_clock(); u64 nr_flushing = wb->flushing.keys.nr; - for (size_t i = 0; i < wb->flushing.keys.nr; i++) { - wb->sorted.data[i].idx = i; - wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; - memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); + wb->sorted.nr = 0; + wb_keys_for_each(&wb->flushing, k) { + struct wb_key_ref *dst = &darray_top(wb->sorted); + wb->sorted.nr++; + + dst->idx = (u64 *) k - wb->flushing.keys.data; + dst->btree = k->btree; + memcpy(&dst->pos, &k->k.k.p, sizeof(struct bpos)); } - wb->sorted.nr = wb->flushing.keys.nr; /* * We first sort so that we can detect and skip redundant updates, and @@ -350,7 +355,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) wb_sort(wb->sorted.data, wb->sorted.nr); darray_for_each(wb->sorted, i) { - struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + struct btree_write_buffered_key *k = wb_keys_idx(&wb->flushing, i->idx); ret = bch2_btree_write_buffer_insert_checks(c, k->btree, &k->k); if (unlikely(ret)) @@ -369,7 +374,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) if (i + 1 < &darray_top(wb->sorted) && wb_key_eq(i, i + 1)) { - struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; + struct btree_write_buffered_key *n = wb_keys_idx(&wb->flushing, i[1].idx); if (k->k.k.type == KEY_TYPE_accounting && n->k.k.type == KEY_TYPE_accounting) @@ -439,23 +444,25 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - sort_nonatomic(wb->flushing.keys.data, - wb->flushing.keys.nr, - sizeof(wb->flushing.keys.data[0]), - wb_key_seq_cmp, NULL); + sort_r_nonatomic(wb->sorted.data, + wb->sorted.nr, + sizeof(wb->sorted.data[0]), + wb_key_seq_cmp, NULL, + &wb->flushing); - darray_for_each(wb->flushing.keys, i) { - if (!i->journal_seq) + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = wb_keys_idx(&wb->flushing, i->idx); + if (!k->journal_seq) continue; if (!accounting_replay_done && - i->k.k.type == KEY_TYPE_accounting) { + k->k.k.type == KEY_TYPE_accounting) { could_not_insert++; continue; } if (!could_not_insert) - bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, + bch2_journal_pin_update(j, k->journal_seq, &wb->flushing.pin, bch2_btree_write_buffer_journal_flush); bch2_trans_begin(trans); @@ -466,11 +473,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_journal_res , - btree_write_buffered_insert(trans, i)); + btree_write_buffered_insert(trans, k)); if (ret) goto err; - i->journal_seq = 0; + k->journal_seq = 0; } /* @@ -492,12 +499,14 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) * distinct counters touched somehow was very large. */ if (could_not_insert) { - struct btree_write_buffered_key *dst = wb->flushing.keys.data; + struct btree_write_buffered_key *dst = wb_keys_start(&wb->flushing); - darray_for_each(wb->flushing.keys, i) - if (i->journal_seq) - *dst++ = *i; - wb->flushing.keys.nr = dst - wb->flushing.keys.data; + wb_keys_for_each_safe(&wb->flushing, i) + if (i->journal_seq) { + memmove_u64s_down(dst, i, wb_key_u64s(&i->k)); + dst = wb_key_next(dst); + } + wb->flushing.keys.nr = (u64 *) dst - wb->flushing.keys.data; } } err: @@ -745,9 +754,10 @@ int bch2_journal_key_to_wb_slowpath(struct bch_fs *c, enum btree_id btree, struct bkey_i *k) { struct btree_write_buffer *wb = &c->btree_write_buffer; + unsigned u64s = wb_key_u64s(k); int ret; retry: - ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); + ret = darray_make_room_gfp(&dst->wb->keys, u64s, GFP_KERNEL); if (!ret && dst->wb == &wb->flushing) ret = darray_resize(&wb->sorted, wb->flushing.keys.size); @@ -766,15 +776,10 @@ retry: dst->room = darray_room(dst->wb->keys); if (dst->wb == &wb->flushing) dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - BUG_ON(!dst->room); + BUG_ON(dst->room < u64s); BUG_ON(!dst->seq); - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; + bch2_journal_key_to_wb_reserved(c, dst, btree, k); return 0; } diff --git a/libbcachefs/btree/write_buffer.h b/libbcachefs/btree/write_buffer.h index ffc9b199..72fae5b5 100644 --- a/libbcachefs/btree/write_buffer.h +++ b/libbcachefs/btree/write_buffer.h @@ -93,19 +93,63 @@ int bch2_journal_key_to_wb_slowpath(struct bch_fs *, struct journal_keys_to_wb *, enum btree_id, struct bkey_i *); +static inline unsigned wb_key_u64s(const struct bkey_i *k) +{ + return k->k.u64s + offsetof(struct btree_write_buffered_key, k) / sizeof(u64); +} + +static inline struct btree_write_buffered_key *wb_keys_start(const struct btree_write_buffer_keys *keys) +{ + return (struct btree_write_buffered_key *) &darray_first(keys->keys); +} + +static inline struct btree_write_buffered_key *wb_keys_end(const struct btree_write_buffer_keys *keys) +{ + return (struct btree_write_buffered_key *) &darray_top(keys->keys); +} + +static inline struct btree_write_buffered_key *wb_keys_idx(const struct btree_write_buffer_keys *keys, + unsigned idx) +{ + return (struct btree_write_buffered_key *) &keys->keys.data[idx]; +} + +static inline struct btree_write_buffered_key *wb_key_next(const struct btree_write_buffered_key *k) +{ + return (struct btree_write_buffered_key *) ((u64 *) k + wb_key_u64s(&k->k)); +} + +#define wb_keys_for_each(_keys, _k) \ + for (struct btree_write_buffered_key *_k = wb_keys_start(_keys); \ + _k != wb_keys_end(_keys); \ + _k = wb_key_next(_k)) + +#define wb_keys_for_each_safe(_keys, _k) \ + for (struct btree_write_buffered_key *_next, *_k = wb_keys_start(_keys); \ + _k != wb_keys_end(_keys) && (_next = wb_key_next(_k), true); \ + _k = _next) + +static inline void bch2_journal_key_to_wb_reserved(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) +{ + unsigned u64s = wb_key_u64s(k); + struct btree_write_buffered_key *wb_k = wb_keys_end(dst->wb); + wb_k->journal_seq = dst->seq; + wb_k->btree = btree; + bkey_copy(&wb_k->k, k); + dst->wb->keys.nr += u64s; + dst->room -= u64s; +} + static inline int __bch2_journal_key_to_wb(struct bch_fs *c, struct journal_keys_to_wb *dst, enum btree_id btree, struct bkey_i *k) { - if (unlikely(!dst->room)) + if (unlikely(dst->room < wb_key_u64s(k))) return bch2_journal_key_to_wb_slowpath(c, dst, btree, k); - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; + bch2_journal_key_to_wb_reserved(c, dst, btree, k); return 0; } diff --git a/libbcachefs/btree/write_buffer_types.h b/libbcachefs/btree/write_buffer_types.h index cfb38cd5..1b13c80c 100644 --- a/libbcachefs/btree/write_buffer_types.h +++ b/libbcachefs/btree/write_buffer_types.h @@ -6,7 +6,6 @@ #include "journal/types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 -#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) struct wb_key_ref { union { @@ -38,11 +37,13 @@ union { struct btree_write_buffered_key { enum btree_id btree:8; u64 journal_seq:56; + + /* BTREE_WRITE_BUFERED_VAL_U64s_MAX only applies to accounting keys */ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); }; struct btree_write_buffer_keys { - DARRAY(struct btree_write_buffered_key) keys; + darray_u64 keys; struct journal_entry_pin pin; struct mutex lock; }; diff --git a/libbcachefs/data/reconcile.c b/libbcachefs/data/reconcile.c index bd17cc64..c52d1c12 100644 --- a/libbcachefs/data/reconcile.c +++ b/libbcachefs/data/reconcile.c @@ -2239,6 +2239,8 @@ static int check_reconcile_work_btrees(struct btree_trans *trans) struct bch_fs *c = trans->c; CLASS(disk_reservation, res)(c); + struct progress_indicator progress; + bch2_progress_init_inner(&progress, c, 0, ~0ULL); for (enum btree_id btree = 0; btree < btree_id_nr_alive(c); btree++) { if (!bch2_btree_id_root(c, btree)->b) @@ -2252,6 +2254,7 @@ static int check_reconcile_work_btrees(struct btree_trans *trans) try(for_each_btree_key_continue(trans, iter, 0, k, ({ bch2_disk_reservation_put(c, &res.r); + progress_update_iter(trans, &progress, &iter) ?: check_reconcile_work_btree_key(trans, &iter, k) ?: bch2_trans_commit(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc); }))); @@ -2274,10 +2277,15 @@ static int check_reconcile_btree_bp(struct btree_trans *trans, struct bkey_s_c k noinline_for_stack static int check_reconcile_btree_bps(struct btree_trans *trans) { + struct progress_indicator progress; + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_reconcile_scan)); + return for_each_btree_key_max(trans, iter, BTREE_ID_reconcile_scan, POS(1, 0), POS(1, U64_MAX), - BTREE_ITER_prefetch, k, - check_reconcile_btree_bp(trans, k)); + BTREE_ITER_prefetch, k, ({ + progress_update_iter(trans, &progress, &iter) ?: + check_reconcile_btree_bp(trans, k); + })); } int bch2_check_reconcile_work(struct bch_fs *c) diff --git a/libbcachefs/journal/read.c b/libbcachefs/journal/read.c index e7531f1e..3c2afae8 100644 --- a/libbcachefs/journal/read.c +++ b/libbcachefs/journal/read.c @@ -1233,7 +1233,11 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) ja->discard_idx = ja->dirty_idx_ondisk = ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: - bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); + if (!ret) + bch_verbose_dev(ca, "journal read done"); + else + bch_err_dev(ca, "journal read error %s", bch2_err_str(ret)); + kvfree(buf.data); enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); closure_return(cl);