From fa1882de618f7a96449faef8aa4f90ab5f3a0380 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 23 Oct 2025 22:22:03 -0400 Subject: [PATCH] Update bcachefs sources to e3e6e947d0c9 bcachefs: Clear recovery_passes_required when initializing --- .bcachefs_revision | 2 +- libbcachefs/alloc/background.c | 36 ++++++----- libbcachefs/alloc/backpointers.c | 51 ++++++++-------- libbcachefs/alloc/backpointers.h | 8 +-- libbcachefs/alloc/check.c | 9 +-- libbcachefs/alloc/lru.c | 9 +-- libbcachefs/alloc/lru.h | 5 +- libbcachefs/alloc/replicas.c | 75 ++++++++++++++++++++++- libbcachefs/alloc/replicas.h | 4 +- libbcachefs/btree/bkey.h | 24 +++++--- libbcachefs/btree/commit.c | 49 ++++++++++----- libbcachefs/btree/interior.c | 34 +++++------ libbcachefs/btree/iter.c | 6 +- libbcachefs/btree/key_cache.c | 102 ++++++++++++++----------------- libbcachefs/btree/update.h | 1 + libbcachefs/btree/write.c | 5 +- libbcachefs/btree/write_buffer.c | 37 ++++++++--- libbcachefs/btree/write_buffer.h | 28 ++++++++- libbcachefs/data/ec.c | 13 ++-- libbcachefs/data/migrate.c | 9 +-- libbcachefs/data/move.c | 4 +- libbcachefs/data/rebalance.c | 7 ++- libbcachefs/data/update.c | 17 ++++-- libbcachefs/debug/trace.h | 10 +-- libbcachefs/init/dev.c | 3 +- libbcachefs/init/fs.c | 50 +++------------ libbcachefs/init/recovery.c | 8 +++ libbcachefs/journal/journal.c | 7 --- libbcachefs/journal/read.c | 8 +-- libbcachefs/journal/reclaim.c | 3 +- libbcachefs/journal/write.c | 4 +- libbcachefs/opts.c | 10 +++ libbcachefs/sb/errors_format.h | 3 +- libbcachefs/sb/io.c | 4 +- 34 files changed, 392 insertions(+), 253 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index b3f8497e..356a62c2 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -ed9ece3835e374b4740124c5c3597f42c3b6d354 +e3e6e947d0c9af7dce749a5d9a88ef5d6cc60311 diff --git a/libbcachefs/alloc/background.c b/libbcachefs/alloc/background.c index d753cca0..64b6029c 100644 --- a/libbcachefs/alloc/background.c +++ b/libbcachefs/alloc/background.c @@ -1234,7 +1234,7 @@ put_ref: static int invalidate_one_bp(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c_backpointer bp, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { CLASS(btree_iter_uninit, iter)(trans); struct bkey_s_c k = bkey_try(bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed)); @@ -1252,7 +1252,7 @@ static int invalidate_one_bucket_by_bps(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket, u8 gen, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); @@ -1281,7 +1281,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *lru_iter, struct bkey_s_c lru_k, - struct bkey_buf *last_flushed, + struct wb_maybe_flush *last_flushed, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; @@ -1364,8 +1364,8 @@ static void bch2_do_invalidates_work(struct work_struct *work) CLASS(btree_trans, trans)(c); int ret = 0; - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); ret = bch2_btree_write_buffer_tryflush(trans); if (ret) @@ -1398,6 +1398,7 @@ restart_err: if (ret) break; + wb_maybe_flush_inc(&last_flushed); bch2_btree_iter_advance(&iter); } bch2_trans_iter_exit(&iter); @@ -1593,13 +1594,22 @@ void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) { /* BCH_DATA_free == all rw devs */ - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (rw && - (i == BCH_DATA_free || - (ca->mi.data_allowed & BIT(i)))) - set_bit(ca->dev_idx, c->rw_devs[i].d); - else - clear_bit(ca->dev_idx, c->rw_devs[i].d); + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) { + bool data_type_rw = rw; + + if (i != BCH_DATA_free && + !(ca->mi.data_allowed & BIT(i))) + data_type_rw = false; + + if ((i == BCH_DATA_journal || + i == BCH_DATA_btree) && + !ca->mi.durability) + data_type_rw = false; + + mod_bit(ca->dev_idx, c->rw_devs[i].d, data_type_rw); + } + + c->rw_devs_change_count++; } /* device goes ro: */ @@ -1610,8 +1620,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* First, remove device from allocation groups: */ bch2_dev_allocator_set_rw(c, ca, false); - c->rw_devs_change_count++; - /* * Capacity is calculated based off of devices in allocation groups: */ diff --git a/libbcachefs/alloc/backpointers.c b/libbcachefs/alloc/backpointers.c index 7bab9b6f..c030ad02 100644 --- a/libbcachefs/alloc/backpointers.c +++ b/libbcachefs/alloc/backpointers.c @@ -6,7 +6,6 @@ #include "alloc/backpointers.h" #include "btree/bbpos.h" -#include "btree/bkey_buf.h" #include "btree/cache.h" #include "btree/update.h" #include "btree/interior.h" @@ -187,7 +186,7 @@ static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, struct bkey_s_c visiting_k, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) @@ -197,7 +196,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, static int backpointer_target_not_found(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct bkey_s_c target_k, - struct bkey_buf *last_flushed, + struct wb_maybe_flush *last_flushed, bool commit) { struct bch_fs *c = trans->c; @@ -260,7 +259,7 @@ fsck_err: static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct btree_iter *iter, - struct bkey_buf *last_flushed, + struct wb_maybe_flush *last_flushed, bool commit) { struct bch_fs *c = trans->c; @@ -306,7 +305,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct btree_iter *iter, unsigned iter_flags, - struct bkey_buf *last_flushed, + struct wb_maybe_flush *last_flushed, bool commit) { struct bch_fs *c = trans->c; @@ -358,7 +357,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct btree_iter *iter, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); } @@ -367,13 +366,13 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct btree_iter *iter, unsigned iter_flags, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true); } static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { if (k.k->type != KEY_TYPE_backpointer) return 0; @@ -415,11 +414,10 @@ fsck_err: int bch2_check_btree_backpointers(struct bch_fs *c) { struct progress_indicator_state progress; - bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_backpointers)); - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); CLASS(btree_trans, trans)(c); return for_each_btree_key_commit(trans, iter, @@ -431,9 +429,9 @@ int bch2_check_btree_backpointers(struct bch_fs *c) } struct extents_to_bp_state { - struct bpos bp_start; - struct bpos bp_end; - struct bkey_buf last_flushed; + struct bpos bp_start; + struct bpos bp_end; + struct wb_maybe_flush last_flushed; }; static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, @@ -790,6 +788,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, try(for_each_btree_key_continue(trans, iter, 0, k, ({ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers") ?: + wb_maybe_flush_inc(&s->last_flushed) ?: check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); }))); @@ -825,11 +824,11 @@ static int data_type_to_alloc_counter(enum bch_data_type t) } static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos, - struct bkey_buf *last_flushed); + struct wb_maybe_flush *last_flushed); static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, bool *had_mismatch, - struct bkey_buf *last_flushed, + struct wb_maybe_flush *last_flushed, struct bpos *last_pos, unsigned *nr_iters) { @@ -1100,7 +1099,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) struct bpos last_pos = POS_MIN; unsigned nr_iters = 0; - bch2_bkey_buf_init(&s.last_flushed); + wb_maybe_flush_init(&s.last_flushed); ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ @@ -1132,8 +1131,8 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) if ( bpos_eq(s.bp_start, POS_MIN) && !bpos_eq(s.bp_end, SPOS_MAX)) - bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", - __func__, btree_nodes_fit_in_ram(c)); + bch_info(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", + __func__, btree_nodes_fit_in_ram(c)); if (!bpos_eq(s.bp_start, POS_MIN) || !bpos_eq(s.bp_end, SPOS_MAX)) { @@ -1159,7 +1158,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); } err: - bch2_bkey_buf_exit(&s.last_flushed); + wb_maybe_flush_exit(&s.last_flushed); bch2_btree_cache_unpin(c); return ret; } @@ -1167,7 +1166,7 @@ err: static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, struct bpos bucket, bool *had_mismatch, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { CLASS(btree_iter, alloc_iter)(trans, BTREE_ID_alloc, bucket, BTREE_ITER_cached); struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&alloc_iter)); @@ -1182,7 +1181,7 @@ static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bch_dev *ca, u64 bucket, bool copygc, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { struct bch_fs *c = trans->c; bool had_mismatch; @@ -1215,7 +1214,7 @@ static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, struct bkey_s_c bp_k, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { if (bp_k.k->type != KEY_TYPE_backpointer) return 0; @@ -1237,7 +1236,7 @@ static int check_one_backpointer(struct btree_trans *trans, static int check_bucket_backpointers_to_extents(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { u32 restart_count = trans->restart_count; @@ -1257,8 +1256,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); struct progress_indicator_state progress; bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); diff --git a/libbcachefs/alloc/backpointers.h b/libbcachefs/alloc/backpointers.h index 21cf4fa4..d9fb481d 100644 --- a/libbcachefs/alloc/backpointers.h +++ b/libbcachefs/alloc/backpointers.h @@ -174,14 +174,14 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, }; } -struct bkey_buf; +struct wb_maybe_flush; struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, unsigned, struct bkey_buf *); + struct btree_iter *, unsigned, struct wb_maybe_flush *); struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, - struct btree_iter *, struct bkey_buf *); + struct btree_iter *, struct wb_maybe_flush *); int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, - bool, struct bkey_buf *); + bool, struct wb_maybe_flush *); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); diff --git a/libbcachefs/alloc/check.c b/libbcachefs/alloc/check.c index 50a8d0c4..6a9efe87 100644 --- a/libbcachefs/alloc/check.c +++ b/libbcachefs/alloc/check.c @@ -5,9 +5,9 @@ #include "alloc/check.h" #include "alloc/lru.h" -#include "btree/bkey_buf.h" #include "btree/cache.h" #include "btree/update.h" +#include "btree/write_buffer.h" #include "data/ec.h" @@ -619,7 +619,7 @@ bkey_err: static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, struct btree_iter *alloc_iter, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { struct bch_fs *c = trans->c; struct bch_alloc_v4 a_convert; @@ -670,8 +670,8 @@ fsck_err: int bch2_check_alloc_to_lru_refs(struct bch_fs *c) { - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); struct progress_indicator_state progress; bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_alloc)); @@ -681,6 +681,7 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ progress_update_iter(trans, &progress, &iter) ?: + wb_maybe_flush_inc(&last_flushed) ?: bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed); }))?: bch2_check_stripe_to_lru_refs(trans); } diff --git a/libbcachefs/alloc/lru.c b/libbcachefs/alloc/lru.c index f1df35c9..9fe222b1 100644 --- a/libbcachefs/alloc/lru.c +++ b/libbcachefs/alloc/lru.c @@ -80,7 +80,7 @@ int bch2_lru_check_set(struct btree_trans *trans, u64 dev_bucket, u64 time, struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { struct bch_fs *c = trans->c; int ret = 0; @@ -168,7 +168,7 @@ static u64 bkey_lru_type_idx(struct bch_fs *c, static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { struct bch_fs *c = trans->c; CLASS(printbuf, buf1)(); @@ -202,8 +202,8 @@ fsck_err: int bch2_check_lrus(struct bch_fs *c) { - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); struct progress_indicator_state progress; bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_lru)); @@ -213,6 +213,7 @@ int bch2_check_lrus(struct bch_fs *c) BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ progress_update_iter(trans, &progress, &iter) ?: + wb_maybe_flush_inc(&last_flushed) ?: bch2_check_lru_key(trans, &iter, k, &last_flushed); })); } diff --git a/libbcachefs/alloc/lru.h b/libbcachefs/alloc/lru.h index d5a2620f..2a9e2bf4 100644 --- a/libbcachefs/alloc/lru.h +++ b/libbcachefs/alloc/lru.h @@ -72,8 +72,9 @@ static inline int bch2_lru_change(struct btree_trans *trans, int bch2_dev_remove_lrus(struct bch_fs *, struct bch_dev *); -struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); +struct wb_maybe_flush; +int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, + struct wb_maybe_flush *); int bch2_check_lrus(struct bch_fs *); #endif /* _BCACHEFS_LRU_H */ diff --git a/libbcachefs/alloc/replicas.c b/libbcachefs/alloc/replicas.c index 462491de..ba1afb74 100644 --- a/libbcachefs/alloc/replicas.c +++ b/libbcachefs/alloc/replicas.c @@ -776,8 +776,8 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { /* Query replicas: */ -bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, - unsigned flags, struct printbuf *err) +bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs, + unsigned flags, struct printbuf *err) { struct bch_replicas_entry_v1 *e; @@ -829,6 +829,77 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, return true; } +bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + unsigned flags, struct printbuf *err, + bool write) +{ + if (write) { + unsigned nr_have[BCH_DATA_NR]; + memset(nr_have, 0, sizeof(nr_have)); + + unsigned nr_online[BCH_DATA_NR]; + memset(nr_online, 0, sizeof(nr_online)); + + scoped_guard(rcu) + for_each_member_device_rcu(c, ca, &devs) { + if (!ca->mi.durability) + continue; + + bool online = ca->mi.state == BCH_MEMBER_STATE_rw && + test_bit(ca->dev_idx, devs.d); + + for (unsigned i = 0; i < BCH_DATA_NR; i++) { + nr_have[i] += ca->mi.data_allowed & BIT(i) ? ca->mi.durability : 0; + + if (online) + nr_online[i] += ca->mi.data_allowed & BIT(i) ? ca->mi.durability : 0; + } + } + + if (!nr_online[BCH_DATA_journal]) { + prt_printf(err, "No rw journal devices online\n"); + return false; + } + + if (!nr_online[BCH_DATA_btree]) { + prt_printf(err, "No rw btree devices online\n"); + return false; + } + + if (!nr_online[BCH_DATA_user]) { + prt_printf(err, "No rw user data devices online\n"); + return false; + } + + if (!(flags & BCH_FORCE_IF_METADATA_DEGRADED)) { + if (nr_online[BCH_DATA_journal] < nr_have[BCH_DATA_journal] && + nr_online[BCH_DATA_journal] < c->opts.metadata_replicas) { + prt_printf(err, "Insufficient rw journal devices (%u) online\n", + nr_online[BCH_DATA_journal]); + return false; + } + + if (nr_online[BCH_DATA_btree] < nr_have[BCH_DATA_btree] && + nr_online[BCH_DATA_btree] < c->opts.metadata_replicas) { + prt_printf(err, "Insufficient rw btree devices (%u) online\n", + nr_online[BCH_DATA_btree]); + return false; + } + } + + if (!(flags & BCH_FORCE_IF_DATA_DEGRADED)) { + if (nr_online[BCH_DATA_user] < nr_have[BCH_DATA_user] && + nr_online[BCH_DATA_user] < c->opts.data_replicas) { + prt_printf(err, "Insufficient rw user data devices (%u) online\n", + nr_online[BCH_DATA_user]); + return false; + } + } + } + + return bch2_can_read_fs_with_devs(c, devs, flags, err); +} + unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { struct bch_sb_field_replicas *replicas; diff --git a/libbcachefs/alloc/replicas.h b/libbcachefs/alloc/replicas.h index d0938b0b..8565e58c 100644 --- a/libbcachefs/alloc/replicas.h +++ b/libbcachefs/alloc/replicas.h @@ -43,8 +43,10 @@ static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e, e->devs[0] = dev; } +bool bch2_can_read_fs_with_devs(struct bch_fs *, struct bch_devs_mask, + unsigned, struct printbuf *); bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, - unsigned, struct printbuf *); + unsigned, struct printbuf *, bool); unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); diff --git a/libbcachefs/btree/bkey.h b/libbcachefs/btree/bkey.h index e6bca7de..dd9b8629 100644 --- a/libbcachefs/btree/bkey.h +++ b/libbcachefs/btree/bkey.h @@ -188,14 +188,6 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) return bkey_gt(l, r) ? l : r; } -static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) -{ - return bpos_eq(l.k->p, r.k->p) && - l.k->size == r.k->size && - bkey_bytes(l.k) == bkey_bytes(r.k) && - !memcmp(l.v, r.v, bkey_val_bytes(l.k)); -} - void bch2_bpos_swab(struct bpos *); void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); @@ -205,6 +197,22 @@ static __always_inline int bversion_cmp(struct bversion l, struct bversion r) cmp_int(l.lo, r.lo); } +static __always_inline bool bversion_eq(struct bversion l, struct bversion r) +{ + return l.hi == r.hi && + l.lo == r.lo; +} + +static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) +{ + return l.k->u64s == r.k->u64s && + l.k->type == r.k->type && + bpos_eq(l.k->p, r.k->p) && + bversion_eq(l.k->bversion, r.k->bversion) && + l.k->size == r.k->size && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); +} + #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) #define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) diff --git a/libbcachefs/btree/commit.c b/libbcachefs/btree/commit.c index 07e84835..c9a05eb6 100644 --- a/libbcachefs/btree/commit.c +++ b/libbcachefs/btree/commit.c @@ -67,11 +67,8 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert k = bkey_i_to_s_c(j_k); } - u = *k.k; - u.needs_whiteout = i->old_k.needs_whiteout; - - BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); - BUG_ON(i->old_v != k.v); + struct bkey_s_c old = { &i->old_k, i->old_v }; + BUG_ON(!bkey_and_val_eq(k, old)); #endif } @@ -692,14 +689,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, trans_for_each_update(trans, i) if (btree_node_type_has_atomic_triggers(i->bkey_type)) { ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_atomic|i->flags); - if (ret) - goto fatal_err; + if (bch2_fs_fatal_err_on(ret, c, "fatal error in transaction commit: %s", bch2_err_str(ret))) + return ret; } if (unlikely(c->gc_pos.phase)) { ret = bch2_trans_commit_run_gc_triggers(trans); - if (ret) - goto fatal_err; + if (bch2_fs_fatal_err_on(ret, c, "fatal error in transaction commit: %s", bch2_err_str(ret))) + return ret; } struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; @@ -716,7 +713,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (unlikely(ret)) { bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", trans->fn); - goto fatal_err; + bch2_sb_error_count(c, BCH_FSCK_ERR_validate_error_in_commit); + __WARN(); + return ret; } } @@ -728,7 +727,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, if (unlikely(ret)){ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", trans->fn, (void *) i->ip_allocated); - goto fatal_err; + bch2_sb_error_count(c, BCH_FSCK_ERR_validate_error_in_commit); + __WARN(); + return ret; } btree_insert_entry_checks(trans, i); } @@ -795,9 +796,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, } return 0; -fatal_err: - bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); - return ret; } static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) @@ -1055,6 +1053,29 @@ int __bch2_trans_commit(struct btree_trans *trans, enum bch_trans_commit_flags f if (ret) goto out_reset; + if (likely(!(flags & BCH_TRANS_COMMIT_no_skip_noops))) { + struct btree_insert_entry *dst = trans->updates; + trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + + /* + * We can't drop noop inode updates because fsync relies + * on grabbing the journal_seq of the latest update from + * the inode - and the journal_seq isn't updated until + * the atomic trigger: + */ + if (likely(i->bkey_type == BKEY_TYPE_inodes || + !bkey_and_val_eq(old, bkey_i_to_s_c(i->k)))) + *dst++ = *i; + else + bch2_path_put(trans, i->path, true); + } + trans->nr_updates = dst - trans->updates; + + if (!bch2_trans_has_updates(trans)) + goto out_reset; + } + if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) diff --git a/libbcachefs/btree/interior.c b/libbcachefs/btree/interior.c index edf5f1f2..111d9588 100644 --- a/libbcachefs/btree/interior.c +++ b/libbcachefs/btree/interior.c @@ -2390,28 +2390,17 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BTREE_TRIGGER_transactional)); } - CLASS(btree_iter_uninit, iter2)(trans); - struct btree *parent = btree_node_parent(btree_iter_path(trans, iter), b); - if (parent) { - bch2_trans_copy_iter(&iter2, iter); + if (!btree_node_is_root(c, b)) { + CLASS(btree_node_iter, parent_iter)(trans, + b->c.btree_id, + b->key.k.p, + 0, + b->c.level + 1, + BTREE_ITER_intent); - iter2.path = bch2_btree_path_make_mut(trans, iter2.path, - iter2.flags & BTREE_ITER_intent, - _THIS_IP_); - - struct btree_path *path2 = btree_iter_path(trans, &iter2); - BUG_ON(path2->level != b->c.level); - BUG_ON(!bpos_eq(path2->pos, new_key->k.p)); - - btree_path_set_level_up(trans, path2); - - trans->paths_sorted = false; - - try(bch2_btree_iter_traverse(&iter2)); - try(bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun)); + try(bch2_btree_iter_traverse(&parent_iter)); + try(bch2_trans_update(trans, &parent_iter, new_key, BTREE_TRIGGER_norun)); } else { - BUG_ON(!btree_node_is_root(c, b)); - struct jset_entry *e = errptr_try(bch2_trans_jset_entry_alloc(trans, jset_u64s(new_key->k.u64s))); @@ -2453,6 +2442,11 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite { struct btree_path *path = btree_iter_path(trans, iter); + /* + * Awkward - we can't rely on caller specifying BTREE_ITER_intent, and + * the commit will downgrade locks + */ + try(bch2_btree_path_upgrade(trans, path, b->c.level + 1)); path->intent_ref++; diff --git a/libbcachefs/btree/iter.c b/libbcachefs/btree/iter.c index a7d66d7f..caa14b96 100644 --- a/libbcachefs/btree/iter.c +++ b/libbcachefs/btree/iter.c @@ -1514,11 +1514,13 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) i->cached, (void *) i->ip_allocated); - prt_printf(buf, " old "); + guard(printbuf_indent)(buf); + + prt_printf(buf, "old "); bch2_bkey_val_to_text(buf, trans->c, old); prt_newline(buf); - prt_printf(buf, " new "); + prt_printf(buf, "new "); bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); prt_newline(buf); } diff --git a/libbcachefs/btree/key_cache.c b/libbcachefs/btree/key_cache.c index d3506f3f..0532e9a9 100644 --- a/libbcachefs/btree/key_cache.c +++ b/libbcachefs/btree/key_cache.c @@ -410,7 +410,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct bkey_cached *ck = NULL; - int ret; CLASS(btree_iter, b_iter)(trans, key.btree_id, key.pos, BTREE_ITER_slots| @@ -427,69 +426,56 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, if (!ck) return 0; - if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - if (evict) - goto evict; - return 0; + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + if (journal_seq && ck->journal.seq != journal_seq) + return 0; + + trans->journal_res.seq = ck->journal.seq; + + /* + * If we're at the end of the journal, we really want to free up space + * in the journal right away - we don't want to pin that old journal + * sequence number with a new btree node write, we want to re-journal + * the update + */ + if (ck->journal.seq == journal_last_seq(j)) + commit_flags |= BCH_WATERMARK_reclaim; + + if (ck->journal.seq != journal_last_seq(j) || + !journal_low_on_space(&c->journal)) + commit_flags |= BCH_TRANS_COMMIT_no_journal_res; + + struct bkey_s_c btree_k = bkey_try(bch2_btree_iter_peek_slot(&b_iter)); + + /* * Check that we're not violating cache coherency rules: */ + BUG_ON(bkey_deleted(btree_k.k)); + + try(bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_internal_snapshot_node| + BTREE_UPDATE_key_cache_reclaim| + BTREE_TRIGGER_norun)); + try(bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_skip_noops| + commit_flags)); + + bch2_journal_pin_drop(j, &ck->journal); + + struct btree_path *path = btree_iter_path(trans, &c_iter); + BUG_ON(!btree_node_locked(path, 0)); } - if (journal_seq && ck->journal.seq != journal_seq) - return 0; - - trans->journal_res.seq = ck->journal.seq; - - /* - * If we're at the end of the journal, we really want to free up space - * in the journal right away - we don't want to pin that old journal - * sequence number with a new btree node write, we want to re-journal - * the update - */ - if (ck->journal.seq == journal_last_seq(j)) - commit_flags |= BCH_WATERMARK_reclaim; - - if (ck->journal.seq != journal_last_seq(j) || - !journal_low_on_space(&c->journal)) - commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - - struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); - ret = bkey_err(btree_k); - if (ret) - goto err; - - /* * Check that we're not violating cache coherency rules: */ - BUG_ON(bkey_deleted(btree_k.k)); - - ret = bch2_trans_update(trans, &b_iter, ck->k, - BTREE_UPDATE_key_cache_reclaim| - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - commit_flags); -err: - bch2_fs_fatal_err_on(ret && - !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && - !bch2_journal_error(j), c, - "flushing key cache: %s", bch2_err_str(ret)); - if (ret) - goto out; - - bch2_journal_pin_drop(j, &ck->journal); - - struct btree_path *path = btree_iter_path(trans, &c_iter); - BUG_ON(!btree_node_locked(path, 0)); - if (!evict) { if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { clear_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_dec(&c->btree_key_cache.nr_dirty); } } else { + struct btree_path *path = btree_iter_path(trans, &c_iter); struct btree_path *path2; unsigned i; -evict: + trans_for_each_path(trans, path2, i) if (path2 != path) __bch2_btree_path_unlock(trans, path2); @@ -509,8 +495,8 @@ evict: six_unlock_intent(&ck->c.lock); } } -out: - return ret; + + return 0; } int bch2_btree_key_cache_journal_flush(struct journal *j, @@ -544,6 +530,10 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, ret = lockrestart_do(trans, btree_key_cache_flush_pos(trans, key, seq, BCH_TRANS_COMMIT_journal_reclaim, false)); + bch2_fs_fatal_err_on(ret && + !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && + !bch2_journal_error(j), c, + "flushing key cache: %s", bch2_err_str(ret)); unlock: srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); return ret; diff --git a/libbcachefs/btree/update.h b/libbcachefs/btree/update.h index 8797a445..af83ffdd 100644 --- a/libbcachefs/btree/update.h +++ b/libbcachefs/btree/update.h @@ -28,6 +28,7 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, x(no_check_rw, "don't attempt to take a ref on c->writes") \ x(no_journal_res, "don't take a journal reservation, instead " \ "pin journal entry referred to by trans->journal_res.seq") \ + x(no_skip_noops, "don't drop noop updates") \ x(journal_reclaim, "operation required for journal reclaim; may return error" \ "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\ x(skip_accounting_apply, "we're in journal replay - accounting updates have already been applied") diff --git a/libbcachefs/btree/write.c b/libbcachefs/btree/write.c index 0e86a7a8..f2f1ff17 100644 --- a/libbcachefs/btree/write.c +++ b/libbcachefs/btree/write.c @@ -109,7 +109,7 @@ static int btree_node_write_update_key(struct btree_trans *trans, bch2_bkey_drop_ptrs(bkey_i_to_s(n), p, entry, bch2_dev_list_has_dev(wbio->wbio.failed, p.ptr.dev)); - if (!bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&wbio->key))) + if (!bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(n))) return bch_err_throw(c, btree_node_write_all_failed); return bch2_btree_node_update_key(trans, &iter, b, n, @@ -127,7 +127,6 @@ static void btree_node_write_work(struct work_struct *work) struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; u64 start_time = wbio->start_time; - int ret = 0; bch2_btree_bounce_free(c, wbio->data_bytes, @@ -135,7 +134,7 @@ static void btree_node_write_work(struct work_struct *work) wbio->data); if (!wbio->wbio.first_btree_write || wbio->wbio.failed.nr) { - ret = bch2_trans_do(c, btree_node_write_update_key(trans, wbio, b)); + int ret = bch2_trans_do(c, btree_node_write_update_key(trans, wbio, b)); if (ret) { set_btree_node_noevict(b); diff --git a/libbcachefs/btree/write_buffer.c b/libbcachefs/btree/write_buffer.c index 25c97d3a..ac60e3ca 100644 --- a/libbcachefs/btree/write_buffer.c +++ b/libbcachefs/btree/write_buffer.c @@ -135,6 +135,7 @@ static noinline int wb_flush_one_slowpath(struct btree_trans *trans, BCH_TRANS_COMMIT_no_enospc| BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_no_skip_noops| BCH_TRANS_COMMIT_journal_reclaim); } @@ -142,7 +143,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite struct btree_write_buffered_key *wb, bool *write_locked, bool *accounting_accumulated, - size_t *fast) + size_t *fast, size_t *noop) { struct btree_path *path; @@ -171,6 +172,21 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite path = btree_iter_path(trans, iter); + struct btree_path_level *l = path_l(path); + struct bkey_packed *old_p = bch2_btree_node_iter_peek_all(&l->iter, l->b); + if (old_p && bkey_cmp_left_packed(l->b, old_p, &wb->k.k.p)) + old_p = NULL; + + struct bkey old_u; + struct bkey_s_c old = old_p + ? bkey_disassemble(l->b, old_p, &old_u) + : bkey_s_c_null; + + if (old.k && bkey_and_val_eq(old, bkey_i_to_s_c(&wb->k))) { + (*noop)++; + return 0; + } + if (!*write_locked) { try(bch2_btree_node_lock_write(trans, path, &path->l[0].b->c)); @@ -282,7 +298,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; struct btree_iter iter = { NULL }; - size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; + size_t overwritten = 0, fast = 0, noop = 0, slowpath = 0, could_not_insert = 0; bool write_locked = false; bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); int ret = 0; @@ -394,7 +410,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) } ret = wb_flush_one(trans, &iter, k, &write_locked, - &accounting_accumulated, &fast); + &accounting_accumulated, &fast, &noop); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); @@ -495,7 +511,7 @@ err: bch2_time_stats_update(&c->times[BCH_TIME_btree_write_buffer_flush], start_time); bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret)); - trace_write_buffer_flush(trans, nr_flushing, overwritten, fast); + trace_write_buffer_flush(trans, nr_flushing, overwritten, fast, noop); return ret; } @@ -642,11 +658,16 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) */ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, struct bkey_s_c referring_k, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *f) { struct bch_fs *c = trans->c; - if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + if (f->seen_error && + f->nr_flushes > 32 && + f->nr_flushes * 8 > f->nr_done) + return 0; + + if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(f->last_flushed.k))) { if (trace_write_buffer_maybe_flush_enabled()) { CLASS(printbuf, buf)(); @@ -665,13 +686,15 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, try(bch2_btree_write_buffer_flush_sync(trans)); - bch2_bkey_buf_copy(last_flushed, tmp.k); + bch2_bkey_buf_copy(&f->last_flushed, tmp.k); + f->nr_flushes++; /* can we avoid the unconditional restart? */ trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); return bch_err_throw(c, transaction_restart_write_buffer_flush); } + f->seen_error = true; return 0; } diff --git a/libbcachefs/btree/write_buffer.h b/libbcachefs/btree/write_buffer.h index 177cfda3..ffc9b199 100644 --- a/libbcachefs/btree/write_buffer.h +++ b/libbcachefs/btree/write_buffer.h @@ -3,6 +3,7 @@ #define _BCACHEFS_BTREE_WRITE_BUFFER_H #include "btree/bkey.h" +#include "btree/bkey_buf.h" #include "alloc/accounting.h" static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) @@ -25,8 +26,31 @@ bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); -struct bkey_buf; -int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct bkey_buf *); +struct wb_maybe_flush { + struct bkey_buf last_flushed; + u64 nr_flushes; + u64 nr_done; + bool seen_error; +}; + +static inline void wb_maybe_flush_exit(struct wb_maybe_flush *f) +{ + bch2_bkey_buf_exit(&f->last_flushed); +} + +static inline void wb_maybe_flush_init(struct wb_maybe_flush *f) +{ + memset(f, 0, sizeof(*f)); + bch2_bkey_buf_init(&f->last_flushed); +} + +static inline int wb_maybe_flush_inc(struct wb_maybe_flush *f) +{ + f->nr_done++; + return 0; +} + +int bch2_btree_write_buffer_maybe_flush(struct btree_trans *, struct bkey_s_c, struct wb_maybe_flush *); struct journal_keys_to_wb { struct btree_write_buffer_keys *wb; diff --git a/libbcachefs/data/ec.c b/libbcachefs/data/ec.c index 0d7a777e..5d62d98c 100644 --- a/libbcachefs/data/ec.c +++ b/libbcachefs/data/ec.c @@ -1035,7 +1035,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, struct bkey_s_c_backpointer bp, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_fs *c = trans->c; @@ -1123,8 +1123,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); return for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, bucket_pos), @@ -1142,6 +1142,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b if (bp.v->btree_id == BTREE_ID_stripes) continue; + wb_maybe_flush_inc(&last_flushed); ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, bp, &last_flushed); })); } @@ -2243,7 +2244,7 @@ int bch2_fs_ec_init(struct bch_fs *c) static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, struct bkey_s_c k, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { if (k.k->type != KEY_TYPE_stripe) return 0; @@ -2258,8 +2259,8 @@ static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, int bch2_check_stripe_to_lru_refs(struct btree_trans *trans) { - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); return for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, POS_MIN, BTREE_ITER_prefetch, k, diff --git a/libbcachefs/data/migrate.c b/libbcachefs/data/migrate.c index c89d2af0..1c7bbd8b 100644 --- a/libbcachefs/data/migrate.c +++ b/libbcachefs/data/migrate.c @@ -98,7 +98,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, static int bch2_dev_btree_drop_key(struct btree_trans *trans, struct bkey_s_c_backpointer bp, unsigned dev_idx, - struct bkey_buf *last_flushed, + struct wb_maybe_flush *last_flushed, unsigned flags, struct printbuf *err) { CLASS(btree_iter_uninit, iter)(trans); @@ -185,7 +185,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, } static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, - struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, + struct bkey_s_c_backpointer bp, struct wb_maybe_flush *last_flushed, unsigned flags, struct printbuf *err) { CLASS(btree_iter_uninit, iter)(trans); @@ -218,8 +218,8 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig { CLASS(btree_trans, trans)(c); - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); return bch2_btree_write_buffer_flush_sync(trans) ?: for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, @@ -229,6 +229,7 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig if (k.k->type != KEY_TYPE_backpointer) continue; + wb_maybe_flush_inc(&last_flushed); data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), &last_flushed, flags, err); diff --git a/libbcachefs/data/move.c b/libbcachefs/data/move.c index a4bc083c..72dcdf96 100644 --- a/libbcachefs/data/move.c +++ b/libbcachefs/data/move.c @@ -572,8 +572,8 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); /* * We're not run in a context that handles transaction restarts: diff --git a/libbcachefs/data/rebalance.c b/libbcachefs/data/rebalance.c index 9dc68480..115e35f9 100644 --- a/libbcachefs/data/rebalance.c +++ b/libbcachefs/data/rebalance.c @@ -907,7 +907,7 @@ int bch2_fs_rebalance_init(struct bch_fs *c) static int check_rebalance_work_one(struct btree_trans *trans, struct btree_iter *extent_iter, struct btree_iter *rebalance_iter, - struct bkey_buf *last_flushed) + struct wb_maybe_flush *last_flushed) { struct bch_fs *c = trans->c; CLASS(printbuf, buf)(); @@ -983,8 +983,8 @@ int bch2_check_rebalance_work(struct bch_fs *c) CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN, BTREE_ITER_prefetch); - struct bkey_buf last_flushed __cleanup(bch2_bkey_buf_exit); - bch2_bkey_buf_init(&last_flushed); + struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit); + wb_maybe_flush_init(&last_flushed); struct progress_indicator_state progress; bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_rebalance_work)); @@ -992,6 +992,7 @@ int bch2_check_rebalance_work(struct bch_fs *c) int ret = 0; while (!(ret = lockrestart_do(trans, progress_update_iter(trans, &progress, &rebalance_iter) ?: + wb_maybe_flush_inc(&last_flushed) ?: check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed)))) ; diff --git a/libbcachefs/data/update.c b/libbcachefs/data/update.c index fd7eda88..0076fbda 100644 --- a/libbcachefs/data/update.c +++ b/libbcachefs/data/update.c @@ -764,11 +764,20 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) } if (!nr_replicas) { - if (trace) { - prt_printf(&buf, "\nnr_replicas %u < %u", nr_replicas, m->op.nr_replicas); - trace_data_update_fail(c, buf.buf); + /* + * If it's a promote that's failing because the promote target + * is full - we expect that in normal operation; it'll still + * show up in io_read_nopromote and error_throw: + */ + if (m->opts.type != BCH_DATA_UPDATE_promote) { + if (trace) { + prt_printf(&buf, " - got replicas %u\n", nr_replicas); + bch2_data_update_to_text(&buf, m); + prt_printf(&buf, "\nret:\t%s\n", bch2_err_str(-BCH_ERR_data_update_fail_no_rw_devs)); + trace_data_update_fail(c, buf.buf); + } + count_event(c, data_update_fail); } - count_event(c, data_update_fail); return bch_err_throw(c, data_update_fail_no_rw_devs); } diff --git a/libbcachefs/debug/trace.h b/libbcachefs/debug/trace.h index bb64f5eb..c84bccd1 100644 --- a/libbcachefs/debug/trace.h +++ b/libbcachefs/debug/trace.h @@ -1246,23 +1246,25 @@ TRACE_EVENT(key_cache_fill, ); TRACE_EVENT(write_buffer_flush, - TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast), - TP_ARGS(trans, nr, skipped, fast), + TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t noop), + TP_ARGS(trans, nr, skipped, fast, noop), TP_STRUCT__entry( __field(size_t, nr ) __field(size_t, skipped ) __field(size_t, fast ) + __field(size_t, noop ) ), TP_fast_assign( __entry->nr = nr; __entry->skipped = skipped; __entry->fast = fast; + __entry->noop = noop; ), - TP_printk("flushed %zu skipped %zu fast %zu", - __entry->nr, __entry->skipped, __entry->fast) + TP_printk("flushed %zu skipped %zu fast %zu noop %zu", + __entry->nr, __entry->skipped, __entry->fast, __entry->noop) ); TRACE_EVENT(write_buffer_flush_sync, diff --git a/libbcachefs/init/dev.c b/libbcachefs/init/dev.c index 914c82f3..8671db2e 100644 --- a/libbcachefs/init/dev.c +++ b/libbcachefs/init/dev.c @@ -514,7 +514,8 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, new_online_devs = c->online_devs; __clear_bit(ca->dev_idx, new_online_devs.d); - return bch2_have_enough_devs(c, new_online_devs, flags, err); + return bch2_have_enough_devs(c, new_online_devs, flags, err, + test_bit(BCH_FS_rw, &c->flags)); default: BUG(); } diff --git a/libbcachefs/init/fs.c b/libbcachefs/init/fs.c index 829b71a4..2cdf1d41 100644 --- a/libbcachefs/init/fs.c +++ b/libbcachefs/init/fs.c @@ -451,8 +451,6 @@ bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) static int __bch2_fs_read_write(struct bch_fs *c, bool early) { - int ret; - BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) @@ -473,17 +471,9 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch_info(c, "going read-write"); - ret = bch2_fs_init_rw(c); - if (ret) - return ret; - - ret = bch2_sb_members_v2_init(c); - if (ret) - return ret; - - ret = bch2_fs_mark_dirty(c); - if (ret) - return ret; + try(bch2_fs_init_rw(c)); + try(bch2_sb_members_v2_init(c)); + try(bch2_fs_mark_dirty(c)); clear_bit(BCH_FS_clean_shutdown, &c->flags); @@ -518,22 +508,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) enumerated_ref_start(&c->writes); - ret = bch2_journal_reclaim_start(&c->journal); + int ret = bch2_journal_reclaim_start(&c->journal) ?: + bch2_copygc_start(c) ?: + bch2_rebalance_start(c); if (ret) { - bch_err_msg(c, ret, "error starting journal reclaim thread"); - goto err; - } - - ret = bch2_copygc_start(c); - if (ret) { - bch_err_msg(c, ret, "error starting copygc thread"); - goto err; - } - - ret = bch2_rebalance_start(c); - if (ret) { - bch_err_msg(c, ret, "error starting rebalance thread"); - goto err; + bch2_fs_read_only(c); + return ret; } bch2_do_discards(c); @@ -541,12 +521,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_do_stripe_deletes(c); bch2_do_pending_node_rewrites(c); return 0; -err: - if (test_bit(BCH_FS_rw, &c->flags)) - bch2_fs_read_only(c); - else - __bch2_fs_read_only(c); - return ret; } int bch2_fs_read_write(struct bch_fs *c) @@ -905,8 +879,6 @@ static bool check_version_upgrade(struct bch_fs *c) noinline_for_stack static int bch2_fs_opt_version_init(struct bch_fs *c) { - int ret = 0; - if (c->opts.norecovery) { c->opts.recovery_pass_last = c->opts.recovery_pass_last ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) @@ -974,9 +946,7 @@ static int bch2_fs_opt_version_init(struct bch_fs *c) if (!ext) return bch_err_throw(c, ENOSPC_sb); - ret = bch2_sb_members_v2_init(c); - if (ret) - return ret; + try(bch2_sb_members_v2_init(c)); __le64 now = cpu_to_le64(ktime_get_real_seconds()); scoped_guard(rcu) @@ -1370,7 +1340,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) } CLASS(printbuf, err)(); - bool ret = bch2_have_enough_devs(c, c->online_devs, flags, &err); + bool ret = bch2_have_enough_devs(c, c->online_devs, flags, &err, !c->opts.read_only); if (!ret) bch2_print_str(c, KERN_ERR, err.buf); return ret; diff --git a/libbcachefs/init/recovery.c b/libbcachefs/init/recovery.c index 000124d1..ae927a49 100644 --- a/libbcachefs/init/recovery.c +++ b/libbcachefs/init/recovery.c @@ -368,6 +368,7 @@ int bch2_journal_replay(struct bch_fs *c) ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_skip_noops| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_skip_accounting_apply| BCH_TRANS_COMMIT_no_journal_res| @@ -400,6 +401,7 @@ int bch2_journal_replay(struct bch_fs *c) ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_skip_noops| BCH_TRANS_COMMIT_journal_reclaim| BCH_TRANS_COMMIT_skip_accounting_apply| (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0), @@ -429,6 +431,7 @@ int bch2_journal_replay(struct bch_fs *c) ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_skip_noops| BCH_TRANS_COMMIT_skip_accounting_apply| (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim @@ -1082,6 +1085,11 @@ int bch2_fs_initialize(struct bch_fs *c) scoped_guard(mutex, &c->sb_lock) { SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + memset(ext->errors_silent, 0, sizeof(ext->errors_silent)); + memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required)); + bch2_write_super(c); } diff --git a/libbcachefs/journal/journal.c b/libbcachefs/journal/journal.c index df54b71c..91d80258 100644 --- a/libbcachefs/journal/journal.c +++ b/libbcachefs/journal/journal.c @@ -1147,14 +1147,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) } for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->mi.durability) - continue; - struct journal_device *ja = &ca->journal; - - if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) - continue; - if (!ja->nr) continue; diff --git a/libbcachefs/journal/read.c b/libbcachefs/journal/read.c index e516c3db..2a697a23 100644 --- a/libbcachefs/journal/read.c +++ b/libbcachefs/journal/read.c @@ -390,15 +390,15 @@ static int journal_validate_key(struct bch_fs *c, bch2_bkey_compat(from.level, from.btree, version, big_endian, write, NULL, bkey_to_packed(k)); - ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); - if (ret == -BCH_ERR_fsck_delete_bkey) { + if (journal_entry_err_on(ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from), + c, version, jset, entry, + journal_entry_bkey_bad_format, + "bkey validate error %s", bch2_err_str(ret))) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); journal_entry_null_range(vstruct_next(entry), next); return FSCK_DELETED_KEY; } - if (ret) - goto fsck_err; if (write) bch2_bkey_compat(from.level, from.btree, version, big_endian, diff --git a/libbcachefs/journal/reclaim.c b/libbcachefs/journal/reclaim.c index 9f9bef7b..0c9bb010 100644 --- a/libbcachefs/journal/reclaim.c +++ b/libbcachefs/journal/reclaim.c @@ -156,8 +156,7 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne (totalram_pages() * PAGE_SIZE) / 4 - j->dirty_entry_bytes); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->journal.nr || - !ca->mi.durability) + if (!ca->journal.nr) continue; min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); diff --git a/libbcachefs/journal/write.c b/libbcachefs/journal/write.c index 7b70156b..0e6deef8 100644 --- a/libbcachefs/journal/write.c +++ b/libbcachefs/journal/write.c @@ -74,9 +74,7 @@ static void __journal_write_alloc(struct journal *j, * Check that we can use this device, and aren't already using * it: */ - if (!ca->mi.durability || - ca->mi.state != BCH_MEMBER_STATE_rw || - !ja->nr || + if (!ja->nr || bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || sectors > ja->sectors_free) { enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 425a751c..535170e3 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -6,6 +6,7 @@ #include "bcachefs.h" #include "opts.h" +#include "alloc/background.h" #include "alloc/disk_groups.h" #include "data/compress.h" @@ -601,6 +602,15 @@ void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, bch2_write_super(c); } break; + case Opt_durability: + if (test_bit(BCH_FS_rw, &c->flags) && + ca && + bch2_dev_is_online(ca) && + ca->mi.state == BCH_MEMBER_STATE_rw) { + guard(rcu)(); + bch2_dev_allocator_set_rw(c, ca, true); + } + break; case Opt_version_upgrade: /* * XXX: in the future we'll likely want to do compatible diff --git a/libbcachefs/sb/errors_format.h b/libbcachefs/sb/errors_format.h index fbdb7b7d..42070f99 100644 --- a/libbcachefs/sb/errors_format.h +++ b/libbcachefs/sb/errors_format.h @@ -340,7 +340,8 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 329, 0) + x(validate_error_in_commit, 329, 0) \ + x(MAX, 330, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/sb/io.c b/libbcachefs/sb/io.c index 1bf35308..11842325 100644 --- a/libbcachefs/sb/io.c +++ b/libbcachefs/sb/io.c @@ -1171,13 +1171,13 @@ int bch2_write_super(struct bch_fs *c) nr_wrote = dev_mask_nr(&sb_written); can_mount_with_written = - bch2_have_enough_devs(c, sb_written, degraded_flags, NULL); + bch2_can_read_fs_with_devs(c, sb_written, degraded_flags, NULL); for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++) sb_written.d[i] = ~sb_written.d[i]; can_mount_without_written = - bch2_have_enough_devs(c, sb_written, degraded_flags, NULL); + bch2_can_read_fs_with_devs(c, sb_written, degraded_flags, NULL); /* * If we would be able to mount _without_ the devices we successfully