From 47360ea69f34b24cf2d95f65a81cbcc721049e1a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 28 Sep 2025 15:54:47 -0400 Subject: [PATCH] Update bcachefs sources to 446f76b78b1e bcachefs: Fix promote path nocow deadlock Signed-off-by: Kent Overstreet --- .bcachefs_revision | 2 +- libbcachefs/data_update.c | 47 ++++++-- libbcachefs/disk_accounting.c | 215 +++++++++++++++++++--------------- libbcachefs/extents.c | 33 ++++++ libbcachefs/extents.h | 1 + libbcachefs/journal.c | 2 +- libbcachefs/nocow_locking.c | 9 ++ 7 files changed, 197 insertions(+), 112 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index cbc331cf..7a32daaf 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -b0e2c6125656a3ef24d52fa60546dc30d2dd17f3 +446f76b78b1e368462bf5b4d772777ce444fe0a5 diff --git a/libbcachefs/data_update.c b/libbcachefs/data_update.c index 6615c868..3b071f9a 100644 --- a/libbcachefs/data_update.c +++ b/libbcachefs/data_update.c @@ -35,7 +35,8 @@ static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) - bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); + if (ptr->dev != BCH_SB_MEMBER_INVALID) + bch2_dev_put(bch2_dev_have_ref(c, ptr->dev)); } static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) @@ -43,7 +44,8 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { + if (ptr->dev != BCH_SB_MEMBER_INVALID && + unlikely(!bch2_dev_tryget(c, ptr->dev))) { bkey_for_each_ptr(ptrs, ptr2) { if (ptr2 == ptr) break; @@ -86,15 +88,18 @@ static void trace_io_move_fail2(struct data_update *m, struct bkey_i *insert, const char *msg) { - struct bch_fs *c = m->op.c; - struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - CLASS(printbuf, buf)(); - unsigned rewrites_found = 0; - if (!trace_io_move_fail_enabled()) return; + struct bch_fs *c = m->op.c; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); + unsigned rewrites_found = 0; + + CLASS(printbuf, buf)(); + printbuf_indent_add_nextline(&buf, 2); + prt_str(&buf, msg); + prt_newline(&buf); if (insert) { const union bch_extent_entry *entry; @@ -117,17 +122,17 @@ static void trace_io_move_fail2(struct data_update *m, bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); - prt_str(&buf, "\nold: "); + prt_str_indented(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nnew: "); + prt_str_indented(&buf, "\nnew: "); bch2_bkey_val_to_text(&buf, c, new); - prt_str(&buf, "\nwrote: "); + prt_str_indented(&buf, "\nwrote: "); bch2_bkey_val_to_text(&buf, c, wrote); if (insert) { - prt_str(&buf, "\ninsert: "); + prt_str_indented(&buf, "\ninsert: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); } @@ -263,6 +268,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_cut_back(new->k.p, insert); bch2_cut_back(insert->k.p, &new->k_i); + bch2_bkey_propagate_incompressible(insert, bkey_i_to_s_c(&new->k_i)); + /* * @old: extent that we read from * @insert: key that we're going to update, initialized from @@ -304,7 +311,9 @@ restart_drop_conflicting_replicas: } if (!bkey_val_u64s(&new->k)) { - trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + trace_io_move_fail2(m, k, + bkey_i_to_s_c(bch2_keylist_front(&op->insert_keys)), + insert, "new replicas conflicted:"); goto nowork; } @@ -745,7 +754,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); darray_for_each(m->op.devs_have, i) - __clear_bit(*i, devs.d); + if (*i != BCH_SB_MEMBER_INVALID) + __clear_bit(*i, devs.d); CLASS(printbuf, buf)(); @@ -963,6 +973,16 @@ int bch2_data_update_init(struct btree_trans *trans, if (c->opts.nocow_enabled) { if (!bch2_bkey_nocow_trylock(c, ptrs, 0)) { + if (!ctxt) { + /* We're being called from the promote path: + * there is a btree_trans on the stack that's + * holding locks, but we don't have a pointer to + * it. Ouch - this needs to be fixed. + */ + ret = bch_err_throw(c, nocow_lock_blocked); + goto out_put_dev_refs; + } + bool locked = false; if (ctxt) move_ctxt_wait_event(ctxt, @@ -992,6 +1012,7 @@ int bch2_data_update_init(struct btree_trans *trans, out_nocow_unlock: if (c->opts.nocow_enabled) bch2_bkey_nocow_unlock(c, k, 0); +out_put_dev_refs: bkey_put_dev_refs(c, k); out: bch2_disk_reservation_put(c, &m->op.res); diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index 22b2dbe8..19f2513b 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -786,102 +786,10 @@ static struct journal_key *accumulate_and_read_journal_accounting(struct btree_t return ret ? ERR_PTR(ret) : next; } -/* - * At startup time, initialize the in memory accounting from the btree (and - * journal) - */ -int bch2_accounting_read(struct bch_fs *c) +static int accounting_read_mem_fixups(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct bch_accounting_mem *acc = &c->accounting; - CLASS(btree_trans, trans)(c); - CLASS(printbuf, buf)(); - - /* - * We might run more than once if we rewind to start topology repair or - * btree node scan - and those might cause us to get different results, - * so we can't just skip if we've already run. - * - * Instead, zero out any accounting we have: - */ - scoped_guard(percpu_write, &c->mark_lock) { - darray_for_each(acc->k, e) - percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); - for_each_member_device(c, ca) - percpu_memset(ca->usage, 0, sizeof(*ca->usage)); - percpu_memset(c->usage, 0, sizeof(*c->usage)); - } - - struct journal_keys *keys = &c->journal_keys; - struct journal_key *jk = keys->data; - - move_gap(keys, keys->nr); - - while (jk < &darray_top(*keys) && - __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0) - jk++; - - struct journal_key *end = jk; - while (end < &darray_top(*keys) && - __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0) - end++; - - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); - iter.flags &= ~BTREE_ITER_with_journal; - int ret = for_each_btree_key_continue(trans, iter, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ - if (k.k->type != KEY_TYPE_accounting) - continue; - - while (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0) - jk = accumulate_and_read_journal_accounting(trans, jk); - - while (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 && - bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) { - jk->overwritten = true; - jk++; - } - - if (jk < end && - __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0) - jk = accumulate_and_read_journal_accounting(trans, jk); - - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, k.k->p); - - if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - break; - - if (!bch2_accounting_is_mem(&acc_k)) { - struct disk_accounting_pos next_acc; - memset(&next_acc, 0, sizeof(next_acc)); - next_acc.type = acc_k.type + 1; - struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc)); - if (jk < end) - next = bpos_min(next, journal_key_k(c, jk)->k.p); - - bch2_btree_iter_set_pos(&iter, next); - continue; - } - - accounting_read_key(trans, k); - })); - bch2_trans_iter_exit(&iter); - if (ret) - return ret; - - while (jk < end) - jk = accumulate_and_read_journal_accounting(trans, jk); - - struct journal_key *dst = keys->data; - darray_for_each(*keys, i) - if (!i->overwritten) - *dst++ = *i; - keys->gap = keys->nr = dst - keys->data; - CLASS(printbuf, underflow_err)(); scoped_guard(percpu_write, &c->mark_lock) { @@ -902,7 +810,7 @@ int bch2_accounting_read(struct bch_fs *c) * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ - ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + int ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); @@ -984,7 +892,7 @@ int bch2_accounting_read(struct bch_fs *c) if (underflow_err.pos) { bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err); unsigned pos = underflow_err.pos; - ret = bch2_run_explicit_recovery_pass(c, &underflow_err, + int ret = bch2_run_explicit_recovery_pass(c, &underflow_err, BCH_RECOVERY_PASS_check_allocations, 0); print |= underflow_err.pos != pos; @@ -994,7 +902,120 @@ int bch2_accounting_read(struct bch_fs *c) return ret; } - return ret; + return 0; +} + +/* + * At startup time, initialize the in memory accounting from the btree (and + * journal) + */ +int bch2_accounting_read(struct bch_fs *c) +{ + struct bch_accounting_mem *acc = &c->accounting; + CLASS(btree_trans, trans)(c); + CLASS(printbuf, buf)(); + + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + scoped_guard(percpu_write, &c->mark_lock) { + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + } + + struct journal_keys *keys = &c->journal_keys; + struct journal_key *jk = keys->data; + + move_gap(keys, keys->nr); + + /* Find the range of accounting keys from the journal: */ + + while (jk < &darray_top(*keys) && + __journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0) + jk++; + + struct journal_key *end = jk; + while (end < &darray_top(*keys) && + __journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0) + end++; + + /* + * Iterate over btree and journal accounting simultaneously: + * + * We want to drop unneeded unneeded accounting deltas early - deltas + * that are older than the btree accounting key version, and once we've + * dropped old accounting deltas we can accumulate and compact deltas + * for the same key: + */ + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + iter.flags &= ~BTREE_ITER_with_journal; + int ret = for_each_btree_key_continue(trans, iter, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + if (k.k->type != KEY_TYPE_accounting) + continue; + + while (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0) + jk = accumulate_and_read_journal_accounting(trans, jk); + + while (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 && + bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) { + jk->overwritten = true; + jk++; + } + + if (jk < end && + __journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0) + jk = accumulate_and_read_journal_accounting(trans, jk); + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + + if (!bch2_accounting_is_mem(&acc_k)) { + struct disk_accounting_pos next_acc; + memset(&next_acc, 0, sizeof(next_acc)); + next_acc.type = acc_k.type + 1; + struct bpos next = disk_accounting_pos_to_bpos(&next_acc); + if (jk < end) + next = bpos_min(next, journal_key_k(c, jk)->k.p); + + /* for_each_btree_key() will still advance iterator: */ + bch2_btree_iter_set_pos(&iter, bpos_predecessor(next)); + continue; + } + + accounting_read_key(trans, k); + })); + bch2_trans_iter_exit(&iter); + if (ret) + return ret; + + while (jk < end) + jk = accumulate_and_read_journal_accounting(trans, jk); + + bch2_trans_unlock(trans); + + struct journal_key *dst = keys->data; + darray_for_each(*keys, i) + if (!i->overwritten) + *dst++ = *i; + keys->gap = keys->nr = dst - keys->data; + + return accounting_read_mem_fixups(trans); } int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca) diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 3274ba42..42e89776 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -783,6 +783,39 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) return false; } +void bch2_bkey_propagate_incompressible(struct bkey_i *dst, struct bkey_s_c src) +{ + if (!bch2_bkey_is_incompressible(src)) + return; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(dst)); + union bch_extent_entry *entry; + + /* + * XXX: if some data actually is compressed, we want + * bch_extent_rebalance.wont_recompress_smaller + */ + + bkey_extent_entry_for_each(ptrs, entry) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_crc32: + if (entry->crc32.compression_type == BCH_COMPRESSION_TYPE_none) + entry->crc32.compression_type = BCH_COMPRESSION_TYPE_incompressible; + break; + case BCH_EXTENT_ENTRY_crc64: + if (entry->crc64.compression_type == BCH_COMPRESSION_TYPE_none) + entry->crc64.compression_type = BCH_COMPRESSION_TYPE_incompressible; + break; + case BCH_EXTENT_ENTRY_crc128: + if (entry->crc128.compression_type == BCH_COMPRESSION_TYPE_none) + entry->crc128.compression_type = BCH_COMPRESSION_TYPE_incompressible; + break; + default: + break; + } + } +} + unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 1ea9752b..1140aa2c 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -599,6 +599,7 @@ unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); bool bch2_bkey_is_incompressible(struct bkey_s_c); +void bch2_bkey_propagate_incompressible(struct bkey_i *, struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 9058df47..18f99fac 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -698,7 +698,7 @@ static unsigned max_dev_latency(struct bch_fs *c) u64 nsecs = 0; guard(rcu)(); - for_each_rw_member_rcu(c, ca) + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); return nsecs_to_jiffies(nsecs); diff --git a/libbcachefs/nocow_locking.c b/libbcachefs/nocow_locking.c index c8907070..73e14299 100644 --- a/libbcachefs/nocow_locking.c +++ b/libbcachefs/nocow_locking.c @@ -95,6 +95,9 @@ void bch2_bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k, int flags) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == BCH_SB_MEMBER_INVALID) + continue; + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); struct bpos bucket = PTR_BUCKET_POS(ca, ptr); @@ -105,6 +108,9 @@ void bch2_bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k, int flags) bool bch2_bkey_nocow_trylock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags) { bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == BCH_SB_MEMBER_INVALID) + continue; + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); struct bpos bucket = PTR_BUCKET_POS(ca, ptr); @@ -144,6 +150,9 @@ void bch2_bkey_nocow_lock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags) darray_init(&buckets); bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == BCH_SB_MEMBER_INVALID) + continue; + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); u64 b = bucket_to_u64(PTR_BUCKET_POS(ca, ptr)); struct nocow_lock_bucket *l =