Update bcachefs sources to 446f76b78b1e bcachefs: Fix promote path nocow deadlock

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-09-28 15:54:47 -04:00
parent bb78070ade
commit 47360ea69f
7 changed files with 197 additions and 112 deletions

View File

@ -1 +1 @@
b0e2c6125656a3ef24d52fa60546dc30d2dd17f3
446f76b78b1e368462bf5b4d772777ce444fe0a5

View File

@ -35,7 +35,8 @@ static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k)
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr)
bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
if (ptr->dev != BCH_SB_MEMBER_INVALID)
bch2_dev_put(bch2_dev_have_ref(c, ptr->dev));
}
static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
@ -43,7 +44,8 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k)
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
if (unlikely(!bch2_dev_tryget(c, ptr->dev))) {
if (ptr->dev != BCH_SB_MEMBER_INVALID &&
unlikely(!bch2_dev_tryget(c, ptr->dev))) {
bkey_for_each_ptr(ptrs, ptr2) {
if (ptr2 == ptr)
break;
@ -86,15 +88,18 @@ static void trace_io_move_fail2(struct data_update *m,
struct bkey_i *insert,
const char *msg)
{
struct bch_fs *c = m->op.c;
struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
CLASS(printbuf, buf)();
unsigned rewrites_found = 0;
if (!trace_io_move_fail_enabled())
return;
struct bch_fs *c = m->op.c;
struct bkey_s_c old = bkey_i_to_s_c(m->k.k);
unsigned rewrites_found = 0;
CLASS(printbuf, buf)();
printbuf_indent_add_nextline(&buf, 2);
prt_str(&buf, msg);
prt_newline(&buf);
if (insert) {
const union bch_extent_entry *entry;
@ -117,17 +122,17 @@ static void trace_io_move_fail2(struct data_update *m,
bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts);
prt_str(&buf, "\nold: ");
prt_str_indented(&buf, "\nold: ");
bch2_bkey_val_to_text(&buf, c, old);
prt_str(&buf, "\nnew: ");
prt_str_indented(&buf, "\nnew: ");
bch2_bkey_val_to_text(&buf, c, new);
prt_str(&buf, "\nwrote: ");
prt_str_indented(&buf, "\nwrote: ");
bch2_bkey_val_to_text(&buf, c, wrote);
if (insert) {
prt_str(&buf, "\ninsert: ");
prt_str_indented(&buf, "\ninsert: ");
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
}
@ -263,6 +268,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
bch2_cut_back(new->k.p, insert);
bch2_cut_back(insert->k.p, &new->k_i);
bch2_bkey_propagate_incompressible(insert, bkey_i_to_s_c(&new->k_i));
/*
* @old: extent that we read from
* @insert: key that we're going to update, initialized from
@ -304,7 +311,9 @@ restart_drop_conflicting_replicas:
}
if (!bkey_val_u64s(&new->k)) {
trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:");
trace_io_move_fail2(m, k,
bkey_i_to_s_c(bch2_keylist_front(&op->insert_keys)),
insert, "new replicas conflicted:");
goto nowork;
}
@ -745,7 +754,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target);
darray_for_each(m->op.devs_have, i)
__clear_bit(*i, devs.d);
if (*i != BCH_SB_MEMBER_INVALID)
__clear_bit(*i, devs.d);
CLASS(printbuf, buf)();
@ -963,6 +973,16 @@ int bch2_data_update_init(struct btree_trans *trans,
if (c->opts.nocow_enabled) {
if (!bch2_bkey_nocow_trylock(c, ptrs, 0)) {
if (!ctxt) {
/* We're being called from the promote path:
* there is a btree_trans on the stack that's
* holding locks, but we don't have a pointer to
* it. Ouch - this needs to be fixed.
*/
ret = bch_err_throw(c, nocow_lock_blocked);
goto out_put_dev_refs;
}
bool locked = false;
if (ctxt)
move_ctxt_wait_event(ctxt,
@ -992,6 +1012,7 @@ int bch2_data_update_init(struct btree_trans *trans,
out_nocow_unlock:
if (c->opts.nocow_enabled)
bch2_bkey_nocow_unlock(c, k, 0);
out_put_dev_refs:
bkey_put_dev_refs(c, k);
out:
bch2_disk_reservation_put(c, &m->op.res);

View File

@ -786,102 +786,10 @@ static struct journal_key *accumulate_and_read_journal_accounting(struct btree_t
return ret ? ERR_PTR(ret) : next;
}
/*
* At startup time, initialize the in memory accounting from the btree (and
* journal)
*/
int bch2_accounting_read(struct bch_fs *c)
static int accounting_read_mem_fixups(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct bch_accounting_mem *acc = &c->accounting;
CLASS(btree_trans, trans)(c);
CLASS(printbuf, buf)();
/*
* We might run more than once if we rewind to start topology repair or
* btree node scan - and those might cause us to get different results,
* so we can't just skip if we've already run.
*
* Instead, zero out any accounting we have:
*/
scoped_guard(percpu_write, &c->mark_lock) {
darray_for_each(acc->k, e)
percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
for_each_member_device(c, ca)
percpu_memset(ca->usage, 0, sizeof(*ca->usage));
percpu_memset(c->usage, 0, sizeof(*c->usage));
}
struct journal_keys *keys = &c->journal_keys;
struct journal_key *jk = keys->data;
move_gap(keys, keys->nr);
while (jk < &darray_top(*keys) &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
jk++;
struct journal_key *end = jk;
while (end < &darray_top(*keys) &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
end++;
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
iter.flags &= ~BTREE_ITER_with_journal;
int ret = for_each_btree_key_continue(trans, iter,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
if (k.k->type != KEY_TYPE_accounting)
continue;
while (jk < end &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
jk = accumulate_and_read_journal_accounting(trans, jk);
while (jk < end &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
jk->overwritten = true;
jk++;
}
if (jk < end &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
jk = accumulate_and_read_journal_accounting(trans, jk);
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
break;
if (!bch2_accounting_is_mem(&acc_k)) {
struct disk_accounting_pos next_acc;
memset(&next_acc, 0, sizeof(next_acc));
next_acc.type = acc_k.type + 1;
struct bpos next = bpos_predecessor(disk_accounting_pos_to_bpos(&next_acc));
if (jk < end)
next = bpos_min(next, journal_key_k(c, jk)->k.p);
bch2_btree_iter_set_pos(&iter, next);
continue;
}
accounting_read_key(trans, k);
}));
bch2_trans_iter_exit(&iter);
if (ret)
return ret;
while (jk < end)
jk = accumulate_and_read_journal_accounting(trans, jk);
struct journal_key *dst = keys->data;
darray_for_each(*keys, i)
if (!i->overwritten)
*dst++ = *i;
keys->gap = keys->nr = dst - keys->data;
CLASS(printbuf, underflow_err)();
scoped_guard(percpu_write, &c->mark_lock) {
@ -902,7 +810,7 @@ int bch2_accounting_read(struct bch_fs *c)
* Remove it, so that if it's re-added it gets re-marked in the
* superblock:
*/
ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
int ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters)
? -BCH_ERR_remove_disk_accounting_entry
: bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters);
@ -984,7 +892,7 @@ int bch2_accounting_read(struct bch_fs *c)
if (underflow_err.pos) {
bool print = bch2_count_fsck_err(c, accounting_key_underflow, &underflow_err);
unsigned pos = underflow_err.pos;
ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
int ret = bch2_run_explicit_recovery_pass(c, &underflow_err,
BCH_RECOVERY_PASS_check_allocations, 0);
print |= underflow_err.pos != pos;
@ -994,7 +902,120 @@ int bch2_accounting_read(struct bch_fs *c)
return ret;
}
return ret;
return 0;
}
/*
* At startup time, initialize the in memory accounting from the btree (and
* journal)
*/
int bch2_accounting_read(struct bch_fs *c)
{
struct bch_accounting_mem *acc = &c->accounting;
CLASS(btree_trans, trans)(c);
CLASS(printbuf, buf)();
/*
* We might run more than once if we rewind to start topology repair or
* btree node scan - and those might cause us to get different results,
* so we can't just skip if we've already run.
*
* Instead, zero out any accounting we have:
*/
scoped_guard(percpu_write, &c->mark_lock) {
darray_for_each(acc->k, e)
percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters);
for_each_member_device(c, ca)
percpu_memset(ca->usage, 0, sizeof(*ca->usage));
percpu_memset(c->usage, 0, sizeof(*c->usage));
}
struct journal_keys *keys = &c->journal_keys;
struct journal_key *jk = keys->data;
move_gap(keys, keys->nr);
/* Find the range of accounting keys from the journal: */
while (jk < &darray_top(*keys) &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, POS_MIN, jk) > 0)
jk++;
struct journal_key *end = jk;
while (end < &darray_top(*keys) &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, SPOS_MAX, end) > 0)
end++;
/*
* Iterate over btree and journal accounting simultaneously:
*
* We want to drop unneeded unneeded accounting deltas early - deltas
* that are older than the btree accounting key version, and once we've
* dropped old accounting deltas we can accumulate and compact deltas
* for the same key:
*/
struct btree_iter iter;
bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
iter.flags &= ~BTREE_ITER_with_journal;
int ret = for_each_btree_key_continue(trans, iter,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({
if (k.k->type != KEY_TYPE_accounting)
continue;
while (jk < end &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) > 0)
jk = accumulate_and_read_journal_accounting(trans, jk);
while (jk < end &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0 &&
bversion_cmp(journal_key_k(c, jk)->k.bversion, k.k->bversion) <= 0) {
jk->overwritten = true;
jk++;
}
if (jk < end &&
__journal_key_cmp(c, BTREE_ID_accounting, 0, k.k->p, jk) == 0)
jk = accumulate_and_read_journal_accounting(trans, jk);
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, k.k->p);
if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR)
break;
if (!bch2_accounting_is_mem(&acc_k)) {
struct disk_accounting_pos next_acc;
memset(&next_acc, 0, sizeof(next_acc));
next_acc.type = acc_k.type + 1;
struct bpos next = disk_accounting_pos_to_bpos(&next_acc);
if (jk < end)
next = bpos_min(next, journal_key_k(c, jk)->k.p);
/* for_each_btree_key() will still advance iterator: */
bch2_btree_iter_set_pos(&iter, bpos_predecessor(next));
continue;
}
accounting_read_key(trans, k);
}));
bch2_trans_iter_exit(&iter);
if (ret)
return ret;
while (jk < end)
jk = accumulate_and_read_journal_accounting(trans, jk);
bch2_trans_unlock(trans);
struct journal_key *dst = keys->data;
darray_for_each(*keys, i)
if (!i->overwritten)
*dst++ = *i;
keys->gap = keys->nr = dst - keys->data;
return accounting_read_mem_fixups(trans);
}
int bch2_dev_usage_remove(struct bch_fs *c, struct bch_dev *ca)

View File

@ -783,6 +783,39 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
return false;
}
void bch2_bkey_propagate_incompressible(struct bkey_i *dst, struct bkey_s_c src)
{
if (!bch2_bkey_is_incompressible(src))
return;
struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(dst));
union bch_extent_entry *entry;
/*
* XXX: if some data actually is compressed, we want
* bch_extent_rebalance.wont_recompress_smaller
*/
bkey_extent_entry_for_each(ptrs, entry) {
switch (extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_crc32:
if (entry->crc32.compression_type == BCH_COMPRESSION_TYPE_none)
entry->crc32.compression_type = BCH_COMPRESSION_TYPE_incompressible;
break;
case BCH_EXTENT_ENTRY_crc64:
if (entry->crc64.compression_type == BCH_COMPRESSION_TYPE_none)
entry->crc64.compression_type = BCH_COMPRESSION_TYPE_incompressible;
break;
case BCH_EXTENT_ENTRY_crc128:
if (entry->crc128.compression_type == BCH_COMPRESSION_TYPE_none)
entry->crc128.compression_type = BCH_COMPRESSION_TYPE_incompressible;
break;
default:
break;
}
}
}
unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);

View File

@ -599,6 +599,7 @@ unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
bool bch2_bkey_is_incompressible(struct bkey_s_c);
void bch2_bkey_propagate_incompressible(struct bkey_i *, struct bkey_s_c);
unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);

View File

@ -698,7 +698,7 @@ static unsigned max_dev_latency(struct bch_fs *c)
u64 nsecs = 0;
guard(rcu)();
for_each_rw_member_rcu(c, ca)
for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal])
nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration);
return nsecs_to_jiffies(nsecs);

View File

@ -95,6 +95,9 @@ void bch2_bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k, int flags)
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
if (ptr->dev == BCH_SB_MEMBER_INVALID)
continue;
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
@ -105,6 +108,9 @@ void bch2_bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k, int flags)
bool bch2_bkey_nocow_trylock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags)
{
bkey_for_each_ptr(ptrs, ptr) {
if (ptr->dev == BCH_SB_MEMBER_INVALID)
continue;
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
struct bpos bucket = PTR_BUCKET_POS(ca, ptr);
@ -144,6 +150,9 @@ void bch2_bkey_nocow_lock(struct bch_fs *c, struct bkey_ptrs_c ptrs, int flags)
darray_init(&buckets);
bkey_for_each_ptr(ptrs, ptr) {
if (ptr->dev == BCH_SB_MEMBER_INVALID)
continue;
struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
u64 b = bucket_to_u64(PTR_BUCKET_POS(ca, ptr));
struct nocow_lock_bucket *l =