Update bcachefs sources to 3b80552e7057 bcachefs: __wait_for_freeing_inode: Switch to wait_bit_queue_entry

This commit is contained in:
Kent Overstreet 2024-10-09 17:00:30 -04:00
parent f6e4cd2ef7
commit a9e1597adc
15 changed files with 248 additions and 78 deletions

View File

@ -1 +1 @@
895d5e67afdccd7c9262630f5e7972721885284b
3b80552e70573764bbf38b89c58749aef9dd8753

View File

@ -464,10 +464,10 @@ do { \
while (1) { \
closure_wait(waitlist, &cl); \
if (_cond) { \
_t = max(1, _until - jiffies); \
_t = max_t(long, 1L, _until - jiffies); \
break; \
} \
_t = max(0, _until - jiffies); \
_t = max_t(long, 0L, _until - jiffies); \
if (!_t) \
break; \
closure_sync_timeout(&cl, _t); \
@ -485,7 +485,7 @@ do { \
({ \
unsigned long _until = jiffies + _timeout; \
(_cond) \
? max(1, _until - jiffies) \
? max_t(long, 1L, _until - jiffies) \
: __closure_wait_event_timeout(waitlist, _cond, _until);\
})

View File

@ -1224,17 +1224,20 @@ int bch2_gc_gens(struct bch_fs *c)
u64 b, start_time = local_clock();
int ret;
/*
* Ideally we would be using state_lock and not gc_gens_lock here, but that
* introduces a deadlock in the RO path - we currently take the state
* lock at the start of going RO, thus the gc thread may get stuck:
*/
if (!mutex_trylock(&c->gc_gens_lock))
return 0;
trace_and_count(c, gc_gens_start, c);
down_read(&c->state_lock);
/*
* We have to use trylock here. Otherwise, we would
* introduce a deadlock in the RO path - we take the
* state lock at the start of going RO.
*/
if (!down_read_trylock(&c->state_lock)) {
mutex_unlock(&c->gc_gens_lock);
return 0;
}
for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);

View File

@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
struct btree_trans *trans = bch2_trans_get(c);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
/* we don't need transaction context anymore after we got the lock. */
bch2_trans_put(trans);
__btree_node_write_done(c, b);
six_unlock_read(&b->c.lock);
bch2_trans_put(trans);
}
static void btree_node_write_work(struct work_struct *work)

View File

@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
!(ret = bkey_err(old_k)) &&
bkey_eq(old_pos, old_k.k->p)) {
struct bpos whiteout_pos =
SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))

View File

@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k)
*p = swab64(*p);
}
static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
struct disk_accounting_pos acc)
{
unsafe_memcpy(r, &acc.replicas,
replicas_entry_bytes(&acc.replicas),
"variable length struct");
}
static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
{
struct disk_accounting_pos acc_k;
@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_replicas:
unsafe_memcpy(r, &acc_k.replicas,
replicas_entry_bytes(&acc_k.replicas),
"variable length struct");
__accounting_to_replicas(r, acc_k);
return true;
default:
return false;
@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
return ret;
}
static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
struct disk_accounting_pos acc,
u64 *v, unsigned nr)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0, invalid_dev = -1;
switch (acc.type) {
case BCH_DISK_ACCOUNTING_replicas: {
struct bch_replicas_padded r;
__accounting_to_replicas(&r.e, acc);
for (unsigned i = 0; i < r.e.nr_devs; i++)
if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
!bch2_dev_exists(c, r.e.devs[i])) {
invalid_dev = r.e.devs[i];
goto invalid_device;
}
/*
* All replicas entry checks except for invalid device are done
* in bch2_accounting_validate
*/
BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
trans, accounting_replicas_not_marked,
"accounting not marked in superblock replicas\n %s",
(printbuf_reset(&buf),
bch2_accounting_key_to_text(&buf, &acc),
buf.buf))) {
/*
* We're not RW yet and still single threaded, dropping
* and retaking lock is ok:
*/
percpu_up_write(&c->mark_lock);
ret = bch2_mark_replicas(c, &r.e);
if (ret)
goto fsck_err;
percpu_down_write(&c->mark_lock);
}
break;
}
case BCH_DISK_ACCOUNTING_dev_data_type:
if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
invalid_dev = acc.dev_data_type.dev;
goto invalid_device;
}
break;
}
fsck_err:
printbuf_exit(&buf);
return ret;
invalid_device:
if (fsck_err(trans, accounting_to_invalid_device,
"accounting entry points to invalid device %i\n %s",
invalid_dev,
(printbuf_reset(&buf),
bch2_accounting_key_to_text(&buf, &acc),
buf.buf))) {
for (unsigned i = 0; i < nr; i++)
v[i] = -v[i];
ret = commit_do(trans, NULL, NULL, 0,
bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
-BCH_ERR_remove_disk_accounting_entry;
} else {
ret = -BCH_ERR_remove_disk_accounting_entry;
}
goto fsck_err;
}
/*
* At startup time, initialize the in memory accounting from the btree (and
* journal)
@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c)
}
keys->gap = keys->nr = dst - keys->data;
percpu_down_read(&c->mark_lock);
for (unsigned i = 0; i < acc->k.nr; i++) {
percpu_down_write(&c->mark_lock);
unsigned i = 0;
while (i < acc->k.nr) {
unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
struct disk_accounting_pos acc_k;
bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
continue;
struct bch_replicas_padded r;
if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
continue;
bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
/*
* If the replicas entry is invalid it'll get cleaned up by
* check_allocations:
* If the entry counters are zeroed, it should be treated as
* nonexistent - it might point to an invalid device.
*
* Remove it, so that if it's re-added it gets re-marked in the
* superblock:
*/
if (bch2_replicas_entry_validate(&r.e, c, &buf))
ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
? -BCH_ERR_remove_disk_accounting_entry
: bch2_disk_accounting_validate_late(trans, acc_k,
v, acc->k.data[idx].nr_counters);
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
free_percpu(acc->k.data[idx].v[0]);
free_percpu(acc->k.data[idx].v[1]);
darray_remove_item(&acc->k, &acc->k.data[idx]);
eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, NULL);
ret = 0;
continue;
struct disk_accounting_pos k;
bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
trans, accounting_replicas_not_marked,
"accounting not marked in superblock replicas\n %s",
(printbuf_reset(&buf),
bch2_accounting_key_to_text(&buf, &k),
buf.buf))) {
/*
* We're not RW yet and still single threaded, dropping
* and retaking lock is ok:
*/
percpu_up_read(&c->mark_lock);
ret = bch2_mark_replicas(c, &r.e);
if (ret)
goto fsck_err;
percpu_down_read(&c->mark_lock);
}
if (ret)
goto fsck_err;
i++;
}
preempt_disable();
@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c)
}
preempt_enable();
fsck_err:
percpu_up_read(&c->mark_lock);
percpu_up_write(&c->mark_lock);
err:
printbuf_exit(&buf);
bch2_trans_put(trans);

View File

@ -900,7 +900,7 @@ err:
bch2_bkey_val_to_text(&msgbuf, c, orig_k);
bch_err_ratelimited(c,
"error doing reconstruct read: %s\n %s", msg, msgbuf.buf);
printbuf_exit(&msgbuf);
printbuf_exit(&msgbuf);;
ret = -BCH_ERR_stripe_reconstruct;
goto out;
}

View File

@ -268,7 +268,8 @@
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, invalid_snapshot_node) \
x(0, option_needs_open_fs)
x(0, option_needs_open_fs) \
x(0, remove_disk_accounting_entry)
enum bch_errcode {
BCH_ERR_START = 2048,

View File

@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
return a.subvol == b.subvol && a.inum == b.inum;
}
static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
{
const subvol_inum *inum = data;
return jhash(&inum->inum, sizeof(inum->inum), seed);
}
static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
{
const struct bch_inode_info *inode = data;
return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
}
static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
@ -170,32 +184,93 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
.head_offset = offsetof(struct bch_inode_info, hash),
.key_offset = offsetof(struct bch_inode_info, ei_inum),
.key_len = sizeof(subvol_inum),
.hashfn = bch2_vfs_inode_hash_fn,
.obj_hashfn = bch2_vfs_inode_obj_hash_fn,
.obj_cmpfn = bch2_vfs_inode_cmp_fn,
.automatic_shrinking = true,
};
static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
{
return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
}
struct bch_fs *c = trans->c;
struct rhashtable *ht = &c->vfs_inodes_table;
subvol_inum inum = (subvol_inum) { .inum = p.offset };
DARRAY(u32) subvols;
int ret = 0;
bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
{
if (!test_bit(BCH_FS_started, &c->flags))
return false;
subvol_inum inum = {
.subvol = snapshot_t(c, p.snapshot)->subvol,
.inum = p.offset,
};
darray_init(&subvols);
restart_from_top:
/* snapshot tree interior node, can't safely delete while online (yet) */
if (!inum.subvol) {
bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
return true;
/*
* Tweaked version of __rhashtable_lookup(); we need to get a list of
* subvolumes in which the given inode number is open.
*
* For this to work, we don't include the subvolume ID in the key that
* we hash - all inodes with the same inode number regardless of
* subvolume will hash to the same slot.
*
* This will be less than ideal if the same file is ever open
* simultaneously in many different snapshots:
*/
rcu_read_lock();
struct rhash_lock_head __rcu *const *bkt;
struct rhash_head *he;
unsigned int hash;
struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
restart:
hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
bkt = rht_bucket(tbl, hash);
do {
struct bch_inode_info *inode;
rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
if (inode->ei_inum.inum == inum.inum) {
ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
GFP_NOWAIT|__GFP_NOWARN);
if (ret) {
rcu_read_unlock();
ret = darray_make_room(&subvols, 1);
if (ret)
goto err;
subvols.nr = 0;
goto restart_from_top;
}
}
}
/* An object might have been moved to a different hash chain,
* while we walk along it - better check and retry.
*/
} while (he != RHT_NULLS_MARKER(bkt));
/* Ensure we see any new tables. */
smp_rmb();
tbl = rht_dereference_rcu(tbl->future_tbl, ht);
if (unlikely(tbl))
goto restart;
rcu_read_unlock();
darray_for_each(subvols, i) {
u32 snap;
ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
if (ret)
goto err;
ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
if (ret)
break;
}
err:
darray_exit(&subvols);
return ret;
}
return __bch2_inode_hash_find(c, inum) != NULL;
static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{
return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
}
static void __wait_on_freeing_inode(struct bch_fs *c,
@ -203,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c,
subvol_inum inum)
{
wait_queue_head_t *wq;
DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
struct wait_bit_queue_entry wait;
wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->v.i_lock);
@ -271,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
set_bit(EI_INODE_HASHED, &inode->ei_flags);
retry:
if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
&inode->ei_inum,
&inode->hash,
bch2_vfs_inodes_params))) {
old = bch2_inode_hash_find(c, trans, inode->ei_inum);

View File

@ -146,6 +146,8 @@ struct bch_inode_info *
__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
int bch2_fs_quota_transfer(struct bch_fs *,
struct bch_inode_info *,
struct bch_qid,
@ -179,8 +181,6 @@ void bch2_inode_update_after_write(struct btree_trans *,
int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
inode_set_fn, void *, unsigned);
bool bch2_inode_is_open(struct bch_fs *c, struct bpos p);
int bch2_setattr_nonsize(struct mnt_idmap *,
struct bch_inode_info *,
struct iattr *);
@ -198,7 +198,7 @@ int bch2_vfs_init(void);
#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
static inline bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { return false; }
static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}

View File

@ -1182,7 +1182,7 @@ static int check_inode(struct btree_trans *trans,
if (ret)
u.bi_flags |= BCH_INODE_has_child_snapshot;
else
u.bi_flags &= !BCH_INODE_has_child_snapshot;
u.bi_flags &= ~BCH_INODE_has_child_snapshot;
do_update = true;
}
ret = 0;
@ -1213,7 +1213,11 @@ static int check_inode(struct btree_trans *trans,
if (ret)
goto err;
} else {
if (fsck_err_on(!bch2_inode_is_open(c, k.k->p),
ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
if (ret < 0)
goto err;
if (fsck_err_on(!ret,
trans, inode_unlinked_and_not_open,
"inode %llu%u unlinked and not open",
u.bi_inum, u.bi_snapshot)) {
@ -1221,6 +1225,7 @@ static int check_inode(struct btree_trans *trans,
bch_err_msg(c, ret, "in fsck deleting inode");
goto err_noprint;
}
ret = 0;
}
}

View File

@ -1244,8 +1244,9 @@ next_parent:
if (!unlinked)
return 0;
if (bch2_inode_is_open(trans->c, pos))
return 0;
ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
if (ret)
return ret < 0 ? ret : 0;
ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
if (ret)

View File

@ -427,7 +427,9 @@ void bch2_opt_to_text(struct printbuf *out,
prt_printf(out, "%lli", v);
break;
case BCH_OPT_STR:
if (flags & OPT_SHOW_FULL_LIST)
if (v < opt->min || v >= opt->max - 1)
prt_printf(out, "(invalid option %lli)", v);
else if (flags & OPT_SHOW_FULL_LIST)
prt_string_option(out, opt->choices, v);
else
prt_str(out, opt->choices[v]);

View File

@ -291,6 +291,7 @@ enum bch_fsck_flags {
x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
x(accounting_to_invalid_device, 289, 0) \
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
@ -300,7 +301,7 @@ enum bch_fsck_flags {
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
x(MAX, 289, 0)
x(MAX, 290, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,

View File

@ -1120,12 +1120,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
prt_bdevname(&buf, fs->bdev);
prt_char(&buf, ' ');
bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
prt_newline(&buf);
prt_bdevname(&buf, sb->bdev);
prt_char(&buf, ' ');
bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
prt_newline(&buf);
if (!opts->no_splitbrain_check)