diff --git a/.bcachefs_revision b/.bcachefs_revision index 417d5b15..9f6715bd 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -895d5e67afdccd7c9262630f5e7972721885284b +3b80552e70573764bbf38b89c58749aef9dd8753 diff --git a/include/linux/closure.h b/include/linux/closure.h index c24d4aaf..880fe85e 100644 --- a/include/linux/closure.h +++ b/include/linux/closure.h @@ -464,10 +464,10 @@ do { \ while (1) { \ closure_wait(waitlist, &cl); \ if (_cond) { \ - _t = max(1, _until - jiffies); \ + _t = max_t(long, 1L, _until - jiffies); \ break; \ } \ - _t = max(0, _until - jiffies); \ + _t = max_t(long, 0L, _until - jiffies); \ if (!_t) \ break; \ closure_sync_timeout(&cl, _t); \ @@ -485,7 +485,7 @@ do { \ ({ \ unsigned long _until = jiffies + _timeout; \ (_cond) \ - ? max(1, _until - jiffies) \ + ? max_t(long, 1L, _until - jiffies) \ : __closure_wait_event_timeout(waitlist, _cond, _until);\ }) diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 771154e3..94bbd850 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -1224,17 +1224,20 @@ int bch2_gc_gens(struct bch_fs *c) u64 b, start_time = local_clock(); int ret; - /* - * Ideally we would be using state_lock and not gc_gens_lock here, but that - * introduces a deadlock in the RO path - we currently take the state - * lock at the start of going RO, thus the gc thread may get stuck: - */ if (!mutex_trylock(&c->gc_gens_lock)) return 0; trace_and_count(c, gc_gens_start, c); - down_read(&c->state_lock); + /* + * We have to use trylock here. Otherwise, we would + * introduce a deadlock in the RO path - we take the + * state lock at the start of going RO. + */ + if (!down_read_trylock(&c->state_lock)) { + mutex_unlock(&c->gc_gens_lock); + return 0; + } for_each_member_device(c, ca) { struct bucket_gens *gens = bucket_gens(ca); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 1c1448b5..cf933409 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) struct btree_trans *trans = bch2_trans_get(c); btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + + /* we don't need transaction context anymore after we got the lock. */ + bch2_trans_put(trans); __btree_node_write_done(c, b); six_unlock_read(&b->c.lock); - - bch2_trans_put(trans); } static void btree_node_write_work(struct work_struct *work) diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 39fc7778..514df618 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) diff --git a/libbcachefs/disk_accounting.c b/libbcachefs/disk_accounting.c index 9f3133e3..e309fb78 100644 --- a/libbcachefs/disk_accounting.c +++ b/libbcachefs/disk_accounting.c @@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k) *p = swab64(*p); } +static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, + struct disk_accounting_pos acc) +{ + unsafe_memcpy(r, &acc.replicas, + replicas_entry_bytes(&acc.replicas), + "variable length struct"); +} + static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p) { struct disk_accounting_pos acc_k; @@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: - unsafe_memcpy(r, &acc_k.replicas, - replicas_entry_bytes(&acc_k.replicas), - "variable length struct"); + __accounting_to_replicas(r, acc_k); return true; default: return false; @@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) return ret; } +static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + struct disk_accounting_pos acc, + u64 *v, unsigned nr) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0, invalid_dev = -1; + + switch (acc.type) { + case BCH_DISK_ACCOUNTING_replicas: { + struct bch_replicas_padded r; + __accounting_to_replicas(&r.e, acc); + + for (unsigned i = 0; i < r.e.nr_devs; i++) + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && + !bch2_dev_exists(c, r.e.devs[i])) { + invalid_dev = r.e.devs[i]; + goto invalid_device; + } + + /* + * All replicas entry checks except for invalid device are done + * in bch2_accounting_validate + */ + BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf)); + + if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n %s", + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping + * and retaking lock is ok: + */ + percpu_up_write(&c->mark_lock); + ret = bch2_mark_replicas(c, &r.e); + if (ret) + goto fsck_err; + percpu_down_write(&c->mark_lock); + } + break; + } + + case BCH_DISK_ACCOUNTING_dev_data_type: + if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { + invalid_dev = acc.dev_data_type.dev; + goto invalid_device; + } + break; + } + +fsck_err: + printbuf_exit(&buf); + return ret; +invalid_device: + if (fsck_err(trans, accounting_to_invalid_device, + "accounting entry points to invalid device %i\n %s", + invalid_dev, + (printbuf_reset(&buf), + bch2_accounting_key_to_text(&buf, &acc), + buf.buf))) { + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; + + ret = commit_do(trans, NULL, NULL, 0, + bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + -BCH_ERR_remove_disk_accounting_entry; + } else { + ret = -BCH_ERR_remove_disk_accounting_entry; + } + goto fsck_err; +} + /* * At startup time, initialize the in memory accounting from the btree (and * journal) @@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c) } keys->gap = keys->nr = dst - keys->data; - percpu_down_read(&c->mark_lock); - for (unsigned i = 0; i < acc->k.nr; i++) { + percpu_down_write(&c->mark_lock); + unsigned i = 0; + while (i < acc->k.nr) { + unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters)) - continue; - - struct bch_replicas_padded r; - if (!accounting_to_replicas(&r.e, acc->k.data[i].pos)) - continue; + bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); /* - * If the replicas entry is invalid it'll get cleaned up by - * check_allocations: + * If the entry counters are zeroed, it should be treated as + * nonexistent - it might point to an invalid device. + * + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: */ - if (bch2_replicas_entry_validate(&r.e, c, &buf)) + ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry + : bch2_disk_accounting_validate_late(trans, acc_k, + v, acc->k.data[idx].nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(acc->k.data[idx].v[0]); + free_percpu(acc->k.data[idx].v[1]); + darray_remove_item(&acc->k, &acc->k.data[idx]); + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + ret = 0; continue; - - struct disk_accounting_pos k; - bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); - - if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), - trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", - (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &k), - buf.buf))) { - /* - * We're not RW yet and still single threaded, dropping - * and retaking lock is ok: - */ - percpu_up_read(&c->mark_lock); - ret = bch2_mark_replicas(c, &r.e); - if (ret) - goto fsck_err; - percpu_down_read(&c->mark_lock); } + + if (ret) + goto fsck_err; + i++; } preempt_disable(); @@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c) } preempt_enable(); fsck_err: - percpu_up_read(&c->mark_lock); + percpu_up_write(&c->mark_lock); err: printbuf_exit(&buf); bch2_trans_put(trans); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index f4fc4f08..1587c6e1 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -900,7 +900,7 @@ err: bch2_bkey_val_to_text(&msgbuf, c, orig_k); bch_err_ratelimited(c, "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); - printbuf_exit(&msgbuf); + printbuf_exit(&msgbuf);; ret = -BCH_ERR_stripe_reconstruct; goto out; } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 60b7875a..64926351 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -268,7 +268,8 @@ x(BCH_ERR_nopromote, nopromote_no_writes) \ x(BCH_ERR_nopromote, nopromote_enomem) \ x(0, invalid_snapshot_node) \ - x(0, option_needs_open_fs) + x(0, option_needs_open_fs) \ + x(0, remove_disk_accounting_entry) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 23cae92d..857175f4 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) return a.subvol == b.subvol && a.inum == b.inum; } +static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) +{ + const subvol_inum *inum = data; + + return jhash(&inum->inum, sizeof(inum->inum), seed); +} + +static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) +{ + const struct bch_inode_info *inode = data; + + return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed); +} + static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg, const void *obj) { @@ -170,32 +184,93 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .head_offset = offsetof(struct bch_inode_info, hash), .key_offset = offsetof(struct bch_inode_info, ei_inum), .key_len = sizeof(subvol_inum), + .hashfn = bch2_vfs_inode_hash_fn, + .obj_hashfn = bch2_vfs_inode_obj_hash_fn, .obj_cmpfn = bch2_vfs_inode_cmp_fn, .automatic_shrinking = true, }; -static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { - return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); -} + struct bch_fs *c = trans->c; + struct rhashtable *ht = &c->vfs_inodes_table; + subvol_inum inum = (subvol_inum) { .inum = p.offset }; + DARRAY(u32) subvols; + int ret = 0; -bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) -{ if (!test_bit(BCH_FS_started, &c->flags)) return false; - subvol_inum inum = { - .subvol = snapshot_t(c, p.snapshot)->subvol, - .inum = p.offset, - }; + darray_init(&subvols); +restart_from_top: - /* snapshot tree interior node, can't safely delete while online (yet) */ - if (!inum.subvol) { - bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot); - return true; + /* + * Tweaked version of __rhashtable_lookup(); we need to get a list of + * subvolumes in which the given inode number is open. + * + * For this to work, we don't include the subvolume ID in the key that + * we hash - all inodes with the same inode number regardless of + * subvolume will hash to the same slot. + * + * This will be less than ideal if the same file is ever open + * simultaneously in many different snapshots: + */ + rcu_read_lock(); + struct rhash_lock_head __rcu *const *bkt; + struct rhash_head *he; + unsigned int hash; + struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); +restart: + hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + bkt = rht_bucket(tbl, hash); + do { + struct bch_inode_info *inode; + + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { + if (inode->ei_inum.inum == inum.inum) { + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, + GFP_NOWAIT|__GFP_NOWARN); + if (ret) { + rcu_read_unlock(); + ret = darray_make_room(&subvols, 1); + if (ret) + goto err; + subvols.nr = 0; + goto restart_from_top; + } + } + } + /* An object might have been moved to a different hash chain, + * while we walk along it - better check and retry. + */ + } while (he != RHT_NULLS_MARKER(bkt)); + + /* Ensure we see any new tables. */ + smp_rmb(); + + tbl = rht_dereference_rcu(tbl->future_tbl, ht); + if (unlikely(tbl)) + goto restart; + rcu_read_unlock(); + + darray_for_each(subvols, i) { + u32 snap; + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); + if (ret) + goto err; + + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); + if (ret) + break; } +err: + darray_exit(&subvols); + return ret; +} - return __bch2_inode_hash_find(c, inum) != NULL; +static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum) +{ + return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params); } static void __wait_on_freeing_inode(struct bch_fs *c, @@ -203,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c, subvol_inum inum) { wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + struct wait_bit_queue_entry wait; + wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->v.i_lock); @@ -271,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, set_bit(EI_INODE_HASHED, &inode->ei_flags); retry: - if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table, + if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table, + &inode->ei_inum, &inode->hash, bch2_vfs_inodes_params))) { old = bch2_inode_hash_find(c, trans, inode->ei_inum); diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h index 40dbd577..59f9f7ae 100644 --- a/libbcachefs/fs.h +++ b/libbcachefs/fs.h @@ -146,6 +146,8 @@ struct bch_inode_info * __bch2_create(struct mnt_idmap *, struct bch_inode_info *, struct dentry *, umode_t, dev_t, subvol_inum, unsigned); +int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, @@ -179,8 +181,6 @@ void bch2_inode_update_after_write(struct btree_trans *, int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); -bool bch2_inode_is_open(struct bch_fs *c, struct bpos p); - int bch2_setattr_nonsize(struct mnt_idmap *, struct bch_inode_info *, struct iattr *); @@ -198,7 +198,7 @@ int bch2_vfs_init(void); #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) -static inline bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { return false; } +static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; } static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index f0eb17e6..a1087fd2 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1182,7 +1182,7 @@ static int check_inode(struct btree_trans *trans, if (ret) u.bi_flags |= BCH_INODE_has_child_snapshot; else - u.bi_flags &= !BCH_INODE_has_child_snapshot; + u.bi_flags &= ~BCH_INODE_has_child_snapshot; do_update = true; } ret = 0; @@ -1213,7 +1213,11 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; } else { - if (fsck_err_on(!bch2_inode_is_open(c, k.k->p), + ret = bch2_inode_or_descendents_is_open(trans, k.k->p); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, "inode %llu%u unlinked and not open", u.bi_inum, u.bi_snapshot)) { @@ -1221,6 +1225,7 @@ static int check_inode(struct btree_trans *trans, bch_err_msg(c, ret, "in fsck deleting inode"); goto err_noprint; } + ret = 0; } } diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 9d6040d4..2c037e84 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -1244,8 +1244,9 @@ next_parent: if (!unlinked) return 0; - if (bch2_inode_is_open(trans->c, pos)) - return 0; + ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos)); + if (ret) + return ret < 0 ? ret : 0; ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot); if (ret) diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 232be8a4..84097235 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -427,7 +427,9 @@ void bch2_opt_to_text(struct printbuf *out, prt_printf(out, "%lli", v); break; case BCH_OPT_STR: - if (flags & OPT_SHOW_FULL_LIST) + if (v < opt->min || v >= opt->max - 1) + prt_printf(out, "(invalid option %lli)", v); + else if (flags & OPT_SHOW_FULL_LIST) prt_string_option(out, opt->choices, v); else prt_str(out, opt->choices[v]); diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h index 4cdddf15..62ad25f9 100644 --- a/libbcachefs/sb-errors_format.h +++ b/libbcachefs/sb-errors_format.h @@ -291,6 +291,7 @@ enum bch_fsck_flags { x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ + x(accounting_to_invalid_device, 289, 0) \ x(invalid_btree_id, 274, 0) \ x(alloc_key_io_time_bad, 275, 0) \ x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ @@ -300,7 +301,7 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 289, 0) + x(MAX, 290, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 32cc7a4b..873e4be7 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1120,12 +1120,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_bdevname(&buf, fs->bdev); prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); + bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; prt_newline(&buf); prt_bdevname(&buf, sb->bdev); prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); + bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; prt_newline(&buf); if (!opts->no_splitbrain_check)