From 5dbc29215cd595751b9e18cfd105f3a437a2a9e1 Mon Sep 17 00:00:00 2001 From: Alexander Miroshnichenko Date: Mon, 4 Aug 2025 07:38:37 +0300 Subject: [PATCH] sys-kernel/hardened-kernel: update bcachefs patches Signed-off-by: Alexander Miroshnichenko --- ...6.15-backport-patches-to-prepare-for.patch | 392 + ...tches-from-master-branch-03-Aug-2025.patch | 51930 ++++++++++++++++ ...rs-on-case-folding-capable-filesyste.patch | 177 + 3 files changed, 52499 insertions(+) create mode 100644 sys-kernel/hardened-kernel/files/linux-6.15/1190_bcachefs-revert-6.15-backport-patches-to-prepare-for.patch create mode 100644 sys-kernel/hardened-kernel/files/linux-6.15/1191_bcachefs-patches-from-master-branch-03-Aug-2025.patch create mode 100644 sys-kernel/hardened-kernel/files/linux-6.15/1194_ovl-support-layers-on-case-folding-capable-filesyste.patch diff --git a/sys-kernel/hardened-kernel/files/linux-6.15/1190_bcachefs-revert-6.15-backport-patches-to-prepare-for.patch b/sys-kernel/hardened-kernel/files/linux-6.15/1190_bcachefs-revert-6.15-backport-patches-to-prepare-for.patch new file mode 100644 index 0000000..ee93686 --- /dev/null +++ b/sys-kernel/hardened-kernel/files/linux-6.15/1190_bcachefs-revert-6.15-backport-patches-to-prepare-for.patch @@ -0,0 +1,392 @@ +From 73c097f81f25b59e97de37f326918bd2119ae26e Mon Sep 17 00:00:00 2001 +From: Alexander Miroshnichenko +Date: Sun, 3 Aug 2025 19:50:52 +0300 +Subject: [PATCH] bcachefs: revert 6.15 backport patches to prepare for aplying + master patches +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 8bit + +Signed-off-by: Alexander Miroshnichenko +--- + fs/bcachefs/dirent.c | 12 +++++- + fs/bcachefs/dirent.h | 4 +- + fs/bcachefs/errcode.h | 2 - + fs/bcachefs/fs.c | 8 +--- + fs/bcachefs/fsck.c | 8 ---- + fs/bcachefs/inode.c | 77 ++++++++++++---------------------- + fs/bcachefs/namei.c | 4 +- + fs/bcachefs/sb-errors_format.h | 4 +- + fs/bcachefs/subvolume.c | 19 ++------- + 9 files changed, 46 insertions(+), 92 deletions(-) + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index 901230ca4a75..a51195088227 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -395,8 +395,8 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + } + + int bch2_dirent_rename(struct btree_trans *trans, +- subvol_inum src_dir, struct bch_hash_info *src_hash, +- subvol_inum dst_dir, struct bch_hash_info *dst_hash, ++ subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, ++ subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, + const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, + const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) +@@ -535,6 +535,14 @@ int bch2_dirent_rename(struct btree_trans *trans, + new_src->v.d_type == DT_SUBVOL) + new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + ++ if (old_dst.k) ++ *dst_dir_i_size -= bkey_bytes(old_dst.k); ++ *src_dir_i_size -= bkey_bytes(old_src.k); ++ ++ if (mode == BCH_RENAME_EXCHANGE) ++ *src_dir_i_size += bkey_bytes(&new_src->k); ++ *dst_dir_i_size += bkey_bytes(&new_dst->k); ++ + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); + if (ret) + goto out; +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index 999b895fa28a..d3e7ae669575 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -80,8 +80,8 @@ enum bch_rename_mode { + }; + + int bch2_dirent_rename(struct btree_trans *, +- subvol_inum, struct bch_hash_info *, +- subvol_inum, struct bch_hash_info *, ++ subvol_inum, struct bch_hash_info *, u64 *, ++ subvol_inum, struct bch_hash_info *, u64 *, + const struct qstr *, subvol_inum *, u64 *, + const struct qstr *, subvol_inum *, u64 *, + enum bch_rename_mode); +diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h +index 346766299cb3..d9ebffa5b3a2 100644 +--- a/fs/bcachefs/errcode.h ++++ b/fs/bcachefs/errcode.h +@@ -209,8 +209,6 @@ + x(EINVAL, remove_would_lose_data) \ + x(EINVAL, no_resize_with_buckets_nouse) \ + x(EINVAL, inode_unpack_error) \ +- x(EINVAL, inode_not_unlinked) \ +- x(EINVAL, inode_has_child_snapshot) \ + x(EINVAL, varint_decode_error) \ + x(EINVAL, erasure_coding_found_btree_node) \ + x(EOPNOTSUPP, may_not_use_incompat_feature) \ +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 8a47ce3467e8..47f1a64c5c8d 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -2181,13 +2181,7 @@ static void bch2_evict_inode(struct inode *vinode) + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +- int ret = bch2_inode_rm(c, inode_inum(inode)); +- if (ret && !bch2_err_matches(ret, EROFS)) { +- bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", +- inode->ei_inum.subvol, +- inode->ei_inum.inum); +- bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); +- } ++ bch2_inode_rm(c, inode_inum(inode)); + + /* + * If we are deleting, we need it present in the vfs hash table +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index bf117f2225d8..aaf187085276 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1183,14 +1183,6 @@ static int check_inode(struct btree_trans *trans, + ret = 0; + } + +- if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, +- trans, inode_dir_has_nonzero_i_size, +- "directory %llu:%u with nonzero i_size %lli", +- u.bi_inum, u.bi_snapshot, u.bi_size)) { +- u.bi_size = 0; +- do_update = true; +- } +- + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto err; +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 845efd429d13..490b85841de9 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -38,7 +38,6 @@ static const char * const bch2_inode_flag_strs[] = { + #undef x + + static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); +-static int may_delete_deleted_inum(struct btree_trans *, subvol_inum); + + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; + +@@ -1049,23 +1048,19 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) + u32 snapshot; + int ret; + +- ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum)); +- if (ret) +- goto err2; +- + /* + * If this was a directory, there shouldn't be any real dirents left - + * but there could be whiteouts (from hash collisions) that we should + * delete: + * +- * XXX: the dirent code ideally would delete whiteouts when they're no ++ * XXX: the dirent could ideally would delete whiteouts when they're no + * longer needed + */ + ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); + if (ret) +- goto err2; ++ goto err; + retry: + bch2_trans_begin(trans); + +@@ -1347,8 +1342,10 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); + } + +-static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, +- bool from_deleted_inodes) ++static int may_delete_deleted_inode(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos pos, ++ bool *need_another_pass) + { + struct bch_fs *c = trans->c; + struct btree_iter inode_iter; +@@ -1363,13 +1360,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + return ret; + + ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; +- if (fsck_err_on(from_deleted_inodes && ret, ++ if (fsck_err_on(!bkey_is_inode(k.k), + trans, deleted_inode_missing, + "nonexistent inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; +- if (ret) +- goto out; + + ret = bch2_inode_unpack(k, &inode); + if (ret) +@@ -1377,8 +1372,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + + if (S_ISDIR(inode.bi_mode)) { + ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); +- if (fsck_err_on(from_deleted_inodes && +- bch2_err_matches(ret, ENOTEMPTY), ++ if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + trans, deleted_inode_is_dir, + "non empty directory %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) +@@ -1387,25 +1381,17 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + goto out; + } + +- ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : -BCH_ERR_inode_not_unlinked; +- if (fsck_err_on(from_deleted_inodes && ret, ++ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), + trans, deleted_inode_not_unlinked, + "non-deleted inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; +- if (ret) +- goto out; +- +- ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot) +- ? 0 : -BCH_ERR_inode_has_child_snapshot; + +- if (fsck_err_on(from_deleted_inodes && ret, ++ if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + trans, deleted_inode_has_child_snapshots, + "inode with child snapshots %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; +- if (ret) +- goto out; + + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) +@@ -1422,28 +1408,19 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + if (ret) + goto out; + } +- +- if (!from_deleted_inodes) { +- ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: +- -BCH_ERR_inode_has_child_snapshot; +- goto out; +- } +- + goto delete; + + } + +- if (from_deleted_inodes) { +- if (test_bit(BCH_FS_clean_recovery, &c->flags) && +- !fsck_err(trans, deleted_inode_but_clean, +- "filesystem marked as clean but have deleted inode %llu:%u", +- pos.offset, pos.snapshot)) { +- ret = 0; +- goto out; +- } +- +- ret = 1; ++ if (test_bit(BCH_FS_clean_recovery, &c->flags) && ++ !fsck_err(trans, deleted_inode_but_clean, ++ "filesystem marked as clean but have deleted inode %llu:%u", ++ pos.offset, pos.snapshot)) { ++ ret = 0; ++ goto out; + } ++ ++ ret = 1; + out: + fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); +@@ -1454,19 +1431,12 @@ static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + goto out; + } + +-static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum) +-{ +- u32 snapshot; +- +- return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: +- may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false); +-} +- + int bch2_delete_dead_inodes(struct bch_fs *c) + { + struct btree_trans *trans = bch2_trans_get(c); ++ bool need_another_pass; + int ret; +- ++again: + /* + * if we ran check_inodes() unlinked inodes will have already been + * cleaned up but the write buffer will be out of sync; therefore we +@@ -1476,6 +1446,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c) + if (ret) + goto err; + ++ need_another_pass = false; ++ + /* + * Weird transaction restart handling here because on successful delete, + * bch2_inode_rm_snapshot() will return a nested transaction restart, +@@ -1485,7 +1457,7 @@ int bch2_delete_dead_inodes(struct bch_fs *c) + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ +- ret = may_delete_deleted_inode(trans, k.k->p, true); ++ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + if (ret > 0) { + bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", + k.k->p.offset, k.k->p.snapshot); +@@ -1506,6 +1478,9 @@ int bch2_delete_dead_inodes(struct bch_fs *c) + + ret; + })); ++ ++ if (!ret && need_another_pass) ++ goto again; + err: + bch2_trans_put(trans); + return ret; +diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c +index 413fb60cff43..9136a9097789 100644 +--- a/fs/bcachefs/namei.c ++++ b/fs/bcachefs/namei.c +@@ -418,8 +418,8 @@ int bch2_rename_trans(struct btree_trans *trans, + } + + ret = bch2_dirent_rename(trans, +- src_dir, &src_hash, +- dst_dir, &dst_hash, ++ src_dir, &src_hash, &src_dir_u->bi_size, ++ dst_dir, &dst_hash, &dst_dir_u->bi_size, + src_name, &src_inum, &src_offset, + dst_name, &dst_inum, &dst_offset, + mode); +diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h +index 9387f6092fe9..4036a20c6adc 100644 +--- a/fs/bcachefs/sb-errors_format.h ++++ b/fs/bcachefs/sb-errors_format.h +@@ -232,7 +232,6 @@ enum bch_fsck_flags { + x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ + x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ + x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ +- x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ + x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ + x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ + x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ +@@ -244,7 +243,6 @@ enum bch_fsck_flags { + x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ + x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ + x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ +- x(vfs_bad_inode_rm, 320, 0) \ + x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ + x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ + x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ +@@ -330,7 +328,7 @@ enum bch_fsck_flags { + x(dirent_stray_data_after_cf_name, 305, 0) \ + x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ + x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ +- x(MAX, 321, 0) ++ x(MAX, 319, 0) + + enum bch_sb_error_id { + #define x(t, n, ...) BCH_FSCK_ERR_##t = n, +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index bc6009a71284..d0209f7658bb 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -6,7 +6,6 @@ + #include "errcode.h" + #include "error.h" + #include "fs.h" +-#include "inode.h" + #include "recovery_passes.h" + #include "snapshot.h" + #include "subvolume.h" +@@ -114,20 +113,10 @@ static int check_subvol(struct btree_trans *trans, + "subvolume %llu points to missing subvolume root %llu:%u", + k.k->p.offset, le64_to_cpu(subvol.v->inode), + le32_to_cpu(subvol.v->snapshot))) { +- /* +- * Recreate - any contents that are still disconnected +- * will then get reattached under lost+found +- */ +- bch2_inode_init_early(c, &inode); +- bch2_inode_init_late(&inode, bch2_current_time(c), +- 0, 0, S_IFDIR|0700, 0, NULL); +- inode.bi_inum = le64_to_cpu(subvol.v->inode); +- inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); +- inode.bi_subvol = k.k->p.offset; +- inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent); +- ret = __bch2_fsck_write_inode(trans, &inode); +- if (ret) +- goto err; ++ ret = bch2_subvolume_delete(trans, iter->pos.offset); ++ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); ++ ret = ret ?: -BCH_ERR_transaction_restart_nested; ++ goto err; + } + } else { + goto err; +-- +2.49.1 + diff --git a/sys-kernel/hardened-kernel/files/linux-6.15/1191_bcachefs-patches-from-master-branch-03-Aug-2025.patch b/sys-kernel/hardened-kernel/files/linux-6.15/1191_bcachefs-patches-from-master-branch-03-Aug-2025.patch new file mode 100644 index 0000000..0533eff --- /dev/null +++ b/sys-kernel/hardened-kernel/files/linux-6.15/1191_bcachefs-patches-from-master-branch-03-Aug-2025.patch @@ -0,0 +1,51930 @@ +From 62f17daf8197fca8fed0545b78a06891df3ce90d Mon Sep 17 00:00:00 2001 +From: Alexander Miroshnichenko +Date: Sun, 3 Aug 2025 20:16:04 +0300 +Subject: [PATCH] bcachefs: patches form master branch on 03-Aug-2025 +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 8bit + +Signed-off-by: Alexander Miroshnichenko +--- + .../filesystems/bcachefs/casefolding.rst | 18 + + .../filesystems/bcachefs/future/idle_work.rst | 78 + + Documentation/filesystems/bcachefs/index.rst | 7 + + fs/bcachefs/Kconfig | 8 + + fs/bcachefs/Makefile | 4 + + fs/bcachefs/acl.c | 29 +- + fs/bcachefs/alloc_background.c | 763 +++++---- + fs/bcachefs/alloc_background.h | 19 +- + fs/bcachefs/alloc_foreground.c | 835 +++++----- + fs/bcachefs/alloc_foreground.h | 86 +- + fs/bcachefs/alloc_types.h | 16 - + fs/bcachefs/async_objs.c | 141 ++ + fs/bcachefs/async_objs.h | 45 + + fs/bcachefs/async_objs_types.h | 25 + + fs/bcachefs/backpointers.c | 439 +++--- + fs/bcachefs/backpointers.h | 17 +- + fs/bcachefs/bcachefs.h | 360 +++-- + fs/bcachefs/bcachefs_format.h | 30 +- + fs/bcachefs/bkey.c | 51 +- + fs/bcachefs/bkey.h | 4 +- + fs/bcachefs/bkey_methods.c | 2 +- + fs/bcachefs/bset.c | 124 +- + fs/bcachefs/bset.h | 22 +- + fs/bcachefs/btree_cache.c | 280 ++-- + fs/bcachefs/btree_cache.h | 20 + + fs/bcachefs/btree_gc.c | 346 +++-- + fs/bcachefs/btree_gc.h | 3 +- + fs/bcachefs/btree_io.c | 567 ++++--- + fs/bcachefs/btree_io.h | 12 +- + fs/bcachefs/btree_iter.c | 818 +++++----- + fs/bcachefs/btree_iter.h | 456 +++--- + fs/bcachefs/btree_journal_iter.c | 119 +- + fs/bcachefs/btree_journal_iter_types.h | 5 +- + fs/bcachefs/btree_key_cache.c | 147 +- + fs/bcachefs/btree_locking.c | 267 ++-- + fs/bcachefs/btree_locking.h | 78 +- + fs/bcachefs/btree_node_scan.c | 142 +- + fs/bcachefs/btree_node_scan.h | 2 +- + fs/bcachefs/btree_trans_commit.c | 202 ++- + fs/bcachefs/btree_types.h | 102 +- + fs/bcachefs/btree_update.c | 405 +++-- + fs/bcachefs/btree_update.h | 198 ++- + fs/bcachefs/btree_update_interior.c | 560 ++++--- + fs/bcachefs/btree_update_interior.h | 22 +- + fs/bcachefs/btree_write_buffer.c | 100 +- + fs/bcachefs/btree_write_buffer.h | 5 + + fs/bcachefs/buckets.c | 426 +++--- + fs/bcachefs/buckets.h | 12 +- + fs/bcachefs/buckets_waiting_for_journal.c | 31 +- + fs/bcachefs/chardev.c | 162 +- + fs/bcachefs/checksum.c | 66 +- + fs/bcachefs/checksum.h | 2 + + fs/bcachefs/clock.c | 64 +- + fs/bcachefs/clock.h | 1 + + fs/bcachefs/compress.c | 53 +- + fs/bcachefs/compress.h | 36 +- + fs/bcachefs/darray.h | 59 +- + fs/bcachefs/data_update.c | 411 +++-- + fs/bcachefs/data_update.h | 15 + + fs/bcachefs/debug.c | 212 +-- + fs/bcachefs/debug.h | 20 +- + fs/bcachefs/dirent.c | 253 ++- + fs/bcachefs/dirent.h | 25 +- + fs/bcachefs/disk_accounting.c | 382 ++--- + fs/bcachefs/disk_accounting.h | 27 +- + fs/bcachefs/disk_groups.c | 167 +- + fs/bcachefs/ec.c | 630 ++++---- + fs/bcachefs/ec.h | 11 +- + fs/bcachefs/ec_types.h | 7 +- + fs/bcachefs/enumerated_ref.c | 142 ++ + fs/bcachefs/enumerated_ref.h | 66 + + fs/bcachefs/enumerated_ref_types.h | 19 + + fs/bcachefs/errcode.c | 7 +- + fs/bcachefs/errcode.h | 42 +- + fs/bcachefs/error.c | 240 +-- + fs/bcachefs/error.h | 27 +- + fs/bcachefs/extent_update.c | 87 +- + fs/bcachefs/extent_update.h | 2 +- + fs/bcachefs/extents.c | 217 ++- + fs/bcachefs/extents.h | 6 + + fs/bcachefs/extents_types.h | 1 + + fs/bcachefs/fast_list.c | 168 ++ + fs/bcachefs/fast_list.h | 41 + + fs/bcachefs/fs-io-buffered.c | 119 +- + fs/bcachefs/fs-io-direct.c | 33 +- + fs/bcachefs/fs-io-pagecache.c | 57 +- + fs/bcachefs/fs-io.c | 177 +-- + fs/bcachefs/fs-io.h | 19 +- + fs/bcachefs/fs-ioctl.c | 43 +- + fs/bcachefs/fs.c | 316 ++-- + fs/bcachefs/fsck.c | 1361 +++++++++-------- + fs/bcachefs/fsck.h | 6 + + fs/bcachefs/inode.c | 389 +++-- + fs/bcachefs/inode.h | 49 +- + fs/bcachefs/inode_format.h | 7 +- + fs/bcachefs/io_misc.c | 112 +- + fs/bcachefs/io_misc.h | 2 + + fs/bcachefs/io_read.c | 558 ++++--- + fs/bcachefs/io_read.h | 44 +- + fs/bcachefs/io_write.c | 177 ++- + fs/bcachefs/io_write.h | 28 - + fs/bcachefs/io_write_types.h | 32 + + fs/bcachefs/journal.c | 455 +++--- + fs/bcachefs/journal.h | 13 +- + fs/bcachefs/journal_io.c | 603 +++++--- + fs/bcachefs/journal_io.h | 8 + + fs/bcachefs/journal_reclaim.c | 293 ++-- + fs/bcachefs/journal_sb.c | 2 +- + fs/bcachefs/journal_seq_blacklist.c | 68 +- + fs/bcachefs/journal_seq_blacklist.h | 4 + + fs/bcachefs/journal_types.h | 2 - + fs/bcachefs/logged_ops.c | 16 +- + fs/bcachefs/logged_ops.h | 2 +- + fs/bcachefs/lru.c | 54 +- + fs/bcachefs/migrate.c | 144 +- + fs/bcachefs/migrate.h | 3 +- + fs/bcachefs/move.c | 589 ++++--- + fs/bcachefs/move.h | 31 +- + fs/bcachefs/move_types.h | 8 +- + fs/bcachefs/movinggc.c | 251 ++- + fs/bcachefs/movinggc.h | 5 +- + fs/bcachefs/namei.c | 439 ++++-- + fs/bcachefs/namei.h | 7 + + fs/bcachefs/nocow_locking.c | 14 +- + fs/bcachefs/nocow_locking.h | 2 +- + fs/bcachefs/opts.c | 199 ++- + fs/bcachefs/opts.h | 56 +- + fs/bcachefs/printbuf.h | 12 + + fs/bcachefs/progress.c | 6 +- + fs/bcachefs/progress.h | 3 + + fs/bcachefs/quota.c | 103 +- + fs/bcachefs/rcu_pending.c | 22 +- + fs/bcachefs/rebalance.c | 330 ++-- + fs/bcachefs/rebalance.h | 14 +- + fs/bcachefs/rebalance_types.h | 6 + + fs/bcachefs/recovery.c | 373 +++-- + fs/bcachefs/recovery.h | 3 +- + fs/bcachefs/recovery_passes.c | 666 ++++++-- + fs/bcachefs/recovery_passes.h | 47 +- + fs/bcachefs/recovery_passes_format.h | 106 ++ + fs/bcachefs/recovery_passes_types.h | 93 +- + fs/bcachefs/reflink.c | 163 +- + fs/bcachefs/replicas.c | 182 +-- + fs/bcachefs/sb-clean.c | 36 +- + fs/bcachefs/sb-counters_format.h | 11 + + fs/bcachefs/sb-downgrade.c | 33 +- + fs/bcachefs/sb-errors.c | 67 +- + fs/bcachefs/sb-errors.h | 1 + + fs/bcachefs/sb-errors_format.h | 46 +- + fs/bcachefs/sb-members.c | 276 ++-- + fs/bcachefs/sb-members.h | 134 +- + fs/bcachefs/sb-members_format.h | 8 +- + fs/bcachefs/sb-members_types.h | 1 + + fs/bcachefs/six.c | 28 +- + fs/bcachefs/snapshot.c | 982 +++++++----- + fs/bcachefs/snapshot.h | 140 +- + fs/bcachefs/snapshot_format.h | 4 +- + fs/bcachefs/snapshot_types.h | 57 + + fs/bcachefs/str_hash.c | 390 +++-- + fs/bcachefs/str_hash.h | 83 +- + fs/bcachefs/subvolume.c | 338 ++-- + fs/bcachefs/subvolume.h | 25 +- + fs/bcachefs/subvolume_types.h | 27 - + fs/bcachefs/super-io.c | 169 +- + fs/bcachefs/super-io.h | 1 + + fs/bcachefs/super.c | 1160 ++++++++------ + fs/bcachefs/super.h | 10 +- + fs/bcachefs/sysfs.c | 273 +++- + fs/bcachefs/tests.c | 340 ++-- + fs/bcachefs/thread_with_file.c | 52 +- + fs/bcachefs/time_stats.c | 7 +- + fs/bcachefs/trace.h | 398 ++--- + fs/bcachefs/util.c | 75 +- + fs/bcachefs/util.h | 27 +- + fs/bcachefs/xattr.c | 81 +- + fs/bcachefs/xattr.h | 4 +- + fs/bcachefs/xattr_format.h | 4 +- + 177 files changed, 15223 insertions(+), 11107 deletions(-) + create mode 100644 Documentation/filesystems/bcachefs/future/idle_work.rst + create mode 100644 fs/bcachefs/async_objs.c + create mode 100644 fs/bcachefs/async_objs.h + create mode 100644 fs/bcachefs/async_objs_types.h + create mode 100644 fs/bcachefs/enumerated_ref.c + create mode 100644 fs/bcachefs/enumerated_ref.h + create mode 100644 fs/bcachefs/enumerated_ref_types.h + create mode 100644 fs/bcachefs/fast_list.c + create mode 100644 fs/bcachefs/fast_list.h + create mode 100644 fs/bcachefs/recovery_passes_format.h + create mode 100644 fs/bcachefs/snapshot_types.h + +diff --git a/Documentation/filesystems/bcachefs/casefolding.rst b/Documentation/filesystems/bcachefs/casefolding.rst +index ba5de97d155f..871a38f557e8 100644 +--- a/Documentation/filesystems/bcachefs/casefolding.rst ++++ b/Documentation/filesystems/bcachefs/casefolding.rst +@@ -88,3 +88,21 @@ This would fail if negative dentry's were cached. + + This is slightly suboptimal, but could be fixed in future with some vfs work. + ++ ++References ++---------- ++ ++(from Peter Anvin, on the list) ++ ++It is worth noting that Microsoft has basically declared their ++"recommended" case folding (upcase) table to be permanently frozen (for ++new filesystem instances in the case where they use an on-disk ++translation table created at format time.) As far as I know they have ++never supported anything other than 1:1 conversion of BMP code points, ++nor normalization. ++ ++The exFAT specification enumerates the full recommended upcase table, ++although in a somewhat annoying format (basically a hex dump of ++compressed data): ++ ++https://learn.microsoft.com/en-us/windows/win32/fileio/exfat-specification +diff --git a/Documentation/filesystems/bcachefs/future/idle_work.rst b/Documentation/filesystems/bcachefs/future/idle_work.rst +new file mode 100644 +index 000000000000..59a332509dcd +--- /dev/null ++++ b/Documentation/filesystems/bcachefs/future/idle_work.rst +@@ -0,0 +1,78 @@ ++Idle/background work classes design doc: ++ ++Right now, our behaviour at idle isn't ideal, it was designed for servers that ++would be under sustained load, to keep pending work at a "medium" level, to ++let work build up so we can process it in more efficient batches, while also ++giving headroom for bursts in load. ++ ++But for desktops or mobile - scenarios where work is less sustained and power ++usage is more important - we want to operate differently, with a "rush to ++idle" so the system can go to sleep. We don't want to be dribbling out ++background work while the system should be idle. ++ ++The complicating factor is that there are a number of background tasks, which ++form a heirarchy (or a digraph, depending on how you divide it up) - one ++background task may generate work for another. ++ ++Thus proper idle detection needs to model this heirarchy. ++ ++- Foreground writes ++- Page cache writeback ++- Copygc, rebalance ++- Journal reclaim ++ ++When we implement idle detection and rush to idle, we need to be careful not ++to disturb too much the existing behaviour that works reasonably well when the ++system is under sustained load (or perhaps improve it in the case of ++rebalance, which currently does not actively attempt to let work batch up). ++ ++SUSTAINED LOAD REGIME ++--------------------- ++ ++When the system is under continuous load, we want these jobs to run ++continuously - this is perhaps best modelled with a P/D controller, where ++they'll be trying to keep a target value (i.e. fragmented disk space, ++available journal space) roughly in the middle of some range. ++ ++The goal under sustained load is to balance our ability to handle load spikes ++without running out of x resource (free disk space, free space in the ++journal), while also letting some work accumululate to be batched (or become ++unnecessary). ++ ++For example, we don't want to run copygc too aggressively, because then it ++will be evacuating buckets that would have become empty (been overwritten or ++deleted) anyways, and we don't want to wait until we're almost out of free ++space because then the system will behave unpredicably - suddenly we're doing ++a lot more work to service each write and the system becomes much slower. ++ ++IDLE REGIME ++----------- ++ ++When the system becomes idle, we should start flushing our pending work ++quicker so the system can go to sleep. ++ ++Note that the definition of "idle" depends on where in the heirarchy a task ++is - a task should start flushing work more quickly when the task above it has ++stopped generating new work. ++ ++e.g. rebalance should start flushing more quickly when page cache writeback is ++idle, and journal reclaim should only start flushing more quickly when both ++copygc and rebalance are idle. ++ ++It's important to let work accumulate when more work is still incoming and we ++still have room, because flushing is always more efficient if we let it batch ++up. New writes may overwrite data before rebalance moves it, and tasks may be ++generating more updates for the btree nodes that journal reclaim needs to flush. ++ ++On idle, how much work we do at each interval should be proportional to the ++length of time we have been idle for. If we're idle only for a short duration, ++we shouldn't flush everything right away; the system might wake up and start ++generating new work soon, and flushing immediately might end up doing a lot of ++work that would have been unnecessary if we'd allowed things to batch more. ++ ++To summarize, we will need: ++ ++ - A list of classes for background tasks that generate work, which will ++ include one "foreground" class. ++ - Tracking for each class - "Am I doing work, or have I gone to sleep?" ++ - And each class should check the class above it when deciding how much work to issue. +diff --git a/Documentation/filesystems/bcachefs/index.rst b/Documentation/filesystems/bcachefs/index.rst +index 3864d0ae89c1..e5c4c2120b93 100644 +--- a/Documentation/filesystems/bcachefs/index.rst ++++ b/Documentation/filesystems/bcachefs/index.rst +@@ -29,3 +29,10 @@ At this moment, only a few of these are described here. + + casefolding + errorcodes ++ ++Future design ++------------- ++.. toctree:: ++ :maxdepth: 1 ++ ++ future/idle_work +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +index 07709b0d7688..8cb2b9d5da96 100644 +--- a/fs/bcachefs/Kconfig ++++ b/fs/bcachefs/Kconfig +@@ -103,6 +103,14 @@ config BCACHEFS_PATH_TRACEPOINTS + Enable extra tracepoints for debugging btree_path operations; we don't + normally want these enabled because they happen at very high rates. + ++config BCACHEFS_TRANS_KMALLOC_TRACE ++ bool "Trace bch2_trans_kmalloc() calls" ++ depends on BCACHEFS_FS ++ ++config BCACHEFS_ASYNC_OBJECT_LISTS ++ bool "Keep async objects on fast_lists for debugfs visibility" ++ depends on BCACHEFS_FS && DEBUG_FS ++ + config MEAN_AND_VARIANCE_UNIT_TEST + tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS + depends on KUNIT +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index 9af65079374f..93c8ee5425c8 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -35,11 +35,13 @@ bcachefs-y := \ + disk_accounting.o \ + disk_groups.o \ + ec.o \ ++ enumerated_ref.o \ + errcode.o \ + error.o \ + extents.o \ + extent_update.o \ + eytzinger.o \ ++ fast_list.o \ + fs.o \ + fs-ioctl.o \ + fs-io.o \ +@@ -97,6 +99,8 @@ bcachefs-y := \ + varint.o \ + xattr.o + ++bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o ++ + obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o + + # Silence "note: xyz changed in GCC X.X" messages +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index d03adc36100e..3befa1f36e72 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -138,8 +138,8 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, + + acl = allocate_dropping_locks(trans, ret, + posix_acl_alloc(count, _gfp)); +- if (!acl) +- return ERR_PTR(-ENOMEM); ++ if (!acl && !ret) ++ ret = bch_err_throw(trans->c, ENOMEM_acl); + if (ret) { + kfree(acl); + return ERR_PTR(ret); +@@ -273,13 +273,13 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); +- struct btree_iter iter = {}; ++ struct btree_iter iter = { NULL }; + struct posix_acl *acl = NULL; + + if (rcu) + return ERR_PTR(-ECHILD); + +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + retry: + bch2_trans_begin(trans); + +@@ -303,8 +303,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) + if (!IS_ERR_OR_NULL(acl)) + set_cached_acl(&inode->v, type, acl); + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); ++ bch2_trans_iter_exit(&iter); + return acl; + } + +@@ -344,14 +343,14 @@ int bch2_set_acl(struct mnt_idmap *idmap, + { + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; +- struct btree_iter inode_iter = {}; ++ struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl; + umode_t mode; + int ret; + +- mutex_lock(&inode->ei_update_lock); +- struct btree_trans *trans = bch2_trans_get(c); ++ guard(mutex)(&inode->ei_update_lock); ++ CLASS(btree_trans, trans)(c); + retry: + bch2_trans_begin(trans); + acl = _acl; +@@ -380,22 +379,18 @@ int bch2_set_acl(struct mnt_idmap *idmap, + ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + btree_err: +- bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(&inode_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (unlikely(ret)) +- goto err; ++ return ret; + + bch2_inode_update_after_write(trans, inode, &inode_u, + ATTR_CTIME|ATTR_MODE); + + set_cached_acl(&inode->v, type, acl); +-err: +- bch2_trans_put(trans); +- mutex_unlock(&inode->ei_update_lock); +- +- return ret; ++ return 0; + } + + int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, +@@ -436,7 +431,7 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, + *new_acl = acl; + acl = NULL; + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + if (!IS_ERR_OR_NULL(acl)) + kfree(acl); + return ret; +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index 94ea9e49aec4..3fc728efbf5c 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -17,10 +17,11 @@ + #include "debug.h" + #include "disk_accounting.h" + #include "ec.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "lru.h" ++#include "progress.h" + #include "recovery.h" +-#include "trace.h" + #include "varint.h" + + #include +@@ -308,7 +309,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, + "data type inconsistency"); + + bkey_fsck_err_on(!a.io_time[READ] && +- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, ++ !(c->recovery.passes_to_run & ++ BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), + c, alloc_key_cached_but_read_time_zero, + "cached bucket with read_time == 0"); + break; +@@ -335,11 +337,11 @@ void bch2_alloc_v4_swab(struct bkey_s k) + a->stripe_sectors = swab32(a->stripe_sectors); + } + +-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) ++static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_alloc_v4 *a) + { +- struct bch_alloc_v4 _a; +- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); +- struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL; ++ struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, k.k->p.inode) : NULL; + + prt_newline(out); + printbuf_indent_add(out, 2); +@@ -348,11 +350,14 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c + bch2_prt_data_type(out, a->data_type); + prt_newline(out); + prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); +- prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); ++ if (bkey_val_bytes(k.k) > offsetof(struct bch_alloc_v4, journal_seq_empty)) ++ prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); ++ + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); +- prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); ++ if (bkey_val_bytes(k.k) > offsetof(struct bch_alloc_v4, stripe_sectors)) ++ prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); + prt_printf(out, "cached_sectors %u\n", a->cached_sectors); + prt_printf(out, "stripe %u\n", a->stripe); + prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); +@@ -367,12 +372,25 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c + bch2_dev_put(ca); + } + ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bch_alloc_v4 _a; ++ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); ++ ++ __bch2_alloc_v4_to_text(out, c, k, a); ++} ++ ++void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) ++{ ++ __bch2_alloc_v4_to_text(out, c, k, bkey_s_c_to_alloc_v4(k).v); ++} ++ + void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) + { + if (k.k->type == KEY_TYPE_alloc_v4) { + void *src, *dst; + +- *out = *bkey_s_c_to_alloc_v4(k).v; ++ bkey_val_copy(out, bkey_s_c_to_alloc_v4(k)); + + src = alloc_v4_backpointers(out); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); +@@ -455,13 +473,14 @@ struct bkey_i_alloc_v4 * + bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) + { +- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, +- BTREE_ITER_with_updates| +- BTREE_ITER_cached| +- BTREE_ITER_intent); ++ bch2_trans_iter_init(trans, iter, BTREE_ID_alloc, pos, ++ BTREE_ITER_with_updates| ++ BTREE_ITER_cached| ++ BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + int ret = bkey_err(k); + if (unlikely(ret)) +- return ERR_PTR(ret); ++ goto err; + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); + ret = PTR_ERR_OR_ZERO(a); +@@ -469,7 +488,7 @@ bch2_trans_start_alloc_update_noupdate(struct btree_trans *trans, struct btree_i + goto err; + return a; + err: +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return ERR_PTR(ret); + } + +@@ -477,14 +496,24 @@ __flatten + struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, struct bpos pos, + enum btree_iter_update_trigger_flags flags) + { +- struct btree_iter iter; +- struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); +- int ret = PTR_ERR_OR_ZERO(a); +- if (ret) ++ CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, pos, ++ BTREE_ITER_with_updates| ++ BTREE_ITER_cached| ++ BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); ++ if (unlikely(ret)) + return ERR_PTR(ret); + +- ret = bch2_trans_update(trans, &iter, &a->k_i, flags); +- bch2_trans_iter_exit(trans, &iter); ++ if ((void *) k.v >= trans->mem && ++ (void *) k.v < trans->mem + trans->mem_top) ++ return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); ++ ++ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); ++ if (IS_ERR(a)) ++ return a; ++ ++ ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); + return unlikely(ret) ? ERR_PTR(ret) : a; + } + +@@ -537,11 +566,11 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke + + int bch2_bucket_gens_init(struct bch_fs *c) + { +- struct btree_trans *trans = bch2_trans_get(c); + struct bkey_i_bucket_gens g; + bool have_bucket_gens_key = false; + int ret; + ++ CLASS(btree_trans, trans)(c); + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_prefetch, k, ({ + /* +@@ -581,17 +610,14 @@ int bch2_bucket_gens_init(struct bch_fs *c) + BCH_TRANS_COMMIT_no_enospc, + bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + +- bch2_trans_put(trans); +- +- bch_err_fn(c, ret); + return ret; + } + + int bch2_alloc_read(struct bch_fs *c) + { +- down_read(&c->state_lock); ++ guard(rwsem_read)(&c->state_lock); + +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + struct bch_dev *ca = NULL; + int ret; + +@@ -610,7 +636,7 @@ int bch2_alloc_read(struct bch_fs *c) + * bch2_check_alloc_key() which runs later: + */ + if (!ca) { +- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + +@@ -631,17 +657,17 @@ int bch2_alloc_read(struct bch_fs *c) + * bch2_check_alloc_key() which runs later: + */ + if (!ca) { +- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + + if (k.k->p.offset < ca->mi.first_bucket) { +- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); + continue; + } + + if (k.k->p.offset >= ca->mi.nbuckets) { +- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + +@@ -652,10 +678,6 @@ int bch2_alloc_read(struct bch_fs *c) + } + + bch2_dev_put(ca); +- bch2_trans_put(trans); +- +- up_read(&c->state_lock); +- bch_err_fn(c, ret); + return ret; + } + +@@ -671,7 +693,7 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, + ? BCH_FSCK_ERR_need_discard_key_wrong + : BCH_FSCK_ERR_freespace_key_wrong; + enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bkey_val_to_text(&buf, c, alloc_k); + +@@ -680,11 +702,9 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, + set ? "" : "un", + bch2_btree_id_str(btree), + buf.buf); +- if (ret == -BCH_ERR_fsck_ignore || +- ret == -BCH_ERR_fsck_errors_not_fixed) ++ if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) || ++ bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed)) + ret = 0; +- +- printbuf_exit(&buf); + return ret; + } + +@@ -720,8 +740,8 @@ static int bch2_bucket_do_index(struct btree_trans *trans, + return 0; + } + +- struct btree_iter iter; +- struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, btree, pos, BTREE_ITER_intent); ++ struct bkey_s_c old = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(old); + if (ret) + return ret; +@@ -731,30 +751,25 @@ static int bch2_bucket_do_index(struct btree_trans *trans, + trans, alloc_k, set, + btree == BTREE_ID_need_discard, false); + +- ret = bch2_btree_bit_mod_iter(trans, &iter, set); ++ return bch2_btree_bit_mod_iter(trans, &iter, set); + fsck_err: +- bch2_trans_iter_exit(trans, &iter); + return ret; + } + + static noinline int bch2_bucket_gen_update(struct btree_trans *trans, + struct bpos bucket, u8 gen) + { +- struct btree_iter iter; +- unsigned offset; +- struct bpos pos = alloc_gens_pos(bucket, &offset); +- struct bkey_i_bucket_gens *g; +- struct bkey_s_c k; +- int ret; +- +- g = bch2_trans_kmalloc(trans, sizeof(*g)); +- ret = PTR_ERR_OR_ZERO(g); ++ struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); ++ int ret = PTR_ERR_OR_ZERO(g); + if (ret) + return ret; + +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, +- BTREE_ITER_intent| +- BTREE_ITER_with_updates); ++ unsigned offset; ++ struct bpos pos = alloc_gens_pos(bucket, &offset); ++ ++ CLASS(btree_iter, iter)(trans, BTREE_ID_bucket_gens, pos, ++ BTREE_ITER_intent|BTREE_ITER_with_updates); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + return ret; +@@ -769,7 +784,7 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans, + g->v.gens[offset] = gen; + + ret = bch2_trans_update(trans, &iter, &g->k_i, 0); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -832,12 +847,12 @@ int bch2_trigger_alloc(struct btree_trans *trans, + enum btree_iter_update_trigger_flags flags) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + +- struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); ++ CLASS(bch2_dev_bucket_tryget, ca)(c, new.k->p); + if (!ca) +- return -BCH_ERR_trigger_alloc; ++ return bch_err_throw(c, trigger_alloc); + + struct bch_alloc_v4 old_a_convert; + const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); +@@ -851,7 +866,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, + struct bkey_i_alloc_v4 *new_ka = bch2_alloc_to_v4_mut_inlined(trans, new.s_c); + ret = PTR_ERR_OR_ZERO(new_ka); + if (unlikely(ret)) +- goto err; ++ return ret; + new_a = &new_ka->v; + } + +@@ -885,7 +900,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, + ret = bch2_bucket_do_index(trans, ca, old, old_a, false) ?: + bch2_bucket_do_index(trans, ca, new.s_c, new_a, true); + if (ret) +- goto err; ++ return ret; + } + + if (new_a->data_type == BCH_DATA_cached && +@@ -897,7 +912,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, + alloc_lru_idx_read(*old_a), + alloc_lru_idx_read(*new_a)); + if (ret) +- goto err; ++ return ret; + + ret = bch2_lru_change(trans, + BCH_LRU_BUCKET_FRAGMENTATION, +@@ -905,26 +920,17 @@ int bch2_trigger_alloc(struct btree_trans *trans, + alloc_lru_idx_fragmentation(*old_a, ca), + alloc_lru_idx_fragmentation(*new_a, ca)); + if (ret) +- goto err; ++ return ret; + + if (old_a->gen != new_a->gen) { + ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); + if (ret) +- goto err; +- } +- +- if ((flags & BTREE_TRIGGER_bucket_invalidate) && +- old_a->cached_sectors) { +- ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, +- -((s64) old_a->cached_sectors), +- flags & BTREE_TRIGGER_gc); +- if (ret) +- goto err; ++ return ret; + } + + ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); + if (ret) +- goto err; ++ return ret; + } + + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { +@@ -975,19 +981,16 @@ int bch2_trigger_alloc(struct btree_trans *trans, + if (bch2_fs_fatal_err_on(ret, c, + "setting bucket_needs_journal_commit: %s", + bch2_err_str(ret))) +- goto err; ++ return ret; + } + } + + if (new_a->gen != old_a->gen) { +- rcu_read_lock(); ++ guard(rcu)(); + u8 *gen = bucket_gen(ca, new.k->p.offset); +- if (unlikely(!gen)) { +- rcu_read_unlock(); ++ if (unlikely(!gen)) + goto invalid_bucket; +- } + *gen = new_a->gen; +- rcu_read_unlock(); + } + + #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) +@@ -1013,36 +1016,28 @@ int bch2_trigger_alloc(struct btree_trans *trans, + } + + if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { +- rcu_read_lock(); ++ guard(rcu)(); + struct bucket *g = gc_bucket(ca, new.k->p.offset); +- if (unlikely(!g)) { +- rcu_read_unlock(); ++ if (unlikely(!g)) + goto invalid_bucket; +- } + g->gen_valid = 1; + g->gen = new_a->gen; +- rcu_read_unlock(); + } +-err: + fsck_err: +- printbuf_exit(&buf); +- bch2_dev_put(ca); + return ret; + invalid_bucket: + bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); +- ret = -BCH_ERR_trigger_alloc; +- goto err; ++ return bch_err_throw(c, trigger_alloc); + } + + /* + * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for + * extents style btrees, but works on non-extents btrees: + */ +-static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, +- struct bpos end, struct bkey *hole) ++static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) + { +- struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + + if (bkey_err(k)) + return k; +@@ -1053,9 +1048,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct bt + struct btree_iter iter2; + struct bpos next; + +- bch2_trans_copy_iter(trans, &iter2, iter); ++ bch2_trans_copy_iter(&iter2, iter); + +- struct btree_path *path = btree_iter_path(trans, iter); ++ struct btree_path *path = btree_iter_path(iter->trans, iter); + if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); + +@@ -1065,9 +1060,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct bt + * btree node min/max is a closed interval, upto takes a half + * open interval: + */ +- k = bch2_btree_iter_peek_max(trans, &iter2, end); ++ k = bch2_btree_iter_peek_max(&iter2, end); + next = iter2.pos; +- bch2_trans_iter_exit(trans, &iter2); ++ bch2_trans_iter_exit(&iter2); + + BUG_ON(next.offset >= iter->pos.offset + U32_MAX); + +@@ -1097,25 +1092,23 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck + bucket->offset = 0; + } + +- rcu_read_lock(); ++ guard(rcu)(); + *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); + if (*ca) { + *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); + bch2_dev_get(*ca); + } +- rcu_read_unlock(); + + return *ca != NULL; + } + +-static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bch_dev **ca, struct bkey *hole) ++static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, ++ struct bch_dev **ca, struct bkey *hole) + { +- struct bch_fs *c = trans->c; ++ struct bch_fs *c = iter->trans->c; + struct bkey_s_c k; + again: +- k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); ++ k = bch2_get_key_or_hole(iter, POS_MAX, hole); + if (bkey_err(k)) + return k; + +@@ -1128,7 +1121,7 @@ static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *tran + if (!next_bucket(c, ca, &hole_start)) + return bkey_s_c_null; + +- bch2_btree_iter_set_pos(trans, iter, hole_start); ++ bch2_btree_iter_set_pos(iter, hole_start); + goto again; + } + +@@ -1152,10 +1145,10 @@ int bch2_check_alloc_key(struct btree_trans *trans, + const struct bch_alloc_v4 *a; + unsigned gens_offset; + struct bkey_s_c k; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + +- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_k.k->p); ++ CLASS(bch2_dev_bucket_tryget_noerror, ca)(c, alloc_k.k->p); + if (fsck_err_on(!ca, + trans, alloc_key_to_missing_dev_bucket, + "alloc key for invalid device:bucket %llu:%llu", +@@ -1165,43 +1158,43 @@ int bch2_check_alloc_key(struct btree_trans *trans, + return ret; + + if (!ca->mi.freespace_initialized) +- goto out; ++ return 0; + + a = bch2_alloc_to_v4(alloc_k, &a_convert); + +- bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); +- k = bch2_btree_iter_peek_slot(trans, discard_iter); ++ bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); ++ k = bch2_btree_iter_peek_slot(discard_iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + bool is_discarded = a->data_type == BCH_DATA_need_discard; + if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, + trans, alloc_k, !is_discarded, true, true)) { + ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); + if (ret) +- goto err; ++ return ret; + } + +- bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); +- k = bch2_btree_iter_peek_slot(trans, freespace_iter); ++ bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); ++ k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + bool is_free = a->data_type == BCH_DATA_free; + if (need_discard_or_freespace_err_on(!!k.k->type != is_free, + trans, alloc_k, !is_free, false, true)) { + ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); + if (ret) +- goto err; ++ return ret; + } + +- bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); +- k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); ++ bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); ++ k = bch2_btree_iter_peek_slot(bucket_gens_iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), + trans, bucket_gens_key_wrong, +@@ -1214,7 +1207,7 @@ int bch2_check_alloc_key(struct btree_trans *trans, + + ret = PTR_ERR_OR_ZERO(g); + if (ret) +- goto err; ++ return ret; + + if (k.k->type == KEY_TYPE_bucket_gens) { + bkey_reassemble(&g->k_i, k); +@@ -1227,13 +1220,9 @@ int bch2_check_alloc_key(struct btree_trans *trans, + + ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); + if (ret) +- goto err; ++ return ret; + } +-out: +-err: + fsck_err: +- bch2_dev_put(ca); +- printbuf_exit(&buf); + return ret; + } + +@@ -1245,18 +1234,18 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct btree_iter *freespace_iter) + { + struct bkey_s_c k; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret; + + if (!ca->mi.freespace_initialized) + return 0; + +- bch2_btree_iter_set_pos(trans, freespace_iter, start); ++ bch2_btree_iter_set_pos(freespace_iter, start); + +- k = bch2_btree_iter_peek_slot(trans, freespace_iter); ++ k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + *end = bkey_min(k.k->p, *end); + +@@ -1269,10 +1258,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + end->offset)) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); +- + ret = PTR_ERR_OR_ZERO(update); + if (ret) +- goto err; ++ return ret; + + bkey_init(&update->k); + update->k.type = KEY_TYPE_set; +@@ -1283,11 +1271,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + + ret = bch2_trans_update(trans, freespace_iter, update, 0); + if (ret) +- goto err; ++ return ret; + } +-err: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -1298,16 +1284,16 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, + struct btree_iter *bucket_gens_iter) + { + struct bkey_s_c k; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + unsigned i, gens_offset, gens_end_offset; + int ret; + +- bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); ++ bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); + +- k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); ++ k = bch2_btree_iter_peek_slot(bucket_gens_iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + if (bkey_cmp(alloc_gens_pos(start, &gens_offset), + alloc_gens_pos(*end, &gens_end_offset))) +@@ -1333,23 +1319,20 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, + + if (need_update) { + struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); +- + ret = PTR_ERR_OR_ZERO(u); + if (ret) +- goto err; ++ return ret; + + memcpy(u, &g, sizeof(g)); + + ret = bch2_trans_update(trans, bucket_gens_iter, u, 0); + if (ret) +- goto err; ++ return ret; + } + } + + *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); +-err: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -1361,17 +1344,17 @@ struct check_discard_freespace_key_async { + + static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) + { +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); ++ CLASS(btree_iter, iter)(trans, pos.btree, pos.pos, 0); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; + + u8 gen; + ret = k.k->type != KEY_TYPE_set +- ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) ++ ? __bch2_check_discard_freespace_key(trans, &iter, &gen, FSCK_ERR_SILENT) + : 0; +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -1381,18 +1364,21 @@ static void check_discard_freespace_key_work(struct work_struct *work) + container_of(work, struct check_discard_freespace_key_async, work); + + bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); +- bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); ++ enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); + kfree(w); + } + +-int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, +- bool async_repair) ++int __bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, ++ enum bch_fsck_flags fsck_flags) + { + struct bch_fs *c = trans->c; + enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard + ? BCH_DATA_need_discard + : BCH_DATA_free; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); ++ ++ bool async_repair = fsck_flags & FSCK_ERR_NO_LOG; ++ fsck_flags |= FSCK_CAN_FIX|FSCK_CAN_IGNORE; + + struct bpos bucket = iter->pos; + bucket.offset &= ~(~0ULL << 56); +@@ -1407,9 +1393,10 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite + return ret; + + if (!bch2_dev_bucket_exists(c, bucket)) { +- if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, +- "entry in %s btree for nonexistant dev:bucket %llu:%llu", +- bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) ++ if (__fsck_err(trans, fsck_flags, ++ need_discard_freespace_key_to_invalid_dev_bucket, ++ "entry in %s btree for nonexistant dev:bucket %llu:%llu", ++ bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) + goto delete; + ret = 1; + goto out; +@@ -1421,7 +1408,8 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite + if (a->data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(*a))) { +- if (fsck_err(trans, need_discard_freespace_key_bad, ++ if (__fsck_err(trans, fsck_flags, ++ need_discard_freespace_key_bad, + "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + bch2_btree_id_str(iter->btree_id), +@@ -1437,16 +1425,15 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite + *gen = a->gen; + out: + fsck_err: +- bch2_set_btree_iter_dontneed(trans, &alloc_iter); +- bch2_trans_iter_exit(trans, &alloc_iter); +- printbuf_exit(&buf); ++ bch2_set_btree_iter_dontneed(&alloc_iter); ++ bch2_trans_iter_exit(&alloc_iter); + return ret; + delete: + if (!async_repair) { + ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc) ?: +- -BCH_ERR_transaction_restart_commit; ++ bch_err_throw(c, transaction_restart_commit); + goto out; + } else { + /* +@@ -1458,7 +1445,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite + if (!w) + goto out; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { + kfree(w); + goto out; + } +@@ -1467,14 +1454,16 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite + w->c = c; + w->pos = BBPOS(iter->btree_id, iter->pos); + queue_work(c->write_ref_wq, &w->work); ++ ++ ret = 1; /* don't allocate from this bucket */ + goto out; + } + } + +-static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) ++static int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter) + { + u8 gen; +- int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); ++ int ret = __bch2_check_discard_freespace_key(trans, iter, &gen, 0); + return ret < 0 ? ret : 0; + } + +@@ -1494,19 +1483,19 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, + u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + u64 b; + bool need_update = false; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + BUG_ON(k.k->type != KEY_TYPE_bucket_gens); + bkey_reassemble(&g.k_i, k); + +- struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); ++ CLASS(bch2_dev_tryget_noerror, ca)(c, k.k->p.inode); + if (!ca) { + if (fsck_err(trans, bucket_gens_to_invalid_dev, + "bucket_gens key for invalid device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) +- ret = bch2_btree_delete_at(trans, iter, 0); +- goto out; ++ return bch2_btree_delete_at(trans, iter, 0); ++ return 0; + } + + if (fsck_err_on(end <= ca->mi.first_bucket || +@@ -1514,8 +1503,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, + trans, bucket_gens_to_invalid_buckets, + "bucket_gens key for invalid buckets:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { +- ret = bch2_btree_delete_at(trans, iter, 0); +- goto out; ++ return bch2_btree_delete_at(trans, iter, 0); + } + + for (b = start; b < ca->mi.first_bucket; b++) +@@ -1536,30 +1524,29 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, + + if (need_update) { + struct bkey_i *u = bch2_trans_kmalloc(trans, sizeof(g)); +- + ret = PTR_ERR_OR_ZERO(u); + if (ret) +- goto out; ++ return ret; + + memcpy(u, &g, sizeof(g)); +- ret = bch2_trans_update(trans, iter, u, 0); ++ return bch2_trans_update(trans, iter, u, 0); + } +-out: + fsck_err: +- bch2_dev_put(ca); +- printbuf_exit(&buf); + return ret; + } + + int bch2_check_alloc_info(struct bch_fs *c) + { +- struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; + struct bch_dev *ca = NULL; + struct bkey hole; + struct bkey_s_c k; + int ret = 0; + ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_alloc)); ++ ++ CLASS(btree_trans, trans)(c); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_prefetch); + bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, +@@ -1574,7 +1561,7 @@ int bch2_check_alloc_info(struct bch_fs *c) + + bch2_trans_begin(trans); + +- k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); ++ k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); + ret = bkey_err(k); + if (ret) + goto bkey_err; +@@ -1582,6 +1569,8 @@ int bch2_check_alloc_info(struct bch_fs *c) + if (!k.k) + break; + ++ progress_update_iter(trans, &progress, &iter); ++ + if (k.k->type) { + next = bpos_nosnap_successor(k.k->p); + +@@ -1612,67 +1601,63 @@ int bch2_check_alloc_info(struct bch_fs *c) + if (ret) + goto bkey_err; + +- bch2_btree_iter_set_pos(trans, &iter, next); ++ bch2_btree_iter_set_pos(&iter, next); + bkey_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + } +- bch2_trans_iter_exit(trans, &bucket_gens_iter); +- bch2_trans_iter_exit(trans, &freespace_iter); +- bch2_trans_iter_exit(trans, &discard_iter); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&bucket_gens_iter); ++ bch2_trans_iter_exit(&freespace_iter); ++ bch2_trans_iter_exit(&discard_iter); ++ bch2_trans_iter_exit(&iter); + bch2_dev_put(ca); + ca = NULL; + + if (ret < 0) +- goto err; ++ return ret; + + ret = for_each_btree_key(trans, iter, + BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_prefetch, k, +- bch2_check_discard_freespace_key_fsck(trans, &iter)); ++ bch2_check_discard_freespace_key(trans, &iter)); + if (ret) +- goto err; ++ return ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_prefetch); + while (1) { + bch2_trans_begin(trans); +- k = bch2_btree_iter_peek(trans, &iter); ++ k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + + ret = bkey_err(k) ?: +- bch2_check_discard_freespace_key_fsck(trans, &iter); ++ bch2_check_discard_freespace_key(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + if (ret) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_bkey_val_to_text(&buf, c, k); +- + bch_err(c, "while checking %s", buf.buf); +- printbuf_exit(&buf); + break; + } + +- bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); ++ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + } +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + if (ret) +- goto err; ++ return ret; + + ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_check_bucket_gens_key(trans, &iter, k)); +-err: +- bch2_trans_put(trans); +- bch_err_fn(c, ret); ++ + return ret; + } + +@@ -1684,10 +1669,10 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct bkey_s_c alloc_k; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret; + +- alloc_k = bch2_btree_iter_peek(trans, alloc_iter); ++ alloc_k = bch2_btree_iter_peek(alloc_iter); + if (!alloc_k.k) + return 0; + +@@ -1695,7 +1680,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, + if (ret) + return ret; + +- struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode); ++ CLASS(bch2_dev_tryget_noerror, ca)(c, alloc_k.k->p.inode); + if (!ca) + return 0; + +@@ -1707,96 +1692,84 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, + bucket_to_u64(alloc_k.k->p), + lru_idx, alloc_k, last_flushed); + if (ret) +- goto err; ++ return ret; + } + +- if (a->data_type != BCH_DATA_cached) +- goto err; ++ if (a->data_type == BCH_DATA_cached) { ++ if (fsck_err_on(!a->io_time[READ], ++ trans, alloc_key_cached_but_read_time_zero, ++ "cached bucket with read_time 0\n%s", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { ++ struct bkey_i_alloc_v4 *a_mut = ++ bch2_alloc_to_v4_mut(trans, alloc_k); ++ ret = PTR_ERR_OR_ZERO(a_mut); ++ if (ret) ++ return ret; + +- if (fsck_err_on(!a->io_time[READ], +- trans, alloc_key_cached_but_read_time_zero, +- "cached bucket with read_time 0\n%s", +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { +- struct bkey_i_alloc_v4 *a_mut = +- bch2_alloc_to_v4_mut(trans, alloc_k); +- ret = PTR_ERR_OR_ZERO(a_mut); +- if (ret) +- goto err; ++ a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); ++ ret = bch2_trans_update(trans, alloc_iter, ++ &a_mut->k_i, BTREE_TRIGGER_norun); ++ if (ret) ++ return ret; + +- a_mut->v.io_time[READ] = bch2_current_io_time(c, READ); +- ret = bch2_trans_update(trans, alloc_iter, +- &a_mut->k_i, BTREE_TRIGGER_norun); +- if (ret) +- goto err; ++ a = &a_mut->v; ++ } + +- a = &a_mut->v; ++ ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, ++ bucket_to_u64(alloc_k.k->p), ++ a->io_time[READ], ++ alloc_k, last_flushed); + } +- +- ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, +- bucket_to_u64(alloc_k.k->p), +- a->io_time[READ], +- alloc_k, last_flushed); +- if (ret) +- goto err; +-err: + fsck_err: +- bch2_dev_put(ca); +- printbuf_exit(&buf); + return ret; + } + + int bch2_check_alloc_to_lru_refs(struct bch_fs *c) + { + struct bkey_buf last_flushed; +- + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_alloc)); ++ ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_prefetch, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: +- bch2_check_stripe_to_lru_refs(c); ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ progress_update_iter(trans, &progress, &iter); ++ bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed); ++ }))?: bch2_check_stripe_to_lru_refs(trans); + + bch2_bkey_buf_exit(&last_flushed, c); +- bch_err_fn(c, ret); + return ret; + } + + static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) + { +- int ret; ++ struct bch_fs *c = ca->fs; + +- mutex_lock(&ca->discard_buckets_in_flight_lock); +- darray_for_each(ca->discard_buckets_in_flight, i) +- if (i->bucket == bucket) { +- ret = -BCH_ERR_EEXIST_discard_in_flight_add; +- goto out; +- } ++ guard(mutex)(&ca->discard_buckets_in_flight_lock); ++ struct discard_in_flight *i = ++ darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); ++ if (i) ++ return bch_err_throw(c, EEXIST_discard_in_flight_add); + +- ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { ++ return darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { + .in_progress = in_progress, + .bucket = bucket, + })); +-out: +- mutex_unlock(&ca->discard_buckets_in_flight_lock); +- return ret; + } + + static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) + { +- mutex_lock(&ca->discard_buckets_in_flight_lock); +- darray_for_each(ca->discard_buckets_in_flight, i) +- if (i->bucket == bucket) { +- BUG_ON(!i->in_progress); +- darray_remove_item(&ca->discard_buckets_in_flight, i); +- goto found; +- } +- BUG(); +-found: +- mutex_unlock(&ca->discard_buckets_in_flight_lock); ++ guard(mutex)(&ca->discard_buckets_in_flight_lock); ++ struct discard_in_flight *i = ++ darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); ++ BUG_ON(!i || !i->in_progress); ++ ++ darray_remove_item(&ca->discard_buckets_in_flight, i); + } + + struct discard_buckets_state { +@@ -1806,19 +1779,6 @@ struct discard_buckets_state { + u64 discarded; + }; + +-/* +- * This is needed because discard is both a filesystem option and a device +- * option, and mount options are supposed to apply to that mount and not be +- * persisted, i.e. if it's set as a mount option we can't propagate it to the +- * device. +- */ +-static inline bool discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +-{ +- return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) +- ? c->opts.discard +- : ca->mi.discard; +-} +- + static int bch2_discard_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, + struct btree_iter *need_discard_iter, +@@ -1828,16 +1788,12 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bpos pos = need_discard_iter->pos; +- struct btree_iter iter = {}; +- struct bkey_s_c k; +- struct bkey_i_alloc_v4 *a; +- struct printbuf buf = PRINTBUF; + bool discard_locked = false; + int ret = 0; + + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { + s->open++; +- goto out; ++ return 0; + } + + u64 seq_ready = bch2_bucket_journal_seq_ready(&c->buckets_waiting_for_journal, +@@ -1845,30 +1801,29 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + if (seq_ready > c->journal.flushed_seq_ondisk) { + if (seq_ready > c->journal.flushing_seq) + s->need_journal_commit++; +- goto out; ++ return 0; + } + +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, +- need_discard_iter->pos, +- BTREE_ITER_cached); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, need_discard_iter->pos, BTREE_ITER_cached); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) +- goto out; ++ return ret; + +- a = bch2_alloc_to_v4_mut(trans, k); ++ struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) +- goto out; ++ return ret; + + if (a->v.data_type != BCH_DATA_need_discard) { + if (need_discard_or_freespace_err(trans, k, true, true, true)) { + ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); + if (ret) +- goto out; ++ return ret; + goto commit; + } + +- goto out; ++ return 0; + } + + if (!fastpath) { +@@ -1882,7 +1837,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + s->discarded++; + *discard_pos_done = iter.pos; + +- if (discard_opt_enabled(c, ca) && !c->opts.nochanges) { ++ if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree +@@ -1921,8 +1876,6 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + discard_in_flight_remove(ca, iter.pos.offset); + if (!ret) + s->seen++; +- bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); + return ret; + } + +@@ -1952,26 +1905,26 @@ static void bch2_do_discards_work(struct work_struct *work) + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, + bch2_err_str(ret)); + +- percpu_ref_put(&ca->io_ref[WRITE]); +- bch2_write_ref_put(c, BCH_WRITE_REF_discard); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); + } + + void bch2_dev_do_discards(struct bch_dev *ca) + { + struct bch_fs *c = ca->fs; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) + return; + +- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) ++ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) + goto put_write_ref; + + if (queue_work(c->write_ref_wq, &ca->discard_work)) + return; + +- percpu_ref_put(&ca->io_ref[WRITE]); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); + put_write_ref: +- bch2_write_ref_put(c, BCH_WRITE_REF_discard); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); + } + + void bch2_do_discards(struct bch_fs *c) +@@ -1986,9 +1939,8 @@ static int bch2_do_discards_fast_one(struct btree_trans *trans, + struct bpos *discard_pos_done, + struct discard_buckets_state *s) + { +- struct btree_iter need_discard_iter; +- struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, +- BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); ++ CLASS(btree_iter, need_discard_iter)(trans, BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); ++ struct bkey_s_c discard_k = bch2_btree_iter_peek_slot(&need_discard_iter); + int ret = bkey_err(discard_k); + if (ret) + return ret; +@@ -1997,12 +1949,10 @@ static int bch2_do_discards_fast_one(struct btree_trans *trans, + trans, discarding_bucket_not_in_need_discard_btree, + "attempting to discard bucket %u:%llu not in need_discard btree", + ca->dev_idx, bucket)) +- goto out; ++ return 0; + +- ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); +-out: ++ return bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); + fsck_err: +- bch2_trans_iter_exit(trans, &need_discard_iter); + return ret; + } + +@@ -2019,17 +1969,16 @@ static void bch2_do_discards_fast_work(struct work_struct *work) + bool got_bucket = false; + u64 bucket; + +- mutex_lock(&ca->discard_buckets_in_flight_lock); +- darray_for_each(ca->discard_buckets_in_flight, i) { +- if (i->in_progress) +- continue; ++ scoped_guard(mutex, &ca->discard_buckets_in_flight_lock) ++ darray_for_each(ca->discard_buckets_in_flight, i) { ++ if (i->in_progress) ++ continue; + +- got_bucket = true; +- bucket = i->bucket; +- i->in_progress = true; +- break; +- } +- mutex_unlock(&ca->discard_buckets_in_flight_lock); ++ got_bucket = true; ++ bucket = i->bucket; ++ i->in_progress = true; ++ break; ++ } + + if (!got_bucket) + break; +@@ -2047,8 +1996,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) + trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_trans_put(trans); +- percpu_ref_put(&ca->io_ref[WRITE]); +- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); + } + + static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) +@@ -2058,18 +2007,18 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) + if (discard_in_flight_add(ca, bucket, false)) + return; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) + return; + +- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) ++ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) + goto put_ref; + + if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) + return; + +- percpu_ref_put(&ca->io_ref[WRITE]); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); + put_ref: +- bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); + } + + static int invalidate_one_bp(struct btree_trans *trans, +@@ -2096,7 +2045,7 @@ static int invalidate_one_bp(struct btree_trans *trans, + + bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); + err: +- bch2_trans_iter_exit(trans, &extent_iter); ++ bch2_trans_iter_exit(&extent_iter); + return ret; + } + +@@ -2137,9 +2086,8 @@ static int invalidate_one_bucket(struct btree_trans *trans, + s64 *nr_to_invalidate) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); +- struct btree_iter alloc_iter = {}; + int ret = 0; + + if (*nr_to_invalidate <= 0) +@@ -2150,52 +2098,53 @@ static int invalidate_one_bucket(struct btree_trans *trans, + "lru key points to nonexistent device:bucket %llu:%llu", + bucket.inode, bucket.offset)) + return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); +- goto out; ++ return 0; + } + + if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) + return 0; + +- struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, +- BTREE_ID_alloc, bucket, +- BTREE_ITER_cached); +- ret = bkey_err(alloc_k); +- if (ret) +- return ret; ++ { ++ CLASS(btree_iter, alloc_iter)(trans, BTREE_ID_alloc, bucket, BTREE_ITER_cached); ++ struct bkey_s_c alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(alloc_k); ++ if (ret) ++ return ret; + +- struct bch_alloc_v4 a_convert; +- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); ++ struct bch_alloc_v4 a_convert; ++ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + +- /* We expect harmless races here due to the btree write buffer: */ +- if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) +- goto out; ++ /* We expect harmless races here due to the btree write buffer: */ ++ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) ++ return 0; + +- /* +- * Impossible since alloc_lru_idx_read() only returns nonzero if the +- * bucket is supposed to be on the cached bucket LRU (i.e. +- * BCH_DATA_cached) +- * +- * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 +- */ +- BUG_ON(a->data_type != BCH_DATA_cached); +- BUG_ON(a->dirty_sectors); ++ /* ++ * Impossible since alloc_lru_idx_read() only returns nonzero if the ++ * bucket is supposed to be on the cached bucket LRU (i.e. ++ * BCH_DATA_cached) ++ * ++ * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 ++ */ ++ BUG_ON(a->data_type != BCH_DATA_cached); ++ BUG_ON(a->dirty_sectors); + +- if (!a->cached_sectors) +- bch_err(c, "invalidating empty bucket, confused"); ++ if (!a->cached_sectors) { ++ bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, ++ true, last_flushed); ++ return 0; ++ } + +- unsigned cached_sectors = a->cached_sectors; +- u8 gen = a->gen; ++ unsigned cached_sectors = a->cached_sectors; ++ u8 gen = a->gen; + +- ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); +- if (ret) +- goto out; ++ ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); ++ if (ret) ++ return ret; + +- trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); +- --*nr_to_invalidate; +-out: ++ trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); ++ --*nr_to_invalidate; ++ } + fsck_err: +- bch2_trans_iter_exit(trans, &alloc_iter); +- printbuf_exit(&buf); + return ret; + } + +@@ -2204,9 +2153,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter + { + struct bkey_s_c k; + again: +- k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); ++ k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + if (!k.k && !*wrapped) { +- bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); ++ bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + *wrapped = true; + goto again; + } +@@ -2218,7 +2167,7 @@ static void bch2_do_invalidates_work(struct work_struct *work) + { + struct bch_dev *ca = container_of(work, struct bch_dev, invalidate_work); + struct bch_fs *c = ca->fs; +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + int ret = 0; + + struct bkey_buf last_flushed; +@@ -2256,32 +2205,31 @@ static void bch2_do_invalidates_work(struct work_struct *work) + if (ret) + break; + +- bch2_btree_iter_advance(trans, &iter); ++ bch2_btree_iter_advance(&iter); + } +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + err: +- bch2_trans_put(trans); +- percpu_ref_put(&ca->io_ref[WRITE]); + bch2_bkey_buf_exit(&last_flushed, c); +- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); + } + + void bch2_dev_do_invalidates(struct bch_dev *ca) + { + struct bch_fs *c = ca->fs; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) + return; + +- if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) ++ if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) + goto put_ref; + + if (queue_work(c->write_ref_wq, &ca->invalidate_work)) + return; + +- percpu_ref_put(&ca->io_ref[WRITE]); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); + put_ref: +- bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); + } + + void bch2_do_invalidates(struct bch_fs *c) +@@ -2293,18 +2241,17 @@ void bch2_do_invalidates(struct bch_fs *c) + int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + u64 bucket_start, u64 bucket_end) + { +- struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter; + struct bkey_s_c k; + struct bkey hole; + struct bpos end = POS(ca->dev_idx, bucket_end); +- struct bch_member *m; + unsigned long last_updated = jiffies; + int ret; + + BUG_ON(bucket_start > bucket_end); + BUG_ON(bucket_end > ca->mi.nbuckets); + ++ CLASS(btree_trans, trans)(c); + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + POS(ca->dev_idx, max_t(u64, ca->mi.first_bucket, bucket_start)), + BTREE_ITER_prefetch); +@@ -2326,7 +2273,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + break; + } + +- k = bch2_get_key_or_hole(trans, &iter, end, &hole); ++ k = bch2_get_key_or_hole(&iter, end, &hole); + ret = bkey_err(k); + if (ret) + goto bkey_err; +@@ -2345,7 +2292,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + if (ret) + goto bkey_err; + +- bch2_btree_iter_advance(trans, &iter); ++ bch2_btree_iter_advance(&iter); + } else { + struct bkey_i *freespace; + +@@ -2365,7 +2312,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + if (ret) + goto bkey_err; + +- bch2_btree_iter_set_pos(trans, &iter, k.k->p); ++ bch2_btree_iter_set_pos(&iter, k.k->p); + } + bkey_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -2374,32 +2321,32 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + break; + } + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); ++ bch2_trans_iter_exit(&iter); + + if (ret < 0) { + bch_err_msg(ca, ret, "initializing free space"); + return ret; + } + +- mutex_lock(&c->sb_lock); +- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); +- SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) { ++ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); ++ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); ++ } + + return 0; + } + + int bch2_fs_freespace_init(struct bch_fs *c) + { +- int ret = 0; +- bool doing_init = false; ++ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) ++ return 0; + + /* + * We can crash during the device add path, so we need to check this on + * every mount: + */ + ++ bool doing_init = false; + for_each_member_device(c, ca) { + if (ca->mi.freespace_initialized) + continue; +@@ -2409,7 +2356,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) + doing_init = true; + } + +- ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); ++ int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + if (ret) { + bch2_dev_put(ca); + bch_err_fn(c, ret); +@@ -2418,9 +2365,8 @@ int bch2_fs_freespace_init(struct bch_fs *c) + } + + if (doing_init) { +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + bch_verbose(c, "done initializing freespace"); + } + +@@ -2439,8 +2385,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) + * We clear the LRU and need_discard btrees first so that we don't race + * with bch2_do_invalidates() and bch2_do_discards() + */ +- ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: +- bch2_btree_delete_range(c, BTREE_ID_lru, start, end, ++ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + BTREE_TRIGGER_norun, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_norun, NULL) ?: +@@ -2480,7 +2425,7 @@ static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + out: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -2503,15 +2448,15 @@ void bch2_recalc_capacity(struct bch_fs *c) + + lockdep_assert_held(&c->state_lock); + +- for_each_online_member(c, ca) { +- struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; ++ guard(rcu)(); ++ for_each_member_device_rcu(c, ca, NULL) { ++ struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); ++ if (bdev) ++ ra_pages += bdev->bd_disk->bdi->ra_pages; + +- ra_pages += bdi->ra_pages; +- } +- +- bch2_set_ra_pages(c, ra_pages); ++ if (ca->mi.state != BCH_MEMBER_STATE_rw) ++ continue; + +- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { + u64 dev_reserve = 0; + + /* +@@ -2549,6 +2494,8 @@ void bch2_recalc_capacity(struct bch_fs *c) + ca->mi.bucket_size); + } + ++ bch2_set_ra_pages(c, ra_pages); ++ + gc_reserve = c->opts.gc_reserve_bytes + ? c->opts.gc_reserve_bytes >> 9 + : div64_u64(capacity * c->opts.gc_reserve_percent, 100); +@@ -2570,7 +2517,8 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) + { + u64 ret = U64_MAX; + +- for_each_rw_member(c, ca) ++ guard(rcu)(); ++ for_each_rw_member_rcu(c, ca) + ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); + return ret; + } +@@ -2578,19 +2526,31 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) + static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) + { + struct open_bucket *ob; +- bool ret = false; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { +- spin_lock(&ob->lock); +- if (ob->valid && !ob->on_partial_list && +- ob->dev == ca->dev_idx) +- ret = true; +- spin_unlock(&ob->lock); ++ scoped_guard(spinlock, &ob->lock) { ++ if (ob->valid && !ob->on_partial_list && ++ ob->dev == ca->dev_idx) ++ return true; ++ } + } + +- return ret; ++ return false; ++} ++ ++void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) ++{ ++ /* BCH_DATA_free == all rw devs */ ++ ++ for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ if (rw && ++ (i == BCH_DATA_free || ++ (ca->mi.data_allowed & BIT(i)))) ++ set_bit(ca->dev_idx, c->rw_devs[i].d); ++ else ++ clear_bit(ca->dev_idx, c->rw_devs[i].d); + } + + /* device goes ro: */ +@@ -2599,9 +2559,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) + lockdep_assert_held(&c->state_lock); + + /* First, remove device from allocation groups: */ +- +- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) +- clear_bit(ca->dev_idx, c->rw_devs[i].d); ++ bch2_dev_allocator_set_rw(c, ca, false); + + c->rw_devs_change_count++; + +@@ -2635,10 +2593,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) + { + lockdep_assert_held(&c->state_lock); + +- for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) +- if (ca->mi.data_allowed & (1 << i)) +- set_bit(ca->dev_idx, c->rw_devs[i].d); +- ++ bch2_dev_allocator_set_rw(c, ca, true); + c->rw_devs_change_count++; + } + +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 34b3d6ac4fbb..c2e8482fbbe6 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -13,11 +13,9 @@ + + static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) + { +- rcu_read_lock(); ++ guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); +- bool ret = ca && bucket_valid(ca, pos.offset); +- rcu_read_unlock(); +- return ret; ++ return ca && bucket_valid(ca, pos.offset); + } + + static inline u64 bucket_to_u64(struct bpos bucket) +@@ -253,6 +251,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); + void bch2_alloc_v4_swab(struct bkey_s); + void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ + .key_validate = bch2_alloc_v1_validate, \ +@@ -277,7 +276,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ + .key_validate = bch2_alloc_v4_validate, \ +- .val_to_text = bch2_alloc_to_text, \ ++ .val_to_text = bch2_alloc_v4_to_text, \ + .swab = bch2_alloc_v4_swab, \ + .trigger = bch2_trigger_alloc, \ + .min_val_size = 48, \ +@@ -310,7 +309,14 @@ int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); + +-int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); ++int __bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, ++ enum bch_fsck_flags); ++ ++static inline int bch2_check_discard_freespace_key_async(struct btree_trans *trans, struct btree_iter *iter, u8 *gen) ++{ ++ return __bch2_check_discard_freespace_key(trans, iter, gen, FSCK_ERR_NO_LOG); ++} ++ + int bch2_check_alloc_info(struct bch_fs *); + int bch2_check_alloc_to_lru_refs(struct bch_fs *); + void bch2_dev_do_discards(struct bch_dev *); +@@ -350,6 +356,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); + void bch2_recalc_capacity(struct bch_fs *); + u64 bch2_min_rw_member_capacity(struct bch_fs *); + ++void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); + void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); + void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); + +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 7ec022e9361a..0a5b3d31d52c 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = { + + void bch2_reset_alloc_cursors(struct bch_fs *c) + { +- rcu_read_lock(); ++ guard(rcu)(); + for_each_member_device_rcu(c, ca, NULL) + memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); +- rcu_read_unlock(); + } + + static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) +@@ -107,20 +106,20 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + return; + } + +- spin_lock(&ob->lock); +- ob->valid = false; +- ob->data_type = 0; +- spin_unlock(&ob->lock); ++ scoped_guard(spinlock, &ob->lock) { ++ ob->valid = false; ++ ob->data_type = 0; ++ } + +- spin_lock(&c->freelist_lock); +- bch2_open_bucket_hash_remove(c, ob); ++ scoped_guard(spinlock, &c->freelist_lock) { ++ bch2_open_bucket_hash_remove(c, ob); + +- ob->freelist = c->open_buckets_freelist; +- c->open_buckets_freelist = ob - c->open_buckets; ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; + +- c->open_buckets_nr_free++; +- ca->nr_open_buckets--; +- spin_unlock(&c->freelist_lock); ++ c->open_buckets_nr_free++; ++ ca->nr_open_buckets--; ++ } + + closure_wake_up(&c->open_buckets_wait); + } +@@ -154,7 +153,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) + + static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) + { +- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) ++ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) + return false; + + return bch2_is_superblock_bucket(ca, b); +@@ -165,26 +164,25 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) + BUG_ON(c->open_buckets_partial_nr >= + ARRAY_SIZE(c->open_buckets_partial)); + +- spin_lock(&c->freelist_lock); +- rcu_read_lock(); +- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; +- rcu_read_unlock(); ++ scoped_guard(spinlock, &c->freelist_lock) { ++ guard(rcu)(); ++ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; + +- ob->on_partial_list = true; +- c->open_buckets_partial[c->open_buckets_partial_nr++] = +- ob - c->open_buckets; +- spin_unlock(&c->freelist_lock); ++ ob->on_partial_list = true; ++ c->open_buckets_partial[c->open_buckets_partial_nr++] = ++ ob - c->open_buckets; ++ } + + closure_wake_up(&c->open_buckets_wait); + closure_wake_up(&c->freelist_wait); + } + + static inline bool may_alloc_bucket(struct bch_fs *c, +- struct bpos bucket, +- struct bucket_alloc_state *s) ++ struct alloc_request *req, ++ struct bpos bucket) + { + if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { +- s->skipped_open++; ++ req->counters.skipped_open++; + return false; + } + +@@ -193,60 +191,59 @@ static inline bool may_alloc_bucket(struct bch_fs *c, + bucket.inode, bucket.offset); + if (journal_seq_ready > c->journal.flushed_seq_ondisk) { + if (journal_seq_ready > c->journal.flushing_seq) +- s->need_journal_commit++; +- s->skipped_need_journal_commit++; ++ req->counters.need_journal_commit++; ++ req->counters.skipped_need_journal_commit++; + return false; + } + + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { +- s->skipped_nocow++; ++ req->counters.skipped_nocow++; + return false; + } + + return true; + } + +-static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, ++ struct alloc_request *req, + u64 bucket, u8 gen, +- enum bch_watermark watermark, +- struct bucket_alloc_state *s, + struct closure *cl) + { ++ struct bch_dev *ca = req->ca; ++ + if (unlikely(is_superblock_bucket(c, ca, bucket))) + return NULL; + + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { +- s->skipped_nouse++; ++ req->counters.skipped_nouse++; + return NULL; + } + +- spin_lock(&c->freelist_lock); ++ guard(spinlock)(&c->freelist_lock); + +- if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { ++ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); +- spin_unlock(&c->freelist_lock); +- return ERR_PTR(-BCH_ERR_open_buckets_empty); ++ return ERR_PTR(bch_err_throw(c, open_buckets_empty)); + } + + /* Recheck under lock: */ + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { +- spin_unlock(&c->freelist_lock); +- s->skipped_open++; ++ req->counters.skipped_open++; + return NULL; + } + + struct open_bucket *ob = bch2_open_bucket_alloc(c); + +- spin_lock(&ob->lock); +- ob->valid = true; +- ob->sectors_free = ca->mi.bucket_size; +- ob->dev = ca->dev_idx; +- ob->gen = gen; +- ob->bucket = bucket; +- spin_unlock(&ob->lock); ++ scoped_guard(spinlock, &ob->lock) { ++ ob->valid = true; ++ ob->sectors_free = ca->mi.bucket_size; ++ ob->dev = ca->dev_idx; ++ ob->gen = gen; ++ ob->bucket = bucket; ++ } + + ca->nr_open_buckets++; + bch2_open_bucket_hash_add(c, ob); +@@ -254,30 +251,28 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); + track_event_change(&c->times[BCH_TIME_blocked_allocate], false); + +- spin_unlock(&c->freelist_lock); + return ob; + } + +-static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, +- enum bch_watermark watermark, +- struct bucket_alloc_state *s, ++static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, ++ struct alloc_request *req, + struct btree_iter *freespace_iter, + struct closure *cl) + { + struct bch_fs *c = trans->c; + u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); + +- if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) ++ if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) + return NULL; + + u8 gen; +- int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); ++ int ret = bch2_check_discard_freespace_key_async(trans, freespace_iter, &gen); + if (ret < 0) + return ERR_PTR(ret); + if (ret) + return NULL; + +- return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); ++ return __try_alloc_bucket(c, req, b, gen, cl); + } + + /* +@@ -285,17 +280,15 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc + */ + static noinline struct open_bucket * + bch2_bucket_alloc_early(struct btree_trans *trans, +- struct bch_dev *ca, +- enum bch_watermark watermark, +- struct bucket_alloc_state *s, ++ struct alloc_request *req, + struct closure *cl) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter, citer; +- struct bkey_s_c k, ck; ++ struct bch_dev *ca = req->ca; ++ struct bkey_s_c k; + struct open_bucket *ob = NULL; + u64 first_bucket = ca->mi.first_bucket; +- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; ++ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; + u64 alloc_start = max(first_bucket, *dev_alloc_cursor); + u64 alloc_cursor = alloc_start; + int ret; +@@ -312,24 +305,24 @@ bch2_bucket_alloc_early(struct btree_trans *trans, + again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), + BTREE_ITER_slots, k, ret) { +- u64 bucket = k.k->p.offset; ++ u64 bucket = alloc_cursor = k.k->p.offset; + + if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) + break; + +- if (s->btree_bitmap != BTREE_BITMAP_ANY && +- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, ++ if (req->btree_bitmap != BTREE_BITMAP_ANY && ++ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { +- if (s->btree_bitmap == BTREE_BITMAP_YES && ++ if (req->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + break; + + bucket = sector_to_bucket(ca, + round_up(bucket_to_sector(ca, bucket) + 1, + 1ULL << ca->mi.btree_bitmap_shift)); +- bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); +- s->buckets_seen++; +- s->skipped_mi_btree_bitmap++; ++ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); ++ req->counters.buckets_seen++; ++ req->counters.skipped_mi_btree_bitmap++; + continue; + } + +@@ -339,30 +332,23 @@ bch2_bucket_alloc_early(struct btree_trans *trans, + continue; + + /* now check the cached key to serialize concurrent allocs of the bucket */ +- ck = bch2_bkey_get_iter(trans, &citer, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached); ++ CLASS(btree_iter, citer)(trans, BTREE_ID_alloc, k.k->p, BTREE_ITER_cached|BTREE_ITER_nopreserve); ++ struct bkey_s_c ck = bch2_btree_iter_peek_slot(&citer); + ret = bkey_err(ck); + if (ret) + break; + + a = bch2_alloc_to_v4(ck, &a_convert); +- if (a->data_type != BCH_DATA_free) +- goto next; +- +- s->buckets_seen++; ++ if (a->data_type == BCH_DATA_free) { ++ req->counters.buckets_seen++; + +- ob = may_alloc_bucket(c, k.k->p, s) +- ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, +- watermark, s, cl) +- : NULL; +-next: +- bch2_set_btree_iter_dontneed(trans, &citer); +- bch2_trans_iter_exit(trans, &citer); +- if (ob) +- break; ++ ob = may_alloc_bucket(c, req, k.k->p) ++ ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) ++ : NULL; ++ if (ob) ++ break; ++ } + } +- bch2_trans_iter_exit(trans, &iter); +- +- alloc_cursor = iter.pos.offset; + + if (!ob && ret) + ob = ERR_PTR(ret); +@@ -378,15 +364,13 @@ bch2_bucket_alloc_early(struct btree_trans *trans, + } + + static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, +- struct bch_dev *ca, +- enum bch_watermark watermark, +- struct bucket_alloc_state *s, +- struct closure *cl) ++ struct alloc_request *req, ++ struct closure *cl) + { +- struct btree_iter iter; ++ struct bch_dev *ca = req->ca; + struct bkey_s_c k; + struct open_bucket *ob = NULL; +- u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; ++ u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); + u64 alloc_cursor = alloc_start; + int ret; +@@ -402,13 +386,13 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + iter.k.size = iter.k.p.offset - iter.pos.offset; + + while (iter.k.size) { +- s->buckets_seen++; ++ req->counters.buckets_seen++; + + u64 bucket = iter.pos.offset & ~(~0ULL << 56); +- if (s->btree_bitmap != BTREE_BITMAP_ANY && +- s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, ++ if (req->btree_bitmap != BTREE_BITMAP_ANY && ++ req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { +- if (s->btree_bitmap == BTREE_BITMAP_YES && ++ if (req->btree_bitmap == BTREE_BITMAP_YES && + bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) + goto fail; + +@@ -417,16 +401,16 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + 1ULL << ca->mi.btree_bitmap_shift)); + alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); + +- bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); +- s->skipped_mi_btree_bitmap++; ++ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); ++ req->counters.skipped_mi_btree_bitmap++; + goto next; + } + +- ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); ++ ob = try_alloc_bucket(trans, req, &iter, cl); + if (ob) { + if (!IS_ERR(ob)) + *dev_alloc_cursor = iter.pos.offset; +- bch2_set_btree_iter_dontneed(trans, &iter); ++ bch2_set_btree_iter_dontneed(&iter); + break; + } + +@@ -438,7 +422,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + break; + } + fail: +- bch2_trans_iter_exit(trans, &iter); + + BUG_ON(ob && ret); + +@@ -453,33 +436,30 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + return ob; + } + +-static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, +- enum bch_watermark watermark, +- enum bch_data_type data_type, ++static noinline void trace_bucket_alloc2(struct bch_fs *c, ++ struct alloc_request *req, + struct closure *cl, +- struct bch_dev_usage *usage, +- struct bucket_alloc_state *s, + struct open_bucket *ob) + { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + printbuf_tabstop_push(&buf, 24); + +- prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); +- prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); +- prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); ++ prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); ++ prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); ++ prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); + prt_printf(&buf, "blocking\t%u\n", cl != NULL); +- prt_printf(&buf, "free\t%llu\n", usage->buckets[BCH_DATA_free]); +- prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); +- prt_printf(&buf, "copygc_wait\t%lu/%lli\n", ++ prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); ++ prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); ++ prt_printf(&buf, "copygc_wait\t%llu/%lli\n", + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); +- prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); +- prt_printf(&buf, "open\t%llu\n", s->skipped_open); +- prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); +- prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); +- prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); +- prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); ++ prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); ++ prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); ++ prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); ++ prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); ++ prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); ++ prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); + + if (!IS_ERR(ob)) { + prt_printf(&buf, "allocated\t%llu\n", ob->bucket); +@@ -488,54 +468,48 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, + prt_printf(&buf, "err\t%s\n", bch2_err_str(PTR_ERR(ob))); + trace_bucket_alloc_fail(c, buf.buf); + } +- +- printbuf_exit(&buf); + } + + /** + * bch2_bucket_alloc_trans - allocate a single bucket from a specific device + * @trans: transaction object +- * @ca: device to allocate from +- * @watermark: how important is this allocation? +- * @data_type: BCH_DATA_journal, btree, user... ++ * @req: state for the entire allocation + * @cl: if not NULL, closure to be used to wait if buckets not available + * @nowait: if true, do not wait for buckets to become available +- * @usage: for secondarily also returning the current device usage + * + * Returns: an open_bucket on success, or an ERR_PTR() on failure. + */ + static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, +- struct bch_dev *ca, +- enum bch_watermark watermark, +- enum bch_data_type data_type, +- struct closure *cl, +- bool nowait, +- struct bch_dev_usage *usage) ++ struct alloc_request *req, ++ struct closure *cl, ++ bool nowait) + { + struct bch_fs *c = trans->c; ++ struct bch_dev *ca = req->ca; + struct open_bucket *ob = NULL; + bool freespace = READ_ONCE(ca->mi.freespace_initialized); + u64 avail; +- struct bucket_alloc_state s = { +- .btree_bitmap = data_type == BCH_DATA_btree, +- }; + bool waiting = nowait; ++ ++ req->btree_bitmap = req->data_type == BCH_DATA_btree; ++ memset(&req->counters, 0, sizeof(req->counters)); + again: +- bch2_dev_usage_read_fast(ca, usage); +- avail = dev_buckets_free(ca, *usage, watermark); ++ bch2_dev_usage_read_fast(ca, &req->usage); ++ avail = dev_buckets_free(ca, req->usage, req->watermark); + +- if (usage->buckets[BCH_DATA_need_discard] > avail) ++ if (req->usage.buckets[BCH_DATA_need_discard] > ++ min(avail, ca->mi.nbuckets >> 7)) + bch2_dev_do_discards(ca); + +- if (usage->buckets[BCH_DATA_need_gc_gens] > avail) ++ if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) + bch2_gc_gens_async(c); + +- if (should_invalidate_buckets(ca, *usage)) ++ if (should_invalidate_buckets(ca, req->usage)) + bch2_dev_do_invalidates(ca); + + if (!avail) { +- if (watermark > BCH_WATERMARK_normal && +- c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) ++ if (req->watermark > BCH_WATERMARK_normal && ++ c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) + goto alloc; + + if (cl && !waiting) { +@@ -546,7 +520,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + + track_event_change(&c->times[BCH_TIME_blocked_allocate], true); + +- ob = ERR_PTR(-BCH_ERR_freelist_empty); ++ ob = ERR_PTR(bch_err_throw(c, freelist_empty)); + goto err; + } + +@@ -554,27 +528,27 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + closure_wake_up(&c->freelist_wait); + alloc: + ob = likely(freespace) +- ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) +- : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); ++ ? bch2_bucket_alloc_freelist(trans, req, cl) ++ : bch2_bucket_alloc_early(trans, req, cl); + +- if (s.need_journal_commit * 2 > avail) ++ if (req->counters.need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); + +- if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { +- s.btree_bitmap = BTREE_BITMAP_ANY; ++ if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { ++ req->btree_bitmap = BTREE_BITMAP_ANY; + goto alloc; + } + +- if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { ++ if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { + freespace = false; + goto alloc; + } + err: + if (!ob) +- ob = ERR_PTR(-BCH_ERR_no_buckets_found); ++ ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); + + if (!IS_ERR(ob)) +- ob->data_type = data_type; ++ ob->data_type = req->data_type; + + if (!IS_ERR(ob)) + count_event(c, bucket_alloc); +@@ -584,7 +558,7 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + if (!IS_ERR(ob) + ? trace_bucket_alloc_enabled() + : trace_bucket_alloc_fail_enabled()) +- trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); ++ trace_bucket_alloc2(c, req, cl, ob); + + return ob; + } +@@ -594,12 +568,16 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum bch_data_type data_type, + struct closure *cl) + { +- struct bch_dev_usage usage; + struct open_bucket *ob; ++ struct alloc_request req = { ++ .watermark = watermark, ++ .data_type = data_type, ++ .ca = ca, ++ }; + +- bch2_trans_do(c, +- PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, +- data_type, cl, false, &usage))); ++ CLASS(btree_trans, trans)(c); ++ lockrestart_do(trans, ++ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); + return ob; + } + +@@ -611,18 +589,18 @@ static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + + #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) + +-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, +- struct dev_stripe_state *stripe, +- struct bch_devs_mask *devs) ++void bch2_dev_alloc_list(struct bch_fs *c, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs, ++ struct dev_alloc_list *ret) + { +- struct dev_alloc_list ret = { .nr = 0 }; +- unsigned i; ++ ret->nr = 0; + ++ unsigned i; + for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) +- ret.data[ret.nr++] = i; ++ ret->data[ret->nr++] = i; + +- bubble_sort(ret.data, ret.nr, dev_stripe_cmp); +- return ret; ++ bubble_sort(ret->data, ret->nr, dev_stripe_cmp); + } + + static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ +@@ -693,64 +671,53 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, + } + + static int add_new_bucket(struct bch_fs *c, +- struct open_buckets *ptrs, +- struct bch_devs_mask *devs_may_alloc, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- struct open_bucket *ob) ++ struct alloc_request *req, ++ struct open_bucket *ob) + { + unsigned durability = ob_dev(c, ob)->mi.durability; + +- BUG_ON(*nr_effective >= nr_replicas); ++ BUG_ON(req->nr_effective >= req->nr_replicas); + +- __clear_bit(ob->dev, devs_may_alloc->d); +- *nr_effective += durability; +- *have_cache |= !durability; ++ __clear_bit(ob->dev, req->devs_may_alloc.d); ++ req->nr_effective += durability; ++ req->have_cache |= !durability; + +- ob_push(c, ptrs, ob); ++ ob_push(c, &req->ptrs, ob); + +- if (*nr_effective >= nr_replicas) ++ if (req->nr_effective >= req->nr_replicas) + return 1; + if (ob->ec) + return 1; + return 0; + } + +-int bch2_bucket_alloc_set_trans(struct btree_trans *trans, +- struct open_buckets *ptrs, +- struct dev_stripe_state *stripe, +- struct bch_devs_mask *devs_may_alloc, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- enum bch_write_flags flags, +- enum bch_data_type data_type, +- enum bch_watermark watermark, +- struct closure *cl) ++inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, ++ struct alloc_request *req, ++ struct dev_stripe_state *stripe, ++ struct closure *cl) + { + struct bch_fs *c = trans->c; +- int ret = -BCH_ERR_insufficient_devices; ++ int ret = 0; ++ ++ BUG_ON(req->nr_effective >= req->nr_replicas); + +- BUG_ON(*nr_effective >= nr_replicas); ++ bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); + +- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); +- darray_for_each(devs_sorted, i) { +- struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); +- if (!ca) ++ darray_for_each(req->devs_sorted, i) { ++ req->ca = bch2_dev_tryget_noerror(c, *i); ++ if (!req->ca) + continue; + +- if (!ca->mi.durability && *have_cache) { +- bch2_dev_put(ca); ++ if (!req->ca->mi.durability && req->have_cache) { ++ bch2_dev_put(req->ca); + continue; + } + +- struct bch_dev_usage usage; +- struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, +- cl, flags & BCH_WRITE_alloc_nowait, &usage); ++ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, ++ req->flags & BCH_WRITE_alloc_nowait); + if (!IS_ERR(ob)) +- bch2_dev_stripe_increment_inlined(ca, stripe, &usage); +- bch2_dev_put(ca); ++ bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); ++ bch2_dev_put(req->ca); + + if (IS_ERR(ob)) { + ret = PTR_ERR(ob); +@@ -759,15 +726,16 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + continue; + } + +- if (add_new_bucket(c, ptrs, devs_may_alloc, +- nr_replicas, nr_effective, +- have_cache, ob)) { +- ret = 0; ++ ret = add_new_bucket(c, req, ob); ++ if (ret) + break; +- } + } + +- return ret; ++ if (ret == 1) ++ return 0; ++ if (ret) ++ return ret; ++ return bch_err_throw(c, insufficient_devices); + } + + /* Allocate from stripes: */ +@@ -779,35 +747,28 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + */ + + static int bucket_alloc_from_stripe(struct btree_trans *trans, +- struct open_buckets *ptrs, +- struct write_point *wp, +- struct bch_devs_mask *devs_may_alloc, +- u16 target, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- enum bch_watermark watermark, +- enum bch_write_flags flags, +- struct closure *cl) ++ struct alloc_request *req, ++ struct closure *cl) + { + struct bch_fs *c = trans->c; + int ret = 0; + +- if (nr_replicas < 2) ++ if (req->nr_replicas < 2) + return 0; + +- if (ec_open_bucket(c, ptrs)) ++ if (ec_open_bucket(c, &req->ptrs)) + return 0; + + struct ec_stripe_head *h = +- bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); ++ bch2_ec_stripe_head_get(trans, req, 0, cl); + if (IS_ERR(h)) + return PTR_ERR(h); + if (!h) + return 0; + +- struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); +- darray_for_each(devs_sorted, i) ++ bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); ++ ++ darray_for_each(req->devs_sorted, i) + for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + if (!h->s->blocks[ec_idx]) + continue; +@@ -818,9 +779,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, + ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); + +- ret = add_new_bucket(c, ptrs, devs_may_alloc, +- nr_replicas, nr_effective, +- have_cache, ob); ++ ret = add_new_bucket(c, req, ob); + goto out; + } + } +@@ -832,86 +791,67 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, + /* Sector allocator */ + + static bool want_bucket(struct bch_fs *c, +- struct write_point *wp, +- struct bch_devs_mask *devs_may_alloc, +- bool *have_cache, bool ec, ++ struct alloc_request *req, + struct open_bucket *ob) + { + struct bch_dev *ca = ob_dev(c, ob); + +- if (!test_bit(ob->dev, devs_may_alloc->d)) ++ if (!test_bit(ob->dev, req->devs_may_alloc.d)) + return false; + +- if (ob->data_type != wp->data_type) ++ if (ob->data_type != req->wp->data_type) + return false; + + if (!ca->mi.durability && +- (wp->data_type == BCH_DATA_btree || ec || *have_cache)) ++ (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) + return false; + +- if (ec != (ob->ec != NULL)) ++ if (req->ec != (ob->ec != NULL)) + return false; + + return true; + } + + static int bucket_alloc_set_writepoint(struct bch_fs *c, +- struct open_buckets *ptrs, +- struct write_point *wp, +- struct bch_devs_mask *devs_may_alloc, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- bool ec) ++ struct alloc_request *req) + { +- struct open_buckets ptrs_skip = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + int ret = 0; + +- open_bucket_for_each(c, &wp->ptrs, ob, i) { +- if (!ret && want_bucket(c, wp, devs_may_alloc, +- have_cache, ec, ob)) +- ret = add_new_bucket(c, ptrs, devs_may_alloc, +- nr_replicas, nr_effective, +- have_cache, ob); ++ req->scratch_ptrs.nr = 0; ++ ++ open_bucket_for_each(c, &req->wp->ptrs, ob, i) { ++ if (!ret && want_bucket(c, req, ob)) ++ ret = add_new_bucket(c, req, ob); + else +- ob_push(c, &ptrs_skip, ob); ++ ob_push(c, &req->scratch_ptrs, ob); + } +- wp->ptrs = ptrs_skip; ++ req->wp->ptrs = req->scratch_ptrs; + + return ret; + } + + static int bucket_alloc_set_partial(struct bch_fs *c, +- struct open_buckets *ptrs, +- struct write_point *wp, +- struct bch_devs_mask *devs_may_alloc, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, bool ec, +- enum bch_watermark watermark) ++ struct alloc_request *req) + { +- int i, ret = 0; +- + if (!c->open_buckets_partial_nr) + return 0; + +- spin_lock(&c->freelist_lock); ++ guard(spinlock)(&c->freelist_lock); + + if (!c->open_buckets_partial_nr) +- goto unlock; ++ return 0; + +- for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { ++ for (int i = c->open_buckets_partial_nr - 1; i >= 0; --i) { + struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; + +- if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { ++ if (want_bucket(c, req, ob)) { + struct bch_dev *ca = ob_dev(c, ob); +- struct bch_dev_usage usage; + u64 avail; + +- bch2_dev_usage_read_fast(ca, &usage); +- avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; ++ bch2_dev_usage_read_fast(ca, &req->usage); ++ avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; + if (!avail) + continue; + +@@ -920,78 +860,54 @@ static int bucket_alloc_set_partial(struct bch_fs *c, + i); + ob->on_partial_list = false; + +- rcu_read_lock(); +- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; +- rcu_read_unlock(); ++ scoped_guard(rcu) ++ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; + +- ret = add_new_bucket(c, ptrs, devs_may_alloc, +- nr_replicas, nr_effective, +- have_cache, ob); ++ int ret = add_new_bucket(c, req, ob); + if (ret) +- break; ++ return ret; + } + } +-unlock: +- spin_unlock(&c->freelist_lock); +- return ret; ++ ++ return 0; + } + + static int __open_bucket_add_buckets(struct btree_trans *trans, +- struct open_buckets *ptrs, +- struct write_point *wp, +- struct bch_devs_list *devs_have, +- u16 target, +- bool erasure_code, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- enum bch_watermark watermark, +- enum bch_write_flags flags, +- struct closure *_cl) ++ struct alloc_request *req, ++ struct closure *_cl) + { + struct bch_fs *c = trans->c; +- struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; + unsigned i; + int ret; + +- devs = target_rw_devs(c, wp->data_type, target); ++ req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); + + /* Don't allocate from devices we already have pointers to: */ +- darray_for_each(*devs_have, i) +- __clear_bit(*i, devs.d); ++ darray_for_each(*req->devs_have, i) ++ __clear_bit(*i, req->devs_may_alloc.d); + +- open_bucket_for_each(c, ptrs, ob, i) +- __clear_bit(ob->dev, devs.d); ++ open_bucket_for_each(c, &req->ptrs, ob, i) ++ __clear_bit(ob->dev, req->devs_may_alloc.d); + +- ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, +- nr_replicas, nr_effective, +- have_cache, erasure_code); ++ ret = bucket_alloc_set_writepoint(c, req); + if (ret) + return ret; + +- ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, +- nr_replicas, nr_effective, +- have_cache, erasure_code, watermark); ++ ret = bucket_alloc_set_partial(c, req); + if (ret) + return ret; + +- if (erasure_code) { +- ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, +- target, +- nr_replicas, nr_effective, +- have_cache, +- watermark, flags, _cl); ++ if (req->ec) { ++ ret = bucket_alloc_from_stripe(trans, req, _cl); + } else { + retry_blocking: + /* + * Try nonblocking first, so that if one device is full we'll try from + * other devices: + */ +- ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, +- nr_replicas, nr_effective, have_cache, +- flags, wp->data_type, watermark, cl); ++ ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && +@@ -1005,38 +921,27 @@ static int __open_bucket_add_buckets(struct btree_trans *trans, + } + + static int open_bucket_add_buckets(struct btree_trans *trans, +- struct open_buckets *ptrs, +- struct write_point *wp, +- struct bch_devs_list *devs_have, +- u16 target, +- unsigned erasure_code, +- unsigned nr_replicas, +- unsigned *nr_effective, +- bool *have_cache, +- enum bch_watermark watermark, +- enum bch_write_flags flags, +- struct closure *cl) ++ struct alloc_request *req, ++ struct closure *cl) + { + int ret; + +- if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { +- ret = __open_bucket_add_buckets(trans, ptrs, wp, +- devs_have, target, erasure_code, +- nr_replicas, nr_effective, have_cache, +- watermark, flags, cl); ++ if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { ++ ret = __open_bucket_add_buckets(trans, req, cl); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_operation_blocked) || + bch2_err_matches(ret, BCH_ERR_freelist_empty) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + return ret; +- if (*nr_effective >= nr_replicas) ++ if (req->nr_effective >= req->nr_replicas) + return 0; + } + +- ret = __open_bucket_add_buckets(trans, ptrs, wp, +- devs_have, target, false, +- nr_replicas, nr_effective, have_cache, +- watermark, flags, cl); ++ bool ec = false; ++ swap(ec, req->ec); ++ ret = __open_bucket_add_buckets(trans, req, cl); ++ swap(ec, req->ec); ++ + return ret < 0 ? ret : 0; + } + +@@ -1060,23 +965,18 @@ static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, + return ob->ec != NULL; + } else if (ca) { + bool drop = ob->dev == ca->dev_idx; +- struct open_bucket *ob2; +- unsigned i; + + if (!drop && ob->ec) { +- unsigned nr_blocks; +- +- mutex_lock(&ob->ec->lock); +- nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; ++ guard(mutex)(&ob->ec->lock); ++ unsigned nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; + +- for (i = 0; i < nr_blocks; i++) { ++ for (unsigned i = 0; i < nr_blocks; i++) { + if (!ob->ec->blocks[i]) + continue; + +- ob2 = c->open_buckets + ob->ec->blocks[i]; ++ struct open_bucket *ob2 = c->open_buckets + ob->ec->blocks[i]; + drop |= ob2->dev == ca->dev_idx; + } +- mutex_unlock(&ob->ec->lock); + } + + return drop; +@@ -1092,14 +992,13 @@ static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + struct open_bucket *ob; + unsigned i; + +- mutex_lock(&wp->lock); ++ guard(mutex)(&wp->lock); + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (should_drop_bucket(ob, c, ca, ec)) + bch2_open_bucket_put(c, ob); + else + ob_push(c, &ptrs, ob); + wp->ptrs = ptrs; +- mutex_unlock(&wp->lock); + } + + void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, +@@ -1115,40 +1014,37 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, + bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); + bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); + +- mutex_lock(&c->btree_reserve_cache_lock); +- while (c->btree_reserve_cache_nr) { +- struct btree_alloc *a = +- &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ scoped_guard(mutex, &c->btree_reserve_cache_lock) ++ while (c->btree_reserve_cache_nr) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + +- bch2_open_buckets_put(c, &a->ob); +- } +- mutex_unlock(&c->btree_reserve_cache_lock); ++ bch2_open_buckets_put(c, &a->ob); ++ } + +- spin_lock(&c->freelist_lock); + i = 0; +- while (i < c->open_buckets_partial_nr) { +- struct open_bucket *ob = +- c->open_buckets + c->open_buckets_partial[i]; +- +- if (should_drop_bucket(ob, c, ca, ec)) { +- --c->open_buckets_partial_nr; +- swap(c->open_buckets_partial[i], +- c->open_buckets_partial[c->open_buckets_partial_nr]); +- +- ob->on_partial_list = false; +- +- rcu_read_lock(); +- bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; +- rcu_read_unlock(); +- +- spin_unlock(&c->freelist_lock); +- bch2_open_bucket_put(c, ob); +- spin_lock(&c->freelist_lock); +- } else { +- i++; ++ scoped_guard(spinlock, &c->freelist_lock) ++ while (i < c->open_buckets_partial_nr) { ++ struct open_bucket *ob = ++ c->open_buckets + c->open_buckets_partial[i]; ++ ++ if (should_drop_bucket(ob, c, ca, ec)) { ++ --c->open_buckets_partial_nr; ++ swap(c->open_buckets_partial[i], ++ c->open_buckets_partial[c->open_buckets_partial_nr]); ++ ++ ob->on_partial_list = false; ++ ++ scoped_guard(rcu) ++ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; ++ ++ spin_unlock(&c->freelist_lock); ++ bch2_open_bucket_put(c, ob); ++ spin_lock(&c->freelist_lock); ++ } else { ++ i++; ++ } + } +- } +- spin_unlock(&c->freelist_lock); + + bch2_ec_stop_dev(c, ca); + } +@@ -1167,14 +1063,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head, + { + struct write_point *wp; + +- rcu_read_lock(); ++ guard(rcu)(); + hlist_for_each_entry_rcu(wp, head, node) + if (wp->write_point == write_point) +- goto out; +- wp = NULL; +-out: +- rcu_read_unlock(); +- return wp; ++ return wp; ++ return NULL; + } + + static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) +@@ -1185,7 +1078,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) + return stranded * factor > free; + } + +-static bool try_increase_writepoints(struct bch_fs *c) ++static noinline bool try_increase_writepoints(struct bch_fs *c) + { + struct write_point *wp; + +@@ -1198,29 +1091,24 @@ static bool try_increase_writepoints(struct bch_fs *c) + return true; + } + +-static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) ++static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) + { + struct bch_fs *c = trans->c; + struct write_point *wp; + struct open_bucket *ob; + unsigned i; + +- mutex_lock(&c->write_points_hash_lock); +- if (c->write_points_nr < old_nr) { +- mutex_unlock(&c->write_points_hash_lock); +- return true; +- } +- +- if (c->write_points_nr == 1 || +- !too_many_writepoints(c, 8)) { +- mutex_unlock(&c->write_points_hash_lock); +- return false; +- } ++ scoped_guard(mutex, &c->write_points_hash_lock) { ++ if (c->write_points_nr < old_nr) ++ return true; + +- wp = c->write_points + --c->write_points_nr; ++ if (c->write_points_nr == 1 || ++ !too_many_writepoints(c, 8)) ++ return false; + +- hlist_del_rcu(&wp->node); +- mutex_unlock(&c->write_points_hash_lock); ++ wp = c->write_points + --c->write_points_nr; ++ hlist_del_rcu(&wp->node); ++ } + + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + open_bucket_for_each(c, &wp->ptrs, ob, i) +@@ -1289,26 +1177,26 @@ static struct write_point *writepoint_find(struct btree_trans *trans, + + static noinline void + deallocate_extra_replicas(struct bch_fs *c, +- struct open_buckets *ptrs, +- struct open_buckets *ptrs_no_use, +- unsigned extra_replicas) ++ struct alloc_request *req) + { +- struct open_buckets ptrs2 = { 0 }; + struct open_bucket *ob; ++ unsigned extra_replicas = req->nr_effective - req->nr_replicas; + unsigned i; + +- open_bucket_for_each(c, ptrs, ob, i) { ++ req->scratch_ptrs.nr = 0; ++ ++ open_bucket_for_each(c, &req->ptrs, ob, i) { + unsigned d = ob_dev(c, ob)->mi.durability; + + if (d && d <= extra_replicas) { + extra_replicas -= d; +- ob_push(c, ptrs_no_use, ob); ++ ob_push(c, &req->wp->ptrs, ob); + } else { +- ob_push(c, &ptrs2, ob); ++ ob_push(c, &req->scratch_ptrs, ob); + } + } + +- *ptrs = ptrs2; ++ req->ptrs = req->scratch_ptrs; + } + + /* +@@ -1327,51 +1215,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + struct write_point **wp_ret) + { + struct bch_fs *c = trans->c; +- struct write_point *wp; + struct open_bucket *ob; +- struct open_buckets ptrs; +- unsigned nr_effective, write_points_nr; +- bool have_cache; +- int ret; ++ unsigned write_points_nr; + int i; + ++ struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); ++ int ret = PTR_ERR_OR_ZERO(req); ++ if (unlikely(ret)) ++ return ret; ++ + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) + erasure_code = false; + ++ req->nr_replicas = nr_replicas; ++ req->target = target; ++ req->ec = erasure_code; ++ req->watermark = watermark; ++ req->flags = flags; ++ req->devs_have = devs_have; ++ + BUG_ON(!nr_replicas || !nr_replicas_required); + retry: +- ptrs.nr = 0; +- nr_effective = 0; +- write_points_nr = c->write_points_nr; +- have_cache = false; ++ req->ptrs.nr = 0; ++ req->nr_effective = 0; ++ req->have_cache = false; ++ write_points_nr = c->write_points_nr; + +- *wp_ret = wp = writepoint_find(trans, write_point.v); ++ *wp_ret = req->wp = writepoint_find(trans, write_point.v); ++ ++ req->data_type = req->wp->data_type; + + ret = bch2_trans_relock(trans); + if (ret) + goto err; + + /* metadata may not allocate on cache devices: */ +- if (wp->data_type != BCH_DATA_user) +- have_cache = true; ++ if (req->data_type != BCH_DATA_user) ++ req->have_cache = true; + + if (target && !(flags & BCH_WRITE_only_specified_devs)) { +- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, +- target, erasure_code, +- nr_replicas, &nr_effective, +- &have_cache, watermark, +- flags, NULL); ++ ret = open_bucket_add_buckets(trans, req, NULL); + if (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto alloc_done; + + /* Don't retry from all devices if we're out of open buckets: */ + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { +- int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, +- target, erasure_code, +- nr_replicas, &nr_effective, +- &have_cache, watermark, +- flags, cl); ++ int ret2 = open_bucket_add_buckets(trans, req, cl); + if (!ret2 || + bch2_err_matches(ret2, BCH_ERR_transaction_restart) || + bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { +@@ -1384,45 +1274,38 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + * Only try to allocate cache (durability = 0 devices) from the + * specified target: + */ +- have_cache = true; ++ req->have_cache = true; ++ req->target = 0; + +- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, +- 0, erasure_code, +- nr_replicas, &nr_effective, +- &have_cache, watermark, +- flags, cl); ++ ret = open_bucket_add_buckets(trans, req, cl); + } else { +- ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, +- target, erasure_code, +- nr_replicas, &nr_effective, +- &have_cache, watermark, +- flags, cl); ++ ret = open_bucket_add_buckets(trans, req, cl); + } + alloc_done: +- BUG_ON(!ret && nr_effective < nr_replicas); ++ BUG_ON(!ret && req->nr_effective < req->nr_replicas); + +- if (erasure_code && !ec_open_bucket(c, &ptrs)) ++ if (erasure_code && !ec_open_bucket(c, &req->ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + + if (ret == -BCH_ERR_insufficient_devices && +- nr_effective >= nr_replicas_required) ++ req->nr_effective >= nr_replicas_required) + ret = 0; + + if (ret) + goto err; + +- if (nr_effective > nr_replicas) +- deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); ++ if (req->nr_effective > req->nr_replicas) ++ deallocate_extra_replicas(c, req); + + /* Free buckets we didn't use: */ +- open_bucket_for_each(c, &wp->ptrs, ob, i) ++ open_bucket_for_each(c, &req->wp->ptrs, ob, i) + open_bucket_free_unused(c, ob); + +- wp->ptrs = ptrs; ++ req->wp->ptrs = req->ptrs; + +- wp->sectors_free = UINT_MAX; ++ req->wp->sectors_free = UINT_MAX; + +- open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ open_bucket_for_each(c, &req->wp->ptrs, ob, i) { + /* + * Ensure proper write alignment - either due to misaligned + * bucket sizes (from buggy bcachefs-tools), or writes that mix +@@ -1436,58 +1319,44 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + + ob->sectors_free = max_t(int, 0, ob->sectors_free - align); + +- wp->sectors_free = min(wp->sectors_free, ob->sectors_free); ++ req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); + } + +- wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c)); ++ req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); + + /* Did alignment use up space in an open_bucket? */ +- if (unlikely(!wp->sectors_free)) { +- bch2_alloc_sectors_done(c, wp); ++ if (unlikely(!req->wp->sectors_free)) { ++ bch2_alloc_sectors_done(c, req->wp); + goto retry; + } + +- BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); ++ BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); + + return 0; + err: +- open_bucket_for_each(c, &wp->ptrs, ob, i) +- if (ptrs.nr < ARRAY_SIZE(ptrs.v)) +- ob_push(c, &ptrs, ob); ++ open_bucket_for_each(c, &req->wp->ptrs, ob, i) ++ if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) ++ ob_push(c, &req->ptrs, ob); + else + open_bucket_free_unused(c, ob); +- wp->ptrs = ptrs; ++ req->wp->ptrs = req->ptrs; + +- mutex_unlock(&wp->lock); ++ mutex_unlock(&req->wp->lock); + + if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && + try_decrease_writepoints(trans, write_points_nr)) + goto retry; + + if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) +- ret = -BCH_ERR_bucket_alloc_blocked; ++ ret = bch_err_throw(c, bucket_alloc_blocked); + + if (cl && !(flags & BCH_WRITE_alloc_nowait) && + bch2_err_matches(ret, BCH_ERR_freelist_empty)) +- ret = -BCH_ERR_bucket_alloc_blocked; ++ ret = bch_err_throw(c, bucket_alloc_blocked); + + return ret; + } + +-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +-{ +- struct bch_dev *ca = ob_dev(c, ob); +- +- return (struct bch_extent_ptr) { +- .type = 1 << BCH_EXTENT_ENTRY_ptr, +- .gen = ob->gen, +- .dev = ob->dev, +- .offset = bucket_to_sector(ca, ob->bucket) + +- ca->mi.bucket_size - +- ob->sectors_free, +- }; +-} +- + void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, + struct bkey_i *k, unsigned sectors, + bool cached) +@@ -1573,35 +1442,25 @@ void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct ope + void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca) + { +- struct open_bucket *ob; +- +- out->atomic++; ++ guard(printbuf_atomic)(out); + +- for (ob = c->open_buckets; ++ for (struct open_bucket *ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { +- spin_lock(&ob->lock); ++ guard(spinlock)(&ob->lock); + if (ob->valid && (!ca || ob->dev == ca->dev_idx)) + bch2_open_bucket_to_text(out, c, ob); +- spin_unlock(&ob->lock); + } +- +- --out->atomic; + } + + void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) + { +- unsigned i; +- +- out->atomic++; +- spin_lock(&c->freelist_lock); ++ guard(printbuf_atomic)(out); ++ guard(spinlock)(&c->freelist_lock); + +- for (i = 0; i < c->open_buckets_partial_nr; i++) ++ for (unsigned i = 0; i < c->open_buckets_partial_nr; i++) + bch2_open_bucket_to_text(out, c, + c->open_buckets + c->open_buckets_partial[i]); +- +- spin_unlock(&c->freelist_lock); +- --out->atomic; + } + + static const char * const bch2_write_point_states[] = { +@@ -1617,6 +1476,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, + struct open_bucket *ob; + unsigned i; + ++ guard(mutex)(&wp->lock); ++ + prt_printf(out, "%lu: ", wp->write_point); + prt_human_readable_u64(out, wp->sectors_allocated << 9); + +@@ -1720,7 +1581,7 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) + + static noinline void bch2_print_allocator_stuck(struct bch_fs *c) + { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_printf(&buf, "Allocator stuck? Waited for %u seconds\n", + c->opts.allocator_stuck_timeout); +@@ -1731,12 +1592,17 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + +- for_each_online_member(c, ca) { +- prt_printf(&buf, "Dev %u:\n", ca->dev_idx); +- printbuf_indent_add(&buf, 2); +- bch2_dev_alloc_debug_to_text(&buf, ca); +- printbuf_indent_sub(&buf, 2); +- prt_newline(&buf); ++ bch2_printbuf_make_room(&buf, 4096); ++ ++ scoped_guard(rcu) { ++ guard(printbuf_atomic)(&buf); ++ for_each_online_member_rcu(c, ca) { ++ prt_printf(&buf, "Dev %u:\n", ca->dev_idx); ++ printbuf_indent_add(&buf, 2); ++ bch2_dev_alloc_debug_to_text(&buf, ca); ++ printbuf_indent_sub(&buf, 2); ++ prt_newline(&buf); ++ } + } + + prt_printf(&buf, "Copygc debug:\n"); +@@ -1750,8 +1616,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) + bch2_journal_debug_to_text(&buf, &c->journal); + printbuf_indent_sub(&buf, 2); + +- bch2_print_string_as_lines(KERN_ERR, buf.buf); +- printbuf_exit(&buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); + } + + static inline unsigned allocator_wait_timeout(struct bch_fs *c) +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index 4c1e33cf57c0..02aef66859c3 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -3,8 +3,10 @@ + #define _BCACHEFS_ALLOC_FOREGROUND_H + + #include "bcachefs.h" ++#include "buckets.h" + #include "alloc_types.h" + #include "extents.h" ++#include "io_write_types.h" + #include "sb-members.h" + + #include +@@ -23,9 +25,57 @@ struct dev_alloc_list { + u8 data[BCH_SB_MEMBERS_MAX]; + }; + +-struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, +- struct dev_stripe_state *, +- struct bch_devs_mask *); ++struct alloc_request { ++ unsigned nr_replicas; ++ unsigned target; ++ bool ec; ++ enum bch_watermark watermark; ++ enum bch_write_flags flags; ++ enum bch_data_type data_type; ++ struct bch_devs_list *devs_have; ++ struct write_point *wp; ++ ++ /* These fields are used primarily by open_bucket_add_buckets */ ++ struct open_buckets ptrs; ++ unsigned nr_effective; /* sum of @ptrs durability */ ++ bool have_cache; /* have we allocated from a 0 durability dev */ ++ struct bch_devs_mask devs_may_alloc; ++ ++ /* bch2_bucket_alloc_set_trans(): */ ++ struct dev_alloc_list devs_sorted; ++ struct bch_dev_usage usage; ++ ++ /* bch2_bucket_alloc_trans(): */ ++ struct bch_dev *ca; ++ ++ enum { ++ BTREE_BITMAP_NO, ++ BTREE_BITMAP_YES, ++ BTREE_BITMAP_ANY, ++ } btree_bitmap; ++ ++ struct { ++ u64 buckets_seen; ++ u64 skipped_open; ++ u64 skipped_need_journal_commit; ++ u64 need_journal_commit; ++ u64 skipped_nocow; ++ u64 skipped_nouse; ++ u64 skipped_mi_btree_bitmap; ++ } counters; ++ ++ unsigned scratch_nr_replicas; ++ unsigned scratch_nr_effective; ++ bool scratch_have_cache; ++ enum bch_data_type scratch_data_type; ++ struct open_buckets scratch_ptrs; ++ struct bch_devs_mask scratch_devs_may_alloc; ++}; ++ ++void bch2_dev_alloc_list(struct bch_fs *, ++ struct dev_stripe_state *, ++ struct bch_devs_mask *, ++ struct dev_alloc_list *); + void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); + + static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) +@@ -160,24 +210,16 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke + + static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) + { +- bool ret; +- + if (bch2_bucket_is_open(c, dev, bucket)) + return true; + +- spin_lock(&c->freelist_lock); +- ret = bch2_bucket_is_open(c, dev, bucket); +- spin_unlock(&c->freelist_lock); +- +- return ret; ++ guard(spinlock)(&c->freelist_lock); ++ return bch2_bucket_is_open(c, dev, bucket); + } + + enum bch_write_flags; +-int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, +- struct dev_stripe_state *, struct bch_devs_mask *, +- unsigned, unsigned *, bool *, enum bch_write_flags, +- enum bch_data_type, enum bch_watermark, +- struct closure *); ++int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, ++ struct dev_stripe_state *, struct closure *); + + int bch2_alloc_sectors_start_trans(struct btree_trans *, + unsigned, unsigned, +@@ -189,7 +231,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, + struct closure *, + struct write_point **); + +-struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); ++static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = ob_dev(c, ob); ++ ++ return (struct bch_extent_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_ptr, ++ .gen = ob->gen, ++ .dev = ob->dev, ++ .offset = bucket_to_sector(ca, ob->bucket) + ++ ca->mi.bucket_size - ++ ob->sectors_free, ++ }; ++} + + /* + * Append pointers to the space we just allocated to @k, and mark @sectors space +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +index 8f79f46c2a78..e7becdf22cba 100644 +--- a/fs/bcachefs/alloc_types.h ++++ b/fs/bcachefs/alloc_types.h +@@ -8,22 +8,6 @@ + #include "clock_types.h" + #include "fifo.h" + +-struct bucket_alloc_state { +- enum { +- BTREE_BITMAP_NO, +- BTREE_BITMAP_YES, +- BTREE_BITMAP_ANY, +- } btree_bitmap; +- +- u64 buckets_seen; +- u64 skipped_open; +- u64 skipped_need_journal_commit; +- u64 need_journal_commit; +- u64 skipped_nocow; +- u64 skipped_nouse; +- u64 skipped_mi_btree_bitmap; +-}; +- + #define BCH_WATERMARKS() \ + x(stripe) \ + x(normal) \ +diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c +new file mode 100644 +index 000000000000..ad04e5f0f056 +--- /dev/null ++++ b/fs/bcachefs/async_objs.c +@@ -0,0 +1,141 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Async obj debugging: keep asynchronous objects on (very fast) lists, make ++ * them visibile in debugfs: ++ */ ++ ++#include "bcachefs.h" ++#include "async_objs.h" ++#include "btree_io.h" ++#include "debug.h" ++#include "io_read.h" ++#include "io_write.h" ++ ++#include ++ ++static void promote_obj_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ void *obj) ++{ ++ bch2_promote_op_to_text(out, c, obj); ++} ++ ++static void rbio_obj_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ void *obj) ++{ ++ bch2_read_bio_to_text(out, c, obj); ++} ++ ++static void write_op_obj_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ void *obj) ++{ ++ bch2_write_op_to_text(out, obj); ++} ++ ++static void btree_read_bio_obj_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ void *obj) ++{ ++ struct btree_read_bio *rbio = obj; ++ bch2_btree_read_bio_to_text(out, rbio); ++} ++ ++static void btree_write_bio_obj_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ void *obj) ++{ ++ struct btree_write_bio *wbio = obj; ++ bch2_bio_to_text(out, &wbio->wbio.bio); ++} ++ ++static int bch2_async_obj_list_open(struct inode *inode, struct file *file) ++{ ++ struct async_obj_list *list = inode->i_private; ++ struct dump_iter *i; ++ ++ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); ++ if (!i) ++ return -ENOMEM; ++ ++ file->private_data = i; ++ i->from = POS_MIN; ++ i->iter = 0; ++ i->c = container_of(list, struct bch_fs, async_objs[list->idx]); ++ i->list = list; ++ i->buf = PRINTBUF; ++ return 0; ++} ++ ++static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct async_obj_list *list = i->list; ++ ssize_t ret = 0; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ struct genradix_iter iter; ++ void *obj; ++ fast_list_for_each_from(&list->list, iter, obj, i->iter) { ++ ret = bch2_debugfs_flush_buf(i); ++ if (ret) ++ return ret; ++ ++ if (!i->size) ++ break; ++ ++ list->obj_to_text(&i->buf, i->c, obj); ++ i->iter = iter.pos; ++ } ++ ++ if (i->buf.allocation_failure) ++ ret = -ENOMEM; ++ ++ if (!ret) ++ ret = bch2_debugfs_flush_buf(i); ++ ++ return ret ?: i->ret; ++} ++ ++static const struct file_operations async_obj_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_async_obj_list_open, ++ .release = bch2_dump_release, ++ .read = bch2_async_obj_list_read, ++}; ++ ++void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) ++{ ++ c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); ++ ++#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ ++ &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); ++ BCH_ASYNC_OBJ_LISTS() ++#undef x ++} ++ ++void bch2_fs_async_obj_exit(struct bch_fs *c) ++{ ++ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) ++ fast_list_exit(&c->async_objs[i].list); ++} ++ ++int bch2_fs_async_obj_init(struct bch_fs *c) ++{ ++ for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { ++ if (fast_list_init(&c->async_objs[i].list)) ++ return -BCH_ERR_ENOMEM_async_obj_init; ++ c->async_objs[i].idx = i; ++ } ++ ++#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; ++ BCH_ASYNC_OBJ_LISTS() ++#undef x ++ ++ return 0; ++} +diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h +new file mode 100644 +index 000000000000..451db4c51fb2 +--- /dev/null ++++ b/fs/bcachefs/async_objs.h +@@ -0,0 +1,45 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ASYNC_OBJS_H ++#define _BCACHEFS_ASYNC_OBJS_H ++ ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++static inline void __async_object_list_del(struct fast_list *head, unsigned *idx) ++{ ++ fast_list_remove(head, *idx); ++ *idx = 0; ++} ++ ++static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) ++{ ++ int ret = fast_list_add(head, obj); ++ *idx = ret > 0 ? ret : 0; ++ return ret < 0 ? ret : 0; ++} ++ ++#define async_object_list_del(_c, _list, idx) \ ++ __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, &idx) ++ ++#define async_object_list_add(_c, _list, obj, idx) \ ++ __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) ++ ++void bch2_fs_async_obj_debugfs_init(struct bch_fs *); ++void bch2_fs_async_obj_exit(struct bch_fs *); ++int bch2_fs_async_obj_init(struct bch_fs *); ++ ++#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ ++ ++#define async_object_list_del(_c, _n, idx) do {} while (0) ++ ++static inline int __async_object_list_add(void) ++{ ++ return 0; ++} ++#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() ++ ++static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} ++static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} ++static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } ++ ++#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ ++ ++#endif /* _BCACHEFS_ASYNC_OBJS_H */ +diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h +new file mode 100644 +index 000000000000..ed262c874ad0 +--- /dev/null ++++ b/fs/bcachefs/async_objs_types.h +@@ -0,0 +1,25 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H ++#define _BCACHEFS_ASYNC_OBJS_TYPES_H ++ ++#define BCH_ASYNC_OBJ_LISTS() \ ++ x(promote) \ ++ x(rbio) \ ++ x(write_op) \ ++ x(btree_read_bio) \ ++ x(btree_write_bio) ++ ++enum bch_async_obj_lists { ++#define x(n) BCH_ASYNC_OBJ_LIST_##n, ++ BCH_ASYNC_OBJ_LISTS() ++#undef x ++ BCH_ASYNC_OBJ_NR ++}; ++ ++struct async_obj_list { ++ struct fast_list list; ++ void (*obj_to_text)(struct printbuf *, struct bch_fs *, void *); ++ unsigned idx; ++}; ++ ++#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ +diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c +index 5f195d2280a4..45d3db41225a 100644 +--- a/fs/bcachefs/backpointers.c ++++ b/fs/bcachefs/backpointers.c +@@ -12,9 +12,20 @@ + #include "disk_accounting.h" + #include "error.h" + #include "progress.h" ++#include "recovery_passes.h" + + #include + ++static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); ++ ++static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) ++{ ++ return (struct bbpos) { ++ .btree = bp.btree_id, ++ .pos = bp.pos, ++ }; ++} ++ + int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) + { +@@ -37,17 +48,19 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke + { + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + +- rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); +- if (ca) { +- u32 bucket_offset; +- struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); +- rcu_read_unlock(); ++ struct bch_dev *ca; ++ u32 bucket_offset; ++ struct bpos bucket; ++ scoped_guard(rcu) { ++ ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); ++ if (ca) ++ bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); ++ } ++ ++ if (ca) + prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); +- } else { +- rcu_read_unlock(); ++ else + prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); +- } + + bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_str(out, " data_type="); +@@ -95,7 +108,9 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, + bool insert) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); ++ bool will_check = c->recovery.passes_to_run & ++ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); + int ret = 0; + + if (insert) { +@@ -110,9 +125,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); +- +- bch_err(c, "%s", buf.buf); +- } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { ++ } else if (!will_check) { + prt_printf(&buf, "backpointer not found when deleting\n"); + printbuf_indent_add(&buf, 2); + +@@ -128,12 +141,11 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, + bch2_bkey_val_to_text(&buf, c, orig_k); + } + +- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && +- __bch2_inconsistent_error(c, &buf)) +- ret = -BCH_ERR_erofs_unfixed_errors; ++ if (!will_check && __bch2_inconsistent_error(c, &buf)) ++ ret = bch_err_throw(c, erofs_unfixed_errors); + +- bch_err(c, "%s", buf.buf); +- printbuf_exit(&buf); ++ if (buf.buf) ++ bch_err(c, "%s", buf.buf); + return ret; + } + +@@ -142,12 +154,10 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + struct bkey_i_backpointer *bp, + bool insert) + { +- struct btree_iter bp_iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, +- bp->k.p, +- BTREE_ITER_intent| +- BTREE_ITER_slots| +- BTREE_ITER_with_updates); ++ CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp->k.p, ++ BTREE_ITER_intent| ++ BTREE_ITER_with_updates); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&bp_iter); + int ret = bkey_err(k); + if (ret) + return ret; +@@ -158,7 +168,7 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { + ret = backpointer_mod_err(trans, orig_k, bp, k, insert); + if (ret) +- goto err; ++ return ret; + } + + if (!insert) { +@@ -166,15 +176,12 @@ int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + set_bkey_val_u64s(&bp->k, 0); + } + +- ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); +-err: +- bch2_trans_iter_exit(trans, &bp_iter); +- return ret; ++ return bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); + } + + static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) + { +- return (likely(!bch2_backpointers_no_use_write_buffer) ++ return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) + ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) + : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +@@ -184,7 +191,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, + struct bkey_s_c visiting_k, + struct bkey_buf *last_flushed) + { +- return likely(!bch2_backpointers_no_use_write_buffer) ++ return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) + ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) + : 0; + } +@@ -196,7 +203,7 @@ static int backpointer_target_not_found(struct btree_trans *trans, + bool commit) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + /* +@@ -232,7 +239,7 @@ static int backpointer_target_not_found(struct btree_trans *trans, + "%s", buf.buf)) { + ret = bch2_backpointer_del(trans, bp.k->p); + if (ret || !commit) +- goto out; ++ return ret; + + /* + * Normally, on transaction commit from inside a transaction, +@@ -250,9 +257,7 @@ static int backpointer_target_not_found(struct btree_trans *trans, + */ + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + } +-out: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -272,7 +277,7 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, + 0, + bp.v->level - 1, + 0); +- struct btree *b = bch2_btree_iter_peek_node(trans, iter); ++ struct btree *b = bch2_btree_iter_peek_node(iter); + if (IS_ERR_OR_NULL(b)) + goto err; + +@@ -283,14 +288,14 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, + return b; + + if (btree_node_will_make_reachable(b)) { +- b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); ++ b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)); + } else { + int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), + last_flushed, commit); + b = ret ? ERR_PTR(ret) : NULL; + } + err: +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return b; + } + +@@ -312,9 +317,9 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, + 0, + bp.v->level, + iter_flags); +- struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return k; + } + +@@ -334,7 +339,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) + return k; + +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + + if (!bp.v->level) { + int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); +@@ -374,44 +379,42 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st + return 0; + + struct bch_fs *c = trans->c; +- struct btree_iter alloc_iter = {}; +- struct bkey_s_c alloc_k; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + struct bpos bucket; + if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) +- goto out; ++ return ret; + + if (fsck_err(trans, backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_backpointer_del(trans, k.k->p); +- goto out; ++ return ret; + } + +- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); +- ret = bkey_err(alloc_k); +- if (ret) +- goto out; +- +- if (alloc_k.k->type != KEY_TYPE_alloc_v4) { +- ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); ++ { ++ CLASS(btree_iter, alloc_iter)(trans, BTREE_ID_alloc, bucket, 0); ++ struct bkey_s_c alloc_k = bch2_btree_iter_peek_slot(&alloc_iter); ++ ret = bkey_err(alloc_k); + if (ret) +- goto out; ++ return ret; + +- if (fsck_err(trans, backpointer_to_missing_alloc, +- "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", +- alloc_iter.pos.inode, alloc_iter.pos.offset, +- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) +- ret = bch2_backpointer_del(trans, k.k->p); ++ if (alloc_k.k->type != KEY_TYPE_alloc_v4) { ++ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); ++ if (ret) ++ return ret; ++ ++ if (fsck_err(trans, backpointer_to_missing_alloc, ++ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", ++ alloc_iter.pos.inode, alloc_iter.pos.offset, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ++ ret = bch2_backpointer_del(trans, k.k->p); ++ } + } +-out: + fsck_err: +- bch2_trans_iter_exit(trans, &alloc_iter); +- printbuf_exit(&buf); + return ret; + } + +@@ -422,14 +425,13 @@ int bch2_check_btree_backpointers(struct bch_fs *c) + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_backpointers, POS_MIN, 0, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); ++ bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed)); + + bch2_bkey_buf_exit(&last_flushed, c); +- bch_err_fn(c, ret); + return ret; + } + +@@ -459,7 +461,7 @@ static int check_extent_checksum(struct btree_trans *trans, + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + void *data_buf = NULL; + struct bio *bio = NULL; + size_t bytes; +@@ -478,7 +480,8 @@ static int check_extent_checksum(struct btree_trans *trans, + + bytes = p.crc.compressed_size << 9; + +- struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, ++ BCH_DEV_READ_REF_check_extent_checksums); + if (!ca) + return false; + +@@ -515,8 +518,8 @@ static int check_extent_checksum(struct btree_trans *trans, + if (bio) + bio_put(bio); + kvfree(data_buf); +- percpu_ref_put(&ca->io_ref[READ]); +- printbuf_exit(&buf); ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_check_extent_checksums); + return ret; + } + +@@ -527,32 +530,30 @@ static int check_bp_exists(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct btree_iter other_extent_iter = {}; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + if (bpos_lt(bp->k.p, s->bp_start) || + bpos_gt(bp->k.p, s->bp_end)) + return 0; + +- struct btree_iter bp_iter; +- struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); ++ CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp->k.p, 0); ++ struct bkey_s_c bp_k = bch2_btree_iter_peek_slot(&bp_iter); + int ret = bkey_err(bp_k); + if (ret) +- goto err; ++ return ret; + + if (bp_k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { + ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); + if (ret) +- goto err; ++ return ret; + + goto check_existing_bp; + } + out: + err: + fsck_err: +- bch2_trans_iter_exit(trans, &other_extent_iter); +- bch2_trans_iter_exit(trans, &bp_iter); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&other_extent_iter); + return ret; + check_existing_bp: + /* Do we have a backpointer for a different extent? */ +@@ -579,6 +580,7 @@ static int check_bp_exists(struct btree_trans *trans, + bkey_for_each_ptr(other_extent_ptrs, ptr) + if (ptr->dev == bp->k.p.inode && + dev_ptr_stale_rcu(ca, ptr)) { ++ rcu_read_unlock(); + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); + if (ret) +@@ -636,7 +638,7 @@ static int check_bp_exists(struct btree_trans *trans, + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, other_extent); + bch_err(c, "%s", buf.buf); +- ret = -BCH_ERR_fsck_repair_unimplemented; ++ ret = bch_err_throw(c, fsck_repair_unimplemented); + goto err; + missing: + printbuf_reset(&buf); +@@ -667,24 +669,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans, + if (p.ptr.dev == BCH_SB_MEMBER_INVALID) + continue; + +- rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); +- bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); +- bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); ++ bool empty; ++ { ++ /* scoped_guard() is a loop, so it breaks continue */ ++ guard(rcu)(); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); ++ if (!ca) ++ continue; + +- bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); +- rcu_read_unlock(); ++ if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) ++ continue; + +- if ((check || empty) && !stale) { +- struct bkey_i_backpointer bp; +- bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); ++ u64 b = PTR_BUCKET_NR(ca, &p.ptr); ++ if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) ++ continue; + +- int ret = check +- ? check_bp_exists(trans, s, &bp, k) +- : bch2_bucket_backpointer_mod(trans, k, &bp, true); +- if (ret) +- return ret; ++ empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); + } ++ ++ struct bkey_i_backpointer bp; ++ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); ++ ++ int ret = !empty ++ ? check_bp_exists(trans, s, &bp, k) ++ : bch2_bucket_backpointer_mod(trans, k, &bp, true); ++ if (ret) ++ return ret; + } + + return 0; +@@ -703,13 +713,13 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, + retry: + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, + 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); +- b = bch2_btree_iter_peek_node(trans, &iter); ++ b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + if (b != btree_node_root(c, b)) { +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + goto retry; + } + +@@ -718,18 +728,10 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, + k = bkey_i_to_s_c(&b->key); + ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +-static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) +-{ +- return (struct bbpos) { +- .btree = bp.btree_id, +- .pos = bp.pos, +- }; +-} +- + static u64 mem_may_pin_bytes(struct bch_fs *c) + { + struct sysinfo i; +@@ -788,6 +790,13 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, + return ret; + } + ++static inline int bch2_fs_going_ro(struct bch_fs *c) ++{ ++ return test_bit(BCH_FS_going_ro, &c->flags) ++ ? -EROFS ++ : 0; ++} ++ + static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + struct extents_to_bp_state *s) + { +@@ -815,9 +824,11 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); ++ bch2_fs_going_ro(c) ?: + check_extent_to_backpointers(trans, s, btree_id, level, k) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + })); ++ bch2_trans_iter_exit(&iter); + if (ret) + return ret; + +@@ -854,6 +865,7 @@ static int data_type_to_alloc_counter(enum bch_data_type t) + static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); + + static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, ++ bool *had_mismatch, + struct bkey_buf *last_flushed) + { + struct bch_fs *c = trans->c; +@@ -861,6 +873,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + bool need_commit = false; + ++ *had_mismatch = false; ++ + if (a->data_type == BCH_DATA_sb || + a->data_type == BCH_DATA_journal || + a->data_type == BCH_DATA_parity) +@@ -869,11 +883,10 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b + u32 sectors[ALLOC_SECTORS_NR]; + memset(sectors, 0, sizeof(sectors)); + +- struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); ++ CLASS(bch2_dev_bucket_tryget_noerror, ca)(trans->c, alloc_k.k->p); + if (!ca) + return 0; + +- struct btree_iter iter; + struct bkey_s_c bp_k; + int ret = 0; + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, +@@ -889,7 +902,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b + bp.v->pad)) { + ret = bch2_backpointer_del(trans, bp_k.k->p); + if (ret) +- break; ++ return ret; + + need_commit = true; + continue; +@@ -904,14 +917,13 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b + + sectors[alloc_counter] += bp.v->bucket_len; + }; +- bch2_trans_iter_exit(trans, &iter); + if (ret) +- goto err; ++ return ret; + + if (need_commit) { + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) +- goto err; ++ return ret; + } + + if (sectors[ALLOC_dirty] != a->dirty_sectors || +@@ -920,27 +932,31 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { + ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); + if (ret) +- goto err; ++ return ret; + } + + if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_cached] > a->cached_sectors || + sectors[ALLOC_stripe] > a->stripe_sectors) { +- ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: +- -BCH_ERR_transaction_restart_nested; +- goto err; ++ return check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: ++ bch_err_throw(c, transaction_restart_nested); + } + +- if (!sectors[ALLOC_dirty] && +- !sectors[ALLOC_stripe] && +- !sectors[ALLOC_cached]) +- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); +- else +- __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); ++ bool empty = (sectors[ALLOC_dirty] + ++ sectors[ALLOC_stripe] + ++ sectors[ALLOC_cached]) == 0; ++ ++ ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, ++ alloc_k.k->p.offset) ?: ++ (empty ++ ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, ++ alloc_k.k->p.offset) ++ : 0); ++ ++ *had_mismatch = true; + } +-err: +- bch2_dev_put(ca); +- return ret; ++ ++ return 0; + } + + static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) +@@ -949,7 +965,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) + case KEY_TYPE_btree_ptr_v2: { + bool ret = false; + +- rcu_read_lock(); ++ guard(rcu)(); + struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; + while (pos.inode <= k.k->p.inode) { + if (pos.inode >= c->sb.nr_devices) +@@ -960,8 +976,14 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) + goto next; + + struct bpos bucket = bp_pos_to_bucket(ca, pos); +- bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, +- ca->mi.nbuckets, bucket.offset); ++ u64 next = ca->mi.nbuckets; ++ ++ unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); ++ if (bitmap) ++ next = min_t(u64, next, ++ find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); ++ ++ bucket.offset = next; + if (bucket.offset == ca->mi.nbuckets) + goto next; + +@@ -971,7 +993,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) + next: + pos = SPOS(pos.inode + 1, 0, 0); + } +- rcu_read_unlock(); + + return ret; + } +@@ -987,7 +1008,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, + { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); +- struct btree *b = bch2_btree_iter_peek_node(trans, &iter); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; +@@ -995,7 +1016,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, + if (b) + bch2_node_pin(trans->c, b); + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -1031,6 +1052,7 @@ static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, + + bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); + })); ++ bch2_trans_iter_exit(&iter); + if (ret) + return ret; + +@@ -1060,6 +1082,7 @@ static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, + + ret; + })); ++ bch2_trans_iter_exit(&iter); + if (ret) + return ret; + +@@ -1070,29 +1093,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) + { + int ret = 0; + +- /* +- * Can't allow devices to come/go/resize while we have bucket bitmaps +- * allocated +- */ +- down_read(&c->state_lock); +- +- for_each_member_device(c, ca) { +- BUG_ON(ca->bucket_backpointer_mismatches); +- ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), +- sizeof(unsigned long), +- GFP_KERNEL); +- ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), +- sizeof(unsigned long), +- GFP_KERNEL); +- if (!ca->bucket_backpointer_mismatches || +- !ca->bucket_backpointer_empty) { +- bch2_dev_put(ca); +- ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; +- goto err_free_bitmaps; +- } +- } +- +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + struct extents_to_bp_state s = { .bp_start = POS_MIN }; + + bch2_bkey_buf_init(&s.last_flushed); +@@ -1100,23 +1101,24 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) + + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_prefetch, k, ({ +- check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); ++ bool had_mismatch; ++ bch2_fs_going_ro(c) ?: ++ check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); + })); + if (ret) + goto err; + +- u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; ++ u64 nr_buckets = 0, nr_mismatches = 0; + for_each_member_device(c, ca) { + nr_buckets += ca->mi.nbuckets; +- nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); +- nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); ++ nr_mismatches += ca->bucket_backpointer_mismatch.nr; + } + +- if (!nr_mismatches && !nr_empty) ++ if (!nr_mismatches) + goto err; + + bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", +- nr_mismatches + nr_empty, nr_buckets); ++ nr_mismatches, nr_buckets); + + while (1) { + ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); +@@ -1130,7 +1132,7 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) + + if (!bpos_eq(s.bp_start, POS_MIN) || + !bpos_eq(s.bp_end, SPOS_MAX)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_str(&buf, "check_extents_to_backpointers(): "); + bch2_bpos_to_text(&buf, s.bp_start); +@@ -1138,7 +1140,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) + bch2_bpos_to_text(&buf, s.bp_end); + + bch_verbose(c, "%s", buf.buf); +- printbuf_exit(&buf); + } + + ret = bch2_check_extents_to_backpointers_pass(trans, &s); +@@ -1147,23 +1148,63 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) + + s.bp_start = bpos_successor(s.bp_end); + } ++ ++ for_each_member_device(c, ca) { ++ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); ++ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); ++ } + err: +- bch2_trans_put(trans); + bch2_bkey_buf_exit(&s.last_flushed, c); + bch2_btree_cache_unpin(c); +-err_free_bitmaps: +- for_each_member_device(c, ca) { +- kvfree(ca->bucket_backpointer_empty); +- ca->bucket_backpointer_empty = NULL; +- kvfree(ca->bucket_backpointer_mismatches); +- ca->bucket_backpointer_mismatches = NULL; +- } +- +- up_read(&c->state_lock); +- bch_err_fn(c, ret); + return ret; + } + ++static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, ++ struct bpos bucket, ++ bool *had_mismatch, ++ struct bkey_buf *last_flushed) ++{ ++ CLASS(btree_iter, alloc_iter)(trans, BTREE_ID_alloc, bucket, BTREE_ITER_cached); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&alloc_iter); ++ int ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ return check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); ++} ++ ++int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, ++ struct bch_dev *ca, u64 bucket, ++ bool copygc, ++ struct bkey_buf *last_flushed) ++{ ++ struct bch_fs *c = trans->c; ++ bool had_mismatch; ++ int ret = lockrestart_do(trans, ++ check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), ++ &had_mismatch, last_flushed)); ++ if (ret || !had_mismatch) ++ return ret; ++ ++ u64 nr = ca->bucket_backpointer_mismatch.nr; ++ u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; ++ ++ CLASS(printbuf, buf)(); ++ __bch2_log_msg_start(ca->name, &buf); ++ ++ prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", ++ bucket, nr, ca->mi.nbuckets); ++ ++ bch2_run_explicit_recovery_pass(c, &buf, ++ BCH_RECOVERY_PASS_check_extents_to_backpointers, ++ nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); ++ ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ return 0; ++} ++ ++/* backpointers -> extents */ ++ + static int check_one_backpointer(struct btree_trans *trans, + struct bbpos start, + struct bbpos end, +@@ -1188,7 +1229,7 @@ static int check_one_backpointer(struct btree_trans *trans, + if (ret) + return ret; + +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -1235,7 +1276,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, + + int bch2_check_backpointers_to_extents(struct bch_fs *c) + { +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; + int ret; + +@@ -1255,7 +1296,7 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) + + if (bbpos_cmp(start, BBPOS_MIN) || + bbpos_cmp(end, BBPOS_MAX)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_str(&buf, "check_backpointers_to_extents(): "); + bch2_bbpos_to_text(&buf, start); +@@ -1263,7 +1304,6 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) + bch2_bbpos_to_text(&buf, end); + + bch_verbose(c, "%s", buf.buf); +- printbuf_exit(&buf); + } + + ret = bch2_check_backpointers_to_extents_pass(trans, start, end); +@@ -1272,10 +1312,53 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) + + start = bbpos_successor(end); + } +- bch2_trans_put(trans); + + bch2_btree_cache_unpin(c); +- +- bch_err_fn(c, ret); + return ret; + } ++ ++static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) ++{ ++ scoped_guard(mutex, &b->lock) { ++ if (!b->buckets) { ++ b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), ++ sizeof(unsigned long), GFP_KERNEL); ++ if (!b->buckets) ++ return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); ++ } ++ ++ b->nr += !__test_and_set_bit(bit, b->buckets); ++ } ++ ++ return 0; ++} ++ ++int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b, ++ u64 old_size, u64 new_size) ++{ ++ scoped_guard(mutex, &b->lock) { ++ if (!b->buckets) ++ return 0; ++ ++ unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), ++ sizeof(unsigned long), GFP_KERNEL); ++ if (!n) ++ return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); ++ ++ memcpy(n, b->buckets, ++ BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); ++ kvfree(b->buckets); ++ b->buckets = n; ++ } ++ ++ return 0; ++} ++ ++void bch2_bucket_bitmap_free(struct bucket_bitmap *b) ++{ ++ mutex_lock(&b->lock); ++ kvfree(b->buckets); ++ b->buckets = NULL; ++ b->nr = 0; ++ mutex_unlock(&b->lock); ++} +diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h +index 16575dbc5736..7e71afee1ac0 100644 +--- a/fs/bcachefs/backpointers.h ++++ b/fs/bcachefs/backpointers.h +@@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, + + static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) + { +- rcu_read_lock(); ++ guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); + if (ca) + *bucket = bp_pos_to_bucket(ca, bp_pos); +- rcu_read_unlock(); + return ca != NULL; + } + +@@ -102,7 +101,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, + struct bkey_i_backpointer *bp, + bool insert) + { +- if (unlikely(bch2_backpointers_no_use_write_buffer)) ++ if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) + return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); + + if (!insert) { +@@ -182,8 +181,20 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b + struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, + struct btree_iter *, struct bkey_buf *); + ++int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, ++ bool, struct bkey_buf *); ++ + int bch2_check_btree_backpointers(struct bch_fs *); + int bch2_check_extents_to_backpointers(struct bch_fs *); + int bch2_check_backpointers_to_extents(struct bch_fs *); + ++static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) ++{ ++ unsigned long *bitmap = READ_ONCE(b->buckets); ++ return bitmap && test_bit(i, bitmap); ++} ++ ++int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64); ++void bch2_bucket_bitmap_free(struct bucket_bitmap *); ++ + #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index 75f7408da173..cdf593c59922 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -183,6 +183,16 @@ + #define pr_fmt(fmt) "%s() " fmt "\n", __func__ + #endif + ++#ifdef CONFIG_BCACHEFS_DEBUG ++#define ENUMERATED_REF_DEBUG ++#endif ++ ++#ifndef dynamic_fault ++#define dynamic_fault(...) 0 ++#endif ++ ++#define race_fault(...) dynamic_fault("bcachefs:race") ++ + #include + #include + #include +@@ -209,24 +219,40 @@ + #include "btree_journal_iter_types.h" + #include "disk_accounting_types.h" + #include "errcode.h" ++#include "fast_list.h" + #include "fifo.h" + #include "nocow_locking_types.h" + #include "opts.h" +-#include "recovery_passes_types.h" + #include "sb-errors_types.h" + #include "seqmutex.h" ++#include "snapshot_types.h" + #include "time_stats.h" + #include "util.h" + +-#ifdef CONFIG_BCACHEFS_DEBUG +-#define BCH_WRITE_REF_DEBUG +-#endif +- +-#ifndef dynamic_fault +-#define dynamic_fault(...) 0 +-#endif ++#include "alloc_types.h" ++#include "async_objs_types.h" ++#include "btree_gc_types.h" ++#include "btree_types.h" ++#include "btree_node_scan_types.h" ++#include "btree_write_buffer_types.h" ++#include "buckets_types.h" ++#include "buckets_waiting_for_journal_types.h" ++#include "clock_types.h" ++#include "disk_groups_types.h" ++#include "ec_types.h" ++#include "enumerated_ref_types.h" ++#include "journal_types.h" ++#include "keylist_types.h" ++#include "quota_types.h" ++#include "rebalance_types.h" ++#include "recovery_passes_types.h" ++#include "replicas_types.h" ++#include "sb-members_types.h" ++#include "subvolume_types.h" ++#include "super_types.h" ++#include "thread_with_file_types.h" + +-#define race_fault(...) dynamic_fault("bcachefs:race") ++#include "trace.h" + + #define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) + +@@ -269,7 +295,7 @@ do { \ + + #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") + +-void bch2_print_str(struct bch_fs *, const char *); ++void bch2_print_str(struct bch_fs *, const char *, const char *); + + __printf(2, 3) + void bch2_print_opts(struct bch_opts *, const char *, ...); +@@ -293,19 +319,31 @@ do { \ + bch2_print(_c, __VA_ARGS__); \ + } while (0) + +-#define bch_info(c, fmt, ...) \ +- bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) +-#define bch_info_ratelimited(c, fmt, ...) \ +- bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) +-#define bch_notice(c, fmt, ...) \ +- bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) +-#define bch_warn(c, fmt, ...) \ +- bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) +-#define bch_warn_ratelimited(c, fmt, ...) \ +- bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) +- +-#define bch_err(c, fmt, ...) \ +- bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch2_print_str_ratelimited(_c, ...) \ ++do { \ ++ static DEFINE_RATELIMIT_STATE(_rs, \ ++ DEFAULT_RATELIMIT_INTERVAL, \ ++ DEFAULT_RATELIMIT_BURST); \ ++ \ ++ if (__ratelimit(&_rs)) \ ++ bch2_print_str(_c, __VA_ARGS__); \ ++} while (0) ++ ++#define bch_log(c, loglevel, fmt, ...) \ ++ bch2_print(c, loglevel bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_log_ratelimited(c, loglevel, fmt, ...) \ ++ bch2_print_ratelimited(c, loglevel bch2_fmt(c, fmt), ##__VA_ARGS__) ++ ++#define bch_err(c, ...) bch_log(c, KERN_ERR, __VA_ARGS__) ++#define bch_err_ratelimited(c, ...) bch_log_ratelimited(c, KERN_ERR, __VA_ARGS__) ++#define bch_warn(c, ...) bch_log(c, KERN_WARNING, __VA_ARGS__) ++#define bch_warn_ratelimited(c, ...) bch_log_ratelimited(c, KERN_WARNING, __VA_ARGS__) ++#define bch_notice(c, ...) bch_log(c, KERN_NOTICE, __VA_ARGS__) ++#define bch_info(c, ...) bch_log(c, KERN_INFO, __VA_ARGS__) ++#define bch_info_ratelimited(c, ...) bch_log_ratelimited(c, KERN_INFO, __VA_ARGS__) ++#define bch_verbose(c, ...) bch_log(c, KERN_DEBUG, __VA_ARGS__) ++#define bch_verbose_ratelimited(c, ...) bch_log_ratelimited(c, KERN_DEBUG, __VA_ARGS__) ++ + #define bch_err_dev(ca, fmt, ...) \ + bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + #define bch_err_dev_offset(ca, _offset, fmt, ...) \ +@@ -315,8 +353,6 @@ do { \ + #define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ + bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + +-#define bch_err_ratelimited(c, fmt, ...) \ +- bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) + #define bch_err_dev_ratelimited(ca, fmt, ...) \ + bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) + #define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ +@@ -350,23 +386,13 @@ do { \ + ##__VA_ARGS__, bch2_err_str(_ret)); \ + } while (0) + +-#define bch_verbose(c, fmt, ...) \ +-do { \ +- if ((c)->opts.verbose) \ +- bch_info(c, fmt, ##__VA_ARGS__); \ +-} while (0) +- +-#define bch_verbose_ratelimited(c, fmt, ...) \ +-do { \ +- if ((c)->opts.verbose) \ +- bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ +-} while (0) ++static inline int __bch2_err_trace(struct bch_fs *c, int err) ++{ ++ trace_error_throw(c, err, _THIS_IP_); ++ return err; ++} + +-#define pr_verbose_init(opts, fmt, ...) \ +-do { \ +- if (opt_get(opts, verbose)) \ +- pr_info(fmt, ##__VA_ARGS__); \ +-} while (0) ++#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) + + /* Parameters that are useful for debugging, but should always be compiled in: */ + #define BCH_DEBUG_PARAMS_ALWAYS() \ +@@ -390,17 +416,20 @@ do { \ + "compare them") \ + BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ + "Don't use the write buffer for backpointers, enabling "\ +- "extra runtime checks") +- +-/* Parameters that should only be compiled in debug mode: */ +-#define BCH_DEBUG_PARAMS_DEBUG() \ +- BCH_DEBUG_PARAM(expensive_debug_checks, \ +- "Enables various runtime debugging checks that " \ +- "significantly affect performance") \ ++ "extra runtime checks") \ ++ BCH_DEBUG_PARAM(debug_check_btree_locking, \ ++ "Enable additional asserts for btree locking") \ + BCH_DEBUG_PARAM(debug_check_iterators, \ + "Enables extra verification for btree iterators") \ ++ BCH_DEBUG_PARAM(debug_check_bset_lookups, \ ++ "Enables extra verification for bset lookups") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ ++ BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ ++ "Enables extra verification for bkey unpack") ++ ++/* Parameters that should only be compiled in debug mode: */ ++#define BCH_DEBUG_PARAMS_DEBUG() \ + BCH_DEBUG_PARAM(journal_seq_verify, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ +@@ -427,15 +456,9 @@ do { \ + #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() + #endif + +-#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; +-BCH_DEBUG_PARAMS() +-#undef BCH_DEBUG_PARAM +- +-#ifndef CONFIG_BCACHEFS_DEBUG +-#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; +-BCH_DEBUG_PARAMS_DEBUG() ++#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; ++BCH_DEBUG_PARAMS_ALL() + #undef BCH_DEBUG_PARAM +-#endif + + #define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ +@@ -443,6 +466,7 @@ BCH_DEBUG_PARAMS_DEBUG() + x(btree_node_compact) \ + x(btree_node_merge) \ + x(btree_node_sort) \ ++ x(btree_node_get) \ + x(btree_node_read) \ + x(btree_node_read_done) \ + x(btree_node_write) \ +@@ -450,6 +474,10 @@ BCH_DEBUG_PARAMS_DEBUG() + x(btree_interior_update_total) \ + x(btree_gc) \ + x(data_write) \ ++ x(data_write_to_submit) \ ++ x(data_write_to_queue) \ ++ x(data_write_to_btree_update) \ ++ x(data_write_btree_update) \ + x(data_read) \ + x(data_promote) \ + x(journal_flush_write) \ +@@ -472,26 +500,6 @@ enum bch_time_stats { + BCH_TIME_STAT_NR + }; + +-#include "alloc_types.h" +-#include "btree_gc_types.h" +-#include "btree_types.h" +-#include "btree_node_scan_types.h" +-#include "btree_write_buffer_types.h" +-#include "buckets_types.h" +-#include "buckets_waiting_for_journal_types.h" +-#include "clock_types.h" +-#include "disk_groups_types.h" +-#include "ec_types.h" +-#include "journal_types.h" +-#include "keylist_types.h" +-#include "quota_types.h" +-#include "rebalance_types.h" +-#include "replicas_types.h" +-#include "sb-members_types.h" +-#include "subvolume_types.h" +-#include "super_types.h" +-#include "thread_with_file_types.h" +- + /* Number of nodes btree coalesce will try to coalesce at once */ + #define GC_MERGE_NODES 4U + +@@ -514,6 +522,57 @@ struct discard_in_flight { + u64 bucket:63; + }; + ++#define BCH_DEV_READ_REFS() \ ++ x(bch2_online_devs) \ ++ x(trans_mark_dev_sbs) \ ++ x(read_fua_test) \ ++ x(sb_field_resize) \ ++ x(write_super) \ ++ x(journal_read) \ ++ x(fs_journal_alloc) \ ++ x(fs_resize_on_mount) \ ++ x(btree_node_read) \ ++ x(btree_node_read_all_replicas) \ ++ x(btree_node_scrub) \ ++ x(btree_node_write) \ ++ x(btree_node_scan) \ ++ x(btree_verify_replicas) \ ++ x(btree_node_ondisk_to_text) \ ++ x(io_read) \ ++ x(check_extent_checksums) \ ++ x(ec_block) ++ ++enum bch_dev_read_ref { ++#define x(n) BCH_DEV_READ_REF_##n, ++ BCH_DEV_READ_REFS() ++#undef x ++ BCH_DEV_READ_REF_NR, ++}; ++ ++#define BCH_DEV_WRITE_REFS() \ ++ x(journal_write) \ ++ x(journal_do_discards) \ ++ x(dev_do_discards) \ ++ x(discard_one_bucket_fast) \ ++ x(do_invalidates) \ ++ x(nocow_flush) \ ++ x(io_write) \ ++ x(ec_block) \ ++ x(ec_bucket_zero) ++ ++enum bch_dev_write_ref { ++#define x(n) BCH_DEV_WRITE_REF_##n, ++ BCH_DEV_WRITE_REFS() ++#undef x ++ BCH_DEV_WRITE_REF_NR, ++}; ++ ++struct bucket_bitmap { ++ unsigned long *buckets; ++ u64 nr; ++ struct mutex lock; ++}; ++ + struct bch_dev { + struct kobject kobj; + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -524,8 +583,7 @@ struct bch_dev { + struct percpu_ref ref; + #endif + struct completion ref_completion; +- struct percpu_ref io_ref[2]; +- struct completion io_ref_completion[2]; ++ struct enumerated_ref io_ref[2]; + + struct bch_fs *fs; + +@@ -559,8 +617,8 @@ struct bch_dev { + u8 *oldest_gen; + unsigned long *buckets_nouse; + +- unsigned long *bucket_backpointer_mismatches; +- unsigned long *bucket_backpointer_empty; ++ struct bucket_bitmap bucket_backpointer_mismatch; ++ struct bucket_bitmap bucket_backpointer_empty; + + struct bch_dev_usage_full __percpu + *usage; +@@ -572,10 +630,6 @@ struct bch_dev { + unsigned nr_partial_buckets; + unsigned nr_btree_reserve; + +- size_t inc_gen_needs_gc; +- size_t inc_gen_really_needs_gc; +- size_t buckets_waiting_on_journal; +- + struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; +@@ -614,14 +668,15 @@ struct bch_dev { + x(accounting_replay_done) \ + x(may_go_rw) \ + x(rw) \ ++ x(rw_init_done) \ + x(was_rw) \ + x(stopping) \ + x(emergency_ro) \ + x(going_ro) \ + x(write_disable_complete) \ + x(clean_shutdown) \ +- x(recovery_running) \ +- x(fsck_running) \ ++ x(in_recovery) \ ++ x(in_fsck) \ + x(initial_gc_unfixed) \ + x(need_delete_dead_snapshots) \ + x(error) \ +@@ -648,8 +703,10 @@ struct btree_transaction_stats { + struct bch2_time_stats lock_hold_times; + struct mutex lock; + unsigned nr_max_paths; +- unsigned journal_entries_size; + unsigned max_mem; ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ darray_trans_kmalloc_trace trans_kmalloc_trace; ++#endif + char *max_paths_text; + }; + +@@ -670,9 +727,6 @@ struct btree_trans_buf { + struct btree_trans *trans; + }; + +-#define BCACHEFS_ROOT_SUBVOL_INUM \ +- ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) +- + #define BCH_WRITE_REFS() \ + x(journal) \ + x(trans) \ +@@ -694,7 +748,9 @@ struct btree_trans_buf { + x(snapshot_delete_pagecache) \ + x(sysfs) \ + x(btree_write_buffer) \ +- x(btree_node_scrub) ++ x(btree_node_scrub) \ ++ x(async_recovery_passes) \ ++ x(ioctl_data) + + enum bch_write_ref { + #define x(n) BCH_WRITE_REF_##n, +@@ -728,11 +784,7 @@ struct bch_fs { + struct rw_semaphore state_lock; + + /* Counts outstanding writes, for clean transition to read-only */ +-#ifdef BCH_WRITE_REF_DEBUG +- atomic_long_t writes[BCH_WRITE_REF_NR]; +-#else +- struct percpu_ref writes; +-#endif ++ struct enumerated_ref writes; + /* + * Certain operations are only allowed in single threaded mode, during + * recovery, and we want to assert that this is the case: +@@ -749,6 +801,7 @@ struct bch_fs { + struct work_struct read_only_work; + + struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; ++ struct bch_devs_mask devs_removed; + + struct bch_accounting_mem accounting; + +@@ -762,6 +815,8 @@ struct bch_fs { + struct bch_disk_groups_cpu __rcu *disk_groups; + + struct bch_opts opts; ++ unsigned loglevel; ++ unsigned prev_loglevel; + + /* Updated by bch2_sb_update():*/ + struct { +@@ -776,6 +831,7 @@ struct bch_fs { + + u8 nr_devices; + u8 clean; ++ bool multi_device; /* true if we've ever had more than one device */ + + u8 encryption_type; + +@@ -785,15 +841,14 @@ struct bch_fs { + unsigned nsec_per_time_unit; + u64 features; + u64 compat; ++ u64 recovery_passes_required; + unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; + u64 btrees_lost_data; + } sb; +- DARRAY(enum bcachefs_metadata_version) +- incompat_versions_requested; + +-#ifdef CONFIG_UNICODE ++ unsigned long incompat_versions_requested[BITS_TO_LONGS(BCH_VERSION_MINOR(bcachefs_metadata_version_current))]; ++ + struct unicode_map *cf_encoding; +-#endif + + struct bch_sb_handle disk_sb; + +@@ -809,7 +864,7 @@ struct bch_fs { + struct mutex snapshot_table_lock; + struct rw_semaphore snapshot_create_lock; + +- struct work_struct snapshot_delete_work; ++ struct snapshot_delete snapshot_delete; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; + snapshot_id_list snapshots_unlinked; + struct mutex snapshots_unlinked_lock; +@@ -874,7 +929,7 @@ struct bch_fs { + struct btree_write_buffer btree_write_buffer; + + struct workqueue_struct *btree_update_wq; +- struct workqueue_struct *btree_io_complete_wq; ++ struct workqueue_struct *btree_write_complete_wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; + /* +@@ -885,6 +940,7 @@ struct bch_fs { + struct workqueue_struct *write_ref_wq; + + /* ALLOCATION */ ++ struct bch_devs_mask online_devs; + struct bch_devs_mask rw_devs[BCH_DATA_NR]; + unsigned long rw_devs_change_count; + +@@ -979,6 +1035,10 @@ struct bch_fs { + nocow_locks; + struct rhashtable promote_table; + ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; ++#endif ++ + mempool_t compression_bounce[2]; + mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; + size_t zstd_workspace_size; +@@ -1048,25 +1108,12 @@ struct bch_fs { + /* RECOVERY */ + u64 journal_replay_seq_start; + u64 journal_replay_seq_end; +- /* +- * Two different uses: +- * "Has this fsck pass?" - i.e. should this type of error be an +- * emergency read-only +- * And, in certain situations fsck will rewind to an earlier pass: used +- * for signaling to the toplevel code which pass we want to run now. +- */ +- enum bch_recovery_pass curr_recovery_pass; +- enum bch_recovery_pass next_recovery_pass; +- /* bitmask of recovery passes that we actually ran */ +- u64 recovery_passes_complete; +- /* never rewinds version of curr_recovery_pass */ +- enum bch_recovery_pass recovery_pass_done; +- spinlock_t recovery_pass_lock; +- struct semaphore online_fsck_mutex; ++ struct bch_fs_recovery recovery; + + /* DEBUG JUNK */ + struct dentry *fs_debug_dir; + struct dentry *btree_debug_dir; ++ struct dentry *async_obj_dir; + struct btree_debug btree_debug[BTREE_ID_NR]; + struct btree *verify_data; + struct btree_node *verify_ondisk; +@@ -1108,54 +1155,6 @@ struct bch_fs { + + extern struct wait_queue_head bch2_read_only_wait; + +-static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) +-{ +-#ifdef BCH_WRITE_REF_DEBUG +- atomic_long_inc(&c->writes[ref]); +-#else +- percpu_ref_get(&c->writes); +-#endif +-} +- +-static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +-{ +-#ifdef BCH_WRITE_REF_DEBUG +- return !test_bit(BCH_FS_going_ro, &c->flags) && +- atomic_long_inc_not_zero(&c->writes[ref]); +-#else +- return percpu_ref_tryget(&c->writes); +-#endif +-} +- +-static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +-{ +-#ifdef BCH_WRITE_REF_DEBUG +- return !test_bit(BCH_FS_going_ro, &c->flags) && +- atomic_long_inc_not_zero(&c->writes[ref]); +-#else +- return percpu_ref_tryget_live(&c->writes); +-#endif +-} +- +-static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) +-{ +-#ifdef BCH_WRITE_REF_DEBUG +- long v = atomic_long_dec_return(&c->writes[ref]); +- +- BUG_ON(v < 0); +- if (v) +- return; +- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) +- if (atomic_long_read(&c->writes[i])) +- return; +- +- set_bit(BCH_FS_write_disable_complete, &c->flags); +- wake_up(&bch2_read_only_wait); +-#else +- percpu_ref_put(&c->writes); +-#endif +-} +- + static inline bool bch2_ro_ref_tryget(struct bch_fs *c) + { + if (test_bit(BCH_FS_stopping, &c->flags)) +@@ -1166,7 +1165,7 @@ static inline bool bch2_ro_ref_tryget(struct bch_fs *c) + + static inline void bch2_ro_ref_put(struct bch_fs *c) + { +- if (refcount_dec_and_test(&c->ro_ref)) ++ if (c && refcount_dec_and_test(&c->ro_ref)) + wake_up(&c->ro_ref_wait); + } + +@@ -1256,4 +1255,33 @@ static inline unsigned data_replicas_required(struct bch_fs *c) + #define BKEY_PADDED_ONSTACK(key, pad) \ + struct { struct bkey_i key; __u64 key ## _pad[pad]; } + ++/* ++ * This is needed because discard is both a filesystem option and a device ++ * option, and mount options are supposed to apply to that mount and not be ++ * persisted, i.e. if it's set as a mount option we can't propagate it to the ++ * device. ++ */ ++static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) ++{ ++ return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) ++ ? c->opts.discard ++ : ca->mi.discard; ++} ++ ++static inline int bch2_fs_casefold_enabled(struct bch_fs *c) ++{ ++ if (!IS_ENABLED(CONFIG_UNICODE)) ++ return bch_err_throw(c, no_casefolding_without_utf8); ++ if (c->opts.casefold_disabled) ++ return bch_err_throw(c, casefolding_disabled); ++ return 0; ++} ++ ++static inline const char *strip_bch2(const char *msg) ++{ ++ if (!strncmp("bch2_", msg, 5)) ++ return msg + 5; ++ return msg; ++} ++ + #endif /* _BCACHEFS_H */ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index d6e4a496f02b..b4a04df5ea95 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -497,7 +497,8 @@ struct bch_sb_field { + x(members_v2, 11) \ + x(errors, 12) \ + x(ext, 13) \ +- x(downgrade, 14) ++ x(downgrade, 14) \ ++ x(recovery_passes, 15) + + #include "alloc_background_format.h" + #include "dirent_format.h" +@@ -510,6 +511,7 @@ struct bch_sb_field { + #include "logged_ops_format.h" + #include "lru_format.h" + #include "quota_format.h" ++#include "recovery_passes_format.h" + #include "reflink_format.h" + #include "replicas_format.h" + #include "snapshot_format.h" +@@ -695,7 +697,10 @@ struct bch_sb_field_ext { + x(stripe_backpointers, BCH_VERSION(1, 22)) \ + x(stripe_lru, BCH_VERSION(1, 23)) \ + x(casefolding, BCH_VERSION(1, 24)) \ +- x(extent_flags, BCH_VERSION(1, 25)) ++ x(extent_flags, BCH_VERSION(1, 25)) \ ++ x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ ++ x(fast_device_removal, BCH_VERSION(1, 27)) \ ++ x(inode_has_case_insensitive, BCH_VERSION(1, 28)) + + enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, +@@ -846,7 +851,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); + LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); + LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); + LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +-/* one free bit */ ++LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); + LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); + LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); + LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); +@@ -867,7 +872,9 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, + LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); + LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); + LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); ++LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); + LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); ++LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); + + static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) + { +@@ -922,7 +929,9 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u + x(alloc_v2, 17) \ + x(extents_across_btree_nodes, 18) \ + x(incompat_version_field, 19) \ +- x(casefolding, 20) ++ x(casefolding, 20) \ ++ x(no_alloc_info, 21) \ ++ x(small_image, 22) + + #define BCH_SB_FEATURES_ALWAYS \ + (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ +@@ -989,6 +998,19 @@ enum bch_error_actions { + BCH_ON_ERROR_NR + }; + ++#define BCH_DEGRADED_ACTIONS() \ ++ x(ask, 0) \ ++ x(yes, 1) \ ++ x(very, 2) \ ++ x(no, 3) ++ ++enum bch_degraded_actions { ++#define x(t, n) BCH_DEGRADED_##t = n, ++ BCH_DEGRADED_ACTIONS() ++#undef x ++ BCH_DEGRADED_ACTIONS_NR ++}; ++ + #define BCH_STR_HASH_TYPES() \ + x(crc32c, 0) \ + x(crc64, 1) \ +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +index 995ba32e9b6e..67e39f835b96 100644 +--- a/fs/bcachefs/bkey.c ++++ b/fs/bcachefs/bkey.c +@@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, + } + } + +-#ifdef CONFIG_BCACHEFS_DEBUG +- +-static void bch2_bkey_pack_verify(const struct bkey_packed *packed, +- const struct bkey *unpacked, +- const struct bkey_format *format) ++static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) + { + struct bkey tmp; + +@@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, + } + } + +-#else + static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, +- const struct bkey *unpacked, +- const struct bkey_format *format) {} +-#endif ++ const struct bkey *unpacked, ++ const struct bkey_format *format) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) ++ __bch2_bkey_pack_verify(packed, unpacked, format); ++} + + struct pack_state { + const struct bkey_format *format; +@@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) + return ret; + } + +-#ifdef CONFIG_BCACHEFS_DEBUG + static bool bkey_packed_successor(struct bkey_packed *out, + const struct btree *b, + struct bkey_packed k) +@@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f) + + return false; + } +-#endif + + /* + * Returns a packed key that compares <= in +@@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, + const struct bkey_format *f = &b->format; + struct pack_state state = pack_state_init(f, out); + u64 *w = out->_data; +-#ifdef CONFIG_BCACHEFS_DEBUG + struct bpos orig = in; +-#endif + bool exact = true; + unsigned i; + +@@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, + out->format = KEY_FORMAT_LOCAL_BTREE; + out->type = KEY_TYPE_deleted; + +-#ifdef CONFIG_BCACHEFS_DEBUG +- if (exact) { +- BUG_ON(bkey_cmp_left_packed(b, out, &orig)); +- } else { +- struct bkey_packed successor; ++ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { ++ if (exact) { ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); ++ } else { ++ struct bkey_packed successor; + +- BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); +- BUG_ON(bkey_packed_successor(&successor, b, *out) && +- bkey_cmp_left_packed(b, &successor, &orig) < 0 && +- !bkey_format_has_too_big_fields(f)); ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); ++ BUG_ON(bkey_packed_successor(&successor, b, *out) && ++ bkey_cmp_left_packed(b, &successor, &orig) < 0 && ++ !bkey_format_has_too_big_fields(f)); ++ } + } +-#endif + + return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; + } +@@ -627,14 +623,11 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) + } + } + +-#ifdef CONFIG_BCACHEFS_DEBUG +- { +- struct printbuf buf = PRINTBUF; +- ++ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { ++ CLASS(printbuf, buf)(); + BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); +- printbuf_exit(&buf); + } +-#endif ++ + return ret; + } + +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 054e2d5e8448..3ccd521c190a 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) + static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) + { + return bpos_eq(l.k->p, r.k->p) && ++ l.k->size == r.k->size && + bkey_bytes(l.k) == bkey_bytes(r.k) && + !memcmp(l.v, r.v, bkey_val_bytes(l.k)); + } +@@ -397,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, + compiled_unpack_fn unpack_fn = b->aux_data; + unpack_fn(dst, src); + +- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && +- bch2_expensive_debug_checks) { ++ if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { + struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + + BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index 00d05ccfaf73..fcd8c82cba4f 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -356,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + return ops->key_merge && + bch2_bkey_maybe_mergable(l.k, r.k) && + (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && +- !bch2_key_merging_disabled && ++ !static_branch_unlikely(&bch2_key_merging_disabled) && + ops->key_merge(c, l, r); + } + +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +index 9a4a83d6fd2d..72698c0d9f0e 100644 +--- a/fs/bcachefs/bset.c ++++ b/fs/bcachefs/bset.c +@@ -58,7 +58,7 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, + struct bkey_packed *_k, *_n; + struct bkey uk, n; + struct bkey_s_c k; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + if (!i->u64s) + return; +@@ -97,8 +97,6 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b, + if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) + printk(KERN_ERR "Duplicate keys\n"); + } +- +- printbuf_exit(&buf); + } + + void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) +@@ -113,7 +111,7 @@ void bch2_dump_btree_node_iter(struct btree *b, + struct btree_node_iter *iter) + { + struct btree_node_iter_set *set; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + printk(KERN_ERR "btree node iter with %u/%u sets:\n", + __btree_node_iter_used(iter), b->nsets); +@@ -128,8 +126,6 @@ void bch2_dump_btree_node_iter(struct btree *b, + printk(KERN_ERR "set %zu key %u: %s\n", + t - b->set, set->k, buf.buf); + } +- +- printbuf_exit(&buf); + } + + struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) +@@ -144,8 +140,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) + return nr; + } + +-#ifdef CONFIG_BCACHEFS_DEBUG +- + void __bch2_verify_btree_nr_keys(struct btree *b) + { + struct btree_nr_keys nr = bch2_btree_node_count_keys(b); +@@ -153,7 +147,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) + BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); + } + +-static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, ++static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, + struct btree *b) + { + struct btree_node_iter iter = *_iter; +@@ -190,8 +184,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, + } + } + +-void bch2_btree_node_iter_verify(struct btree_node_iter *iter, +- struct btree *b) ++void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) + { + struct btree_node_iter_set *set, *s2; + struct bkey_packed *k, *p; +@@ -237,8 +231,8 @@ void bch2_btree_node_iter_verify(struct btree_node_iter *iter, + } + } + +-void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, +- struct bkey_packed *insert, unsigned clobber_u64s) ++static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, ++ struct bkey_packed *insert, unsigned clobber_u64s) + { + struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); +@@ -285,12 +279,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + #endif + } + +-#else +- +-static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, +- struct btree *b) {} ++static inline void bch2_verify_insert_pos(struct btree *b, ++ struct bkey_packed *where, ++ struct bkey_packed *insert, ++ unsigned clobber_u64s) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) ++ __bch2_verify_insert_pos(b, where, insert, clobber_u64s); ++} + +-#endif + + /* Auxiliary search trees */ + +@@ -361,23 +358,6 @@ static struct bkey_float *bkey_float(const struct btree *b, + return ro_aux_tree_base(b, t)->f + idx; + } + +-static void bset_aux_tree_verify(struct btree *b) +-{ +-#ifdef CONFIG_BCACHEFS_DEBUG +- for_each_bset(b, t) { +- if (t->aux_data_offset == U16_MAX) +- continue; +- +- BUG_ON(t != b->set && +- t[-1].aux_data_offset == U16_MAX); +- +- BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); +- BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); +- BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); +- } +-#endif +-} +- + void bch2_btree_keys_init(struct btree *b) + { + unsigned i; +@@ -495,15 +475,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, + }; + } + +-static void bch2_bset_verify_rw_aux_tree(struct btree *b, +- struct bset_tree *t) ++static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) + { + struct bkey_packed *k = btree_bkey_first(b, t); + unsigned j = 0; + +- if (!bch2_expensive_debug_checks) +- return; +- + BUG_ON(bset_has_ro_aux_tree(t)); + + if (!bset_has_rw_aux_tree(t)) +@@ -530,6 +506,58 @@ static void bch2_bset_verify_rw_aux_tree(struct btree *b, + } + } + ++static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) ++ __bch2_bset_verify_rw_aux_tree(b, t); ++} ++ ++static void __bset_aux_tree_verify_ro(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *k = btree_bkey_first(b, t); ++ ++ eytzinger1_for_each(j, t->size - 1) { ++ while (tree_to_bkey(b, t, j) > k && ++ k != btree_bkey_last(b, t)) ++ k = bkey_p_next(k); ++ ++ BUG_ON(tree_to_bkey(b, t, j) != k); ++ } ++} ++ ++static void __bset_aux_tree_verify(struct btree *b) ++{ ++ for_each_bset(b, t) { ++ if (t->aux_data_offset == U16_MAX) ++ continue; ++ ++ BUG_ON(t != b->set && ++ t[-1].aux_data_offset == U16_MAX); ++ ++ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); ++ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); ++ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_RO_AUX_TREE: ++ __bset_aux_tree_verify_ro(b, t); ++ break; ++ case BSET_RW_AUX_TREE: ++ __bch2_bset_verify_rw_aux_tree(b, t); ++ break; ++ default: ++ break; ++ } ++ } ++} ++ ++static inline void bset_aux_tree_verify(struct btree *b) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) ++ __bset_aux_tree_verify(b); ++} ++ + /* returns idx of first entry >= offset: */ + static unsigned rw_aux_tree_bsearch(struct btree *b, + struct bset_tree *t, +@@ -869,7 +897,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + k = p; + } + +- if (bch2_expensive_debug_checks) { ++ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { + BUG_ON(ret >= orig_k); + + for (i = ret +@@ -1195,7 +1223,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, + bkey_iter_pos_cmp(b, m, search) < 0) + m = bkey_p_next(m); + +- if (bch2_expensive_debug_checks) { ++ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); + + BUG_ON(prev && +@@ -1435,9 +1463,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, + void bch2_btree_node_iter_advance(struct btree_node_iter *iter, + struct btree *b) + { +- if (bch2_expensive_debug_checks) { +- bch2_btree_node_iter_verify(iter, b); +- bch2_btree_node_iter_next_check(iter, b); ++ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { ++ __bch2_btree_node_iter_verify(iter, b); ++ __bch2_btree_node_iter_next_check(iter, b); + } + + __bch2_btree_node_iter_advance(iter, b); +@@ -1453,8 +1481,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + struct btree_node_iter_set *set; + unsigned end = 0; + +- if (bch2_expensive_debug_checks) +- bch2_btree_node_iter_verify(iter, b); ++ bch2_btree_node_iter_verify(iter, b); + + for_each_bset(b, t) { + k = bch2_bkey_prev_all(b, t, +@@ -1489,8 +1516,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + iter->data[0].k = __btree_node_key_to_offset(b, prev); + iter->data[0].end = end; + +- if (bch2_expensive_debug_checks) +- bch2_btree_node_iter_verify(iter, b); ++ bch2_btree_node_iter_verify(iter, b); + return prev; + } + +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +index 6953d55b72cc..a15ecf9d006e 100644 +--- a/fs/bcachefs/bset.h ++++ b/fs/bcachefs/bset.h +@@ -517,27 +517,19 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); + void bch2_dump_btree_node(struct bch_fs *, struct btree *); + void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); + +-#ifdef CONFIG_BCACHEFS_DEBUG +- + void __bch2_verify_btree_nr_keys(struct btree *); +-void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); +-void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, +- struct bkey_packed *, unsigned); +- +-#else ++void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); + +-static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} + static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, +- struct btree *b) {} +-static inline void bch2_verify_insert_pos(struct btree *b, +- struct bkey_packed *where, +- struct bkey_packed *insert, +- unsigned clobber_u64s) {} +-#endif ++ struct btree *b) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) ++ __bch2_btree_node_iter_verify(iter, b); ++} + + static inline void bch2_verify_btree_nr_keys(struct btree *b) + { +- if (bch2_debug_check_btree_accounting) ++ if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) + __bch2_verify_btree_nr_keys(b); + } + +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 899891295797..9261ad043564 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -15,14 +15,9 @@ + + #include + #include ++#include + #include + +-#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ +-do { \ +- if (shrinker_counter) \ +- bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ +-} while (0) +- + const char * const bch2_btree_node_flags[] = { + "typebit", + "typebit", +@@ -83,15 +78,14 @@ void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b) + { + struct btree_cache *bc = &c->btree_cache; + +- mutex_lock(&bc->lock); +- __bch2_btree_node_to_freelist(bc, b); +- mutex_unlock(&bc->lock); ++ scoped_guard(mutex, &bc->lock) ++ __bch2_btree_node_to_freelist(bc, b); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } + +-static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) ++void __btree_node_data_free(struct btree *b) + { + BUG_ON(!list_empty(&b->list)); + BUG_ON(btree_node_hashed(b)); +@@ -118,16 +112,17 @@ static void __btree_node_data_free(struct btree_cache *bc, struct btree *b) + munmap(b->aux_data, btree_aux_data_bytes(b)); + #endif + b->aux_data = NULL; +- +- btree_node_to_freedlist(bc, b); + } + + static void btree_node_data_free(struct btree_cache *bc, struct btree *b) + { + BUG_ON(list_empty(&b->list)); + list_del_init(&b->list); ++ ++ __btree_node_data_free(b); ++ + --bc->nr_freeable; +- __btree_node_data_free(bc, b); ++ btree_node_to_freedlist(bc, b); + } + + static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, +@@ -155,7 +150,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) + + b->data = kvmalloc(btree_buf_bytes(b), gfp); + if (!b->data) +- return -BCH_ERR_ENOMEM_btree_node_mem_alloc; ++ return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); + #ifdef __KERNEL__ + b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); + #else +@@ -168,7 +163,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) + if (!b->aux_data) { + kvfree(b->data); + b->data = NULL; +- return -BCH_ERR_ENOMEM_btree_node_mem_alloc; ++ return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); + } + + return 0; +@@ -191,10 +186,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) + + struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) + { +- struct btree_cache *bc = &c->btree_cache; +- struct btree *b; +- +- b = __btree_node_mem_alloc(c, GFP_KERNEL); ++ struct btree *b = __btree_node_mem_alloc(c, GFP_KERNEL); + if (!b) + return NULL; + +@@ -204,8 +196,6 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) + } + + bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); +- +- __bch2_btree_node_to_freelist(bc, b); + return b; + } + +@@ -224,14 +214,13 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b) + { + struct btree_cache *bc = &c->btree_cache; + +- mutex_lock(&bc->lock); +- if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { ++ guard(mutex)(&bc->lock); ++ if (!btree_node_is_root(c, b) && !btree_node_pinned(b)) { + set_btree_node_pinned(b); + list_move(&b->list, &bc->live[1].list); + bc->live[0].nr--; + bc->live[1].nr++; + } +- mutex_unlock(&bc->lock); + } + + void bch2_btree_cache_unpin(struct bch_fs *c) +@@ -239,7 +228,7 @@ void bch2_btree_cache_unpin(struct bch_fs *c) + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *n; + +- mutex_lock(&bc->lock); ++ guard(mutex)(&bc->lock); + c->btree_cache.pinned_nodes_mask[0] = 0; + c->btree_cache.pinned_nodes_mask[1] = 0; + +@@ -249,8 +238,6 @@ void bch2_btree_cache_unpin(struct bch_fs *c) + bc->live[0].nr++; + bc->live[1].nr--; + } +- +- mutex_unlock(&bc->lock); + } + + /* Btree in memory cache - hash table */ +@@ -305,11 +292,8 @@ int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, + b->c.level = level; + b->c.btree_id = id; + +- mutex_lock(&bc->lock); +- int ret = __bch2_btree_node_hash_insert(bc, b); +- mutex_unlock(&bc->lock); +- +- return ret; ++ guard(mutex)(&bc->lock); ++ return __bch2_btree_node_hash_insert(bc, b); + } + + void bch2_btree_node_update_key_early(struct btree_trans *trans, +@@ -326,7 +310,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, + + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { +- mutex_lock(&c->btree_cache.lock); ++ guard(mutex)(&c->btree_cache.lock); + + __bch2_btree_node_hash_remove(&c->btree_cache, b); + +@@ -334,7 +318,6 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + +- mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + +@@ -350,115 +333,119 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, + return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); + } + +-/* +- * this version is for btree nodes that have already been freed (we're not +- * reaping a real btree node) +- */ +-static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) ++static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, ++ bool flush, bool locked) + { + struct btree_cache *bc = &c->btree_cache; +- int ret = 0; + + lockdep_assert_held(&bc->lock); +-wait_on_io: +- if (b->flags & ((1U << BTREE_NODE_dirty)| +- (1U << BTREE_NODE_read_in_flight)| ++ ++ if (btree_node_noevict(b)) { ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; ++ return bch_err_throw(c, ENOMEM_btree_node_reclaim); ++ } ++ if (btree_node_write_blocked(b)) { ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; ++ return bch_err_throw(c, ENOMEM_btree_node_reclaim); ++ } ++ if (btree_node_will_make_reachable(b)) { ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; ++ return bch_err_throw(c, ENOMEM_btree_node_reclaim); ++ } ++ ++ if (btree_node_dirty(b)) { ++ if (!flush) { ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; ++ return bch_err_throw(c, ENOMEM_btree_node_reclaim); ++ } ++ ++ if (locked) { ++ /* ++ * Using the underscore version because we don't want to compact ++ * bsets after the write, since this node is about to be evicted ++ * - unless btree verify mode is enabled, since it runs out of ++ * the post write cleanup: ++ */ ++ if (static_branch_unlikely(&bch2_verify_btree_ondisk)) ++ bch2_btree_node_write(c, b, SIX_LOCK_intent, ++ BTREE_WRITE_cache_reclaim); ++ else ++ __bch2_btree_node_write(c, b, ++ BTREE_WRITE_cache_reclaim); ++ } ++ } ++ ++ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) { +- if (btree_node_dirty(b)) +- BTREE_CACHE_NOT_FREED_INCREMENT(dirty); +- else if (btree_node_read_in_flight(b)) +- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); ++ if (btree_node_read_in_flight(b)) ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; + else if (btree_node_write_in_flight(b)) +- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); +- return -BCH_ERR_ENOMEM_btree_node_reclaim; ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; ++ return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + ++ if (locked) ++ return -EINTR; ++ + /* XXX: waiting on IO with btree cache lock held */ + bch2_btree_node_wait_on_read(b); + bch2_btree_node_wait_on_write(b); + } + ++ return 0; ++} ++ ++/* ++ * this version is for btree nodes that have already been freed (we're not ++ * reaping a real btree node) ++ */ ++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ int ret = 0; ++ ++ lockdep_assert_held(&bc->lock); ++retry_unlocked: ++ ret = __btree_node_reclaim_checks(c, b, flush, false); ++ if (ret) ++ return ret; ++ + if (!six_trylock_intent(&b->c.lock)) { +- BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); +- return -BCH_ERR_ENOMEM_btree_node_reclaim; ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; ++ return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + + if (!six_trylock_write(&b->c.lock)) { +- BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); +- goto out_unlock_intent; ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; ++ six_unlock_intent(&b->c.lock); ++ return bch_err_throw(c, ENOMEM_btree_node_reclaim); + } + + /* recheck under lock */ +- if (b->flags & ((1U << BTREE_NODE_read_in_flight)| +- (1U << BTREE_NODE_write_in_flight))) { +- if (!flush) { +- if (btree_node_read_in_flight(b)) +- BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); +- else if (btree_node_write_in_flight(b)) +- BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); +- goto out_unlock; +- } ++ ret = __btree_node_reclaim_checks(c, b, flush, true); ++ if (ret) { + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +- goto wait_on_io; +- } +- +- if (btree_node_noevict(b)) { +- BTREE_CACHE_NOT_FREED_INCREMENT(noevict); +- goto out_unlock; +- } +- if (btree_node_write_blocked(b)) { +- BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); +- goto out_unlock; +- } +- if (btree_node_will_make_reachable(b)) { +- BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); +- goto out_unlock; ++ if (ret == -EINTR) ++ goto retry_unlocked; ++ return ret; + } + +- if (btree_node_dirty(b)) { +- if (!flush) { +- BTREE_CACHE_NOT_FREED_INCREMENT(dirty); +- goto out_unlock; +- } +- /* +- * Using the underscore version because we don't want to compact +- * bsets after the write, since this node is about to be evicted +- * - unless btree verify mode is enabled, since it runs out of +- * the post write cleanup: +- */ +- if (bch2_verify_btree_ondisk) +- bch2_btree_node_write(c, b, SIX_LOCK_intent, +- BTREE_WRITE_cache_reclaim); +- else +- __bch2_btree_node_write(c, b, +- BTREE_WRITE_cache_reclaim); +- +- six_unlock_write(&b->c.lock); +- six_unlock_intent(&b->c.lock); +- goto wait_on_io; +- } +-out: + if (b->hash_val && !ret) +- trace_and_count(c, btree_cache_reap, c, b); +- return ret; +-out_unlock: +- six_unlock_write(&b->c.lock); +-out_unlock_intent: +- six_unlock_intent(&b->c.lock); +- ret = -BCH_ERR_ENOMEM_btree_node_reclaim; +- goto out; ++ trace_btree_node(c, b, btree_cache_reap); ++ ++ return 0; + } + +-static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) ++static int btree_node_reclaim(struct bch_fs *c, struct btree *b) + { +- return __btree_node_reclaim(c, b, false, shrinker_counter); ++ return __btree_node_reclaim(c, b, false); + } + + static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) + { +- return __btree_node_reclaim(c, b, true, false); ++ return __btree_node_reclaim(c, b, true); + } + + static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, +@@ -476,7 +463,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + unsigned long ret = SHRINK_STOP; + bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; + +- if (bch2_btree_shrinker_disabled) ++ if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) + return SHRINK_STOP; + + mutex_lock(&bc->lock); +@@ -490,7 +477,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + * IO can always make forward progress: + */ + can_free = btree_cache_can_free(list); +- nr = min_t(unsigned long, nr, can_free); ++ if (nr > can_free) { ++ bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; ++ nr = can_free; ++ } + + i = 0; + list_for_each_entry_safe(b, t, &bc->freeable, list) { +@@ -506,7 +496,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + if (touched >= nr) + goto out; + +- if (!btree_node_reclaim(c, b, true)) { ++ if (!btree_node_reclaim(c, b)) { + btree_node_data_free(bc, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +@@ -521,10 +511,11 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + if (btree_node_accessed(b)) { + clear_btree_node_accessed(b); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; +- --touched;; +- } else if (!btree_node_reclaim(c, b, true)) { ++ --touched; ++ } else if (!btree_node_reclaim(c, b)) { + __bch2_btree_node_hash_remove(bc, b); +- __btree_node_data_free(bc, b); ++ __btree_node_data_free(b); ++ btree_node_to_freedlist(bc, b); + + freed++; + bc->nr_freed++; +@@ -569,12 +560,25 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, + { + struct btree_cache_list *list = shrink->private_data; + +- if (bch2_btree_shrinker_disabled) ++ if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) + return 0; + + return btree_cache_can_free(list); + } + ++static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) ++{ ++ struct btree_cache_list *list = shrink->private_data; ++ struct btree_cache *bc = container_of(list, struct btree_cache, live[list->idx]); ++ ++ char *cbuf; ++ size_t buflen = seq_buf_get_buf(s, &cbuf); ++ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); ++ ++ bch2_btree_cache_to_text(&out, bc); ++ seq_buf_commit(s, out.pos); ++} ++ + void bch2_fs_btree_cache_exit(struct bch_fs *c) + { + struct btree_cache *bc = &c->btree_cache; +@@ -652,9 +656,12 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) + + bch2_recalc_btree_reserve(c); + +- for (i = 0; i < bc->nr_reserve; i++) +- if (!__bch2_btree_node_mem_alloc(c)) ++ for (i = 0; i < bc->nr_reserve; i++) { ++ struct btree *b = __bch2_btree_node_mem_alloc(c); ++ if (!b) + goto err; ++ __bch2_btree_node_to_freelist(bc, b); ++ } + + list_splice_init(&bc->live[0].list, &bc->freeable); + +@@ -666,6 +673,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) + bc->live[0].shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; ++ shrink->to_text = bch2_btree_cache_shrinker_to_text; + shrink->seeks = 2; + shrink->private_data = &bc->live[0]; + shrinker_register(shrink); +@@ -676,13 +684,14 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) + bc->live[1].shrink = shrink; + shrink->count_objects = bch2_btree_cache_count; + shrink->scan_objects = bch2_btree_cache_scan; ++ shrink->to_text = bch2_btree_cache_shrinker_to_text; + shrink->seeks = 8; + shrink->private_data = &bc->live[1]; + shrinker_register(shrink); + + return 0; + err: +- return -BCH_ERR_ENOMEM_fs_btree_cache_init; ++ return bch_err_throw(c, ENOMEM_fs_btree_cache_init); + } + + void bch2_fs_btree_cache_init_early(struct btree_cache *bc) +@@ -727,7 +736,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure + + if (!cl) { + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); +- return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; ++ return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); + } + + closure_wait(&bc->alloc_wait, cl); +@@ -741,7 +750,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure + } + + trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); +- return -BCH_ERR_btree_cache_cannibalize_lock_blocked; ++ return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); + + success: + trace_and_count(c, btree_cache_cannibalize_lock, trans); +@@ -755,7 +764,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) + + for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) + list_for_each_entry_reverse(b, &bc->live[i].list, list) +- if (!btree_node_reclaim(c, b, false)) ++ if (!btree_node_reclaim(c, b)) + return b; + + while (1) { +@@ -790,7 +799,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, freed, list) +- if (!btree_node_reclaim(c, b, false)) { ++ if (!btree_node_reclaim(c, b)) { + list_del_init(&b->list); + goto got_node; + } +@@ -817,7 +826,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b2, &bc->freeable, list) +- if (!btree_node_reclaim(c, b2, false)) { ++ if (!btree_node_reclaim(c, b2)) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + +@@ -913,20 +922,18 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, + } + + if (unlikely(!bkey_is_btree_ptr(&k->k))) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf); +- printbuf_exit(&buf); + return ERR_PTR(ret); + } + + if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf); +- printbuf_exit(&buf); + return ERR_PTR(ret); + } + +@@ -977,7 +984,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, + + /* Unlock before doing IO: */ + six_unlock_intent(&b->c.lock); +- bch2_trans_unlock_noassert(trans); ++ bch2_trans_unlock(trans); + + bch2_btree_node_read(trans, b, sync); + +@@ -1001,11 +1008,10 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, + + static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) + { +- struct printbuf buf = PRINTBUF; +- +- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) ++ if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) + return; + ++ CLASS(printbuf, buf)(); + prt_printf(&buf, + "btree node header doesn't match ptr: "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); +@@ -1021,8 +1027,6 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) + bch2_bpos_to_text(&buf, b->data->max_key); + + bch2_fs_topology_error(c, "%s", buf.buf); +- +- printbuf_exit(&buf); + } + + static inline void btree_check_header(struct bch_fs *c, struct btree *b) +@@ -1492,9 +1496,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc + + prt_btree_cache_line(out, c, "live:", bc->live[0].nr); + prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); +- prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); ++ prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); ++ prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); + prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); +- prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); ++ prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); + prt_newline(out); + + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { +@@ -1505,6 +1510,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc + } + + prt_newline(out); ++ prt_printf(out, "counters since mount:\n"); + prt_printf(out, "freed:\t%zu\n", bc->nr_freed); + prt_printf(out, "not freed:\n"); + +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index ca3c1b145330..035b2cb25077 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -30,6 +30,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *, enum btree_id, unsig + void bch2_btree_cache_cannibalize_unlock(struct btree_trans *); + int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *); + ++void __btree_node_data_free(struct btree *); + struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); + struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); + +@@ -143,6 +144,14 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) + return r ? r->b : NULL; + } + ++static inline bool btree_node_is_root(struct bch_fs *c, struct btree *b) ++{ ++ struct btree *root = btree_node_root(c, b); ++ ++ BUG_ON(b != root && b->c.level >= root->c.level); ++ return b == root; ++} ++ + const char *bch2_btree_id_str(enum btree_id); /* avoid */ + void bch2_btree_id_to_text(struct printbuf *, enum btree_id); + void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); +@@ -153,4 +162,15 @@ void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btr + void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); + void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); + ++#define trace_btree_node(_c, _b, event) \ ++do { \ ++ if (trace_##event##_enabled()) { \ ++ CLASS(printbuf, buf)(); \ ++ printbuf_indent_add(&buf, 2); \ ++ bch2_btree_pos_to_text(&buf, c, b); \ ++ trace_##event(c, buf.buf); \ ++ } \ ++ count_event(c, event); \ ++} while (0); ++ + #endif /* _BCACHEFS_BTREE_CACHE_H */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 37b69d89341f..6b91649688da 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -22,6 +22,7 @@ + #include "debug.h" + #include "disk_accounting.h" + #include "ec.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "extents.h" + #include "journal.h" +@@ -43,10 +44,6 @@ + #include + #include + +-#define DROP_THIS_NODE 10 +-#define DROP_PREV_NODE 11 +-#define DID_FILL_FROM_SCAN 12 +- + /* + * Returns true if it's a btree we can easily reconstruct, or otherwise won't + * cause data loss if it's missing: +@@ -94,11 +91,10 @@ static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k) + + static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) + { +- preempt_disable(); ++ guard(preempt)(); + write_seqcount_begin(&c->gc_pos_lock); + c->gc_pos = new_pos; + write_seqcount_end(&c->gc_pos_lock); +- preempt_enable(); + } + + static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +@@ -137,19 +133,18 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) + int ret; + + if (c->opts.verbose) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_min); + + bch_info(c, "%s(): %s", __func__, buf.buf); +- printbuf_exit(&buf); + } + + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); + if (!new) +- return -BCH_ERR_ENOMEM_gc_repair_key; ++ return bch_err_throw(c, ENOMEM_gc_repair_key); + + btree_ptr_to_v2(b, new); + b->data->min_key = new_min; +@@ -173,14 +168,13 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) + int ret; + + if (c->opts.verbose) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, " -> "); + bch2_bpos_to_text(&buf, new_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); +- printbuf_exit(&buf); + } + + ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); +@@ -189,7 +183,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) + + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); + if (!new) +- return -BCH_ERR_ENOMEM_gc_repair_key; ++ return bch_err_throw(c, ENOMEM_gc_repair_key); + + btree_ptr_to_v2(b, new); + b->data->max_key = new_max; +@@ -204,13 +198,12 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) + + bch2_btree_node_drop_keys_outside_node(b); + +- mutex_lock(&c->btree_cache.lock); ++ guard(mutex)(&c->btree_cache.lock); + __bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, &new->k_i); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); +- mutex_unlock(&c->btree_cache.lock); + return 0; + } + +@@ -222,7 +215,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * + struct bpos expected_start = !prev + ? b->data->min_key + : bpos_successor(prev->key.k.p); +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && +@@ -252,10 +245,10 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * + expected_start, + bpos_predecessor(cur->data->min_key)); + if (ret) +- goto err; ++ return ret; + + *pulled_from_scan = cur->data->min_key; +- ret = DID_FILL_FROM_SCAN; ++ ret = bch_err_throw(c, topology_repair_did_fill_from_scan); + } else { + if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) +@@ -266,7 +259,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * + if (bpos_ge(prev->data->min_key, cur->data->min_key)) { /* fully? */ + if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_next_node, + "btree node overwritten by next node%s", buf.buf)) +- ret = DROP_PREV_NODE; ++ ret = bch_err_throw(c, topology_repair_drop_prev_node); + } else { + if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, + "btree node with incorrect max_key%s", buf.buf)) +@@ -277,7 +270,7 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * + if (bpos_ge(expected_start, cur->data->max_key)) { /* fully? */ + if (mustfix_fsck_err(trans, btree_node_topology_overwritten_by_prev_node, + "btree node overwritten by prev node%s", buf.buf)) +- ret = DROP_THIS_NODE; ++ ret = bch_err_throw(c, topology_repair_drop_this_node); + } else { + if (mustfix_fsck_err(trans, btree_node_topology_bad_min_key, + "btree node with incorrect min_key%s", buf.buf)) +@@ -285,6 +278,39 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * + } + } + } ++fsck_err: ++ return ret; ++} ++ ++static int btree_check_root_boundaries(struct btree_trans *trans, struct btree *b) ++{ ++ struct bch_fs *c = trans->c; ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && ++ !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, ++ b->data->min_key)); ++ ++ prt_str(&buf, " at "); ++ bch2_btree_pos_to_text(&buf, c, b); ++ ++ if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), ++ trans, btree_node_topology_bad_root_min_key, ++ "btree root with incorrect min_key%s", buf.buf)) { ++ ret = set_node_min(c, b, POS_MIN); ++ if (ret) ++ goto err; ++ } ++ ++ if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), ++ trans, btree_node_topology_bad_root_max_key, ++ "btree root with incorrect min_key%s", buf.buf)) { ++ ret = set_node_max(c, b, SPOS_MAX); ++ if (ret) ++ goto err; ++ } ++ + err: + fsck_err: + printbuf_exit(&buf); +@@ -295,7 +321,7 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, + struct btree *child, struct bpos *pulled_from_scan) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + if (bpos_eq(child->key.k.p, b->key.k.p)) +@@ -316,17 +342,15 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, + ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0, + bpos_successor(child->key.k.p), b->key.k.p); + if (ret) +- goto err; ++ return ret; + + *pulled_from_scan = b->key.k.p; +- ret = DID_FILL_FROM_SCAN; ++ ret = bch_err_throw(c, topology_repair_did_fill_from_scan); + } else { + ret = set_node_max(c, child, b->key.k.p); + } + } +-err: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -339,7 +363,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + struct bkey_buf prev_k, cur_k; + struct btree *prev = NULL, *cur = NULL; + bool have_child, new_pass = false; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + if (!b->c.level) +@@ -370,20 +394,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); + +- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), +- trans, btree_node_read_error, +- "Topology repair: unreadable btree node at\n%s", +- buf.buf)) { ++ if (bch2_err_matches(ret, EIO)) { + bch2_btree_node_evict(trans, cur_k.k); + cur = NULL; + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + if (ret) + break; +- +- ret = bch2_btree_lost_data(c, b->c.btree_id); +- if (ret) +- break; + continue; + } + +@@ -403,13 +420,17 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + continue; + } + +- ret = btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan); +- if (ret == DID_FILL_FROM_SCAN) { ++ ret = lockrestart_do(trans, ++ btree_check_node_boundaries(trans, b, prev, cur, pulled_from_scan)); ++ if (ret && !bch2_err_matches(ret, BCH_ERR_topology_repair)) ++ goto err; ++ ++ if (bch2_err_matches(ret, BCH_ERR_topology_repair_did_fill_from_scan)) { + new_pass = true; + ret = 0; + } + +- if (ret == DROP_THIS_NODE) { ++ if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, +@@ -424,7 +445,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + six_unlock_read(&prev->c.lock); + prev = NULL; + +- if (ret == DROP_PREV_NODE) { ++ if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_prev_node)) { + bch_info(c, "dropped prev node"); + bch2_btree_node_evict(trans, prev_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, +@@ -444,8 +465,9 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + + if (!ret && !IS_ERR_OR_NULL(prev)) { + BUG_ON(cur); +- ret = btree_repair_node_end(trans, b, prev, pulled_from_scan); +- if (ret == DID_FILL_FROM_SCAN) { ++ ret = lockrestart_do(trans, ++ btree_repair_node_end(trans, b, prev, pulled_from_scan)); ++ if (bch2_err_matches(ret, BCH_ERR_topology_repair_did_fill_from_scan)) { + new_pass = true; + ret = 0; + } +@@ -486,7 +508,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + six_unlock_read(&cur->c.lock); + cur = NULL; + +- if (ret == DROP_THIS_NODE) { ++ if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); +@@ -504,10 +526,16 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + ++ /* ++ * XXX: we're not passing the trans object here because we're not set up ++ * to handle a transaction restart - this code needs to be rewritten ++ * when we start doing online topology repair ++ */ ++ bch2_trans_unlock_long(trans); + if (mustfix_fsck_err_on(!have_child, +- trans, btree_node_topology_interior_node_empty, ++ c, btree_node_topology_interior_node_empty, + "empty interior btree node at %s", buf.buf)) +- ret = DROP_THIS_NODE; ++ ret = bch_err_throw(c, topology_repair_drop_this_node); + err: + fsck_err: + if (!IS_ERR_OR_NULL(prev)) +@@ -524,78 +552,99 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); +- printbuf_exit(&buf); ++ if (!bch2_err_matches(ret, BCH_ERR_topology_repair)) ++ bch_err_fn(c, ret); + return ret; + } + +-int bch2_check_topology(struct bch_fs *c) ++static int bch2_check_root(struct btree_trans *trans, enum btree_id btree, ++ bool *reconstructed_root) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct bpos pulled_from_scan = POS_MIN; +- struct printbuf buf = PRINTBUF; ++ struct bch_fs *c = trans->c; ++ struct btree_root *r = bch2_btree_id_root(c, btree); ++ CLASS(printbuf, buf)(); + int ret = 0; + +- bch2_trans_srcu_unlock(trans); ++ bch2_btree_id_to_text(&buf, btree); + +- for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { +- struct btree_root *r = bch2_btree_id_root(c, i); +- bool reconstructed_root = false; ++ if (r->error) { ++ bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); + +- printbuf_reset(&buf); +- bch2_btree_id_to_text(&buf, i); ++ ret = bch2_btree_has_scanned_nodes(c, btree); ++ if (ret < 0) ++ goto err; + +- if (r->error) { +- ret = bch2_btree_lost_data(c, i); +- if (ret) +- break; +-reconstruct_root: +- bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); ++ if (!ret) { ++ __fsck_err(trans, ++ FSCK_CAN_FIX|(!btree_id_important(btree) ? FSCK_AUTOFIX : 0), ++ btree_root_unreadable_and_scan_found_nothing, ++ "no nodes found for btree %s, continue?", buf.buf); + + r->alive = false; + r->error = 0; ++ bch2_btree_root_alloc_fake_trans(trans, btree, 0); ++ } else { ++ r->alive = false; ++ r->error = 0; ++ bch2_btree_root_alloc_fake_trans(trans, btree, 1); + +- if (!bch2_btree_has_scanned_nodes(c, i)) { +- __fsck_err(trans, +- FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), +- btree_root_unreadable_and_scan_found_nothing, +- "no nodes found for btree %s, continue?", buf.buf); +- bch2_btree_root_alloc_fake_trans(trans, i, 0); +- } else { +- bch2_btree_root_alloc_fake_trans(trans, i, 1); +- bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); +- ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX); +- if (ret) +- break; +- } +- +- reconstructed_root = true; ++ bch2_shoot_down_journal_keys(c, btree, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ++ ret = bch2_get_scanned_nodes(c, btree, 0, POS_MIN, SPOS_MAX); ++ if (ret) ++ return ret; + } + ++ *reconstructed_root = true; ++ } ++err: ++fsck_err: ++ bch_err_fn(c, ret); ++ return ret; ++} ++ ++int bch2_check_topology(struct bch_fs *c) ++{ ++ CLASS(btree_trans, trans)(c); ++ struct bpos pulled_from_scan = POS_MIN; ++ int ret = 0; ++ ++ bch2_trans_srcu_unlock(trans); ++ ++ for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { ++ bool reconstructed_root = false; ++recover: ++ ret = lockrestart_do(trans, bch2_check_root(trans, i, &reconstructed_root)); ++ if (ret) ++ break; ++ ++ struct btree_root *r = bch2_btree_id_root(c, i); + struct btree *b = r->b; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); +- ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); ++ ret = btree_check_root_boundaries(trans, b) ?: ++ bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan); + six_unlock_read(&b->c.lock); + +- if (ret == DROP_THIS_NODE) { +- mutex_lock(&c->btree_cache.lock); +- bch2_btree_node_hash_remove(&c->btree_cache, b); +- mutex_unlock(&c->btree_cache.lock); ++ if (bch2_err_matches(ret, BCH_ERR_topology_repair_drop_this_node)) { ++ scoped_guard(mutex, &c->btree_cache.lock) ++ bch2_btree_node_hash_remove(&c->btree_cache, b); + + r->b = NULL; + +- if (!reconstructed_root) +- goto reconstruct_root; ++ if (!reconstructed_root) { ++ r->error = -EIO; ++ goto recover; ++ } + ++ CLASS(printbuf, buf)(); ++ bch2_btree_id_to_text(&buf, i); + bch_err(c, "empty btree root %s", buf.buf); + bch2_btree_root_alloc_fake_trans(trans, i, 0); + r->alive = false; + ret = 0; + } + } +-fsck_err: +- printbuf_exit(&buf); +- bch2_trans_put(trans); ++ + return ret; + } + +@@ -622,13 +671,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + + struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + deleted.p = k.k->p; + + if (initial) { +- BUG_ON(bch2_journal_seq_verify && ++ BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && + k.k->bversion.lo > atomic64_read(&c->journal.seq)); + + if (fsck_err_on(btree_id != BTREE_ID_accounting && +@@ -646,10 +695,9 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + bch2_dev_btree_bitmap_mark(c, k); + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + } + + /* +@@ -664,7 +712,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + if (ret) + goto out; + +- if (trans->nr_updates) { ++ if (bch2_trans_has_updates(trans)) { + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; +@@ -674,7 +722,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags); + out: + fsck_err: +- printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; + } +@@ -702,6 +749,7 @@ static int bch2_gc_btree(struct btree_trans *trans, + gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); + bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); + })); ++ bch2_trans_iter_exit(&iter); + if (ret) + goto err; + } +@@ -714,13 +762,13 @@ static int bch2_gc_btree(struct btree_trans *trans, + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, + 0, bch2_btree_id_root(c, btree)->b->c.level, 0); +- struct btree *b = bch2_btree_iter_peek_node(trans, &iter); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err_root; + + if (b != btree_node_root(c, b)) { +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + goto retry_root; + } + +@@ -728,7 +776,7 @@ static int bch2_gc_btree(struct btree_trans *trans, + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + ret = bch2_gc_mark_key(trans, btree, b->c.level + 1, NULL, NULL, k, initial); + err_root: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); + err: + bch_err_fn(c, ret); +@@ -742,8 +790,8 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) + + static int bch2_gc_btrees(struct bch_fs *c) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct printbuf buf = PRINTBUF; ++ CLASS(btree_trans, trans)(c); ++ CLASS(printbuf, buf)(); + int ret = 0; + + struct progress_indicator_state progress; +@@ -763,8 +811,6 @@ static int bch2_gc_btrees(struct bch_fs *c) + ret = bch2_gc_btree(trans, &progress, btree, true); + } + +- printbuf_exit(&buf); +- bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; + } +@@ -916,16 +962,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + + static int bch2_gc_alloc_done(struct bch_fs *c) + { ++ CLASS(btree_trans, trans)(c); + int ret = 0; + + for_each_member_device(c, ca) { +- ret = bch2_trans_run(c, +- for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, ++ ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + POS(ca->dev_idx, ca->mi.nbuckets - 1), + BTREE_ITER_slots|BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_alloc_write_key(trans, &iter, ca, k))); ++ bch2_alloc_write_key(trans, &iter, ca, k)); + if (ret) { + bch2_dev_put(ca); + break; +@@ -944,7 +990,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c) + ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); + if (ret) { + bch2_dev_put(ca); +- ret = -BCH_ERR_ENOMEM_gc_alloc_start; ++ ret = bch_err_throw(c, ENOMEM_gc_alloc_start); + break; + } + } +@@ -958,7 +1004,7 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, + struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + const struct bch_stripe *s; + struct gc_stripe *m; + bool bad = false; +@@ -1003,18 +1049,17 @@ static int bch2_gc_write_stripes_key(struct btree_trans *trans, + ret = bch2_trans_update(trans, iter, &new->k_i, 0); + } + fsck_err: +- printbuf_exit(&buf); + return ret; + } + + static int bch2_gc_stripes_done(struct bch_fs *c) + { +- return bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_commit(trans, iter, + BTREE_ID_stripes, POS_MIN, + BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_gc_write_stripes_key(trans, &iter, k))); ++ bch2_gc_write_stripes_key(trans, &iter, k)); + } + + /** +@@ -1043,8 +1088,8 @@ int bch2_check_allocations(struct bch_fs *c) + { + int ret; + +- down_read(&c->state_lock); +- down_write(&c->gc_lock); ++ guard(rwsem_read)(&c->state_lock); ++ guard(rwsem_write)(&c->gc_lock); + + bch2_btree_interior_updates_flush(c); + +@@ -1073,22 +1118,21 @@ int bch2_check_allocations(struct bch_fs *c) + bch2_gc_stripes_done(c) ?: + bch2_gc_reflink_done(c); + out: +- percpu_down_write(&c->mark_lock); +- /* Indicates that gc is no longer in progress: */ +- __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); +- +- bch2_gc_free(c); +- percpu_up_write(&c->mark_lock); +- +- up_write(&c->gc_lock); +- up_read(&c->state_lock); ++ scoped_guard(percpu_write, &c->mark_lock) { ++ /* Indicates that gc is no longer in progress: */ ++ __gc_pos_set(c, gc_phase(GC_PHASE_not_running)); ++ bch2_gc_free(c); ++ } + + /* + * At startup, allocations can happen directly instead of via the + * allocator thread - issue wakeup in case they blocked on gc_lock: + */ + closure_wake_up(&c->freelist_wait); +- bch_err_fn(c, ret); ++ ++ if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) ++ bch2_sb_members_clean_deleted(c); ++ + return ret; + } + +@@ -1098,42 +1142,41 @@ static int gc_btree_gens_key(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- struct bkey_i *u; +- int ret; + + if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) + return -EROFS; + +- rcu_read_lock(); +- bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); +- if (!ca) +- continue; ++ bool too_stale = false; ++ scoped_guard(rcu) { ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); ++ if (!ca) ++ continue; + +- if (dev_ptr_stale(ca, ptr) > 16) { +- rcu_read_unlock(); +- goto update; ++ too_stale |= dev_ptr_stale(ca, ptr) > 16; + } ++ ++ if (!too_stale) ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); ++ if (!ca) ++ continue; ++ ++ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; ++ if (gen_after(*gen, ptr->gen)) ++ *gen = ptr->gen; ++ } + } + +- bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); +- if (!ca) +- continue; ++ if (too_stale) { ++ struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); ++ int ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; + +- u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; +- if (gen_after(*gen, ptr->gen)) +- *gen = ptr->gen; ++ bch2_extent_normalize(c, bkey_i_to_s(u)); + } +- rcu_read_unlock(); +- return 0; +-update: +- u = bch2_bkey_make_mut(trans, iter, &k, 0); +- ret = PTR_ERR_OR_ZERO(u); +- if (ret) +- return ret; + +- bch2_extent_normalize(c, bkey_i_to_s(u)); + return 0; + } + +@@ -1186,7 +1229,7 @@ int bch2_gc_gens(struct bch_fs *c) + ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); + if (!ca->oldest_gen) { + bch2_dev_put(ca); +- ret = -BCH_ERR_ENOMEM_gc_gens; ++ ret = bch_err_throw(c, ENOMEM_gc_gens); + goto err; + } + +@@ -1222,7 +1265,7 @@ int bch2_gc_gens(struct bch_fs *c) + BCH_TRANS_COMMIT_no_enospc, ({ + ca = bch2_dev_iterate(c, ca, k.k->p.inode); + if (!ca) { +- bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); ++ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + continue; + } + bch2_alloc_write_oldest_gen(trans, ca, &iter, k); +@@ -1256,26 +1299,21 @@ static void bch2_gc_gens_work(struct work_struct *work) + { + struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); + bch2_gc_gens(c); +- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); + } + + void bch2_gc_gens_async(struct bch_fs *c) + { +- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && ++ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && + !queue_work(c->write_ref_wq, &c->gc_gens_work)) +- bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); + } + +-void bch2_fs_btree_gc_exit(struct bch_fs *c) +-{ +-} +- +-int bch2_fs_btree_gc_init(struct bch_fs *c) ++void bch2_fs_btree_gc_init_early(struct bch_fs *c) + { + seqcount_init(&c->gc_pos_lock); + INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); + + init_rwsem(&c->gc_lock); + mutex_init(&c->gc_gens_lock); +- return 0; + } +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index 9693a90a48a2..ec77662369a2 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); + int bch2_gc_gens(struct bch_fs *); + void bch2_gc_gens_async(struct bch_fs *); + +-void bch2_fs_btree_gc_exit(struct bch_fs *); +-int bch2_fs_btree_gc_init(struct bch_fs *); ++void bch2_fs_btree_gc_init_early(struct bch_fs *); + + #endif /* _BCACHEFS_BTREE_GC_H */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 60782f3e5aec..8a03cd75a64f 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "async_objs.h" + #include "bkey_buf.h" + #include "bkey_methods.h" + #include "bkey_sort.h" +@@ -13,6 +14,7 @@ + #include "buckets.h" + #include "checksum.h" + #include "debug.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "extents.h" + #include "io_write.h" +@@ -22,8 +24,15 @@ + #include "super-io.h" + #include "trace.h" + ++#include + #include + ++#ifdef CONFIG_BCACHEFS_DEBUG ++static unsigned bch2_btree_read_corrupt_ratio; ++module_param_named(btree_read_corrupt_ratio, bch2_btree_read_corrupt_ratio, uint, 0644); ++MODULE_PARM_DESC(btree_read_corrupt_ratio, ""); ++#endif ++ + static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) + { + bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); +@@ -514,19 +523,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) + + static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca, ++ bool print_pos, + struct btree *b, struct bset *i, struct bkey_packed *k, +- unsigned offset, int write) ++ unsigned offset, int rw) + { +- prt_printf(out, bch2_log_msg(c, "%s"), +- write == READ +- ? "error validating btree node " +- : "corrupt btree node before write "); ++ if (print_pos) { ++ prt_str(out, rw == READ ++ ? "error validating btree node " ++ : "corrupt btree node before write "); ++ prt_printf(out, "at btree "); ++ bch2_btree_pos_to_text(out, c, b); ++ prt_newline(out); ++ } ++ + if (ca) +- prt_printf(out, "on %s ", ca->name); +- prt_printf(out, "at btree "); +- bch2_btree_pos_to_text(out, c, b); ++ prt_printf(out, "%s ", ca->name); + +- prt_printf(out, "\nnode offset %u/%u", ++ prt_printf(out, "node offset %u/%u", + b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); + if (i) + prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); +@@ -537,93 +550,127 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + prt_str(out, ": "); + } + +-__printf(10, 11) ++__printf(11, 12) + static int __btree_err(int ret, + struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, + struct bset *i, + struct bkey_packed *k, +- int write, +- bool have_retry, ++ int rw, + enum bch_sb_error_id err_type, ++ struct bch_io_failures *failed, ++ struct printbuf *err_msg, + const char *fmt, ...) + { +- bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; ++ if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) ++ return ret == -BCH_ERR_btree_node_read_err_fixable ++ ? bch_err_throw(c, fsck_fix) ++ : ret; ++ ++ bool have_retry = false; ++ int ret2; ++ ++ if (ca) { ++ bch2_mark_btree_validate_failure(failed, ca->dev_idx); ++ ++ struct extent_ptr_decoded pick; ++ have_retry = bch2_bkey_pick_read_device(c, ++ bkey_i_to_s_c(&b->key), ++ failed, &pick, -1) == 1; ++ } + + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) +- ret = -BCH_ERR_btree_node_read_err_fixable; ++ ret = bch_err_throw(c, btree_node_read_err_fixable); + if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) +- ret = -BCH_ERR_btree_node_read_err_bad_node; ++ ret = bch_err_throw(c, btree_node_read_err_bad_node); + +- if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) +- bch2_sb_error_count(c, err_type); ++ bch2_sb_error_count(c, err_type); + +- struct printbuf out = PRINTBUF; +- if (write != WRITE && ret != -BCH_ERR_btree_node_read_err_fixable) { +- printbuf_indent_add_nextline(&out, 2); +-#ifdef BCACHEFS_LOG_PREFIX +- prt_printf(&out, bch2_log_msg(c, "")); +-#endif +- } ++ bool print_deferred = err_msg && ++ rw == READ && ++ !(test_bit(BCH_FS_in_fsck, &c->flags) && ++ c->opts.fix_errors == FSCK_FIX_ask); ++ ++ CLASS(printbuf, out)(); ++ bch2_log_msg_start(c, &out); + +- btree_err_msg(&out, c, ca, b, i, k, b->written, write); ++ if (!print_deferred) ++ err_msg = &out; ++ ++ btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); + + va_list args; + va_start(args, fmt); +- prt_vprintf(&out, fmt, args); ++ prt_vprintf(err_msg, fmt, args); + va_end(args); + +- if (write == WRITE) { ++ if (print_deferred) { ++ prt_newline(err_msg); ++ ++ switch (ret) { ++ case -BCH_ERR_btree_node_read_err_fixable: ++ ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); ++ if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && ++ !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { ++ ret = ret2; ++ goto fsck_err; ++ } ++ ++ if (!have_retry) ++ ret = bch_err_throw(c, fsck_fix); ++ return ret; ++ case -BCH_ERR_btree_node_read_err_bad_node: ++ prt_str(&out, ", "); ++ break; ++ } ++ ++ return ret; ++ } ++ ++ if (rw == WRITE) { + prt_str(&out, ", "); + ret = __bch2_inconsistent_error(c, &out) + ? -BCH_ERR_fsck_errors_not_fixed + : 0; +- silent = false; ++ goto print; + } + + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: +- ret = !silent +- ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) +- : -BCH_ERR_fsck_fix; +- if (ret != -BCH_ERR_fsck_fix && +- ret != -BCH_ERR_fsck_ignore) ++ ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); ++ if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && ++ !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { ++ ret = ret2; + goto fsck_err; +- ret = -BCH_ERR_fsck_fix; +- goto out; ++ } ++ ++ if (!have_retry) ++ ret = bch_err_throw(c, fsck_fix); ++ return ret; + case -BCH_ERR_btree_node_read_err_bad_node: + prt_str(&out, ", "); +- ret = __bch2_topology_error(c, &out); +- if (ret) +- silent = false; +- break; +- case -BCH_ERR_btree_node_read_err_incompatible: +- ret = -BCH_ERR_fsck_errors_not_fixed; +- silent = false; + break; + } +- +- if (!silent) +- bch2_print_string_as_lines(KERN_ERR, out.buf); +-out: ++print: ++ bch2_print_str(c, KERN_ERR, out.buf); + fsck_err: +- printbuf_exit(&out); + return ret; + } + + #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ + ({ \ +- int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ ++ int _ret = __btree_err(type, c, ca, b, i, k, write, \ + BCH_FSCK_ERR_##_err_type, \ ++ failed, err_msg, \ + msg, ##__VA_ARGS__); \ + \ +- if (_ret != -BCH_ERR_fsck_fix) { \ ++ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \ + ret = _ret; \ + goto fsck_err; \ + } \ + \ +- *saw_error = true; \ ++ true; \ + }) + + #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) +@@ -681,13 +728,13 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) + + static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, struct bset *i, +- unsigned offset, unsigned sectors, +- int write, bool have_retry, bool *saw_error) ++ unsigned offset, int write, ++ struct bch_io_failures *failed, ++ struct printbuf *err_msg) + { + unsigned version = le16_to_cpu(i->version); +- unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); +- struct printbuf buf1 = PRINTBUF; +- struct printbuf buf2 = PRINTBUF; ++ CLASS(printbuf, buf1)(); ++ CLASS(printbuf, buf2)(); + int ret = 0; + + btree_err_on(!bch2_version_compatible(version), +@@ -698,16 +745,21 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version)); + +- if (btree_err_on(version < c->sb.version_min, ++ if (c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes && ++ btree_err_on(version < c->sb.version_min, + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, NULL, + btree_node_bset_older_than_sb_min, + "bset version %u older than superblock version_min %u", + version, c->sb.version_min)) { +- mutex_lock(&c->sb_lock); +- c->disk_sb.sb->version_min = cpu_to_le16(version); +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ if (bch2_version_compatible(version)) { ++ guard(mutex)(&c->sb_lock); ++ c->disk_sb.sb->version_min = cpu_to_le16(version); ++ bch2_write_super(c); ++ } else { ++ /* We have no idea what's going on: */ ++ i->version = cpu_to_le16(c->sb.version); ++ } + } + + if (btree_err_on(BCH_VERSION_MAJOR(version) > +@@ -717,10 +769,9 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + btree_node_bset_newer_than_sb, + "bset version %u newer than superblock version %u", + version, c->sb.version)) { +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + c->disk_sb.sb->version = cpu_to_le16(version); + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + } + + btree_err_on(BSET_SEPARATE_WHITEOUTS(i), +@@ -729,15 +780,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + btree_node_unsupported_version, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + +- if (!write && +- btree_err_on(offset + sectors > (ptr_written ?: btree_sectors(c)), +- -BCH_ERR_btree_node_read_err_fixable, +- c, ca, b, i, NULL, +- bset_past_end_of_btree_node, +- "bset past end of btree node (offset %u len %u but written %zu)", +- offset, sectors, ptr_written ?: btree_sectors(c))) +- i->u64s = 0; +- + btree_err_on(offset && !i->u64s, + -BCH_ERR_btree_node_read_err_fixable, + c, ca, b, i, NULL, +@@ -829,8 +871,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + &bn->format); + } + fsck_err: +- printbuf_exit(&buf2); +- printbuf_exit(&buf1); + return ret; + } + +@@ -895,11 +935,12 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b, + + static int validate_bset_keys(struct bch_fs *c, struct btree *b, + struct bset *i, int write, +- bool have_retry, bool *saw_error) ++ struct bch_io_failures *failed, ++ struct printbuf *err_msg) + { + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + int ret = 0; +@@ -1001,14 +1042,16 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + le16_add_cpu(&i->u64s, -next_good_key); + memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_node_need_rewrite(b); ++ set_btree_node_need_rewrite_error(b); + } + fsck_err: +- printbuf_exit(&buf); + return ret; + } + + int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, +- struct btree *b, bool have_retry, bool *saw_error) ++ struct btree *b, ++ struct bch_io_failures *failed, ++ struct printbuf *err_msg) + { + struct btree_node_entry *bne; + struct sort_iter *iter; +@@ -1018,11 +1061,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + bool used_mempool, blacklisted; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); +- unsigned u64s; + unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); + u64 max_journal_seq = 0; +- struct printbuf buf = PRINTBUF; +- int ret = 0, retry_read = 0, write = READ; ++ CLASS(printbuf, buf)(); ++ int ret = 0, write = READ; + u64 start_time = local_clock(); + + b->version_ondisk = U16_MAX; +@@ -1099,6 +1141,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + + if (first) { ++ sectors = vstruct_sectors(b->data, c->block_bits); ++ if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, ca, b, i, NULL, ++ bset_past_end_of_btree_node, ++ "bset past end of btree node (offset %u len %u but written %zu)", ++ b->written, sectors, ptr_written ?: btree_sectors(c))) ++ i->u64s = 0; + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + bool csum_bad = bch2_crc_cmp(b->data->csum, csum); +@@ -1126,9 +1176,15 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + c, NULL, b, NULL, NULL, + btree_node_unsupported_version, + "btree node does not have NEW_EXTENT_OVERWRITE set"); +- +- sectors = vstruct_sectors(b->data, c->block_bits); + } else { ++ sectors = vstruct_sectors(bne, c->block_bits); ++ if (btree_err_on(b->written + sectors > (ptr_written ?: btree_sectors(c)), ++ -BCH_ERR_btree_node_read_err_fixable, ++ c, ca, b, i, NULL, ++ bset_past_end_of_btree_node, ++ "bset past end of btree node (offset %u len %u but written %zu)", ++ b->written, sectors, ptr_written ?: btree_sectors(c))) ++ i->u64s = 0; + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + bool csum_bad = bch2_crc_cmp(bne->csum, csum); +@@ -1149,22 +1205,19 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } +- +- sectors = vstruct_sectors(bne, c->block_bits); + } + + b->version_ondisk = min(b->version_ondisk, + le16_to_cpu(i->version)); + +- ret = validate_bset(c, ca, b, i, b->written, sectors, +- READ, have_retry, saw_error); ++ ret = validate_bset(c, ca, b, i, b->written, READ, failed, err_msg); + if (ret) + goto fsck_err; + + if (!b->written) + btree_node_set_format(b, b->data->format); + +- ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); ++ ret = validate_bset_keys(c, b, i, READ, failed, err_msg); + if (ret) + goto fsck_err; + +@@ -1225,29 +1278,23 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); + sorted->keys.u64s = 0; + +- set_btree_bset(b, b->set, &b->data->keys); +- + b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, + btree_buf_bytes(b) - + sizeof(struct btree_node) - + b->nr.live_u64s * sizeof(u64)); + +- u64s = le16_to_cpu(sorted->keys.u64s); ++ b->data->keys.u64s = sorted->keys.u64s; + *sorted = *b->data; +- sorted->keys.u64s = cpu_to_le16(u64s); + swap(sorted, b->data); + set_btree_bset(b, b->set, &b->data->keys); + b->nsets = 1; + b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); + +- BUG_ON(b->nr.live_u64s != u64s); ++ BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); + + btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); + +- if (updated_range) +- bch2_btree_node_drop_keys_outside_node(b); +- + i = &b->data->keys; + for (k = i->start; k != vstruct_last(i);) { + struct bkey tmp; +@@ -1255,7 +1302,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); + if (ret == -BCH_ERR_fsck_delete_bkey || +- (bch2_inject_invalid_keys && ++ (static_branch_unlikely(&bch2_inject_invalid_keys) && + !bversion_cmp(u.k->bversion, MAX_VERSION))) { + btree_keys_account_key_drop(&b->nr, 0, k); + +@@ -1264,6 +1311,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_bset_end(b, b->set); + set_btree_node_need_rewrite(b); ++ set_btree_node_need_rewrite_error(b); + continue; + } + if (ret) +@@ -1284,31 +1332,54 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + btree_node_reset_sib_u64s(b); + +- rcu_read_lock(); +- bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { +- struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); ++ if (updated_range) ++ bch2_btree_node_drop_keys_outside_node(b); + +- if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) +- set_btree_node_need_rewrite(b); ++ /* ++ * XXX: ++ * ++ * We deadlock if too many btree updates require node rewrites while ++ * we're still in journal replay. ++ * ++ * This is because btree node rewrites generate more updates for the ++ * interior updates (alloc, backpointers), and if those updates touch ++ * new nodes and generate more rewrites - well, you see the problem. ++ * ++ * The biggest cause is that we don't use the btree write buffer (for ++ * the backpointer updates - this needs some real thought on locking in ++ * order to fix. ++ * ++ * The problem with this workaround (not doing the rewrite for degraded ++ * nodes in journal replay) is that those degraded nodes persist, and we ++ * don't want that (this is a real bug when a btree node write completes ++ * with fewer replicas than we wanted and leaves a degraded node due to ++ * device _removal_, i.e. the device went away mid write). ++ * ++ * It's less of a bug here, but still a problem because we don't yet ++ * have a way of tracking degraded data - we another index (all ++ * extents/btree nodes, by replicas entry) in order to fix properly ++ * (re-replicate degraded data at the earliest possible time). ++ */ ++ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay)) { ++ scoped_guard(rcu) ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { ++ struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); ++ ++ if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) { ++ set_btree_node_need_rewrite(b); ++ set_btree_node_need_rewrite_degraded(b); ++ } ++ } + } +- rcu_read_unlock(); + +- if (!ptr_written) ++ if (!ptr_written) { + set_btree_node_need_rewrite(b); +-out: ++ set_btree_node_need_rewrite_ptr_written_zero(b); ++ } ++fsck_err: + mempool_free(iter, &c->fill_iter); +- printbuf_exit(&buf); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); +- return retry_read; +-fsck_err: +- if (ret == -BCH_ERR_btree_node_read_err_want_retry || +- ret == -BCH_ERR_btree_node_read_err_must_retry) { +- retry_read = 1; +- } else { +- set_btree_node_read_error(b); +- bch2_btree_lost_data(c, b->c.btree_id); +- } +- goto out; ++ return ret; + } + + static void btree_node_read_work(struct work_struct *work) +@@ -1320,16 +1391,24 @@ static void btree_node_read_work(struct work_struct *work) + struct btree *b = rb->b; + struct bio *bio = &rb->bio; + struct bch_io_failures failed = { .nr = 0 }; +- struct printbuf buf = PRINTBUF; +- bool saw_error = false; +- bool retry = false; +- bool can_retry; ++ int ret = 0; ++ ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ prt_printf(&buf, "btree node read error at btree "); ++ bch2_btree_pos_to_text(&buf, c, b); ++ prt_newline(&buf); + + goto start; + while (1) { +- retry = true; +- bch_info(c, "retrying read"); +- ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); ++ ret = bch2_bkey_pick_read_device(c, ++ bkey_i_to_s_c(&b->key), ++ &failed, &rb->pick, -1); ++ if (ret <= 0) ++ break; ++ ++ ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); + rb->have_ioref = ca != NULL; + rb->start_time = local_clock(); + bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); +@@ -1346,60 +1425,58 @@ static void btree_node_read_work(struct work_struct *work) + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); + start: +- printbuf_reset(&buf); +- bch2_btree_pos_to_text(&buf, c, b); +- +- if (ca && bio->bi_status) +- bch_err_dev_ratelimited(ca, +- "btree read error %s for %s", +- bch2_blk_status_to_str(bio->bi_status), buf.buf); + if (rb->have_ioref) +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); + rb->have_ioref = false; + +- bch2_mark_io_failure(&failed, &rb->pick, false); +- +- can_retry = bch2_bkey_pick_read_device(c, +- bkey_i_to_s_c(&b->key), +- &failed, &rb->pick, -1) > 0; +- +- if (!bio->bi_status && +- !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { +- if (retry) +- bch_info(c, "retry success"); +- break; ++ if (bio->bi_status) { ++ bch2_mark_io_failure(&failed, &rb->pick, false); ++ continue; + } + +- saw_error = true; ++ memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); ++ bio->bi_iter.bi_size = btree_buf_bytes(b); ++ ++ bch2_maybe_corrupt_bio(bio, bch2_btree_read_corrupt_ratio); + +- if (!can_retry) { +- set_btree_node_read_error(b); +- bch2_btree_lost_data(c, b->c.btree_id); ++ ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); ++ if (ret != -BCH_ERR_btree_node_read_err_want_retry && ++ ret != -BCH_ERR_btree_node_read_err_must_retry) + break; +- } + } + +- bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], +- rb->start_time); +- bio_put(&rb->bio); ++ bch2_io_failures_to_text(&buf, c, &failed); ++ ++ /* ++ * only print retry success if we read from a replica with no errors ++ */ ++ if (ret) { ++ set_btree_node_read_error(b); ++ bch2_btree_lost_data(c, &buf, b->c.btree_id); ++ prt_printf(&buf, "ret %s", bch2_err_str(ret)); ++ } else if (failed.nr) { ++ if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) ++ prt_printf(&buf, "retry success"); ++ else ++ prt_printf(&buf, "repair success"); ++ } + +- if ((saw_error || ++ if ((failed.nr || + btree_node_need_rewrite(b)) && + !btree_node_read_error(b) && +- c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { +- if (saw_error) { +- printbuf_reset(&buf); +- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); +- prt_str(&buf, " "); +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); +- bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", +- __func__, buf.buf); +- } +- ++ c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { ++ prt_printf(&buf, " (rewriting node)"); + bch2_btree_node_rewrite_async(c, b); + } ++ prt_newline(&buf); ++ ++ if (failed.nr) ++ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); + +- printbuf_exit(&buf); ++ async_object_list_del(c, btree_read_bio, rb->list_idx); ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], ++ rb->start_time); ++ bio_put(&rb->bio); + clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +@@ -1419,6 +1496,11 @@ static void btree_node_read_endio(struct bio *bio) + queue_work(c->btree_read_complete_wq, &rb->work); + } + ++void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) ++{ ++ bch2_bio_to_text(out, &rbio->bio); ++} ++ + struct btree_node_read_all { + struct closure cl; + struct bch_fs *c; +@@ -1476,14 +1558,15 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) + closure_type(ra, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bool dump_bset_maps = false; +- bool have_retry = false; + int ret = 0, best = -1, write = READ; + unsigned i, written = 0, written2 = 0; + __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + bool _saw_error = false, *saw_error = &_saw_error; ++ struct printbuf *err_msg = NULL; ++ struct bch_io_failures *failed = NULL; + + for (i = 0; i < ra->nr; i++) { + struct btree_node *bn = ra->buf[i]; +@@ -1576,14 +1659,18 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) + + if (best >= 0) { + memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); +- ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); ++ ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); + } else { + ret = -1; + } + + if (ret) { + set_btree_node_read_error(b); +- bch2_btree_lost_data(c, b->c.btree_id); ++ ++ CLASS(printbuf, buf)(); ++ bch2_btree_lost_data(c, &buf, b->c.btree_id); ++ if (buf.pos) ++ bch_err(c, "%s", buf.buf); + } else if (*saw_error) + bch2_btree_node_rewrite_async(c, b); + +@@ -1594,7 +1681,6 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) + + closure_debug_destroy(&ra->cl); + kfree(ra); +- printbuf_exit(&buf); + + clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); +@@ -1612,7 +1698,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) + struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); + + bch2_latency_acct(ca, rb->start_time, READ); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_btree_node_read_all_replicas); + } + + ra->err[rb->idx] = bio->bi_status; +@@ -1634,7 +1721,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) +- return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; ++ return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas); + + closure_init(&ra->cl, NULL); + ra->c = c; +@@ -1652,7 +1739,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { +- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, ++ BCH_DEV_READ_REF_btree_node_read_all_replicas); + struct btree_read_bio *rb = + container_of(ra->bio[i], struct btree_read_bio, bio); + rb->c = c; +@@ -1701,9 +1789,9 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, + struct bio *bio; + int ret; + +- trace_and_count(c, btree_node_read, trans, b); ++ trace_btree_node(c, b, btree_node_read); + +- if (bch2_verify_all_btree_replicas && ++ if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && + !btree_node_read_all_replicas(c, b, sync)) + return; + +@@ -1711,26 +1799,33 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, + NULL, &pick, -1); + + if (ret <= 0) { +- struct printbuf buf = PRINTBUF; ++ bool ratelimit = true; ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); + + prt_str(&buf, "btree node read error: no device to read from\n at "); + bch2_btree_pos_to_text(&buf, c, b); +- bch_err_ratelimited(c, "%s", buf.buf); ++ prt_newline(&buf); ++ bch2_btree_lost_data(c, &buf, b->c.btree_id); + +- if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && +- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) +- bch2_fatal_error(c); ++ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && ++ bch2_fs_emergency_read_only2(c, &buf)) ++ ratelimit = false; ++ ++ static DEFINE_RATELIMIT_STATE(rs, ++ DEFAULT_RATELIMIT_INTERVAL, ++ DEFAULT_RATELIMIT_BURST); ++ if (!ratelimit || __ratelimit(&rs)) ++ bch2_print_str(c, KERN_ERR, buf.buf); + + set_btree_node_read_error(b); +- bch2_btree_lost_data(c, b->c.btree_id); + clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +- printbuf_exit(&buf); + return; + } + +- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); ++ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); + + bio = bio_alloc_bioset(NULL, + buf_pages(b->data, btree_buf_bytes(b)), +@@ -1749,6 +1844,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, + bio->bi_end_io = btree_node_read_endio; + bch2_bio_map(bio, b->data, btree_buf_bytes(b)); + ++ async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); ++ + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(bio)); +@@ -1801,11 +1898,10 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, + bch2_btree_node_read(trans, b, true); + + if (btree_node_read_error(b)) { +- mutex_lock(&c->btree_cache.lock); +- bch2_btree_node_hash_remove(&c->btree_cache, b); +- mutex_unlock(&c->btree_cache.lock); ++ scoped_guard(mutex, &c->btree_cache.lock) ++ bch2_btree_node_hash_remove(&c->btree_cache, b); + +- ret = -BCH_ERR_btree_node_read_error; ++ ret = bch_err_throw(c, btree_node_read_error); + goto err; + } + +@@ -1820,7 +1916,8 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, + int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + const struct bkey_i *k, unsigned level) + { +- return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); ++ CLASS(btree_trans, trans)(c); ++ return __bch2_btree_root_read(trans, id, k, level); + } + + struct btree_node_scrub { +@@ -1899,43 +1996,26 @@ static void btree_node_scrub_work(struct work_struct *work) + { + struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); + struct bch_fs *c = scrub->c; +- struct printbuf err = PRINTBUF; ++ CLASS(printbuf, err)(); + + __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, + bkey_i_to_s_c(scrub->key.k)); + prt_newline(&err); + + if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { +- struct btree_trans *trans = bch2_trans_get(c); +- +- struct btree_iter iter; +- bch2_trans_node_iter_init(trans, &iter, scrub->btree, +- scrub->key.k->k.p, 0, scrub->level - 1, 0); +- +- struct btree *b; +- int ret = lockrestart_do(trans, +- PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); +- if (ret) +- goto err; +- +- if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { +- bch_err(c, "error validating btree node during scrub on %s at btree %s", +- scrub->ca->name, err.buf); +- +- ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +- } +-err: +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_begin(trans); +- bch2_trans_put(trans); ++ int ret = bch2_trans_do(c, ++ bch2_btree_node_rewrite_key(trans, scrub->btree, scrub->level - 1, ++ scrub->key.k, 0)); ++ if (!bch2_err_matches(ret, ENOENT) && ++ !bch2_err_matches(ret, EROFS)) ++ bch_err_fn_ratelimited(c, ret); + } + +- printbuf_exit(&err); +- bch2_bkey_buf_exit(&scrub->key, c);; ++ bch2_bkey_buf_exit(&scrub->key, c); + btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); +- percpu_ref_put(&scrub->ca->io_ref[READ]); ++ enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); + kfree(scrub); +- bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); + } + + static void btree_node_scrub_endio(struct bio *bio) +@@ -1954,17 +2034,18 @@ int bch2_btree_node_scrub(struct btree_trans *trans, + + struct bch_fs *c = trans->c; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_node_scrub)) +- return -BCH_ERR_erofs_no_writes; ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) ++ return bch_err_throw(c, erofs_no_writes); + + struct extent_ptr_decoded pick; + int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); + if (ret <= 0) + goto err; + +- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, ++ BCH_DEV_READ_REF_btree_node_scrub); + if (!ca) { +- ret = -BCH_ERR_device_offline; ++ ret = bch_err_throw(c, device_offline); + goto err; + } + +@@ -2002,9 +2083,9 @@ int bch2_btree_node_scrub(struct btree_trans *trans, + return 0; + err_free: + btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); + err: +- bch2_write_ref_put(c, BCH_WRITE_REF_btree_node_scrub); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); + return ret; + } + +@@ -2101,7 +2182,7 @@ static void btree_node_write_work(struct work_struct *work) + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { +- ret = -BCH_ERR_btree_node_write_all_failed; ++ ret = bch_err_throw(c, btree_node_write_all_failed); + goto err; + } + +@@ -2110,7 +2191,8 @@ static void btree_node_write_work(struct work_struct *work) + + } + } else { +- ret = bch2_trans_do(c, ++ CLASS(btree_trans, trans)(c); ++ ret = lockrestart_do(trans, + bch2_btree_node_update_key_get_iter(trans, b, &wbio->key, + BCH_WATERMARK_interior_updates| + BCH_TRANS_COMMIT_journal_reclaim| +@@ -2121,6 +2203,7 @@ static void btree_node_write_work(struct work_struct *work) + goto err; + } + out: ++ async_object_list_del(c, btree_write_bio, wbio->list_idx); + bio_put(&wbio->wbio.bio); + btree_node_write_done(c, b, start_time); + return; +@@ -2128,11 +2211,10 @@ static void btree_node_write_work(struct work_struct *work) + set_btree_node_noevict(b); + + if (!bch2_err_matches(ret, EROFS)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); + bch2_btree_pos_to_text(&buf, c, b); + bch2_fs_fatal_error(c, "%s", buf.buf); +- printbuf_exit(&buf); + } + goto out; + } +@@ -2151,13 +2233,12 @@ static void btree_node_write_endio(struct bio *bio) + wbio->submit_time, !bio->bi_status); + + if (ca && bio->bi_status) { +- struct printbuf buf = PRINTBUF; +- buf.atomic++; ++ CLASS(printbuf, buf)(); ++ guard(printbuf_atomic)(&buf); + prt_printf(&buf, "btree write error: %s\n ", + bch2_blk_status_to_str(bio->bi_status)); + bch2_btree_pos_to_text(&buf, c, b); + bch_err_dev_ratelimited(ca, "%s", buf.buf); +- printbuf_exit(&buf); + } + + if (bio->bi_status) { +@@ -2172,7 +2253,8 @@ static void btree_node_write_endio(struct bio *bio) + * btree writes yet (due to device removal/ro): + */ + if (wbio->have_ioref) +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_btree_node_write); + + if (parent) { + bio_put(bio); +@@ -2184,14 +2266,12 @@ static void btree_node_write_endio(struct bio *bio) + smp_mb__after_atomic(); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); + INIT_WORK(&wb->work, btree_node_write_work); +- queue_work(c->btree_io_complete_wq, &wb->work); ++ queue_work(c->btree_write_complete_wq, &wb->work); + } + + static int validate_bset_for_write(struct bch_fs *c, struct btree *b, +- struct bset *i, unsigned sectors) ++ struct bset *i) + { +- bool saw_error; +- + int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, +@@ -2204,8 +2284,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + return ret; + } + +- ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: +- validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); ++ ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: ++ validate_bset(c, NULL, b, i, b->written, WRITE, NULL, NULL); + if (ret) { + bch2_inconsistent_error(c); + dump_stack(); +@@ -2398,7 +2478,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + + /* if we're going to be encrypting, check metadata validity first: */ + if (validate_before_checksum && +- validate_bset_for_write(c, b, i, sectors_to_write)) ++ validate_bset_for_write(c, b, i)) + goto err; + + ret = bset_encrypt(c, i, b->written << 9); +@@ -2415,7 +2495,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + + /* if we're not encrypting, check metadata after checksumming: */ + if (!validate_before_checksum && +- validate_bset_for_write(c, b, i, sectors_to_write)) ++ validate_bset_for_write(c, b, i)) + goto err; + + /* +@@ -2440,11 +2520,26 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + c->opts.nochanges) + goto err; + +- trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); ++ if (trace_btree_node_write_enabled()) { ++ CLASS(printbuf, buf)(); ++ printbuf_indent_add(&buf, 2); ++ prt_printf(&buf, "offset %u sectors %u bytes %u\n", ++ b->written, ++ sectors_to_write, ++ bytes_to_write); ++ bch2_btree_pos_to_text(&buf, c, b); ++ trace_btree_node_write(c, buf.buf); ++ } ++ count_event(c, btree_node_write); ++ ++ /* ++ * blk-wbt.c throttles all writes except those that have both REQ_SYNC ++ * and REQ_IDLE set... ++ */ + + wbio = container_of(bio_alloc_bioset(NULL, + buf_pages(data, sectors_to_write << 9), +- REQ_OP_WRITE|REQ_META, ++ REQ_OP_WRITE|REQ_META|REQ_SYNC|REQ_IDLE, + GFP_NOFS, + &c->btree_bio), + struct btree_write_bio, wbio.bio); +@@ -2472,6 +2567,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) + atomic64_inc(&c->btree_write_stats[type].nr); + atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); + ++ async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); ++ + INIT_WORK(&wbio->work, btree_write_submit); + queue_work(c->btree_write_submit_wq, &wbio->work); + return; +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index dbf76d22c660..30a5180532c8 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -41,6 +41,9 @@ struct btree_read_bio { + u64 start_time; + unsigned have_ioref:1; + unsigned idx:7; ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ unsigned list_idx; ++#endif + struct extent_ptr_decoded pick; + struct work_struct work; + struct bio bio; +@@ -53,6 +56,9 @@ struct btree_write_bio { + unsigned data_bytes; + unsigned sector_offset; + u64 start_time; ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ unsigned list_idx; ++#endif + struct bch_write_bio wbio; + }; + +@@ -128,11 +134,15 @@ void bch2_btree_build_aux_trees(struct btree *); + void bch2_btree_init_next(struct btree_trans *, struct btree *); + + int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, +- struct btree *, bool, bool *); ++ struct btree *, ++ struct bch_io_failures *, ++ struct printbuf *); + void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); + int bch2_btree_root_read(struct bch_fs *, enum btree_id, + const struct bkey_i *, unsigned); + ++void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); ++ + int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, unsigned); + +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index ac5f2046550d..a67babf69d39 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -16,6 +16,7 @@ + #include "journal_io.h" + #include "replicas.h" + #include "snapshot.h" ++#include "super.h" + #include "trace.h" + + #include +@@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, + !btree_path_pos_after_node(path, b); + } + +-/* Btree iterator: */ ++/* Debug: */ + +-#ifdef CONFIG_BCACHEFS_DEBUG +- +-static void bch2_btree_path_verify_cached(struct btree_trans *trans, ++static void __bch2_btree_path_verify_cached(struct btree_trans *trans, + struct btree_path *path) + { + struct bkey_cached *ck; +@@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, + btree_node_unlock(trans, path, 0); + } + +-static void bch2_btree_path_verify_level(struct btree_trans *trans, ++static void __bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned level) + { + struct btree_path_level *l; +@@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, + struct printbuf buf3 = PRINTBUF; + const char *msg; + +- if (!bch2_debug_check_iterators) +- return; +- + l = &path->l[level]; + tmp = l->iter; + locked = btree_node_locked(path, level); + + if (path->cached) { + if (!level) +- bch2_btree_path_verify_cached(trans, path); ++ __bch2_btree_path_verify_cached(trans, path); + return; + } + +@@ -217,7 +213,7 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, + msg, level, buf1.buf, buf2.buf, buf3.buf); + } + +-static void bch2_btree_path_verify(struct btree_trans *trans, ++static void __bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) + { + struct bch_fs *c = trans->c; +@@ -229,23 +225,25 @@ static void bch2_btree_path_verify(struct btree_trans *trans, + break; + } + +- bch2_btree_path_verify_level(trans, path, i); ++ __bch2_btree_path_verify_level(trans, path, i); + } + +- bch2_btree_path_verify_locks(path); ++ bch2_btree_path_verify_locks(trans, path); + } + +-void bch2_trans_verify_paths(struct btree_trans *trans) ++void __bch2_trans_verify_paths(struct btree_trans *trans) + { + struct btree_path *path; + unsigned iter; + + trans_for_each_path(trans, path, iter) +- bch2_btree_path_verify(trans, path); ++ __bch2_btree_path_verify(trans, path); + } + +-static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) ++static void __bch2_btree_iter_verify(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; ++ + BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); + + BUG_ON((iter->flags & BTREE_ITER_is_extents) && +@@ -256,11 +254,11 @@ static void bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter + !btree_type_has_snapshot_field(iter->btree_id)); + + if (iter->update_path) +- bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); +- bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); ++ __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); ++ __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); + } + +-static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) ++static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + { + BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && + !iter->pos.snapshot); +@@ -274,15 +272,9 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + bkey_gt(iter->pos, iter->k.p))); + } + +-static int bch2_btree_iter_verify_ret(struct btree_trans *trans, +- struct btree_iter *iter, struct bkey_s_c k) ++static int __bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) + { +- struct btree_iter copy; +- struct bkey_s_c prev; +- int ret = 0; +- +- if (!bch2_debug_check_iterators) +- return 0; ++ struct btree_trans *trans = iter->trans; + + if (!(iter->flags & BTREE_ITER_filter_snapshots)) + return 0; +@@ -294,16 +286,16 @@ static int bch2_btree_iter_verify_ret(struct btree_trans *trans, + iter->snapshot, + k.k->p.snapshot)); + +- bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, +- BTREE_ITER_nopreserve| +- BTREE_ITER_all_snapshots); +- prev = bch2_btree_iter_prev(trans, ©); ++ CLASS(btree_iter, copy)(trans, iter->btree_id, iter->pos, ++ BTREE_ITER_nopreserve| ++ BTREE_ITER_all_snapshots); ++ struct bkey_s_c prev = bch2_btree_iter_prev(©); + if (!prev.k) +- goto out; ++ return 0; + +- ret = bkey_err(prev); ++ int ret = bkey_err(prev); + if (ret) +- goto out; ++ return ret; + + if (bkey_eq(prev.k->p, k.k->p) && + bch2_snapshot_is_ancestor(trans->c, iter->snapshot, +@@ -319,12 +311,11 @@ static int bch2_btree_iter_verify_ret(struct btree_trans *trans, + iter->snapshot, + buf1.buf, buf2.buf); + } +-out: +- bch2_trans_iter_exit(trans, ©); +- return ret; ++ ++ return 0; + } + +-void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, ++void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos) + { + bch2_trans_verify_not_unlocked_or_in_restart(trans); +@@ -357,19 +348,39 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); + } + +-#else +- + static inline void bch2_btree_path_verify_level(struct btree_trans *trans, +- struct btree_path *path, unsigned l) {} ++ struct btree_path *path, unsigned l) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_iterators)) ++ __bch2_btree_path_verify_level(trans, path, l); ++} ++ + static inline void bch2_btree_path_verify(struct btree_trans *trans, +- struct btree_path *path) {} +-static inline void bch2_btree_iter_verify(struct btree_trans *trans, +- struct btree_iter *iter) {} +-static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} +-static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, +- struct bkey_s_c k) { return 0; } ++ struct btree_path *path) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_iterators)) ++ __bch2_btree_path_verify(trans, path); ++} + +-#endif ++static inline void bch2_btree_iter_verify(struct btree_iter *iter) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_iterators)) ++ __bch2_btree_iter_verify(iter); ++} ++ ++static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_iterators)) ++ __bch2_btree_iter_verify_entry_exit(iter); ++} ++ ++static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ return static_branch_unlikely(&bch2_debug_check_iterators) ++ ? __bch2_btree_iter_verify_ret(iter, k) ++ : 0; ++} + + /* Btree path: fixups after btree updates */ + +@@ -523,7 +534,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, + __bch2_btree_node_iter_fix(path, b, node_iter, t, + where, clobber_u64s, new_u64s); + +- if (bch2_debug_check_iterators) ++ if (static_branch_unlikely(&bch2_debug_check_iterators)) + bch2_btree_node_iter_verify(node_iter, b); + } + +@@ -631,6 +642,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str + + trans_for_each_update(trans, i) + if (!i->cached && ++ !i->key_cache_flushing && + i->level == b->c.level && + i->btree_id == b->c.btree_id && + bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && +@@ -876,8 +888,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + + static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + struct btree_path *path, +- unsigned flags, +- struct bkey_buf *out) ++ enum btree_iter_update_trigger_flags flags) + { + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); +@@ -889,7 +900,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + + k = bch2_btree_and_journal_iter_peek(&jiter); + if (!k.k) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); +@@ -897,11 +908,10 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + bch2_btree_pos_to_text(&buf, c, l->b); + + ret = bch2_fs_topology_error(c, "%s", buf.buf); +- printbuf_exit(&buf); + goto err; + } + +- bch2_bkey_buf_reassemble(out, c, k); ++ bkey_reassemble(&trans->btree_path_down, k); + + if ((flags & BTREE_ITER_prefetch) && + c->opts.btree_node_prefetch) +@@ -912,9 +922,25 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + return ret; + } + ++static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ struct bch_fs *c = trans->c; ++ CLASS(printbuf, buf)(); ++ ++ prt_str(&buf, "node not found at pos "); ++ bch2_bpos_to_text(&buf, path->pos); ++ prt_str(&buf, " within parent node "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); ++ ++ bch2_fs_fatal_error(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ return bch_err_throw(c, btree_need_topology_repair); ++} ++ + static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree_path *path, +- unsigned flags, ++ enum btree_iter_update_trigger_flags flags, + unsigned long trace_ip) + { + struct bch_fs *c = trans->c; +@@ -922,51 +948,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree *b; + unsigned level = path->level - 1; + enum six_lock_type lock_type = __btree_lock_want(path, level); +- struct bkey_buf tmp; + int ret; + + EBUG_ON(!btree_node_locked(path, path->level)); + +- bch2_bkey_buf_init(&tmp); +- + if (unlikely(trans->journal_replay_not_finished)) { +- ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); ++ ret = btree_node_iter_and_journal_peek(trans, path, flags); + if (ret) +- goto err; ++ return ret; + } else { + struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); +- if (!k) { +- struct printbuf buf = PRINTBUF; +- +- prt_str(&buf, "node not found at pos "); +- bch2_bpos_to_text(&buf, path->pos); +- prt_str(&buf, " within parent node "); +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); +- +- bch2_fs_fatal_error(c, "%s", buf.buf); +- printbuf_exit(&buf); +- ret = -BCH_ERR_btree_need_topology_repair; +- goto err; +- } ++ if (unlikely(!k)) ++ return btree_node_missing_err(trans, path); + +- bch2_bkey_buf_unpack(&tmp, c, l->b, k); ++ bch2_bkey_unpack(l->b, &trans->btree_path_down, k); + +- if ((flags & BTREE_ITER_prefetch) && ++ if (unlikely((flags & BTREE_ITER_prefetch)) && + c->opts.btree_node_prefetch) { + ret = btree_path_prefetch(trans, path); + if (ret) +- goto err; ++ return ret; + } + } + +- b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); ++ b = bch2_btree_node_get(trans, path, &trans->btree_path_down, ++ level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); + if (unlikely(ret)) +- goto err; ++ return ret; + +- if (likely(!trans->journal_replay_not_finished && +- tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && +- unlikely(b != btree_node_mem_ptr(tmp.k))) ++ if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && ++ likely(!trans->journal_replay_not_finished && ++ trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) + btree_node_mem_ptr_set(trans, path, level + 1, b); + + if (btree_node_read_locked(path, level + 1)) +@@ -977,10 +990,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans, + path->level = level; + bch2_btree_path_level_init(trans, path, b); + +- bch2_btree_path_verify_locks(path); +-err: +- bch2_bkey_buf_exit(&tmp, c); +- return ret; ++ bch2_btree_path_verify_locks(trans, path); ++ return 0; + } + + static int bch2_btree_path_traverse_all(struct btree_trans *trans) +@@ -992,7 +1003,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) + int ret = 0; + + if (trans->in_traverse_all) +- return -BCH_ERR_transaction_restart_in_traverse_all; ++ return bch_err_throw(c, transaction_restart_in_traverse_all); + + trans->in_traverse_all = true; + retry_all: +@@ -1089,7 +1100,7 @@ static void btree_path_set_level_down(struct btree_trans *trans, + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, l); + +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_path_verify(trans, path); + } + +@@ -1137,7 +1148,7 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + */ + int bch2_btree_path_traverse_one(struct btree_trans *trans, + btree_path_idx_t path_idx, +- unsigned flags, ++ enum btree_iter_update_trigger_flags flags, + unsigned long trace_ip) + { + struct btree_path *path = &trans->paths[path_idx]; +@@ -1287,7 +1298,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, + if (unlikely(path->cached)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + goto out; + } + +@@ -1316,7 +1327,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, + } + + if (unlikely(level != path->level)) { +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + __bch2_btree_path_unlock(trans, path); + } + out: +@@ -1385,45 +1396,45 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p + + void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) + { +- struct btree_path *path = trans->paths + path_idx, *dup; ++ struct btree_path *path = trans->paths + path_idx, *dup = NULL; + + if (!__btree_path_put(trans, path, intent)) + return; + ++ if (!path->preserve && !path->should_be_locked) ++ goto free; ++ + dup = path->preserve + ? have_path_at_pos(trans, path) + : have_node_at_pos(trans, path); +- +- trace_btree_path_free(trans, path_idx, dup); +- +- if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) ++ if (!dup) + return; + +- if (path->should_be_locked && !trans->restarted) { +- if (!dup) +- return; +- ++ /* ++ * If we need this path locked, the duplicate also has te be locked ++ * before we free this one: ++ */ ++ if (path->should_be_locked && ++ !dup->should_be_locked && ++ !trans->restarted) { + if (!(trans->locked + ? bch2_btree_path_relock_norestart(trans, dup) + : bch2_btree_path_can_relock(trans, dup))) + return; +- } + +- if (dup) { +- dup->preserve |= path->preserve; +- dup->should_be_locked |= path->should_be_locked; ++ dup->should_be_locked = true; + } + +- __bch2_path_free(trans, path_idx); +-} +- +-static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, +- bool intent) +-{ +- if (!__btree_path_put(trans, trans->paths + path, intent)) +- return; ++ BUG_ON(path->should_be_locked && ++ !trans->restarted && ++ trans->locked && ++ !btree_node_locked(dup, dup->level)); + +- __bch2_path_free(trans, path); ++ path->should_be_locked = false; ++ dup->preserve |= path->preserve; ++free: ++ trace_btree_path_free(trans, path_idx, dup); ++ __bch2_path_free(trans, path_idx); + } + + void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) +@@ -1436,7 +1447,7 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ + static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) + { + #ifdef CONFIG_BCACHEFS_DEBUG +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_prt_backtrace(&buf, &trans->last_restarted_trace); + panic("in transaction restart: %s, last restarted by\n%s", + bch2_err_str(trans->restarted), +@@ -1485,7 +1496,7 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) + prt_newline(buf); + } + +- for (struct jset_entry *e = trans->journal_entries; ++ for (struct jset_entry *e = btree_trans_journal_entries_start(trans); + e != btree_trans_journal_entries_top(trans); + e = vstruct_next(e)) { + bch2_journal_entry_to_text(buf, trans->c, e); +@@ -1586,13 +1597,13 @@ void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) + static noinline __cold + void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) + { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(trans->c, &buf); + + __bch2_trans_paths_to_text(&buf, trans, nosort); + bch2_trans_updates_to_text(&buf, trans); + +- bch2_print_str(trans->c, buf.buf); +- printbuf_exit(&buf); ++ bch2_print_str(trans->c, KERN_ERR, buf.buf); + } + + noinline __cold +@@ -1605,22 +1616,19 @@ noinline __cold + static void bch2_trans_update_max_paths(struct btree_trans *trans) + { + struct btree_transaction_stats *s = btree_trans_stats(trans); +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths); + + bch2_trans_paths_to_text(&buf, trans); + + if (!buf.allocation_failure) { +- mutex_lock(&s->lock); ++ guard(mutex)(&s->lock); + if (nr > s->nr_max_paths) { + s->nr_max_paths = nr; + swap(s->max_paths_text, buf.buf); + } +- mutex_unlock(&s->lock); + } + +- printbuf_exit(&buf); +- + trans->nr_paths_max = nr; + } + +@@ -1628,11 +1636,10 @@ noinline __cold + int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) + { + if (trace_trans_restart_too_many_iters_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_trans_paths_to_text(&buf, trans); + trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf); +- printbuf_exit(&buf); + } + + count_event(trans->c, trans_restart_too_many_iters); +@@ -1722,7 +1729,8 @@ static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, + btree_path_idx_t bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, +- unsigned flags, unsigned long ip) ++ enum btree_iter_update_trigger_flags flags, ++ unsigned long ip) + { + struct btree_path *path; + bool cached = flags & BTREE_ITER_cached; +@@ -1735,6 +1743,10 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, + + btree_trans_sort_paths(trans); + ++ if (intent) ++ locks_want = max(locks_want, level + 1); ++ locks_want = min(locks_want, BTREE_MAX_DEPTH); ++ + trans_for_each_path_inorder(trans, path, iter) { + if (__btree_path_cmp(path, + btree_id, +@@ -1749,7 +1761,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, + if (path_pos && + trans->paths[path_pos].cached == cached && + trans->paths[path_pos].btree_id == btree_id && +- trans->paths[path_pos].level == level) { ++ trans->paths[path_pos].level == level && ++ bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { + trace_btree_path_get(trans, trans->paths + path_pos, &pos); + + __btree_path_get(trans, trans->paths + path_pos, intent); +@@ -1781,9 +1794,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, + if (!(flags & BTREE_ITER_nopreserve)) + path->preserve = true; + +- if (path->intent_ref) +- locks_want = max(locks_want, level + 1); +- + /* + * If the path has locks_want greater than requested, we don't downgrade + * it here - on transaction restart because btree node split needs to +@@ -1792,10 +1802,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, + * a successful transaction commit. + */ + +- locks_want = min(locks_want, BTREE_MAX_DEPTH); +- if (locks_want > path->locks_want) +- bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); +- + return path_idx; + } + +@@ -1855,8 +1861,10 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * + return (struct bkey_s_c) { u, NULL }; + } + +-void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) ++void bch2_set_btree_iter_dontneed(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; ++ + if (!iter->path || trans->restarted) + return; + +@@ -1868,14 +1876,17 @@ void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter * + /* Btree iterators: */ + + int __must_check +-__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) ++__bch2_btree_iter_traverse(struct btree_iter *iter) + { +- return bch2_btree_path_traverse(trans, iter->path, iter->flags); ++ return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + } + + int __must_check +-bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) ++bch2_btree_iter_traverse(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; ++ int ret; ++ + bch2_trans_verify_not_unlocked_or_in_restart(trans); + + iter->path = bch2_btree_path_set_pos(trans, iter->path, +@@ -1883,7 +1894,7 @@ bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) + iter->flags & BTREE_ITER_intent, + btree_iter_ip_allocated(iter)); + +- int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); ++ ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + if (ret) + return ret; + +@@ -1895,14 +1906,14 @@ bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) + + /* Iterate across nodes (leaf and interior nodes) */ + +-struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, +- struct btree_iter *iter) ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + struct btree *b = NULL; + int ret; + + EBUG_ON(trans->paths[iter->path].cached); +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) +@@ -1924,7 +1935,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, + btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); + out: + bch2_btree_iter_verify_entry_exit(iter); +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); + + return b; + err: +@@ -1933,26 +1944,26 @@ struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, + } + + /* Only kept for -tools */ +-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, +- struct btree_iter *iter) ++struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) + { + struct btree *b; + +- while (b = bch2_btree_iter_peek_node(trans, iter), ++ while (b = bch2_btree_iter_peek_node(iter), + bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) +- bch2_trans_begin(trans); ++ bch2_trans_begin(iter->trans); + + return b; + } + +-struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) ++struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + struct btree *b = NULL; + int ret; + + EBUG_ON(trans->paths[iter->path].cached); + bch2_trans_verify_not_unlocked_or_in_restart(trans); +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) +@@ -1967,6 +1978,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ + + /* got to end? */ + if (!btree_path_node(path, path->level + 1)) { ++ path->should_be_locked = false; + btree_path_set_level_up(trans, path); + return NULL; + } +@@ -1978,12 +1990,12 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ + bch2_btree_path_downgrade(trans, path); + + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { ++ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); ++ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + __bch2_btree_path_unlock(trans, path); + path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); + path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); +- trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); +- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); ++ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + goto err; + } + +@@ -2025,7 +2037,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ + EBUG_ON(btree_iter_path(trans, iter)->uptodate); + out: + bch2_btree_iter_verify_entry_exit(iter); +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); + + return b; + err: +@@ -2035,7 +2047,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_ + + /* Iterate across keys (in leaf nodes only) */ + +-inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) ++inline bool bch2_btree_iter_advance(struct btree_iter *iter) + { + struct bpos pos = iter->k.p; + bool ret = !(iter->flags & BTREE_ITER_all_snapshots +@@ -2044,11 +2056,11 @@ inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter + + if (ret && !(iter->flags & BTREE_ITER_is_extents)) + pos = bkey_successor(iter, pos); +- bch2_btree_iter_set_pos(trans, iter, pos); ++ bch2_btree_iter_set_pos(iter, pos); + return ret; + } + +-inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) ++inline bool bch2_btree_iter_rewind(struct btree_iter *iter) + { + struct bpos pos = bkey_start_pos(&iter->k); + bool ret = !(iter->flags & BTREE_ITER_all_snapshots +@@ -2057,20 +2069,20 @@ inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter + + if (ret && !(iter->flags & BTREE_ITER_is_extents)) + pos = bkey_predecessor(iter, pos); +- bch2_btree_iter_set_pos(trans, iter, pos); ++ bch2_btree_iter_set_pos(iter, pos); + return ret; + } + + static noinline + void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter, +- struct bkey_s_c *k) ++ struct bpos search_key, struct bkey_s_c *k) + { + struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key; + + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && +- bpos_le(i->k->k.p, iter->pos) && ++ bpos_le(i->k->k.p, search_key) && + bpos_ge(i->k->k.p, k->k ? k->k->p : end)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); +@@ -2079,6 +2091,7 @@ void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_ + + static noinline + void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos search_key, + struct bkey_s_c *k) + { + struct btree_path *path = btree_iter_path(trans, iter); +@@ -2087,7 +2100,7 @@ void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter + trans_for_each_update(trans, i) + if (!i->key_cache_already_flushed && + i->btree_id == iter->btree_id && +- bpos_ge(i->k->k.p, path->pos) && ++ bpos_ge(i->k->k.p, search_key) && + bpos_le(i->k->k.p, k->k ? k->k->p : end)) { + iter->k = i->k->k; + *k = bkey_i_to_s_c(i->k); +@@ -2109,13 +2122,14 @@ void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_ + + static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, + struct btree_iter *iter, ++ struct bpos search_pos, + struct bpos end_pos) + { + struct btree_path *path = btree_iter_path(trans, iter); + + return bch2_journal_keys_peek_max(trans->c, iter->btree_id, + path->level, +- path->pos, ++ search_pos, + end_pos, + &iter->journal_idx); + } +@@ -2125,7 +2139,7 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, + struct btree_iter *iter) + { + struct btree_path *path = btree_iter_path(trans, iter); +- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos); ++ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos, path->pos); + + if (k) { + iter->k = k->k; +@@ -2138,11 +2152,12 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, + static noinline + void btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, ++ struct bpos search_key, + struct bkey_s_c *k) + { + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *next_journal = +- bch2_btree_journal_peek(trans, iter, ++ bch2_btree_journal_peek(trans, iter, search_key, + k->k ? k->k->p : path_l(path)->b->key.k.p); + if (next_journal) { + iter->k = next_journal->k; +@@ -2152,13 +2167,14 @@ void btree_trans_peek_journal(struct btree_trans *trans, + + static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, + struct btree_iter *iter, ++ struct bpos search_key, + struct bpos end_pos) + { + struct btree_path *path = btree_iter_path(trans, iter); + + return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, + path->level, +- path->pos, ++ search_key, + end_pos, + &iter->journal_idx); + } +@@ -2166,12 +2182,13 @@ static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, + static noinline + void btree_trans_peek_prev_journal(struct btree_trans *trans, + struct btree_iter *iter, ++ struct bpos search_key, + struct bkey_s_c *k) + { + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *next_journal = +- bch2_btree_journal_peek_prev(trans, iter, +- k->k ? k->k->p : path_l(path)->b->key.k.p); ++ bch2_btree_journal_peek_prev(trans, iter, search_key, ++ k->k ? k->k->p : path_l(path)->b->data->min_key); + + if (next_journal) { + iter->k = next_journal->k; +@@ -2184,9 +2201,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans, + * bkey_s_c_null: + */ + static noinline +-struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, +- struct bpos pos) ++struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) + { ++ struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k; +@@ -2232,14 +2249,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btr + return k; + } + +-static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, +- struct bpos search_key) ++static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) + { ++ struct btree_trans *trans = iter->trans; + struct bkey_s_c k, k2; + int ret; + + EBUG_ON(btree_iter_path(trans, iter)->cached); +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, +@@ -2249,7 +2266,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ +- bch2_btree_iter_set_pos(trans, iter, iter->pos); ++ bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + break; + } +@@ -2259,7 +2276,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ +- bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); ++ bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + break; + } +@@ -2270,20 +2287,21 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct + + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && + k.k && +- (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { ++ !bkey_deleted(k.k) && ++ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + k = k2; + if (bkey_err(k)) { +- bch2_btree_iter_set_pos(trans, iter, iter->pos); ++ bch2_btree_iter_set_pos(iter, iter->pos); + break; + } + } + + if (unlikely(iter->flags & BTREE_ITER_with_journal)) +- btree_trans_peek_journal(trans, iter, &k); ++ btree_trans_peek_journal(trans, iter, search_key, &k); + + if (unlikely((iter->flags & BTREE_ITER_with_updates) && + trans->nr_updates)) +- bch2_btree_trans_peek_updates(trans, iter, &k); ++ bch2_btree_trans_peek_updates(trans, iter, search_key, &k); + + if (k.k && bkey_deleted(k.k)) { + /* +@@ -2306,28 +2324,41 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct + search_key = bpos_successor(l->b->key.k.p); + } else { + /* End of btree: */ +- bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); ++ bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + break; + } + } + +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); ++ ++ if (trace___btree_iter_peek_enabled()) { ++ CLASS(printbuf, buf)(); ++ ++ int ret = bkey_err(k); ++ if (ret) ++ prt_str(&buf, bch2_err_str(ret)); ++ else if (k.k) ++ bch2_bkey_val_to_text(&buf, trans->c, k); ++ else ++ prt_str(&buf, "(null)"); ++ trace___btree_iter_peek(trans->c, buf.buf); ++ } ++ + return k; + } + + /** + * bch2_btree_iter_peek_max() - returns first key greater than or equal to + * iterator's current position +- * @trans: btree transaction object + * @iter: iterator to peek from + * @end: search limit: returns keys less than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, +- struct bpos end) ++struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) + { ++ struct btree_trans *trans = iter->trans; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_s_c k; + struct bpos iter_pos = iter->pos; +@@ -2344,13 +2375,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree + } + + if (iter->update_path) { +- bch2_path_put_nokeep(trans, iter->update_path, +- iter->flags & BTREE_ITER_intent); ++ bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); + iter->update_path = 0; + } + + while (1) { +- k = __bch2_btree_iter_peek(trans, iter, search_key); ++ k = __bch2_btree_iter_peek(iter, search_key); + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) +@@ -2374,8 +2404,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree + + if (iter->update_path && + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { +- bch2_path_put_nokeep(trans, iter->update_path, +- iter->flags & BTREE_ITER_intent); ++ bch2_path_put(trans, iter->update_path, ++ iter->flags & BTREE_ITER_intent); + iter->update_path = 0; + } + +@@ -2421,7 +2451,7 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree + } + + if (bkey_whiteout(k.k) && +- !(iter->flags & BTREE_ITER_key_cache_fill)) { ++ !(iter->flags & BTREE_ITER_nofilter_whiteouts)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } +@@ -2464,17 +2494,30 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree + if (!(iter->flags & BTREE_ITER_all_snapshots)) + iter->pos.snapshot = iter->snapshot; + +- ret = bch2_btree_iter_verify_ret(trans, iter, k); ++ ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) { +- bch2_btree_iter_set_pos(trans, iter, iter->pos); ++ bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + } + + bch2_btree_iter_verify_entry_exit(iter); + ++ if (trace_btree_iter_peek_max_enabled()) { ++ CLASS(printbuf, buf)(); ++ ++ int ret = bkey_err(k); ++ if (ret) ++ prt_str(&buf, bch2_err_str(ret)); ++ else if (k.k) ++ bch2_bkey_val_to_text(&buf, trans->c, k); ++ else ++ prt_str(&buf, "(null)"); ++ trace_btree_iter_peek_max(trans->c, buf.buf); ++ } ++ + return k; + end: +- bch2_btree_iter_set_pos(trans, iter, end); ++ bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; + } +@@ -2482,25 +2525,24 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree + /** + * bch2_btree_iter_next() - returns first key greater than iterator's current + * position +- * @trans: btree transaction object + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +-struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + { +- if (!bch2_btree_iter_advance(trans, iter)) ++ if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + +- return bch2_btree_iter_peek(trans, iter); ++ return bch2_btree_iter_peek(iter); + } + +-static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, +- struct bpos search_key) ++static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) + { ++ struct btree_trans *trans = iter->trans; + struct bkey_s_c k, k2; + +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, +@@ -2510,7 +2552,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ +- bch2_btree_iter_set_pos(trans, iter, iter->pos); ++ bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + break; + } +@@ -2520,7 +2562,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ +- bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); ++ bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + break; + } +@@ -2536,20 +2578,21 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st + + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && + k.k && +- (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { ++ !bkey_deleted(k.k) && ++ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + k = k2; + if (bkey_err(k2)) { +- bch2_btree_iter_set_pos(trans, iter, iter->pos); ++ bch2_btree_iter_set_pos(iter, iter->pos); + break; + } + } + + if (unlikely(iter->flags & BTREE_ITER_with_journal)) +- btree_trans_peek_prev_journal(trans, iter, &k); ++ btree_trans_peek_prev_journal(trans, iter, search_key, &k); + + if (unlikely((iter->flags & BTREE_ITER_with_updates) && + trans->nr_updates)) +- bch2_btree_trans_peek_prev_updates(trans, iter, &k); ++ bch2_btree_trans_peek_prev_updates(trans, iter, search_key, &k); + + if (likely(k.k && !bkey_deleted(k.k))) { + break; +@@ -2560,27 +2603,25 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, st + search_key = bpos_predecessor(path->l[0].b->data->min_key); + } else { + /* Start of btree: */ +- bch2_btree_iter_set_pos(trans, iter, POS_MIN); ++ bch2_btree_iter_set_pos(iter, POS_MIN); + k = bkey_s_c_null; + break; + } + } + +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); + return k; + } + + /** + * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to + * iterator's current position +- * @trans: btree transaction object + * @iter: iterator to peek from + * @end: search limit: returns keys greater than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, +- struct bpos end) ++struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) + { + if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && + !bkey_eq(iter->pos, POS_MAX) && +@@ -2595,7 +2636,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct + * real visible extents - easiest to just use peek_slot() (which + * internally uses peek() for extents) + */ +- struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) + return k; + +@@ -2605,6 +2646,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct + return k; + } + ++ struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; + struct bkey_s_c k; + btree_path_idx_t saved_path = 0; +@@ -2620,7 +2662,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct + } + + while (1) { +- k = __bch2_btree_iter_peek_prev(trans, iter, search_key); ++ k = __bch2_btree_iter_peek_prev(iter, search_key); + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) +@@ -2634,7 +2676,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct + * the last possible snapshot overwrite, return + * it: + */ +- bch2_path_put_nokeep(trans, iter->path, ++ bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_intent); + iter->path = saved_path; + saved_path = 0; +@@ -2664,8 +2706,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct + * our previous saved candidate: + */ + if (saved_path) { +- bch2_path_put_nokeep(trans, saved_path, +- iter->flags & BTREE_ITER_intent); ++ bch2_path_put(trans, saved_path, ++ iter->flags & BTREE_ITER_intent); + saved_path = 0; + } + +@@ -2702,19 +2744,32 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct + } + + /* Extents can straddle iter->pos: */ +- iter->pos = bpos_min(iter->pos, k.k->p);; ++ iter->pos = bpos_min(iter->pos, k.k->p); + + if (iter->flags & BTREE_ITER_filter_snapshots) + iter->pos.snapshot = iter->snapshot; + out_no_locked: + if (saved_path) +- bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); ++ bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); + + bch2_btree_iter_verify_entry_exit(iter); +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); ++ ++ if (trace_btree_iter_peek_prev_min_enabled()) { ++ CLASS(printbuf, buf)(); ++ ++ int ret = bkey_err(k); ++ if (ret) ++ prt_str(&buf, bch2_err_str(ret)); ++ else if (k.k) ++ bch2_bkey_val_to_text(&buf, trans->c, k); ++ else ++ prt_str(&buf, "(null)"); ++ trace_btree_iter_peek_prev_min(trans->c, buf.buf); ++ } + return k; + end: +- bch2_btree_iter_set_pos(trans, iter, end); ++ bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; + } +@@ -2722,27 +2777,27 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct + /** + * bch2_btree_iter_prev() - returns first key less than iterator's current + * position +- * @trans: btree transaction object + * @iter: iterator to peek from + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +-struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) + { +- if (!bch2_btree_iter_rewind(trans, iter)) ++ if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; + +- return bch2_btree_iter_peek_prev(trans, iter); ++ return bch2_btree_iter_peek_prev(iter); + } + +-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; + struct bpos search_key; +- struct bkey_s_c k; ++ struct bkey_s_c k, k2; + int ret; + + bch2_trans_verify_not_unlocked_or_in_restart(trans); +- bch2_btree_iter_verify(trans, iter); ++ bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); + +@@ -2755,10 +2810,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre + /* extents can't span inode numbers: */ + if ((iter->flags & BTREE_ITER_is_extents) && + unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { +- if (iter->pos.inode == KEY_INODE_MAX) +- return bkey_s_c_null; ++ if (iter->pos.inode == KEY_INODE_MAX) { ++ k = bkey_s_c_null; ++ goto out2; ++ } + +- bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); ++ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + } + + search_key = btree_iter_search_key(iter); +@@ -2773,8 +2830,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre + } + + struct btree_path *path = btree_iter_path(trans, iter); +- if (unlikely(!btree_path_node(path, path->level))) +- return bkey_s_c_null; ++ if (unlikely(!btree_path_node(path, path->level))) { ++ k = bkey_s_c_null; ++ goto out2; ++ } + + btree_path_set_should_be_locked(trans, path); + +@@ -2793,21 +2852,22 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre + (k = btree_trans_peek_slot_journal(trans, iter)).k) + goto out; + +- if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && +- (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { +- if (!bkey_err(k)) +- iter->k = *k.k; +- /* We're not returning a key from iter->path: */ +- goto out; +- } +- + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); + if (unlikely(!k.k)) + goto out; + ++ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && ++ !bkey_deleted(k.k) && ++ (k2 = btree_trans_peek_key_cache(iter, iter->pos)).k) { ++ k = k2; ++ if (bkey_err(k)) ++ goto out; ++ iter->k = *k.k; ++ } ++ + if (unlikely(k.k->type == KEY_TYPE_whiteout && + (iter->flags & BTREE_ITER_filter_snapshots) && +- !(iter->flags & BTREE_ITER_key_cache_fill))) ++ !(iter->flags & BTREE_ITER_nofilter_whiteouts))) + iter->k.type = KEY_TYPE_deleted; + } else { + struct bpos next; +@@ -2821,21 +2881,21 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre + if (iter->flags & BTREE_ITER_intent) { + struct btree_iter iter2; + +- bch2_trans_copy_iter(trans, &iter2, iter); +- k = bch2_btree_iter_peek_max(trans, &iter2, end); ++ bch2_trans_copy_iter(&iter2, iter); ++ k = bch2_btree_iter_peek_max(&iter2, end); + + if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); + iter->k = iter2.k; + k.k = &iter->k; + } +- bch2_trans_iter_exit(trans, &iter2); ++ bch2_trans_iter_exit(&iter2); + } else { + struct bpos pos = iter->pos; + +- k = bch2_btree_iter_peek_max(trans, iter, end); ++ k = bch2_btree_iter_peek_max(iter, end); + if (unlikely(bkey_err(k))) +- bch2_btree_iter_set_pos(trans, iter, pos); ++ bch2_btree_iter_set_pos(iter, pos); + else + iter->pos = pos; + } +@@ -2864,39 +2924,52 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btre + } + out: + bch2_btree_iter_verify_entry_exit(iter); +- bch2_btree_iter_verify(trans, iter); +- ret = bch2_btree_iter_verify_ret(trans, iter, k); ++ bch2_btree_iter_verify(iter); ++ ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) +- return bkey_s_c_err(ret); ++ k = bkey_s_c_err(ret); ++out2: ++ if (trace_btree_iter_peek_slot_enabled()) { ++ CLASS(printbuf, buf)(); ++ ++ int ret = bkey_err(k); ++ if (ret) ++ prt_str(&buf, bch2_err_str(ret)); ++ else if (k.k) ++ bch2_bkey_val_to_text(&buf, trans->c, k); ++ else ++ prt_str(&buf, "(null)"); ++ trace_btree_iter_peek_slot(trans->c, buf.buf); ++ } + + return k; + } + +-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) + { +- if (!bch2_btree_iter_advance(trans, iter)) ++ if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + +- return bch2_btree_iter_peek_slot(trans, iter); ++ return bch2_btree_iter_peek_slot(iter); + } + +-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) ++struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) + { +- if (!bch2_btree_iter_rewind(trans, iter)) ++ if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; + +- return bch2_btree_iter_peek_slot(trans, iter); ++ return bch2_btree_iter_peek_slot(iter); + } + + /* Obsolete, but still used by rust wrapper in -tools */ +-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) ++struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) + { + struct bkey_s_c k; + +- while (btree_trans_too_many_iters(trans) || +- (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), ++ while (btree_trans_too_many_iters(iter->trans) || ++ (k = bch2_btree_iter_peek_type(iter, iter->flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) +- bch2_trans_begin(trans); ++ bch2_trans_begin(iter->trans); + + return k; + } +@@ -2929,7 +3002,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) + struct btree_path *path, *prev = NULL; + struct trans_for_each_path_inorder_iter iter; + +- if (!bch2_debug_check_iterators) ++ if (!static_branch_unlikely(&bch2_debug_check_iterators)) + return; + + trans_for_each_path_inorder(trans, path, iter) { +@@ -3028,10 +3101,12 @@ static inline void btree_path_list_add(struct btree_trans *trans, + btree_trans_verify_sorted_refs(trans); + } + +-void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) ++void bch2_trans_iter_exit(struct btree_iter *iter) + { ++ struct btree_trans *trans = iter->trans; ++ + if (iter->update_path) +- bch2_path_put_nokeep(trans, iter->update_path, ++ bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); + if (iter->path) + bch2_path_put(trans, iter->path, +@@ -3042,16 +3117,18 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) + iter->path = 0; + iter->update_path = 0; + iter->key_cache_path = 0; ++ iter->trans = NULL; + } + + void bch2_trans_iter_init_outlined(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_id btree_id, struct bpos pos, +- unsigned flags) ++ enum btree_iter_update_trigger_flags flags, ++ unsigned long ip) + { + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, + bch2_btree_iter_flags(trans, btree_id, 0, flags), +- _RET_IP_); ++ ip); + } + + void bch2_trans_node_iter_init(struct btree_trans *trans, +@@ -3060,7 +3137,7 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, + struct bpos pos, + unsigned locks_want, + unsigned depth, +- unsigned flags) ++ enum btree_iter_update_trigger_flags flags) + { + flags |= BTREE_ITER_not_extents; + flags |= BTREE_ITER_snapshot_field; +@@ -3081,9 +3158,10 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, + BUG_ON(iter->min_depth != depth); + } + +-void bch2_trans_copy_iter(struct btree_trans *trans, +- struct btree_iter *dst, struct btree_iter *src) ++void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) + { ++ struct btree_trans *trans = src->trans; ++ + *dst = *src; + #ifdef TRACK_PATH_ALLOCATED + dst->ip_allocated = _RET_IP_; +@@ -3095,7 +3173,19 @@ void bch2_trans_copy_iter(struct btree_trans *trans, + dst->key_cache_path = 0; + } + +-void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, ++ darray_trans_kmalloc_trace *trace) ++{ ++ printbuf_tabstops_reset(out); ++ printbuf_tabstop_push(out, 60); ++ ++ darray_for_each(*trace, i) ++ prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); ++} ++#endif ++ ++void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) + { + struct bch_fs *c = trans->c; + unsigned new_top = trans->mem_top + size; +@@ -3105,74 +3195,75 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) + void *new_mem; + void *p; + +- WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); ++ if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ prt_printf(&buf, "bump allocator exceeded BTREE_TRANS_MEM_MAX (%u)\n", ++ BTREE_TRANS_MEM_MAX); ++ ++ bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); ++ bch2_print_str(c, KERN_ERR, buf.buf); ++#endif ++ } + + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (ret) + return ERR_PTR(ret); + + struct btree_transaction_stats *s = btree_trans_stats(trans); +- s->max_mem = max(s->max_mem, new_bytes); +- +- if (trans->used_mempool) { +- if (trans->mem_bytes >= new_bytes) +- goto out_change_top; +- +- /* No more space from mempool item, need malloc new one */ +- new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN); +- if (unlikely(!new_mem)) { +- bch2_trans_unlock(trans); +- +- new_mem = kmalloc(new_bytes, GFP_KERNEL); +- if (!new_mem) +- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); ++ if (new_bytes > s->max_mem) { ++ guard(mutex)(&s->lock); ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); ++ s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, ++ trans->trans_kmalloc_trace.nr); ++ ++ memcpy(s->trans_kmalloc_trace.data, ++ trans->trans_kmalloc_trace.data, ++ sizeof(s->trans_kmalloc_trace.data[0]) * ++ s->trans_kmalloc_trace.nr); ++#endif ++ s->max_mem = new_bytes; ++ } + +- ret = bch2_trans_relock(trans); +- if (ret) { +- kfree(new_mem); +- return ERR_PTR(ret); +- } +- } +- memcpy(new_mem, trans->mem, trans->mem_top); +- trans->used_mempool = false; +- mempool_free(trans->mem, &c->btree_trans_mem_pool); +- goto out_new_mem; ++ if (trans->used_mempool || new_bytes > BTREE_TRANS_MEM_MAX) { ++ EBUG_ON(trans->mem_bytes >= new_bytes); ++ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + } + +- new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); +- if (unlikely(!new_mem)) { +- bch2_trans_unlock(trans); ++ if (old_bytes) { ++ trans->realloc_bytes_required = new_bytes; ++ trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); ++ return ERR_PTR(btree_trans_restart_ip(trans, ++ BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); ++ } + +- new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); +- if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { +- new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); +- new_bytes = BTREE_TRANS_MEM_MAX; +- memcpy(new_mem, trans->mem, trans->mem_top); +- trans->used_mempool = true; +- kfree(trans->mem); +- } ++ EBUG_ON(trans->mem); ++ EBUG_ON(trans->mem_bytes); ++ EBUG_ON(trans->mem_top); ++ EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); ++ ++ bool lock_dropped = false; ++ new_mem = allocate_dropping_locks_norelock(trans, lock_dropped, kmalloc(new_bytes, _gfp)); ++ if (!new_mem) { ++ new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); ++ new_bytes = BTREE_TRANS_MEM_MAX; ++ trans->used_mempool = true; ++ } + +- if (!new_mem) +- return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); ++ EBUG_ON(!new_mem); + +- trans->mem = new_mem; +- trans->mem_bytes = new_bytes; ++ trans->mem = new_mem; ++ trans->mem_bytes = new_bytes; + ++ if (unlikely(lock_dropped)) { + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + } +-out_new_mem: +- trans->mem = new_mem; +- trans->mem_bytes = new_bytes; + +- if (old_bytes) { +- trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); +- return ERR_PTR(btree_trans_restart_ip(trans, +- BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); +- } +-out_change_top: +- p = trans->mem + trans->mem_top; ++ p = trans->mem; + trans->mem_top += size; + memset(p, 0, size); + return p; +@@ -3231,7 +3322,30 @@ u32 bch2_trans_begin(struct btree_trans *trans) + + trans->restart_count++; + trans->mem_top = 0; +- trans->journal_entries = NULL; ++ ++ if (unlikely(trans->restarted == BCH_ERR_transaction_restart_mem_realloced)) { ++ unsigned new_bytes = trans->realloc_bytes_required; ++ EBUG_ON(new_bytes > BTREE_TRANS_MEM_MAX); ++ EBUG_ON(!trans->mem); ++ EBUG_ON(!trans->mem_bytes); ++ ++ bool lock_dropped = false; ++ void *new_mem = allocate_dropping_locks_norelock(trans, lock_dropped, ++ krealloc(trans->mem, new_bytes, _gfp)); ++ (void)lock_dropped; ++ ++ if (!new_mem) { ++ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); ++ new_bytes = BTREE_TRANS_MEM_MAX; ++ trans->used_mempool = true; ++ kfree(trans->mem); ++ } ++ ++ EBUG_ON(!new_mem); ++ ++ trans->mem = new_mem; ++ trans->mem_bytes = new_bytes; ++ } + + trans_for_each_path(trans, path, i) { + path->should_be_locked = false; +@@ -3285,6 +3399,10 @@ u32 bch2_trans_begin(struct btree_trans *trans) + } + #endif + ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ trans->trans_kmalloc_trace.nr = 0; ++#endif ++ + trans_set_locked(trans, false); + + if (trans->restarted) { +@@ -3385,7 +3503,6 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) + } + + trans->nr_paths_max = s->nr_max_paths; +- trans->journal_entries_size = s->journal_entries_size; + } + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); +@@ -3397,29 +3514,44 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) + return trans; + } + +-static void check_btree_paths_leaked(struct btree_trans *trans) +-{ + #ifdef CONFIG_BCACHEFS_DEBUG +- struct bch_fs *c = trans->c; ++ ++static bool btree_paths_leaked(struct btree_trans *trans) ++{ + struct btree_path *path; + unsigned i; + + trans_for_each_path(trans, path, i) + if (path->ref) +- goto leaked; +- return; +-leaked: +- bch_err(c, "btree paths leaked from %s!", trans->fn); +- trans_for_each_path(trans, path, i) +- if (path->ref) +- printk(KERN_ERR " btree %s %pS\n", +- bch2_btree_id_str(path->btree_id), +- (void *) path->ip_allocated); +- /* Be noisy about this: */ +- bch2_fatal_error(c); +-#endif ++ return true; ++ return false; + } + ++static void check_btree_paths_leaked(struct btree_trans *trans) ++{ ++ if (btree_paths_leaked(trans)) { ++ struct bch_fs *c = trans->c; ++ struct btree_path *path; ++ unsigned i; ++ ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); ++ trans_for_each_path(trans, path, i) ++ if (path->ref) ++ prt_printf(&buf, "btree %s %pS\n", ++ bch2_btree_id_str(path->btree_id), ++ (void *) path->ip_allocated); ++ ++ bch2_fs_emergency_read_only2(c, &buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ } ++} ++#else ++static inline void check_btree_paths_leaked(struct btree_trans *trans) {} ++#endif ++ + void bch2_trans_put(struct btree_trans *trans) + __releases(&c->btree_trans_barrier) + { +@@ -3454,6 +3586,9 @@ void bch2_trans_put(struct btree_trans *trans) + #ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&trans->last_restarted_trace); + #endif ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ darray_exit(&trans->trans_kmalloc_trace); ++#endif + + unsigned long *paths_allocated = trans->paths_allocated; + trans->paths_allocated = NULL; +@@ -3500,13 +3635,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, + struct btree_bkey_cached_common *b) + { + struct six_lock_count c = six_lock_counts(&b->lock); +- struct task_struct *owner; + pid_t pid; + +- rcu_read_lock(); +- owner = READ_ONCE(b->lock.owner); +- pid = owner ? owner->pid : 0; +- rcu_read_unlock(); ++ scoped_guard(rcu) { ++ struct task_struct *owner = READ_ONCE(b->lock.owner); ++ pid = owner ? owner->pid : 0; ++ } + + prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, b->btree_id); +@@ -3535,12 +3669,12 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) + prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); + + /* trans->paths is rcu protected vs. freeing */ +- rcu_read_lock(); +- out->atomic++; ++ guard(rcu)(); ++ guard(printbuf_atomic)(out); + + struct btree_path *paths = rcu_dereference(trans->paths); + if (!paths) +- goto out; ++ return; + + unsigned long *paths_allocated = trans_paths_allocated(paths); + +@@ -3576,9 +3710,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); + } +-out: +- --out->atomic; +- rcu_read_unlock(); + } + + void bch2_fs_btree_iter_exit(struct bch_fs *c) +@@ -3608,6 +3739,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) + for (s = c->btree_transaction_stats; + s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); + s++) { ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ darray_exit(&s->trans_kmalloc_trace); ++#endif + kfree(s->max_paths_text); + bch2_time_stats_exit(&s->lock_hold_times); + } +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 9d2cccf5d21a..b117cb5d7f94 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -46,9 +46,11 @@ static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path + return --path->ref == 0; + } + +-static inline void btree_path_set_dirty(struct btree_path *path, ++static inline void btree_path_set_dirty(struct btree_trans *trans, ++ struct btree_path *path, + enum btree_path_uptodate u) + { ++ BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); + path->uptodate = max_t(unsigned, path->uptodate, u); + } + +@@ -233,12 +235,14 @@ bch2_btree_path_set_pos(struct btree_trans *trans, + + int __must_check bch2_btree_path_traverse_one(struct btree_trans *, + btree_path_idx_t, +- unsigned, unsigned long); ++ enum btree_iter_update_trigger_flags, ++ unsigned long); + + static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); + + static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, +- btree_path_idx_t path, unsigned flags) ++ btree_path_idx_t path, ++ enum btree_iter_update_trigger_flags flags) + { + bch2_trans_verify_not_unlocked_or_in_restart(trans); + +@@ -249,7 +253,9 @@ static inline int __must_check bch2_btree_path_traverse(struct btree_trans *tran + } + + btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, +- unsigned, unsigned, unsigned, unsigned long); ++ unsigned, unsigned, ++ enum btree_iter_update_trigger_flags, ++ unsigned long); + btree_path_idx_t bch2_path_get_unlocked_mut(struct btree_trans *, enum btree_id, + unsigned, struct bpos); + +@@ -285,14 +291,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex + : __bch2_trans_mutex_lock(trans, lock); + } + +-#ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_trans_verify_paths(struct btree_trans *); +-void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); +-#else +-static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} +-static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, +- struct bpos pos) {} +-#endif ++/* Debug: */ ++ ++void __bch2_trans_verify_paths(struct btree_trans *); ++void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); ++ ++static inline void bch2_trans_verify_paths(struct btree_trans *trans) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_iterators)) ++ __bch2_trans_verify_paths(trans); ++} ++ ++static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, ++ struct bpos pos) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_iterators)) ++ __bch2_assert_pos_locked(trans, btree, pos); ++} + + void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *, struct bkey_packed *); +@@ -393,37 +408,36 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct + void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); + void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); + +-int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); +-int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); ++int __must_check bch2_btree_iter_traverse(struct btree_iter *); + +-struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); +-struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); +-struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *); ++struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); ++struct btree *bch2_btree_iter_next_node(struct btree_iter *); + +-struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); +-struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + +-static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, +- struct btree_iter *iter) ++static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + { +- return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); ++ return bch2_btree_iter_peek_max(iter, SPOS_MAX); + } + +-struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); ++struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); + +-static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) ++static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + { +- return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); ++ return bch2_btree_iter_peek_prev_min(iter, POS_MIN); + } + +-struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + +-struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); +-struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); +-struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + +-bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); +-bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); ++bool bch2_btree_iter_advance(struct btree_iter *); ++bool bch2_btree_iter_rewind(struct btree_iter *); + + static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { +@@ -434,9 +448,10 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo + iter->k.size = 0; + } + +-static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, +- struct btree_iter *iter, struct bpos new_pos) ++static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) + { ++ struct btree_trans *trans = iter->trans; ++ + if (unlikely(iter->update_path)) + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); +@@ -454,22 +469,21 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it + iter->pos = bkey_start_pos(&iter->k); + } + +-static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, +- struct btree_iter *iter, u32 snapshot) ++static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) + { + struct bpos pos = iter->pos; + + iter->snapshot = snapshot; + pos.snapshot = snapshot; +- bch2_btree_iter_set_pos(trans, iter, pos); ++ bch2_btree_iter_set_pos(iter, pos); + } + +-void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); ++void bch2_trans_iter_exit(struct btree_iter *); + +-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, +- unsigned btree_id, +- unsigned level, +- unsigned flags) ++static inline enum btree_iter_update_trigger_flags ++bch2_btree_iter_flags(struct btree_trans *trans, ++ unsigned btree_id, unsigned level, ++ enum btree_iter_update_trigger_flags flags) + { + if (level || !btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_cached; +@@ -497,15 +511,16 @@ static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, + + static inline void bch2_trans_iter_init_common(struct btree_trans *trans, + struct btree_iter *iter, +- unsigned btree_id, struct bpos pos, ++ enum btree_id btree, struct bpos pos, + unsigned locks_want, + unsigned depth, +- unsigned flags, ++ enum btree_iter_update_trigger_flags flags, + unsigned long ip) + { ++ iter->trans = trans; + iter->update_path = 0; + iter->key_cache_path = 0; +- iter->btree_id = btree_id; ++ iter->btree_id = btree; + iter->min_depth = 0; + iter->flags = flags; + iter->snapshot = pos.snapshot; +@@ -515,99 +530,156 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, + #ifdef CONFIG_BCACHEFS_DEBUG + iter->ip_allocated = ip; + #endif +- iter->path = bch2_path_get(trans, btree_id, iter->pos, +- locks_want, depth, flags, ip); ++ iter->path = bch2_path_get(trans, btree, iter->pos, locks_want, depth, flags, ip); + } + + void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, +- enum btree_id, struct bpos, unsigned); ++ enum btree_id, struct bpos, ++ enum btree_iter_update_trigger_flags, ++ unsigned long ip); + + static inline void bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, +- unsigned btree_id, struct bpos pos, +- unsigned flags) ++ enum btree_id btree, struct bpos pos, ++ enum btree_iter_update_trigger_flags flags) + { +- if (__builtin_constant_p(btree_id) && ++ if (__builtin_constant_p(btree) && + __builtin_constant_p(flags)) +- bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, +- bch2_btree_iter_flags(trans, btree_id, 0, flags), +- _THIS_IP_); ++ bch2_trans_iter_init_common(trans, iter, btree, pos, 0, 0, ++ bch2_btree_iter_flags(trans, btree, 0, flags), ++ _RET_IP_); + else +- bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); ++ bch2_trans_iter_init_outlined(trans, iter, btree, pos, flags, _RET_IP_); + } + ++#define bch2_trans_iter_class_init(_trans, _btree, _pos, _flags) \ ++({ \ ++ struct btree_iter iter; \ ++ bch2_trans_iter_init(_trans, &iter, (_btree), (_pos), (_flags)); \ ++ iter; \ ++}) ++ ++DEFINE_CLASS(btree_iter, struct btree_iter, ++ bch2_trans_iter_exit(&_T), ++ bch2_trans_iter_class_init(trans, btree, pos, flags), ++ struct btree_trans *trans, ++ enum btree_id btree, struct bpos pos, ++ enum btree_iter_update_trigger_flags flags); ++ + void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos, +- unsigned, unsigned, unsigned); +-void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); ++ unsigned, unsigned, ++ enum btree_iter_update_trigger_flags); + +-void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); ++void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); + +-void *__bch2_trans_kmalloc(struct btree_trans *, size_t); ++void bch2_set_btree_iter_dontneed(struct btree_iter *); + +-/** +- * bch2_trans_kmalloc - allocate memory for use by the current transaction +- * +- * Must be called after bch2_trans_begin, which on second and further calls +- * frees all memory allocated in this transaction +- */ +-static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++void bch2_trans_kmalloc_trace_to_text(struct printbuf *, ++ darray_trans_kmalloc_trace *); ++#endif ++ ++void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); ++ ++static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, ++ unsigned long ip) ++{ ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ darray_push(&trans->trans_kmalloc_trace, ++ ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); ++#endif ++} ++ ++static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, ++ unsigned long ip) + { + size = roundup(size, 8); + ++ bch2_trans_kmalloc_trace(trans, size, ip); ++ + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + + trans->mem_top += size; +- memset(p, 0, size); + return p; + } else { +- return __bch2_trans_kmalloc(trans, size); ++ return __bch2_trans_kmalloc(trans, size, ip); + } + } + +-static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) ++static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, ++ unsigned long ip) + { +- size = round_up(size, 8); ++ size = roundup(size, 8); ++ ++ bch2_trans_kmalloc_trace(trans, size, ip); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + + trans->mem_top += size; ++ memset(p, 0, size); + return p; + } else { +- return __bch2_trans_kmalloc(trans, size); ++ return __bch2_trans_kmalloc(trans, size, ip); + } + } + ++/** ++ * bch2_trans_kmalloc - allocate memory for use by the current transaction ++ * ++ * Must be called after bch2_trans_begin, which on second and further calls ++ * frees all memory allocated in this transaction ++ */ ++static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ++{ ++ return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); ++} ++ ++static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) ++{ ++ return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); ++} ++ + static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, + struct btree_iter *iter, +- unsigned btree_id, struct bpos pos, +- unsigned flags, unsigned type) ++ enum btree_id btree, struct bpos pos, ++ enum btree_iter_update_trigger_flags flags, ++ enum bch_bkey_type type) + { + struct bkey_s_c k; + +- bch2_trans_iter_init(trans, iter, btree_id, pos, flags); +- k = bch2_btree_iter_peek_slot(trans, iter); ++ bch2_trans_iter_init(trans, iter, btree, pos, flags); ++ k = bch2_btree_iter_peek_slot(iter); + + if (!bkey_err(k) && type && k.k->type != type) +- k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); ++ k = bkey_s_c_err(bch_err_throw(trans->c, ENOENT_bkey_type_mismatch)); + if (unlikely(bkey_err(k))) +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return k; + } + + static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, + struct btree_iter *iter, +- unsigned btree_id, struct bpos pos, +- unsigned flags) ++ enum btree_id btree, struct bpos pos, ++ enum btree_iter_update_trigger_flags flags) + { +- return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); ++ return __bch2_bkey_get_iter(trans, iter, btree, pos, flags, 0); + } + +-#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ +- bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ +- _btree_id, _pos, _flags, KEY_TYPE_##_type)) ++static inline struct bkey_s_c __bch2_bkey_get_typed(struct btree_iter *iter, ++ enum bch_bkey_type type) ++{ ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ ++ if (!bkey_err(k) && type && k.k->type != type) ++ k = bkey_s_c_err(bch_err_throw(iter->trans->c, ENOENT_bkey_type_mismatch)); ++ return k; ++} ++ ++#define bch2_bkey_get_typed(_iter, _type) \ ++ bkey_s_c_to_##_type(__bch2_bkey_get_typed(_iter, KEY_TYPE_##_type)) + + static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) + { +@@ -624,18 +696,16 @@ do { \ + } while (0) + + static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, +- unsigned btree_id, struct bpos pos, +- unsigned flags, unsigned type, ++ enum btree_id btree, struct bpos pos, ++ enum btree_iter_update_trigger_flags flags, ++ enum bch_bkey_type type, + unsigned val_size, void *val) + { +- struct btree_iter iter; +- struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); ++ CLASS(btree_iter, iter)(trans, btree, pos, flags); ++ struct bkey_s_c k = __bch2_bkey_get_typed(&iter, type); + int ret = bkey_err(k); +- if (!ret) { ++ if (!ret) + __bkey_val_copy(val, val_size, k); +- bch2_trans_iter_exit(trans, &iter); +- } +- + return ret; + } + +@@ -658,17 +728,17 @@ u32 bch2_trans_begin(struct btree_trans *); + int _ret3 = 0; \ + do { \ + _ret3 = lockrestart_do((_trans), ({ \ +- struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ ++ struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ + if (!_b) \ + break; \ + \ + PTR_ERR_OR_ZERO(_b) ?: (_do); \ + })) ?: \ + lockrestart_do((_trans), \ +- PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ ++ PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ + } while (!_ret3); \ + \ +- bch2_trans_iter_exit((_trans), &(_iter)); \ ++ bch2_trans_iter_exit(&(_iter)); \ + _ret3; \ + }) + +@@ -677,34 +747,31 @@ u32 bch2_trans_begin(struct btree_trans *); + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _do) + +-static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, +- struct btree_iter *iter, +- unsigned flags) ++static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, ++ enum btree_iter_update_trigger_flags flags) + { +- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : +- bch2_btree_iter_peek_prev(trans, iter); ++ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : ++ bch2_btree_iter_peek_prev(iter); + } + +-static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, +- struct btree_iter *iter, +- unsigned flags) ++static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, ++ enum btree_iter_update_trigger_flags flags) + { +- return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : +- bch2_btree_iter_peek(trans, iter); ++ return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : ++ bch2_btree_iter_peek(iter); + } + +-static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, +- struct btree_iter *iter, ++static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, + struct bpos end, +- unsigned flags) ++ enum btree_iter_update_trigger_flags flags) + { + if (!(flags & BTREE_ITER_slots)) +- return bch2_btree_iter_peek_max(trans, iter, end); ++ return bch2_btree_iter_peek_max(iter, end); + + if (bkey_gt(iter->pos, end)) + return bkey_s_c_null; + +- return bch2_btree_iter_peek_slot(trans, iter); ++ return bch2_btree_iter_peek_slot(iter); + } + + int __bch2_btree_trans_too_many_iters(struct btree_trans *); +@@ -760,7 +827,7 @@ transaction_restart: \ + if (!_ret2) \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + \ +- _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ ++ _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ + }) + + #define for_each_btree_key_max_continue(_trans, _iter, \ +@@ -771,62 +838,52 @@ transaction_restart: \ + \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ +- (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ ++ (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ + _end, (_flags)); \ + if (!(_k).k) \ + break; \ + \ + bkey_err(_k) ?: (_do); \ + })); \ +- } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ ++ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + \ +- bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ + }) + + #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ + for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) + +-#define for_each_btree_key_max(_trans, _iter, _btree_id, \ +- _start, _end, _flags, _k, _do) \ +-({ \ +- bch2_trans_begin(trans); \ +- \ +- struct btree_iter _iter; \ +- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ +- (_start), (_flags)); \ +- \ +- for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ ++#define for_each_btree_key_max(_trans, _iter, _btree_id, \ ++ _start, _end, _flags, _k, _do) \ ++({ \ ++ bch2_trans_begin(trans); \ ++ \ ++ CLASS(btree_iter, _iter)((_trans), (_btree_id), (_start), (_flags)); \ ++ for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do); \ + }) + +-#define for_each_btree_key(_trans, _iter, _btree_id, \ +- _start, _flags, _k, _do) \ +- for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ +- SPOS_MAX, _flags, _k, _do) ++#define for_each_btree_key(_trans, _iter, _btree_id, _start, _flags, _k, _do) \ ++ for_each_btree_key_max(_trans, _iter, _btree_id, _start, SPOS_MAX, _flags, _k, _do) + +-#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ +- _start, _flags, _k, _do) \ +-({ \ +- struct btree_iter _iter; \ +- struct bkey_s_c _k; \ +- int _ret3 = 0; \ +- \ +- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ +- (_start), (_flags)); \ +- \ +- do { \ +- _ret3 = lockrestart_do(_trans, ({ \ +- (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ +- (_flags)); \ +- if (!(_k).k) \ +- break; \ +- \ +- bkey_err(_k) ?: (_do); \ +- })); \ +- } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ +- \ +- bch2_trans_iter_exit((_trans), &(_iter)); \ +- _ret3; \ ++#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _do) \ ++({ \ ++ int _ret3 = 0; \ ++ \ ++ CLASS(btree_iter, iter)((_trans), (_btree_id), (_start), (_flags)); \ ++ \ ++ do { \ ++ _ret3 = lockrestart_do(_trans, ({ \ ++ struct bkey_s_c _k = \ ++ bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ ++ if (!(_k).k) \ ++ break; \ ++ \ ++ bkey_err(_k) ?: (_do); \ ++ })); \ ++ } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ ++ \ ++ _ret3; \ + }) + + #define for_each_btree_key_commit(_trans, _iter, _btree_id, \ +@@ -853,38 +910,36 @@ transaction_restart: \ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, +- struct btree_iter *); +- +-#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ +- _start, _end, _flags, _k, _ret) \ +- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ +- (_start), (_flags)); \ +- (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ +- !((_ret) = bkey_err(_k)) && (_k).k; \ +- bch2_btree_iter_advance(_trans, &(_iter))) +- +-#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ +- for (; \ +- (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ +- !((_ret) = bkey_err(_k)) && (_k).k; \ +- bch2_btree_iter_advance(_trans, &(_iter))) +- +-#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ +- _start, _flags, _k, _ret) \ +- for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ ++struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); ++ ++#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ ++ _start, _end, _flags, _k, _ret) \ ++ for (CLASS(btree_iter, _iter)((_trans), (_btree_id), (_start), (_flags)); \ ++ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ ++#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start, \ + SPOS_MAX, _flags, _k, _ret) + +-#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ +- _start, _flags, _k, _ret) \ +- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ +- (_start), (_flags)); \ +- (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ +- !((_ret) = bkey_err(_k)) && (_k).k; \ +- bch2_btree_iter_rewind(_trans, &(_iter))) ++#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret) \ ++ for (; \ ++ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_advance(&(_iter))) ++ ++#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ ++ for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) + +-#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ +- for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) ++#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for (CLASS(btree_iter, _iter)((_trans), (_btree_id), \ ++ (_start), (_flags)); \ ++ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ bch2_btree_iter_rewind(&(_iter))) + + /* + * This should not be used in a fastpath, without first trying _do in +@@ -922,16 +977,20 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, + _p; \ + }) + +-#define bch2_trans_run(_c, _do) \ ++#define allocate_dropping_locks_norelock(_trans, _lock_dropped, _do) \ + ({ \ +- struct btree_trans *trans = bch2_trans_get(_c); \ +- int _ret = (_do); \ +- bch2_trans_put(trans); \ +- _ret; \ ++ gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ ++ typeof(_do) _p = _do; \ ++ _lock_dropped = false; \ ++ if (unlikely(!_p)) { \ ++ bch2_trans_unlock(_trans); \ ++ _lock_dropped = true; \ ++ _gfp = GFP_KERNEL; \ ++ _p = _do; \ ++ } \ ++ _p; \ + }) + +-#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) +- + struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); + void bch2_trans_put(struct btree_trans *); + +@@ -949,6 +1008,33 @@ unsigned bch2_trans_get_fn_idx(const char *); + __bch2_trans_get(_c, trans_fn_idx); \ + }) + ++/* ++ * We don't use DEFINE_CLASS() because using a function for the constructor ++ * breaks bch2_trans_get()'s use of __func__ ++ */ ++typedef struct btree_trans * class_btree_trans_t; ++static inline void class_btree_trans_destructor(struct btree_trans **p) ++{ ++ struct btree_trans *trans = *p; ++ bch2_trans_put(trans); ++} ++ ++#define class_btree_trans_constructor(_c) bch2_trans_get(_c) ++ ++/* deprecated, prefer CLASS(btree_trans) */ ++#define bch2_trans_run(_c, _do) \ ++({ \ ++ CLASS(btree_trans, trans)(_c); \ ++ (_do); \ ++}) ++ ++/* deprecated, prefer CLASS(btree_trans) */ ++#define bch2_trans_do(_c, _do) \ ++({ \ ++ CLASS(btree_trans, trans)(_c); \ ++ lockrestart_do(trans, _do); \ ++}) ++ + void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); + + void bch2_fs_btree_iter_exit(struct bch_fs *); +diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c +index ade3b5addd75..24f2fbe84ad7 100644 +--- a/fs/bcachefs/btree_journal_iter.c ++++ b/fs/bcachefs/btree_journal_iter.c +@@ -5,6 +5,7 @@ + #include "bset.h" + #include "btree_cache.h" + #include "btree_journal_iter.h" ++#include "disk_accounting.h" + #include "journal_io.h" + + #include +@@ -137,12 +138,15 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b + struct journal_key *k; + + BUG_ON(*idx > keys->nr); ++ ++ if (!keys->nr) ++ return NULL; + search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + +- while (*idx && +- __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { ++ while (*idx < keys->nr && ++ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx)) >= 0) { + (*idx)++; + iters++; + if (iters == 10) { +@@ -151,18 +155,23 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b + } + } + ++ if (*idx == keys->nr) ++ --(*idx); ++ + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + +- while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { ++ while (true) { ++ k = idx_to_key(keys, *idx); + if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) + break; + + if (k->overwritten) { + if (k->overwritten_range) +- *idx = rcu_dereference(k->overwritten_range)->start - 1; +- else +- *idx -= 1; ++ *idx = rcu_dereference(k->overwritten_range)->start; ++ if (!*idx) ++ break; ++ --(*idx); + continue; + } + +@@ -171,6 +180,8 @@ struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id b + break; + } + ++ if (!*idx) ++ break; + --(*idx); + iters++; + if (iters == 10) { +@@ -268,12 +279,23 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + + if (idx < keys->size && + journal_key_cmp(&n, &keys->data[idx]) == 0) { ++ struct bkey_i *o = keys->data[idx].k; ++ ++ if (k->k.type == KEY_TYPE_accounting && ++ o->k.type == KEY_TYPE_accounting) { ++ if (!keys->data[idx].allocated) ++ goto insert; ++ ++ bch2_accounting_accumulate(bkey_i_to_accounting(k), ++ bkey_i_to_s_c_accounting(o)); ++ } ++ + if (keys->data[idx].allocated) + kfree(keys->data[idx].k); + keys->data[idx] = n; + return 0; + } +- ++insert: + if (idx > keys->gap) + idx -= keys->size - keys->nr; + +@@ -292,7 +314,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + if (!new_keys.data) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); +- return -BCH_ERR_ENOMEM_journal_key_insert; ++ return bch_err_throw(c, ENOMEM_journal_key_insert); + } + + /* Since @keys was full, there was no gap: */ +@@ -331,7 +353,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + + n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n) +- return -BCH_ERR_ENOMEM_journal_key_insert; ++ return bch_err_throw(c, ENOMEM_journal_key_insert); + + bkey_copy(n, k); + ret = bch2_journal_key_insert_take(c, id, level, n); +@@ -440,9 +462,8 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos) && + !keys->data[idx].overwritten) { +- mutex_lock(&keys->overwrite_lock); ++ guard(mutex)(&keys->overwrite_lock); + __bch2_journal_key_overwritten(keys, idx); +- mutex_unlock(&keys->overwrite_lock); + } + } + +@@ -457,11 +478,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) + + static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) + { +- struct bkey_s_c ret = bkey_s_c_null; +- + journal_iter_verify(iter); + +- rcu_read_lock(); ++ guard(rcu)(); + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; + +@@ -470,19 +489,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) + break; + BUG_ON(cmp); + +- if (!k->overwritten) { +- ret = bkey_i_to_s_c(k->k); +- break; +- } ++ if (!k->overwritten) ++ return bkey_i_to_s_c(k->k); + + if (k->overwritten_range) + iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); + else + bch2_journal_iter_advance(iter); + } +- rcu_read_unlock(); + +- return ret; ++ return bkey_s_c_null; + } + + static void bch2_journal_iter_exit(struct journal_iter *iter) +@@ -646,10 +662,11 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) + { + const struct journal_key *l = _l; + const struct journal_key *r = _r; ++ int rewind = l->rewind && r->rewind ? -1 : 1; + + return journal_key_cmp(l, r) ?: +- cmp_int(l->journal_seq, r->journal_seq) ?: +- cmp_int(l->journal_offset, r->journal_offset); ++ ((cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->journal_offset, r->journal_offset)) * rewind); + } + + void bch2_journal_keys_put(struct bch_fs *c) +@@ -718,6 +735,8 @@ int bch2_journal_keys_sort(struct bch_fs *c) + struct journal_keys *keys = &c->journal_keys; + size_t nr_read = 0; + ++ u64 rewind_seq = c->opts.journal_rewind ?: U64_MAX; ++ + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + +@@ -726,28 +745,43 @@ int bch2_journal_keys_sort(struct bch_fs *c) + + cond_resched(); + +- for_each_jset_key(k, entry, &i->j) { +- struct journal_key n = (struct journal_key) { +- .btree_id = entry->btree_id, +- .level = entry->level, +- .k = k, +- .journal_seq = le64_to_cpu(i->j.seq), +- .journal_offset = k->_data - i->j._data, +- }; +- +- if (darray_push(keys, n)) { +- __journal_keys_sort(keys); +- +- if (keys->nr * 8 > keys->size * 7) { +- bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", +- keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); +- return -BCH_ERR_ENOMEM_journal_keys_sort; ++ vstruct_for_each(&i->j, entry) { ++ bool rewind = !entry->level && ++ !btree_id_is_alloc(entry->btree_id) && ++ le64_to_cpu(i->j.seq) >= rewind_seq; ++ ++ if (entry->type != (rewind ++ ? BCH_JSET_ENTRY_overwrite ++ : BCH_JSET_ENTRY_btree_keys)) ++ continue; ++ ++ if (!rewind && le64_to_cpu(i->j.seq) < c->journal_replay_seq_start) ++ continue; ++ ++ jset_entry_for_each_key(entry, k) { ++ struct journal_key n = (struct journal_key) { ++ .btree_id = entry->btree_id, ++ .level = entry->level, ++ .rewind = rewind, ++ .k = k, ++ .journal_seq = le64_to_cpu(i->j.seq), ++ .journal_offset = k->_data - i->j._data, ++ }; ++ ++ if (darray_push(keys, n)) { ++ __journal_keys_sort(keys); ++ ++ if (keys->nr * 8 > keys->size * 7) { ++ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", ++ keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); ++ return bch_err_throw(c, ENOMEM_journal_keys_sort); ++ } ++ ++ BUG_ON(darray_push(keys, n)); + } + +- BUG_ON(darray_push(keys, n)); ++ nr_read++; + } +- +- nr_read++; + } + } + +@@ -780,7 +814,7 @@ void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree, + void bch2_journal_keys_dump(struct bch_fs *c) + { + struct journal_keys *keys = &c->journal_keys; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + pr_info("%zu keys:", keys->nr); + +@@ -794,7 +828,6 @@ void bch2_journal_keys_dump(struct bch_fs *c) + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); + pr_err("%s", buf.buf); + } +- printbuf_exit(&buf); + } + + void bch2_fs_journal_keys_init(struct bch_fs *c) +diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h +index 8b773823704f..86aacb254fb2 100644 +--- a/fs/bcachefs/btree_journal_iter_types.h ++++ b/fs/bcachefs/btree_journal_iter_types.h +@@ -11,8 +11,9 @@ struct journal_key { + u32 journal_offset; + enum btree_id btree_id:8; + unsigned level:8; +- bool allocated; +- bool overwritten; ++ bool allocated:1; ++ bool overwritten:1; ++ bool rewind:1; + struct journal_key_range_overwritten __rcu * + overwritten_range; + struct bkey_i *k; +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 669825f89cdd..e3336ab27ccc 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -13,6 +13,7 @@ + #include "trace.h" + + #include ++#include + + static inline bool btree_uses_pcpu_readers(enum btree_id id) + { +@@ -101,8 +102,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu + kmem_cache_free(bch2_key_cache, ck); + } + +-static void bkey_cached_free(struct btree_key_cache *bc, +- struct bkey_cached *ck) ++static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, ++ struct bkey_cached *ck) + { + kfree(ck->k); + ck->k = NULL; +@@ -116,6 +117,19 @@ static void bkey_cached_free(struct btree_key_cache *bc, + this_cpu_inc(*bc->nr_pending); + } + ++static void bkey_cached_free(struct btree_trans *trans, ++ struct btree_key_cache *bc, ++ struct bkey_cached *ck) ++{ ++ /* ++ * we'll hit strange issues in the SRCU code if we aren't holding an ++ * SRCU read lock... ++ */ ++ EBUG_ON(!trans->srcu_held); ++ ++ bkey_cached_free_noassert(bc, ck); ++} ++ + static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) + { + gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; +@@ -174,27 +188,23 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k + static struct bkey_cached * + bkey_cached_reuse(struct btree_key_cache *c) + { +- struct bucket_table *tbl; ++ ++ guard(rcu)(); ++ struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table); + struct rhash_head *pos; + struct bkey_cached *ck; +- unsigned i; + +- rcu_read_lock(); +- tbl = rht_dereference_rcu(c->table.tbl, &c->table); +- for (i = 0; i < tbl->size; i++) ++ for (unsigned i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bkey_cached_lock_for_evict(ck)) { + if (bkey_cached_evict(c, ck)) +- goto out; ++ return ck; + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + } + } +- ck = NULL; +-out: +- rcu_read_unlock(); +- return ck; ++ return NULL; + } + + static int btree_key_cache_create(struct btree_trans *trans, +@@ -229,7 +239,7 @@ static int btree_key_cache_create(struct btree_trans *trans, + if (unlikely(!ck)) { + bch_err(c, "error allocating memory for key cache item, btree %s", + bch2_btree_id_str(ck_path->btree_id)); +- return -BCH_ERR_ENOMEM_btree_key_cache_create; ++ return bch_err_throw(c, ENOMEM_btree_key_cache_create); + } + } + +@@ -244,11 +254,13 @@ static int btree_key_cache_create(struct btree_trans *trans, + + struct bkey_i *new_k = allocate_dropping_locks(trans, ret, + kmalloc(key_u64s * sizeof(u64), _gfp)); +- if (unlikely(!new_k)) { ++ if (unlikely(!new_k && !ret)) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(ck->key.btree_id), key_u64s); +- ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; +- } else if (ret) { ++ ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); ++ } ++ ++ if (unlikely(ret)) { + kfree(new_k); + goto err; + } +@@ -281,7 +293,7 @@ static int btree_key_cache_create(struct btree_trans *trans, + ck_path->uptodate = BTREE_ITER_UPTODATE; + return 0; + err: +- bkey_cached_free(bc, ck); ++ bkey_cached_free(trans, bc, ck); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); + + return ret; +@@ -291,13 +303,12 @@ static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans + struct btree_path *ck_path, + struct bkey_s_c k) + { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); +- printbuf_exit(&buf); + } + + static noinline int btree_key_cache_fill(struct btree_trans *trans, +@@ -312,19 +323,17 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, + } + + struct bch_fs *c = trans->c; +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; + +- bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, +- BTREE_ITER_intent| +- BTREE_ITER_key_cache_fill| +- BTREE_ITER_cached_nofill); ++ CLASS(btree_iter, iter)(trans, ck_path->btree_id, ck_path->pos, ++ BTREE_ITER_intent| ++ BTREE_ITER_nofilter_whiteouts| ++ BTREE_ITER_key_cache_fill| ++ BTREE_ITER_cached_nofill); + iter.flags &= ~BTREE_ITER_with_journal; +- k = bch2_btree_iter_peek_slot(trans, &iter); +- ret = bkey_err(k); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + /* Recheck after btree lookup, before allocating: */ + ck_path = trans->paths + ck_path_idx; +@@ -334,15 +343,13 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, + + ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); + if (ret) +- goto err; ++ return ret; + + if (trace_key_cache_fill_enabled()) + do_trace_key_cache_fill(trans, ck_path, k); + out: + /* We're not likely to need this iterator again: */ +- bch2_set_btree_iter_dontneed(trans, &iter); +-err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_set_btree_iter_dontneed(&iter); + return ret; + } + +@@ -398,7 +405,7 @@ int bch2_btree_path_traverse_cached(struct btree_trans *trans, + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); + } +- } else { ++ } else if (!(flags & BTREE_ITER_cached_nofill)) { + BUG_ON(path->uptodate); + BUG_ON(!path->nodes_locked); + } +@@ -414,35 +421,34 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; +- struct btree_iter c_iter, b_iter; + struct bkey_cached *ck = NULL; + int ret; + +- bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, +- BTREE_ITER_slots| +- BTREE_ITER_intent| +- BTREE_ITER_all_snapshots); +- bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, +- BTREE_ITER_cached| +- BTREE_ITER_intent); ++ CLASS(btree_iter, b_iter)(trans, key.btree_id, key.pos, ++ BTREE_ITER_slots| ++ BTREE_ITER_intent| ++ BTREE_ITER_all_snapshots); ++ CLASS(btree_iter, c_iter)(trans, key.btree_id, key.pos, ++ BTREE_ITER_cached| ++ BTREE_ITER_intent); + b_iter.flags &= ~BTREE_ITER_with_key_cache; + +- ret = bch2_btree_iter_traverse(trans, &c_iter); ++ ret = bch2_btree_iter_traverse(&c_iter); + if (ret) +- goto out; ++ return ret; + + ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b; + if (!ck) +- goto out; ++ return 0; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + if (evict) + goto evict; +- goto out; ++ return 0; + } + + if (journal_seq && ck->journal.seq != journal_seq) +- goto out; ++ return 0; + + trans->journal_res.seq = ck->journal.seq; + +@@ -459,7 +465,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + !test_bit(JOURNAL_space_low, &c->journal.flags)) + commit_flags |= BCH_TRANS_COMMIT_no_journal_res; + +- struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); ++ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); + ret = bkey_err(btree_k); + if (ret) + goto err; +@@ -511,15 +517,13 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + + mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + if (bkey_cached_evict(&c->btree_key_cache, ck)) { +- bkey_cached_free(&c->btree_key_cache, ck); ++ bkey_cached_free(trans, &c->btree_key_cache, ck); + } else { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + } + } + out: +- bch2_trans_iter_exit(trans, &b_iter); +- bch2_trans_iter_exit(trans, &c_iter); + return ret; + } + +@@ -530,10 +534,10 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, + struct bkey_cached *ck = + container_of(pin, struct bkey_cached, journal); + struct bkey_cached_key key; +- struct btree_trans *trans = bch2_trans_get(c); + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + int ret = 0; + ++ CLASS(btree_trans, trans)(c); + btree_node_lock_nopath_nofail(trans, &ck->c, SIX_LOCK_read); + key = ck->key; + +@@ -556,8 +560,6 @@ int bch2_btree_key_cache_journal_flush(struct journal *j, + BCH_TRANS_COMMIT_journal_reclaim, false)); + unlock: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); +- +- bch2_trans_put(trans); + return ret; + } + +@@ -571,6 +573,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, + bool kick_reclaim = false; + + BUG_ON(insert->k.u64s > ck->u64s); ++ BUG_ON(bkey_deleted(&insert->k)); + + bkey_copy(ck->k, insert); + +@@ -625,7 +628,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, + } + + bkey_cached_evict(bc, ck); +- bkey_cached_free(bc, ck); ++ bkey_cached_free(trans, bc, ck); + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); + +@@ -633,10 +636,17 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, + unsigned i; + trans_for_each_path(trans, path2, i) + if (path2->l[0].b == (void *) ck) { ++ /* ++ * It's safe to clear should_be_locked here because ++ * we're evicting from the key cache, and we still have ++ * the underlying btree locked: filling into the key ++ * cache would require taking a write lock on the btree ++ * node ++ */ ++ path2->should_be_locked = false; + __bch2_btree_path_unlock(trans, path2); + path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); +- path2->should_be_locked = false; +- btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); ++ btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); + } + + bch2_trans_verify_locks(trans); +@@ -693,7 +703,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + } else if (!bkey_cached_lock_for_evict(ck)) { + bc->skipped_lock_fail++; + } else if (bkey_cached_evict(bc, ck)) { +- bkey_cached_free(bc, ck); ++ bkey_cached_free_noassert(bc, ck); + bc->freed++; + freed++; + } else { +@@ -799,6 +809,18 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) + { + } + ++static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) ++{ ++ struct bch_fs *c = shrink->private_data; ++ struct btree_key_cache *bc = &c->btree_key_cache; ++ char *cbuf; ++ size_t buflen = seq_buf_get_buf(s, &cbuf); ++ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); ++ ++ bch2_btree_key_cache_to_text(&out, bc); ++ seq_buf_commit(s, out.pos); ++} ++ + int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) + { + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); +@@ -806,23 +828,24 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) + + bc->nr_pending = alloc_percpu(size_t); + if (!bc->nr_pending) +- return -BCH_ERR_ENOMEM_fs_btree_cache_init; ++ return bch_err_throw(c, ENOMEM_fs_btree_cache_init); + + if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || + rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) +- return -BCH_ERR_ENOMEM_fs_btree_cache_init; ++ return bch_err_throw(c, ENOMEM_fs_btree_cache_init); + + if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) +- return -BCH_ERR_ENOMEM_fs_btree_cache_init; ++ return bch_err_throw(c, ENOMEM_fs_btree_cache_init); + + bc->table_init_done = true; + + shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); + if (!shrink) +- return -BCH_ERR_ENOMEM_fs_btree_cache_init; ++ return bch_err_throw(c, ENOMEM_fs_btree_cache_init); + bc->shrink = shrink; + shrink->count_objects = bch2_btree_key_cache_count; + shrink->scan_objects = bch2_btree_key_cache_scan; ++ shrink->to_text = bch2_btree_key_cache_shrinker_to_text; + shrink->batch = 1 << 14; + shrink->seeks = 0; + shrink->private_data = c; +diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c +index 94eb2b73a843..38c5643e8a78 100644 +--- a/fs/bcachefs/btree_locking.c ++++ b/fs/bcachefs/btree_locking.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "btree_cache.h" + #include "btree_locking.h" + #include "btree_types.h" + +@@ -158,13 +159,11 @@ static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans + count_event(c, trans_restart_would_deadlock); + + if (trace_trans_restart_would_deadlock_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); ++ guard(printbuf_atomic)(&buf); + +- buf.atomic++; + print_cycle(&buf, g); +- + trace_trans_restart_would_deadlock(trans, buf.buf); +- printbuf_exit(&buf); + } + } + +@@ -193,6 +192,29 @@ static int btree_trans_abort_preference(struct btree_trans *trans) + return 3; + } + ++static noinline __noreturn void break_cycle_fail(struct lock_graph *g) ++{ ++ CLASS(printbuf, buf)(); ++ guard(printbuf_atomic)(&buf); ++ ++ prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); ++ ++ for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) { ++ struct btree_trans *trans = i->trans; ++ ++ bch2_btree_trans_to_text(&buf, trans); ++ ++ prt_printf(&buf, "backtrace:\n"); ++ printbuf_indent_add(&buf, 2); ++ bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); ++ printbuf_indent_sub(&buf, 2); ++ prt_newline(&buf); ++ } ++ ++ bch2_print_str(g->g->trans->c, KERN_ERR, buf.buf); ++ BUG(); ++} ++ + static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, + struct trans_waiting_for_lock *from) + { +@@ -218,28 +240,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, + } + } + +- if (unlikely(!best)) { +- struct printbuf buf = PRINTBUF; +- buf.atomic++; +- +- prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); +- +- for (i = g->g; i < g->g + g->nr; i++) { +- struct btree_trans *trans = i->trans; +- +- bch2_btree_trans_to_text(&buf, trans); +- +- prt_printf(&buf, "backtrace:\n"); +- printbuf_indent_add(&buf, 2); +- bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); +- printbuf_indent_sub(&buf, 2); +- prt_newline(&buf); +- } +- +- bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); +- printbuf_exit(&buf); +- BUG(); +- } ++ if (unlikely(!best)) ++ break_cycle_fail(g); + + ret = abort_lock(g, abort); + out: +@@ -254,15 +256,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + struct printbuf *cycle) + { + struct btree_trans *orig_trans = g->g->trans; +- struct trans_waiting_for_lock *i; + +- for (i = g->g; i < g->g + g->nr; i++) ++ for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) + if (i->trans == trans) { + closure_put(&trans->ref); + return break_cycle(g, cycle, i); + } + +- if (g->nr == ARRAY_SIZE(g->g)) { ++ if (unlikely(g->nr == ARRAY_SIZE(g->g))) { + closure_put(&trans->ref); + + if (orig_trans->lock_may_not_fail) +@@ -307,7 +308,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) + lock_graph_down(&g, trans); + + /* trans->paths is rcu protected vs. freeing */ +- rcu_read_lock(); ++ guard(rcu)(); + if (cycle) + cycle->atomic++; + next: +@@ -405,7 +406,6 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) + out: + if (cycle) + --cycle->atomic; +- rcu_read_unlock(); + return ret; + } + +@@ -450,13 +450,13 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + + /* relock */ + +-static inline bool btree_path_get_locks(struct btree_trans *trans, +- struct btree_path *path, +- bool upgrade, +- struct get_locks_fail *f) ++static int btree_path_get_locks(struct btree_trans *trans, ++ struct btree_path *path, ++ bool upgrade, ++ struct get_locks_fail *f, ++ int restart_err) + { + unsigned l = path->level; +- int fail_idx = -1; + + do { + if (!btree_path_node(path, l)) +@@ -464,39 +464,49 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, + + if (!(upgrade + ? bch2_btree_node_upgrade(trans, path, l) +- : bch2_btree_node_relock(trans, path, l))) { +- fail_idx = l; +- +- if (f) { +- f->l = l; +- f->b = path->l[l].b; +- } +- } ++ : bch2_btree_node_relock(trans, path, l))) ++ goto err; + + l++; + } while (l < path->locks_want); + ++ if (path->uptodate == BTREE_ITER_NEED_RELOCK) ++ path->uptodate = BTREE_ITER_UPTODATE; ++ ++ return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1; ++err: ++ if (f) { ++ f->l = l; ++ f->b = path->l[l].b; ++ } ++ ++ /* ++ * Do transaction restart before unlocking, so we don't pop ++ * should_be_locked asserts ++ */ ++ if (restart_err) { ++ btree_trans_restart(trans, restart_err); ++ } else if (path->should_be_locked && !trans->restarted) { ++ if (upgrade) ++ path->locks_want = l; ++ return -1; ++ } ++ ++ __bch2_btree_path_unlock(trans, path); ++ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); ++ + /* + * When we fail to get a lock, we have to ensure that any child nodes + * can't be relocked so bch2_btree_path_traverse has to walk back up to + * the node that we failed to relock: + */ +- if (fail_idx >= 0) { +- __bch2_btree_path_unlock(trans, path); +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); +- +- do { +- path->l[fail_idx].b = upgrade +- ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) +- : ERR_PTR(-BCH_ERR_no_btree_node_relock); +- --fail_idx; +- } while (fail_idx >= 0); +- } +- +- if (path->uptodate == BTREE_ITER_NEED_RELOCK) +- path->uptodate = BTREE_ITER_UPTODATE; ++ do { ++ path->l[l].b = upgrade ++ ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) ++ : ERR_PTR(-BCH_ERR_no_btree_node_relock); ++ } while (l--); + +- return path->uptodate < BTREE_ITER_NEED_RELOCK; ++ return -restart_err ?: -1; + } + + bool __bch2_btree_node_relock(struct btree_trans *trans, +@@ -583,7 +593,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, + l++) { + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(trans, path); +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); + } +@@ -595,9 +605,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, + __flatten + bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) + { +- struct get_locks_fail f; +- +- bool ret = btree_path_get_locks(trans, path, false, &f); ++ bool ret = !btree_path_get_locks(trans, path, false, NULL, 0); + bch2_trans_verify_locks(trans); + return ret; + } +@@ -613,27 +621,37 @@ int __bch2_btree_path_relock(struct btree_trans *trans, + return 0; + } + +-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, +- struct btree_path *path, +- unsigned new_locks_want, +- struct get_locks_fail *f) ++bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned new_locks_want) + { +- EBUG_ON(path->locks_want >= new_locks_want); +- + path->locks_want = new_locks_want; + +- bool ret = btree_path_get_locks(trans, path, true, f); +- bch2_trans_verify_locks(trans); ++ /* ++ * If we need it locked, we can't touch it. Otherwise, we can return ++ * success - bch2_path_get() will use this path, and it'll just be ++ * retraversed: ++ */ ++ bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || ++ !path->should_be_locked; ++ ++ bch2_btree_path_verify_locks(trans, path); + return ret; + } + +-bool __bch2_btree_path_upgrade(struct btree_trans *trans, +- struct btree_path *path, +- unsigned new_locks_want, +- struct get_locks_fail *f) ++int __bch2_btree_path_upgrade(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned new_locks_want) + { +- bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); +- if (ret) ++ unsigned old_locks = path->nodes_locked; ++ unsigned old_locks_want = path->locks_want; ++ ++ path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); ++ ++ struct get_locks_fail f = {}; ++ int ret = btree_path_get_locks(trans, path, true, &f, ++ BCH_ERR_transaction_restart_upgrade); ++ if (!ret) + goto out; + + /* +@@ -665,9 +683,29 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, + linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; +- btree_path_get_locks(trans, linked, true, NULL); ++ btree_path_get_locks(trans, linked, true, NULL, 0); + } + } ++ ++ count_event(trans->c, trans_restart_upgrade); ++ if (trace_trans_restart_upgrade_enabled()) { ++ CLASS(printbuf, buf)(); ++ ++ prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); ++ prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); ++ bch2_bpos_to_text(&buf, path->pos); ++ prt_printf(&buf, "locks want %u -> %u level %u\n", ++ old_locks_want, new_locks_want, f.l); ++ prt_printf(&buf, "nodes_locked %x -> %x\n", ++ old_locks, path->nodes_locked); ++ prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : ++ !f.b ? "(null)" : "(node)"); ++ prt_printf(&buf, "path seq %u node seq %u\n", ++ IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, ++ path->l[f.l].lock_seq); ++ ++ trace_trans_restart_upgrade(trans->c, buf.buf); ++ } + out: + bch2_trans_verify_locks(trans); + return ret; +@@ -699,7 +737,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, + } + } + +- bch2_btree_path_verify_locks(path); ++ bch2_btree_path_verify_locks(trans, path); + + trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); + } +@@ -728,17 +766,19 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans) + __bch2_btree_path_unlock(trans, path); + } + +-static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, +- struct get_locks_fail *f, bool trace) ++static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, ++ struct get_locks_fail *f, bool trace, ulong ip) + { + if (!trace) + goto out; + + if (trace_trans_restart_relock_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bpos_to_text(&buf, path->pos); +- prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); ++ prt_printf(&buf, " %s l=%u seq=%u node seq=", ++ bch2_btree_id_str(path->btree_id), ++ f->l, path->l[f->l].lock_seq); + if (IS_ERR_OR_NULL(f->b)) { + prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); + } else { +@@ -752,18 +792,16 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str + prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]); + } + +- trace_trans_restart_relock(trans, _RET_IP_, buf.buf); +- printbuf_exit(&buf); ++ trace_trans_restart_relock(trans, ip, buf.buf); + } + + count_event(trans->c, trans_restart_relock); + out: + __bch2_trans_unlock(trans); + bch2_trans_verify_locks(trans); +- return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + +-static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) ++static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace, ulong ip) + { + bch2_trans_verify_locks(trans); + +@@ -777,10 +815,14 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) + + trans_for_each_path(trans, path, i) { + struct get_locks_fail f; ++ int ret; + + if (path->should_be_locked && +- !btree_path_get_locks(trans, path, false, &f)) +- return bch2_trans_relock_fail(trans, path, &f, trace); ++ (ret = btree_path_get_locks(trans, path, false, &f, ++ BCH_ERR_transaction_restart_relock))) { ++ bch2_trans_relock_fail(trans, path, &f, trace, ip); ++ return ret; ++ } + } + + trans_set_locked(trans, true); +@@ -791,26 +833,19 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) + + int bch2_trans_relock(struct btree_trans *trans) + { +- return __bch2_trans_relock(trans, true); ++ return __bch2_trans_relock(trans, true, _RET_IP_); + } + + int bch2_trans_relock_notrace(struct btree_trans *trans) + { +- return __bch2_trans_relock(trans, false); ++ return __bch2_trans_relock(trans, false, _RET_IP_); + } + +-void bch2_trans_unlock_noassert(struct btree_trans *trans) ++void bch2_trans_unlock(struct btree_trans *trans) + { +- __bch2_trans_unlock(trans); +- + trans_set_unlocked(trans); +-} + +-void bch2_trans_unlock(struct btree_trans *trans) +-{ + __bch2_trans_unlock(trans); +- +- trans_set_unlocked(trans); + } + + void bch2_trans_unlock_long(struct btree_trans *trans) +@@ -842,32 +877,28 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, + + /* Debug */ + +-#ifdef CONFIG_BCACHEFS_DEBUG +- +-void bch2_btree_path_verify_locks(struct btree_path *path) ++void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) + { +- /* +- * A path may be uptodate and yet have nothing locked if and only if +- * there is no node at path->level, which generally means we were +- * iterating over all nodes and got to the end of the btree +- */ +- BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && +- btree_path_node(path, path->level) && +- !path->nodes_locked); ++ if (!path->nodes_locked && btree_path_node(path, path->level)) { ++ /* ++ * A path may be uptodate and yet have nothing locked if and only if ++ * there is no node at path->level, which generally means we were ++ * iterating over all nodes and got to the end of the btree ++ */ ++ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); ++ BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); ++ } + + if (!path->nodes_locked) + return; + + for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { + int want = btree_lock_want(path, l); +- int have = btree_node_locked_type(path, l); ++ int have = btree_node_locked_type_nowrite(path, l); + + BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); + +- BUG_ON(is_btree_node(path, l) && +- (want == BTREE_NODE_UNLOCKED || +- have != BTREE_NODE_WRITE_LOCKED) && +- want != have); ++ BUG_ON(is_btree_node(path, l) && want != have); + + BUG_ON(btree_node_locked(path, l) && + path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); +@@ -885,7 +916,7 @@ static bool bch2_trans_locked(struct btree_trans *trans) + return false; + } + +-void bch2_trans_verify_locks(struct btree_trans *trans) ++void __bch2_trans_verify_locks(struct btree_trans *trans) + { + if (!trans->locked) { + BUG_ON(bch2_trans_locked(trans)); +@@ -896,7 +927,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans) + unsigned i; + + trans_for_each_path(trans, path, i) +- bch2_btree_path_verify_locks(path); ++ __bch2_btree_path_verify_locks(trans, path); + } +- +-#endif +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index b33ab7af8440..f2173a3316f4 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -15,7 +15,6 @@ + + void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); + +-void bch2_trans_unlock_noassert(struct btree_trans *); + void bch2_trans_unlock_write(struct btree_trans *); + + static inline bool is_btree_node(struct btree_path *path, unsigned l) +@@ -44,6 +43,15 @@ static inline int btree_node_locked_type(struct btree_path *path, + return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); + } + ++static inline int btree_node_locked_type_nowrite(struct btree_path *path, ++ unsigned level) ++{ ++ int have = btree_node_locked_type(path, level); ++ return have == BTREE_NODE_WRITE_LOCKED ++ ? BTREE_NODE_INTENT_LOCKED ++ : have; ++} ++ + static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) + { + return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; +@@ -152,7 +160,7 @@ static inline int btree_path_highest_level_locked(struct btree_path *path) + static inline void __bch2_btree_path_unlock(struct btree_trans *trans, + struct btree_path *path) + { +- btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); ++ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK); + + while (path->nodes_locked) + btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); +@@ -367,8 +375,8 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, + struct btree_path *path, unsigned level) + { + EBUG_ON(btree_node_locked(path, level) && +- !btree_node_write_locked(path, level) && +- btree_node_locked_type(path, level) != __btree_lock_want(path, level)); ++ btree_node_locked_type_nowrite(path, level) != ++ __btree_lock_want(path, level)); + + return likely(btree_node_locked(path, level)) || + (!IS_ERR_OR_NULL(path->l[level].b) && +@@ -377,31 +385,29 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, + + /* upgrade */ + +-bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, +- struct btree_path *, unsigned, +- struct get_locks_fail *); ++bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned); + +-bool __bch2_btree_path_upgrade(struct btree_trans *, +- struct btree_path *, unsigned, +- struct get_locks_fail *); ++static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, ++ struct btree_path *path, ++ unsigned new_locks_want) ++{ ++ return new_locks_want > path->locks_want ++ ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want) ++ : true; ++} ++ ++int __bch2_btree_path_upgrade(struct btree_trans *, ++ struct btree_path *, unsigned); + + static inline int bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) + { +- struct get_locks_fail f = {}; +- unsigned old_locks_want = path->locks_want; +- + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + +- if (path->locks_want < new_locks_want +- ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) +- : path->nodes_locked) +- return 0; +- +- trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, +- old_locks_want, new_locks_want, &f); +- return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); ++ return likely(path->locks_want >= new_locks_want && path->nodes_locked) ++ ? 0 ++ : __bch2_btree_path_upgrade(trans, path, new_locks_want); + } + + /* misc: */ +@@ -411,8 +417,10 @@ static inline void btree_path_set_should_be_locked(struct btree_trans *trans, st + EBUG_ON(!btree_node_locked(path, path->level)); + EBUG_ON(path->uptodate); + +- path->should_be_locked = true; +- trace_btree_path_should_be_locked(trans, path); ++ if (!path->should_be_locked) { ++ path->should_be_locked = true; ++ trace_btree_path_should_be_locked(trans, path); ++ } + } + + static inline void __btree_path_set_level_up(struct btree_trans *trans, +@@ -427,7 +435,7 @@ static inline void btree_path_set_level_up(struct btree_trans *trans, + struct btree_path *path) + { + __btree_path_set_level_up(trans, path, path->level++); +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); ++ btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + } + + /* debug */ +@@ -439,12 +447,20 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, + + int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); + +-#ifdef CONFIG_BCACHEFS_DEBUG +-void bch2_btree_path_verify_locks(struct btree_path *); +-void bch2_trans_verify_locks(struct btree_trans *); +-#else +-static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} +-static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +-#endif ++void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *); ++void __bch2_trans_verify_locks(struct btree_trans *); ++ ++static inline void bch2_btree_path_verify_locks(struct btree_trans *trans, ++ struct btree_path *path) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_btree_locking)) ++ __bch2_btree_path_verify_locks(trans, path); ++} ++ ++static inline void bch2_trans_verify_locks(struct btree_trans *trans) ++{ ++ if (static_branch_unlikely(&bch2_debug_check_btree_locking)) ++ __bch2_trans_verify_locks(trans); ++} + + #endif /* _BCACHEFS_BTREE_LOCKING_H */ +diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c +index 86acf037590c..4b7b5ca74ba1 100644 +--- a/fs/bcachefs/btree_node_scan.c ++++ b/fs/bcachefs/btree_node_scan.c +@@ -65,49 +65,6 @@ static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_n + memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs); + } + +-static inline u64 bkey_journal_seq(struct bkey_s_c k) +-{ +- switch (k.k->type) { +- case KEY_TYPE_inode_v3: +- return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_journal_seq); +- default: +- return 0; +- } +-} +- +-static bool found_btree_node_is_readable(struct btree_trans *trans, +- struct found_btree_node *f) +-{ +- struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp; +- +- found_btree_node_to_key(&tmp.k, f); +- +- struct btree *b = bch2_btree_node_get_noiter(trans, &tmp.k, f->btree_id, f->level, false); +- bool ret = !IS_ERR_OR_NULL(b); +- if (!ret) +- return ret; +- +- f->sectors_written = b->written; +- f->journal_seq = le64_to_cpu(b->data->keys.journal_seq); +- +- struct bkey_s_c k; +- struct bkey unpacked; +- struct btree_node_iter iter; +- for_each_btree_node_key_unpack(b, k, &iter, &unpacked) +- f->journal_seq = max(f->journal_seq, bkey_journal_seq(k)); +- +- six_unlock_read(&b->c.lock); +- +- /* +- * We might update this node's range; if that happens, we need the node +- * to be re-read so the read path can trim keys that are no longer in +- * this node +- */ +- if (b != btree_node_root(trans->c, b)) +- bch2_btree_node_evict(trans, &tmp.k); +- return ret; +-} +- + static int found_btree_node_cmp_cookie(const void *_l, const void *_r) + { + const struct found_btree_node *l = _l; +@@ -159,17 +116,17 @@ static const struct min_heap_callbacks found_btree_node_heap_cbs = { + }; + + static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, +- struct bio *bio, struct btree_node *bn, u64 offset) ++ struct btree *b, struct bio *bio, u64 offset) + { + struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes); ++ struct btree_node *bn = b->data; + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); + bio->bi_iter.bi_sector = offset; +- bch2_bio_map(bio, bn, PAGE_SIZE); ++ bch2_bio_map(bio, b->data, c->opts.block_size); + + u64 submit_time = local_clock(); + submit_bio_wait(bio); +- + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + if (bio->bi_status) { +@@ -217,18 +174,37 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, + }; + rcu_read_unlock(); + +- if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) { +- mutex_lock(&f->lock); ++ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ); ++ bio->bi_iter.bi_sector = offset; ++ bch2_bio_map(bio, b->data, c->opts.btree_node_size); ++ ++ submit_time = local_clock(); ++ submit_bio_wait(bio); ++ bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); ++ ++ found_btree_node_to_key(&b->key, &n); ++ ++ CLASS(printbuf, buf)(); ++ if (!bch2_btree_node_read_done(c, ca, b, NULL, &buf)) { ++ /* read_done will swap out b->data for another buffer */ ++ bn = b->data; ++ /* ++ * Grab journal_seq here because we want the max journal_seq of ++ * any bset; read_done sorts down to a single set and picks the ++ * max journal_seq ++ */ ++ n.journal_seq = le64_to_cpu(bn->keys.journal_seq), ++ n.sectors_written = b->written; ++ ++ guard(mutex)(&f->lock); + if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) { + bch_err(c, "try_read_btree_node() can't handle endian conversion"); + f->ret = -EINVAL; +- goto unlock; ++ return; + } + + if (darray_push(&f->nodes, n)) + f->ret = -ENOMEM; +-unlock: +- mutex_unlock(&f->lock); + } + } + +@@ -237,12 +213,20 @@ static int read_btree_nodes_worker(void *p) + struct find_btree_nodes_worker *w = p; + struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes); + struct bch_dev *ca = w->ca; +- void *buf = (void *) __get_free_page(GFP_KERNEL); +- struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL); + unsigned long last_print = jiffies; ++ struct btree *b = NULL; ++ struct bio *bio = NULL; + +- if (!buf || !bio) { +- bch_err(c, "read_btree_nodes_worker: error allocating bio/buf"); ++ b = __bch2_btree_node_mem_alloc(c); ++ if (!b) { ++ bch_err(c, "read_btree_nodes_worker: error allocating buf"); ++ w->f->ret = -ENOMEM; ++ goto err; ++ } ++ ++ bio = bio_alloc(NULL, buf_pages(b->data, c->opts.btree_node_size), 0, GFP_KERNEL); ++ if (!bio) { ++ bch_err(c, "read_btree_nodes_worker: error allocating bio"); + w->f->ret = -ENOMEM; + goto err; + } +@@ -266,12 +250,14 @@ static int read_btree_nodes_worker(void *p) + !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c))) + continue; + +- try_read_btree_node(w->f, ca, bio, buf, sector); ++ try_read_btree_node(w->f, ca, b, bio, sector); + } + err: ++ if (b) ++ __btree_node_data_free(b); ++ kfree(b); + bio_put(bio); +- free_page((unsigned long) buf); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); + closure_put(w->cl); + kfree(w); + return 0; +@@ -284,14 +270,17 @@ static int read_btree_nodes(struct find_btree_nodes *f) + int ret = 0; + + closure_init_stack(&cl); ++ CLASS(printbuf, buf)(); ++ ++ prt_printf(&buf, "scanning for btree nodes on"); + +- for_each_online_member(c, ca) { ++ for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { + if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) + continue; + + struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); + if (!w) { +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); + ret = -ENOMEM; + goto err; + } +@@ -303,16 +292,20 @@ static int read_btree_nodes(struct find_btree_nodes *f) + struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + ret = PTR_ERR_OR_ZERO(t); + if (ret) { +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); + kfree(w); + bch_err_msg(c, ret, "starting kthread"); + break; + } + ++ prt_printf(&buf, " %s", ca->name); ++ + closure_get(&cl); +- percpu_ref_get(&ca->io_ref[READ]); ++ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); + wake_up_process(t); + } ++ ++ bch_notice(c, "%s", buf.buf); + err: + while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) + ; +@@ -363,6 +356,8 @@ static int handle_overwrites(struct bch_fs *c, + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); + } + } ++ ++ cond_resched(); + } + + return 0; +@@ -371,7 +366,7 @@ static int handle_overwrites(struct bch_fs *c, + int bch2_scan_for_btree_nodes(struct bch_fs *c) + { + struct find_btree_nodes *f = &c->found_btree_nodes; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + found_btree_nodes nodes_heap = {}; + size_t dst; + int ret = 0; +@@ -395,7 +390,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); +- bch2_print_string_as_lines(KERN_INFO, buf.buf); ++ bch2_print_str(c, KERN_INFO, buf.buf); + } + + sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); +@@ -424,7 +419,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); +- bch2_print_string_as_lines(KERN_INFO, buf.buf); ++ bch2_print_str(c, KERN_INFO, buf.buf); + } + + swap(nodes_heap, f->nodes); +@@ -470,7 +465,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); +- bch2_print_string_as_lines(KERN_INFO, buf.buf); ++ bch2_print_str(c, KERN_INFO, buf.buf); + } else { + bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); + } +@@ -478,7 +473,6 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) + eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + err: + darray_exit(&nodes_heap); +- printbuf_exit(&buf); + return ret; + } + +@@ -519,8 +513,12 @@ bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b) + return false; + } + +-bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) ++int bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree) + { ++ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); ++ if (ret) ++ return ret; ++ + struct found_btree_node search = { + .btree_id = btree, + .level = 0, +@@ -541,12 +539,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + + struct find_btree_nodes *f = &c->found_btree_nodes; + +- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); ++ int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + if (ret) + return ret; + + if (c->opts.verbose) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_str(&buf, "recovery "); + bch2_btree_id_level_to_text(&buf, btree, level); +@@ -556,7 +554,6 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + bch2_bpos_to_text(&buf, node_max); + + bch_info(c, "%s(): %s", __func__, buf.buf); +- printbuf_exit(&buf); + } + + struct found_btree_node search = { +@@ -580,10 +577,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + found_btree_node_to_key(&tmp.k, &n); + + if (c->opts.verbose) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); +- printbuf_exit(&buf); + } + + BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), +diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h +index 08687b209787..66e6f9ed19d0 100644 +--- a/fs/bcachefs/btree_node_scan.h ++++ b/fs/bcachefs/btree_node_scan.h +@@ -4,7 +4,7 @@ + + int bch2_scan_for_btree_nodes(struct bch_fs *); + bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *); +-bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); ++int bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id); + int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos); + void bch2_find_btree_nodes_exit(struct find_btree_nodes *); + +diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c +index 7d7e52ddde02..8b94a8156fbf 100644 +--- a/fs/bcachefs/btree_trans_commit.c ++++ b/fs/bcachefs/btree_trans_commit.c +@@ -11,6 +11,7 @@ + #include "btree_write_buffer.h" + #include "buckets.h" + #include "disk_accounting.h" ++#include "enumerated_ref.h" + #include "errcode.h" + #include "error.h" + #include "journal.h" +@@ -20,6 +21,7 @@ + #include "snapshot.h" + + #include ++#include + + static const char * const trans_commit_flags_strs[] = { + #define x(n, ...) #n, +@@ -44,6 +46,9 @@ void bch2_trans_commit_flags_to_text(struct printbuf *out, enum bch_trans_commit + static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) + { + #ifdef CONFIG_BCACHEFS_DEBUG ++ if (i->key_cache_flushing) ++ return; ++ + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u); +@@ -230,10 +235,10 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); +- struct btree_trans *trans = bch2_trans_get(c); + unsigned long old, new; + unsigned idx = w - b->writes; + ++ CLASS(btree_trans, trans)(c); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + + old = READ_ONCE(b->flags); +@@ -252,8 +257,6 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + + btree_node_write_if_need(trans, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); +- +- bch2_trans_put(trans); + return 0; + } + +@@ -335,6 +338,9 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, + + BUG_ON(!bpos_eq(i->k->k.p, path->pos)); + BUG_ON(i->cached != path->cached); ++ BUG_ON(i->cached && ++ !i->key_cache_already_flushed && ++ bkey_deleted(&i->k->k));; + BUG_ON(i->level != path->level); + BUG_ON(i->btree_id != path->btree_id); + BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); +@@ -366,14 +372,15 @@ static noinline void journal_transaction_name(struct btree_trans *trans) + struct jset_entry_log *l = + container_of(entry, struct jset_entry_log, entry); + +- strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); ++ memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), ++ trans->fn, strlen(trans->fn), 0); + } + + static inline int btree_key_can_insert(struct btree_trans *trans, + struct btree *b, unsigned u64s) + { + if (!bch2_btree_node_insert_fits(b, u64s)) +- return -BCH_ERR_btree_insert_btree_node_full; ++ return bch_err_throw(trans->c, btree_insert_btree_node_full); + + return 0; + } +@@ -391,9 +398,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { +- bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", ++ struct bch_fs *c = trans->c; ++ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_id_str(path->btree_id), new_u64s); +- return -BCH_ERR_ENOMEM_btree_key_cache_insert; ++ return bch_err_throw(c, ENOMEM_btree_key_cache_insert); + } + + ret = bch2_trans_relock(trans) ?: +@@ -429,7 +437,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags + if (watermark < BCH_WATERMARK_reclaim && + !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c)) +- return -BCH_ERR_btree_insert_need_journal_reclaim; ++ return bch_err_throw(c, btree_insert_need_journal_reclaim); + + /* + * bch2_varint_decode can read past the end of the buffer by at most 7 +@@ -581,7 +589,8 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) + } + + static inline int +-bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, ++bch2_trans_commit_write_locked(struct btree_trans *trans, ++ enum bch_trans_commit_flags flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) + { +@@ -591,12 +600,13 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + int ret = 0; + + bch2_trans_verify_not_unlocked_or_in_restart(trans); +- ++#if 0 ++ /* todo: bring back dynamic fault injection */ + if (race_fault()) { + trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); + } +- ++#endif + /* + * Check if the insert will fit in the leaf node with the write lock + * held, otherwise another thread could write the node changing the +@@ -644,10 +654,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + !(flags & BCH_TRANS_COMMIT_no_journal_res)) { +- if (bch2_journal_seq_verify) ++ if (static_branch_unlikely(&bch2_journal_seq_verify)) + trans_for_each_update(trans, i) + i->k->k.bversion.lo = trans->journal_res.seq; +- else if (bch2_inject_invalid_keys) ++ else if (static_branch_unlikely(&bch2_inject_invalid_keys)) + trans_for_each_update(trans, i) + i->k->k.bversion = MAX_VERSION; + } +@@ -660,19 +670,22 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + h = h->next; + } + +- struct jset_entry *entry = trans->journal_entries; +- +- percpu_down_read(&c->mark_lock); +- for (entry = trans->journal_entries; +- entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); +- entry = vstruct_next(entry)) +- if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && +- entry->start->k.type == KEY_TYPE_accounting) { +- ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); +- if (ret) +- goto revert_fs_usage; ++ struct bkey_i *accounting; ++ ++ scoped_guard(percpu_read, &c->mark_lock) ++ for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); ++ accounting != btree_trans_subbuf_top(trans, &trans->accounting); ++ accounting = bkey_next(accounting)) { ++ ret = bch2_accounting_trans_commit_hook(trans, ++ bkey_i_to_accounting(accounting), flags); ++ if (unlikely(ret)) { ++ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); ++ i != accounting; ++ i = bkey_next(i)) ++ bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); ++ return ret; ++ } + } +- percpu_up_read(&c->mark_lock); + + /* XXX: we only want to run this if deltas are nonzero */ + bch2_trans_account_disk_usage_change(trans); +@@ -695,8 +708,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; + +- for (struct jset_entry *i = trans->journal_entries; +- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); ++ for (struct jset_entry *i = btree_trans_journal_entries_start(trans); ++ i != btree_trans_journal_entries_top(trans); + i = vstruct_next(i)) { + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, +@@ -751,11 +764,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + } + + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), +- trans->journal_entries, +- trans->journal_entries_u64s); ++ btree_trans_journal_entries_start(trans), ++ trans->journal_entries.u64s); ++ ++ EBUG_ON(trans->journal_res.u64s < trans->journal_entries.u64s); + +- trans->journal_res.offset += trans->journal_entries_u64s; +- trans->journal_res.u64s -= trans->journal_entries_u64s; ++ trans->journal_res.offset += trans->journal_entries.u64s; ++ trans->journal_res.u64s -= trans->journal_entries.u64s; ++ ++ if (trans->accounting.u64s) ++ memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, ++ BCH_JSET_ENTRY_write_buffer_keys, ++ BTREE_ID_accounting, 0, ++ trans->accounting.u64s)->_data, ++ btree_trans_subbuf_base(trans, &trans->accounting), ++ trans->accounting.u64s); + + if (trans->journal_seq) + *trans->journal_seq = trans->journal_res.seq; +@@ -775,16 +798,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + return 0; + fatal_err: + bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); +- percpu_down_read(&c->mark_lock); +-revert_fs_usage: +- for (struct jset_entry *entry2 = trans->journal_entries; +- entry2 != entry; +- entry2 = vstruct_next(entry2)) +- if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && +- entry2->start->k.type == KEY_TYPE_accounting) +- bch2_accounting_trans_commit_revert(trans, +- bkey_i_to_accounting(entry2->start), flags); +- percpu_up_read(&c->mark_lock); + return ret; + } + +@@ -810,7 +823,8 @@ static int bch2_trans_commit_journal_pin_flush(struct journal *j, + /* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +-static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, ++static inline int do_bch2_trans_commit(struct btree_trans *trans, ++ enum bch_trans_commit_flags flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) + { +@@ -888,7 +902,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, + */ + if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && + watermark < BCH_WATERMARK_reclaim) { +- ret = -BCH_ERR_journal_reclaim_would_deadlock; ++ ret = bch_err_throw(c, journal_reclaim_would_deadlock); + goto out; + } + +@@ -946,35 +960,90 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, + * do. + */ + static noinline int +-do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) ++do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans, ++ enum bch_trans_commit_flags flags) + { + struct bch_fs *c = trans->c; ++ int ret = 0; + + BUG_ON(current != c->recovery_task); + ++ struct bkey_i *accounting; ++retry: ++ percpu_down_read(&c->mark_lock); ++ for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); ++ accounting != btree_trans_subbuf_top(trans, &trans->accounting); ++ accounting = bkey_next(accounting)) { ++ ret = likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply)) ++ ? bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(accounting), ++ BCH_ACCOUNTING_normal, false) ++ : 0; ++ if (ret) ++ goto revert_fs_usage; ++ } ++ percpu_up_read(&c->mark_lock); ++ + trans_for_each_update(trans, i) { +- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); ++ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + if (ret) +- return ret; ++ goto fatal_err; + } + +- for (struct jset_entry *i = trans->journal_entries; +- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); +- i = vstruct_next(i)) ++ for (struct jset_entry *i = btree_trans_journal_entries_start(trans); ++ i != btree_trans_journal_entries_top(trans); ++ i = vstruct_next(i)) { + if (i->type == BCH_JSET_ENTRY_btree_keys || + i->type == BCH_JSET_ENTRY_write_buffer_keys) { +- int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start); +- if (ret) +- return ret; ++ jset_entry_for_each_key(i, k) { ++ ret = bch2_journal_key_insert(c, i->btree_id, i->level, k); ++ if (ret) ++ goto fatal_err; ++ } + } + ++ if (i->type == BCH_JSET_ENTRY_btree_root) { ++ guard(mutex)(&c->btree_root_lock); ++ ++ struct btree_root *r = bch2_btree_id_root(c, i->btree_id); ++ ++ bkey_copy(&r->key, i->start); ++ r->level = i->level; ++ r->alive = true; ++ } ++ } ++ ++ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); ++ i != btree_trans_subbuf_top(trans, &trans->accounting); ++ i = bkey_next(i)) { ++ ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); ++ if (ret) ++ goto fatal_err; ++ } ++ + return 0; ++fatal_err: ++ bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); ++ percpu_down_read(&c->mark_lock); ++revert_fs_usage: ++ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); ++ i != accounting; ++ i = bkey_next(i)) ++ bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); ++ percpu_up_read(&c->mark_lock); ++ ++ if (bch2_err_matches(ret, BCH_ERR_btree_insert_need_mark_replicas)) { ++ ret = drop_locks_do(trans, bch2_accounting_update_sb(trans)); ++ if (!ret) ++ goto retry; ++ } ++ return ret; + } + +-int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) ++int __bch2_trans_commit(struct btree_trans *trans, enum bch_trans_commit_flags flags) + { + struct btree_insert_entry *errored_at = NULL; + struct bch_fs *c = trans->c; ++ unsigned journal_u64s = 0; + int ret = 0; + + bch2_trans_verify_not_unlocked_or_in_restart(trans); +@@ -983,8 +1052,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + if (unlikely(ret)) + goto out_reset; + +- if (!trans->nr_updates && +- !trans->journal_entries_u64s) ++ if (!bch2_trans_has_updates(trans)) + goto out_reset; + + ret = bch2_trans_commit_run_triggers(trans); +@@ -992,20 +1060,24 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + goto out_reset; + + if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && +- unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { ++ unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) +- ret = do_bch2_trans_commit_to_journal_replay(trans); ++ ret = do_bch2_trans_commit_to_journal_replay(trans, flags); + else +- ret = -BCH_ERR_erofs_trans_commit; ++ ret = bch_err_throw(c, erofs_trans_commit); + goto out_reset; + } + + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); + +- trans->journal_u64s = trans->journal_entries_u64s; ++ journal_u64s = 0; ++ + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + if (trans->journal_transaction_names) +- trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); ++ journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); ++ ++ if (trans->accounting.u64s) ++ journal_u64s += jset_u64s(trans->accounting.u64s); + + trans_for_each_update(trans, i) { + struct btree_path *path = trans->paths + i->path; +@@ -1025,11 +1097,11 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + continue; + + /* we're going to journal the key being updated: */ +- trans->journal_u64s += jset_u64s(i->k->k.u64s); ++ journal_u64s += jset_u64s(i->k->k.u64s); + + /* and we're also going to log the overwrite: */ + if (trans->journal_transaction_names) +- trans->journal_u64s += jset_u64s(i->old_k.u64s); ++ journal_u64s += jset_u64s(i->old_k.u64s); + } + + if (trans->extra_disk_res) { +@@ -1047,6 +1119,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); + ++ trans->journal_u64s = journal_u64s + trans->journal_entries.u64s; ++ + ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_); + + /* make sure we didn't drop or screw up locks: */ +@@ -1058,7 +1132,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + trace_and_count(c, transaction_commit, trans, _RET_IP_); + out: + if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) +- bch2_write_ref_put(c, BCH_WRITE_REF_trans); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); + out_reset: + if (!ret) + bch2_trans_downgrade(trans); +@@ -1078,7 +1152,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + * restart: + */ + if (flags & BCH_TRANS_COMMIT_no_journal_res) { +- ret = -BCH_ERR_transaction_restart_nested; ++ ret = bch_err_throw(c, transaction_restart_nested); + goto out; + } + +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 023c472dc9ee..e893eb938bb3 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -139,6 +139,7 @@ struct btree { + }; + + #define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ ++ x(cache_reserve) \ + x(lock_intent) \ + x(lock_write) \ + x(dirty) \ +@@ -228,6 +229,7 @@ struct btree_node_iter { + x(snapshot_field) \ + x(all_snapshots) \ + x(filter_snapshots) \ ++ x(nofilter_whiteouts) \ + x(nopreserve) \ + x(cached_nofill) \ + x(key_cache_fill) \ +@@ -257,9 +259,6 @@ struct btree_node_iter { + * + * BTREE_TRIGGER_insert - @new is entering the btree + * BTREE_TRIGGER_overwrite - @old is leaving the btree +- * +- * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc +- * trigger + */ + #define BTREE_TRIGGER_FLAGS() \ + x(norun) \ +@@ -269,8 +268,7 @@ struct btree_node_iter { + x(gc) \ + x(insert) \ + x(overwrite) \ +- x(is_root) \ +- x(bucket_invalidate) ++ x(is_root) + + enum { + #define x(n) BTREE_ITER_FLAG_BIT_##n, +@@ -367,6 +365,7 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ + struct btree_iter { ++ struct btree_trans *trans; + btree_path_idx_t path; + btree_path_idx_t update_path; + btree_path_idx_t key_cache_path; +@@ -425,14 +424,16 @@ struct btree_insert_entry { + u8 sort_order; + u8 bkey_type; + enum btree_id btree_id:8; +- u8 level:4; ++ u8 level:3; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; + bool key_cache_already_flushed:1; ++ bool key_cache_flushing:1; + /* +- * @old_k may be a key from the journal; @old_btree_u64s always refers +- * to the size of the key being overwritten in the btree: ++ * @old_k may be a key from the journal or the key cache; ++ * @old_btree_u64s always refers to the size of the key being ++ * overwritten in the btree: + */ + u8 old_btree_u64s; + btree_path_idx_t path; +@@ -477,6 +478,18 @@ struct btree_trans_paths { + struct btree_path paths[]; + }; + ++struct trans_kmalloc_trace { ++ unsigned long ip; ++ size_t bytes; ++}; ++typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; ++ ++struct btree_trans_subbuf { ++ u16 base; ++ u16 u64s; ++ u16 size; ++}; ++ + struct btree_trans { + struct bch_fs *c; + +@@ -488,6 +501,10 @@ struct btree_trans { + void *mem; + unsigned mem_top; + unsigned mem_bytes; ++ unsigned realloc_bytes_required; ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ darray_trans_kmalloc_trace trans_kmalloc_trace; ++#endif + + btree_path_idx_t nr_sorted; + btree_path_idx_t nr_paths; +@@ -528,9 +545,8 @@ struct btree_trans { + int srcu_idx; + + /* update path: */ +- u16 journal_entries_u64s; +- u16 journal_entries_size; +- struct jset_entry *journal_entries; ++ struct btree_trans_subbuf journal_entries; ++ struct btree_trans_subbuf accounting; + + struct btree_trans_commit_hook *hooks; + struct journal_entry_pin *journal_pin; +@@ -544,6 +560,8 @@ struct btree_trans { + unsigned journal_u64s; + unsigned extra_disk_res; /* XXX kill */ + ++ __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ + #ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; + #endif +@@ -604,6 +622,9 @@ enum btree_write_type { + x(dying) \ + x(fake) \ + x(need_rewrite) \ ++ x(need_rewrite_error) \ ++ x(need_rewrite_degraded) \ ++ x(need_rewrite_ptr_written_zero) \ + x(never_write) \ + x(pinned) + +@@ -628,6 +649,32 @@ static inline void clear_btree_node_ ## flag(struct btree *b) \ + BTREE_FLAGS() + #undef x + ++#define BTREE_NODE_REWRITE_REASON() \ ++ x(none) \ ++ x(unknown) \ ++ x(error) \ ++ x(degraded) \ ++ x(ptr_written_zero) ++ ++enum btree_node_rewrite_reason { ++#define x(n) BTREE_NODE_REWRITE_##n, ++ BTREE_NODE_REWRITE_REASON() ++#undef x ++}; ++ ++static inline enum btree_node_rewrite_reason btree_node_rewrite_reason(struct btree *b) ++{ ++ if (btree_node_need_rewrite_ptr_written_zero(b)) ++ return BTREE_NODE_REWRITE_ptr_written_zero; ++ if (btree_node_need_rewrite_degraded(b)) ++ return BTREE_NODE_REWRITE_degraded; ++ if (btree_node_need_rewrite_error(b)) ++ return BTREE_NODE_REWRITE_error; ++ if (btree_node_need_rewrite(b)) ++ return BTREE_NODE_REWRITE_unknown; ++ return BTREE_NODE_REWRITE_none; ++} ++ + static inline struct btree_write *btree_current_write(struct btree *b) + { + return b->writes + btree_node_write_idx(b); +@@ -647,13 +694,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b) + static inline void * + __btree_node_offset_to_ptr(const struct btree *b, u16 offset) + { +- return (void *) ((u64 *) b->data + 1 + offset); ++ return (void *) ((u64 *) b->data + offset); + } + + static inline u16 + __btree_node_ptr_to_offset(const struct btree *b, const void *p) + { +- u16 ret = (u64 *) p - 1 - (u64 *) b->data; ++ u16 ret = (u64 *) p - (u64 *) b->data; + + EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); + return ret; +@@ -793,15 +840,15 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type) + return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; + } + +-static inline bool btree_id_is_extents(enum btree_id btree) +-{ +- const u64 mask = 0 ++static const u64 btree_is_extents_mask = 0 + #define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) +- BCH_BTREE_IDS() ++BCH_BTREE_IDS() + #undef x +- ; ++; + +- return BIT_ULL(btree) & mask; ++static inline bool btree_id_is_extents(enum btree_id btree) ++{ ++ return BIT_ULL(btree) & btree_is_extents_mask; + } + + static inline bool btree_node_type_is_extents(enum btree_node_type type) +@@ -809,15 +856,20 @@ static inline bool btree_node_type_is_extents(enum btree_node_type type) + return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); + } + +-static inline bool btree_type_has_snapshots(enum btree_id btree) +-{ +- const u64 mask = 0 ++static const u64 btree_has_snapshots_mask = 0 + #define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) +- BCH_BTREE_IDS() ++BCH_BTREE_IDS() + #undef x +- ; ++; + +- return BIT_ULL(btree) & mask; ++static inline bool btree_type_has_snapshots(enum btree_id btree) ++{ ++ return BIT_ULL(btree) & btree_has_snapshots_mask; ++} ++ ++static inline bool btree_id_is_extents_snapshots(enum btree_id btree) ++{ ++ return BIT_ULL(btree) & btree_has_snapshots_mask & btree_is_extents_mask; + } + + static inline bool btree_type_has_snapshot_field(enum btree_id btree) +diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c +index 1e6b7836cc01..6f3b57573cba 100644 +--- a/fs/bcachefs/btree_update.c ++++ b/fs/bcachefs/btree_update.c +@@ -14,6 +14,8 @@ + #include "snapshot.h" + #include "trace.h" + ++#include ++ + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + const struct btree_insert_entry *r) + { +@@ -93,7 +95,6 @@ static noinline int extent_back_merge(struct btree_trans *trans, + static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) + { +- struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; +@@ -115,71 +116,45 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, + break; + } + } +- bch2_trans_iter_exit(trans, &iter); + + return ret; + } + + int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, +- enum btree_id id, +- struct bpos old_pos, +- struct bpos new_pos) ++ enum btree_id btree, struct bpos pos, ++ snapshot_id_list *s) + { +- struct bch_fs *c = trans->c; +- struct btree_iter old_iter, new_iter = {}; +- struct bkey_s_c old_k, new_k; +- snapshot_id_list s; +- struct bkey_i *update; + int ret = 0; + +- if (!bch2_snapshot_has_children(c, old_pos.snapshot)) +- return 0; ++ darray_for_each(*s, id) { ++ pos.snapshot = *id; + +- darray_init(&s); +- +- bch2_trans_iter_init(trans, &old_iter, id, old_pos, +- BTREE_ITER_not_extents| +- BTREE_ITER_all_snapshots); +- while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && +- !(ret = bkey_err(old_k)) && +- bkey_eq(old_pos, old_k.k->p)) { +- struct bpos whiteout_pos = +- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); +- +- if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || +- snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) +- continue; +- +- new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, +- BTREE_ITER_not_extents| +- BTREE_ITER_intent); +- ret = bkey_err(new_k); ++ CLASS(btree_iter, iter)(trans, btree, pos, BTREE_ITER_not_extents|BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ ret = bkey_err(k); + if (ret) + break; + +- if (new_k.k->type == KEY_TYPE_deleted) { +- update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ if (k.k->type == KEY_TYPE_deleted) { ++ struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(update); +- if (ret) ++ if (ret) { + break; ++ } + + bkey_init(&update->k); +- update->k.p = whiteout_pos; ++ update->k.p = pos; + update->k.type = KEY_TYPE_whiteout; + +- ret = bch2_trans_update(trans, &new_iter, update, ++ ret = bch2_trans_update(trans, &iter, update, + BTREE_UPDATE_internal_snapshot_node); + } +- bch2_trans_iter_exit(trans, &new_iter); + +- ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); + if (ret) + break; + } +- bch2_trans_iter_exit(trans, &new_iter); +- bch2_trans_iter_exit(trans, &old_iter); +- darray_exit(&s); + ++ darray_exit(s); + return ret; + } + +@@ -240,7 +215,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, + return ret; + } + +- if (bkey_le(old.k->p, new.k->p)) { ++ if (!back_split) { + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; +@@ -263,9 +238,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans, + BTREE_UPDATE_internal_snapshot_node|flags); + if (ret) + return ret; +- } +- +- if (back_split) { ++ } else { + update = bch2_bkey_make_mut_noupdate(trans, old); + if ((ret = PTR_ERR_OR_ZERO(update))) + return ret; +@@ -287,18 +260,16 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + struct bkey_i *insert, + enum btree_iter_update_trigger_flags flags) + { +- struct btree_iter iter; +- struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; +- int ret = 0; + +- bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), +- BTREE_ITER_intent| +- BTREE_ITER_with_updates| +- BTREE_ITER_not_extents); +- k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); +- if ((ret = bkey_err(k))) +- goto err; ++ CLASS(btree_iter, iter)(trans, btree_id, bkey_start_pos(&insert->k), ++ BTREE_ITER_intent| ++ BTREE_ITER_with_updates| ++ BTREE_ITER_not_extents); ++ struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); ++ int ret = bkey_err(k); ++ if (ret) ++ return ret; + if (!k.k) + goto out; + +@@ -306,7 +277,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + ret = extent_front_merge(trans, &iter, k, &insert, flags); + if (ret) +- goto err; ++ return ret; + } + + goto next; +@@ -317,15 +288,15 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + + ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); + if (ret) +- goto err; ++ return ret; + + if (done) + goto out; + next: +- bch2_btree_iter_advance(trans, &iter); +- k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); ++ bch2_btree_iter_advance(&iter); ++ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) +- goto err; ++ return ret; + if (!k.k) + goto out; + } +@@ -333,58 +304,19 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = extent_back_merge(trans, &iter, insert, k); + if (ret) +- goto err; ++ return ret; + } + out: +- if (!bkey_deleted(&insert->k)) +- ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); +-err: +- bch2_trans_iter_exit(trans, &iter); +- +- return ret; +-} +- +-static noinline int flush_new_cached_update(struct btree_trans *trans, +- struct btree_insert_entry *i, +- enum btree_iter_update_trigger_flags flags, +- unsigned long ip) +-{ +- struct bkey k; +- int ret; +- +- btree_path_idx_t path_idx = +- bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, +- BTREE_ITER_intent, _THIS_IP_); +- ret = bch2_btree_path_traverse(trans, path_idx, 0); +- if (ret) +- goto out; +- +- struct btree_path *btree_path = trans->paths + path_idx; +- +- /* +- * The old key in the insert entry might actually refer to an existing +- * key in the btree that has been deleted from cache and not yet +- * flushed. Check for this and skip the flush so we don't run triggers +- * against a stale key. +- */ +- bch2_btree_path_peek_slot_exact(btree_path, &k); +- if (!bkey_deleted(&k)) +- goto out; +- +- i->key_cache_already_flushed = true; +- i->flags |= BTREE_TRIGGER_norun; +- +- btree_path_set_should_be_locked(trans, btree_path); +- ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip); +-out: +- bch2_path_put(trans, path_idx, true); +- return ret; ++ return !bkey_deleted(&insert->k) ++ ? bch2_btree_insert_nonextent(trans, btree_id, insert, flags) ++ : 0; + } + +-static int __must_check +-bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, +- struct bkey_i *k, enum btree_iter_update_trigger_flags flags, +- unsigned long ip) ++static inline struct btree_insert_entry * ++__btree_trans_update_by_path(struct btree_trans *trans, ++ btree_path_idx_t path_idx, ++ struct bkey_i *k, enum btree_iter_update_trigger_flags flags, ++ unsigned long ip) + { + struct bch_fs *c = trans->c; + struct btree_insert_entry *i, n; +@@ -455,6 +387,58 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, + __btree_path_get(trans, trans->paths + i->path, true); + + trace_update_by_path(trans, path, i, overwrite); ++ return i; ++} ++ ++static noinline int flush_new_cached_update(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ enum btree_iter_update_trigger_flags flags, ++ unsigned long ip) ++{ ++ btree_path_idx_t path_idx = ++ bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0, ++ BTREE_ITER_intent, _THIS_IP_); ++ int ret = bch2_btree_path_traverse(trans, path_idx, 0); ++ if (ret) ++ goto out; ++ ++ struct btree_path *btree_path = trans->paths + path_idx; ++ ++ btree_path_set_should_be_locked(trans, btree_path); ++#if 0 ++ /* ++ * The old key in the insert entry might actually refer to an existing ++ * key in the btree that has been deleted from cache and not yet ++ * flushed. Check for this and skip the flush so we don't run triggers ++ * against a stale key. ++ */ ++ struct bkey k; ++ bch2_btree_path_peek_slot_exact(btree_path, &k); ++ if (!bkey_deleted(&k)) ++ goto out; ++#endif ++ i->key_cache_already_flushed = true; ++ i->flags |= BTREE_TRIGGER_norun; ++ ++ struct bkey old_k = i->old_k; ++ const struct bch_val *old_v = i->old_v; ++ ++ i = __btree_trans_update_by_path(trans, path_idx, i->k, flags, _THIS_IP_); ++ ++ i->old_k = old_k; ++ i->old_v = old_v; ++ i->key_cache_flushing = true; ++out: ++ bch2_path_put(trans, path_idx, true); ++ return ret; ++} ++ ++static int __must_check ++bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, ++ struct bkey_i *k, enum btree_iter_update_trigger_flags flags, ++ unsigned long ip) ++{ ++ struct btree_insert_entry *i = __btree_trans_update_by_path(trans, path_idx, k, flags, ip); + + /* + * If a key is present in the key cache, it must also exist in the +@@ -463,10 +447,9 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, + * the key cache - but the key has to exist in the btree for that to + * work: + */ +- if (path->cached && !i->old_btree_u64s) +- return flush_new_cached_update(trans, i, flags, ip); +- +- return 0; ++ return i->cached && (!i->old_btree_u64s || bkey_deleted(&k->k)) ++ ? flush_new_cached_update(trans, i, flags, ip) ++ : 0; + } + + static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, +@@ -509,8 +492,9 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, + return 0; + } + +-int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, +- struct bkey_i *k, enum btree_iter_update_trigger_flags flags) ++int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_iter_update_trigger_flags flags, ++ unsigned long ip) + { + kmsan_check_memory(k, bkey_bytes(&k->k)); + +@@ -546,7 +530,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter + path_idx = iter->key_cache_path; + } + +- return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); ++ return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); + } + + int bch2_btree_insert_clone_trans(struct btree_trans *trans, +@@ -562,43 +546,48 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans, + return bch2_btree_insert_trans(trans, btree, n, 0); + } + +-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) ++void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, ++ struct btree_trans_subbuf *buf, ++ unsigned u64s, ulong ip) + { +- unsigned new_top = trans->journal_entries_u64s + u64s; +- unsigned old_size = trans->journal_entries_size; ++ unsigned new_top = buf->u64s + u64s; ++ unsigned new_size = buf->size; + +- if (new_top > trans->journal_entries_size) { +- trans->journal_entries_size = roundup_pow_of_two(new_top); ++ BUG_ON(roundup_pow_of_two(new_top) > U16_MAX); + +- btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; +- } ++ if (new_top > new_size) ++ new_size = roundup_pow_of_two(new_top); + +- struct jset_entry *n = +- bch2_trans_kmalloc_nomemzero(trans, +- trans->journal_entries_size * sizeof(u64)); ++ void *n = bch2_trans_kmalloc_nomemzero_ip(trans, new_size * sizeof(u64), ip); + if (IS_ERR(n)) +- return ERR_CAST(n); ++ return n; ++ ++ unsigned offset = (u64 *) n - (u64 *) trans->mem; ++ BUG_ON(offset > U16_MAX); + +- if (trans->journal_entries) +- memcpy(n, trans->journal_entries, old_size * sizeof(u64)); +- trans->journal_entries = n; ++ if (buf->u64s) ++ memcpy(n, ++ btree_trans_subbuf_base(trans, buf), ++ buf->u64s * sizeof(u64)); ++ buf->base = (u64 *) n - (u64 *) trans->mem; ++ buf->size = new_size; + +- struct jset_entry *e = btree_trans_journal_entries_top(trans); +- trans->journal_entries_u64s = new_top; +- return e; ++ void *p = btree_trans_subbuf_top(trans, buf); ++ buf->u64s = new_top; ++ return p; + } + + int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos end) + { + bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); +- struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); ++ struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); + int ret = bkey_err(k); + if (ret) + goto err; + +- bch2_btree_iter_advance(trans, iter); +- k = bch2_btree_iter_peek_slot(trans, iter); ++ bch2_btree_iter_advance(iter); ++ k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -606,13 +595,13 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, + BUG_ON(k.k->type != KEY_TYPE_deleted); + + if (bkey_gt(k.k->p, end)) { +- ret = -BCH_ERR_ENOSPC_btree_slot; ++ ret = bch_err_throw(trans->c, ENOSPC_btree_slot); + goto err; + } + + return 0; + err: +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return ret; + } + +@@ -627,29 +616,21 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k, + enum btree_iter_update_trigger_flags flags) + { +- struct btree_iter iter; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, btree, k->k.p, +- BTREE_ITER_cached| +- BTREE_ITER_not_extents| +- BTREE_ITER_intent); +- ret = bch2_btree_iter_traverse(trans, &iter) ?: ++ CLASS(btree_iter, iter)(trans, btree, k->k.p, ++ BTREE_ITER_cached| ++ BTREE_ITER_not_extents| ++ BTREE_ITER_intent); ++ return bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); +- bch2_trans_iter_exit(trans, &iter); +- return ret; + } + +-int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, ++int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id btree, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) + { +- struct btree_iter iter; +- bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), +- BTREE_ITER_intent|flags); +- int ret = bch2_btree_iter_traverse(trans, &iter) ?: +- bch2_trans_update(trans, &iter, k, flags); +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ CLASS(btree_iter, iter)(trans, btree, bkey_start_pos(&k->k), ++ BTREE_ITER_intent|flags); ++ return bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, k, flags); + } + + /** +@@ -659,21 +640,23 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, + * @k: key to insert + * @disk_res: must be non-NULL whenever inserting or potentially + * splitting data extents +- * @flags: transaction commit flags ++ * @commit_flags: transaction commit flags + * @iter_flags: btree iter update trigger flags + * + * Returns: 0 on success, error code on failure + */ + int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, +- struct disk_reservation *disk_res, int flags, ++ struct disk_reservation *disk_res, ++ enum bch_trans_commit_flags commit_flags, + enum btree_iter_update_trigger_flags iter_flags) + { +- return bch2_trans_commit_do(c, disk_res, NULL, flags, +- bch2_btree_insert_trans(trans, id, k, iter_flags)); ++ CLASS(btree_trans, trans)(c); ++ return commit_do(trans, disk_res, NULL, commit_flags, ++ bch2_btree_insert_trans(trans, id, k, iter_flags)); + } + +-int bch2_btree_delete_at(struct btree_trans *trans, +- struct btree_iter *iter, unsigned update_flags) ++int bch2_btree_delete_at(struct btree_trans *trans, struct btree_iter *iter, ++ enum btree_iter_update_trigger_flags flags) + { + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); +@@ -682,38 +665,32 @@ int bch2_btree_delete_at(struct btree_trans *trans, + + bkey_init(&k->k); + k->k.p = iter->pos; +- return bch2_trans_update(trans, iter, k, update_flags); ++ return bch2_trans_update(trans, iter, k, flags); + } + + int bch2_btree_delete(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, +- unsigned update_flags) ++ enum btree_iter_update_trigger_flags flags) + { +- struct btree_iter iter; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, btree, pos, +- BTREE_ITER_cached| +- BTREE_ITER_intent); +- ret = bch2_btree_iter_traverse(trans, &iter) ?: +- bch2_btree_delete_at(trans, &iter, update_flags); +- bch2_trans_iter_exit(trans, &iter); +- +- return ret; ++ CLASS(btree_iter, iter)(trans, btree, pos, ++ BTREE_ITER_cached| ++ BTREE_ITER_intent); ++ return bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_delete_at(trans, &iter, flags); + } + +-int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, ++int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id btree, + struct bpos start, struct bpos end, +- unsigned update_flags, ++ enum btree_iter_update_trigger_flags flags, + u64 *journal_seq) + { + u32 restart_count = trans->restart_count; +- struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +- bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); +- while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { ++ CLASS(btree_iter, iter)(trans, btree, start, BTREE_ITER_intent|flags); ++ ++ while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(trans->c, 0); + struct bkey_i delete; +@@ -745,7 +722,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + bpos_min(end, k.k->p).offset - + iter.pos.offset); + +- ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: ++ ret = bch2_trans_update(trans, &iter, &delete, flags) ?: + bch2_trans_commit(trans, &disk_res, journal_seq, + BCH_TRANS_COMMIT_no_enospc); + bch2_disk_reservation_put(trans->c, &disk_res); +@@ -763,7 +740,6 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + if (ret) + break; + } +- bch2_trans_iter_exit(trans, &iter); + + return ret ?: trans_was_restarted(trans, restart_count); + } +@@ -775,12 +751,11 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + */ + int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, +- unsigned update_flags, ++ enum btree_iter_update_trigger_flags flags, + u64 *journal_seq) + { +- int ret = bch2_trans_run(c, +- bch2_btree_delete_range_trans(trans, id, start, end, +- update_flags, journal_seq)); ++ CLASS(btree_trans, trans)(c); ++ int ret = bch2_btree_delete_range_trans(trans, id, start, end, flags, journal_seq); + if (ret == -BCH_ERR_transaction_restart_nested) + ret = 0; + return ret; +@@ -805,13 +780,10 @@ int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, + int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) + { +- struct btree_iter iter; +- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, btree, pos, BTREE_ITER_intent); + +- int ret = bch2_btree_iter_traverse(trans, &iter) ?: +- bch2_btree_bit_mod_iter(trans, &iter, set); +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_bit_mod_iter(trans, &iter, set); + } + + int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, +@@ -826,30 +798,40 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, + return bch2_trans_update_buffered(trans, btree, &k); + } + +-int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) ++static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len, ulong ip) + { +- unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); +- prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); ++ unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); + +- int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; +- if (ret) +- return ret; +- +- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); +- ret = PTR_ERR_OR_ZERO(e); ++ struct jset_entry *e = bch2_trans_jset_entry_alloc_ip(trans, jset_u64s(u64s), ip); ++ int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + + struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); + journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); +- memcpy(l->d, buf->buf, buf->pos); ++ memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0); + return 0; + } + ++int bch2_trans_log_str(struct btree_trans *trans, const char *str) ++{ ++ return __bch2_trans_log_str(trans, str, strlen(str), _RET_IP_); ++} ++ ++int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) ++{ ++ int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; ++ if (ret) ++ return ret; ++ ++ return __bch2_trans_log_str(trans, buf->buf, buf->pos, _RET_IP_); ++} ++ + int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bkey_i *k) + { +- struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); ++ struct jset_entry *e = bch2_trans_jset_entry_alloc_ip(trans, ++ jset_u64s(k->k.u64s), _RET_IP_); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; +@@ -864,32 +846,31 @@ static int + __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, + va_list args) + { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + prt_vprintf(&buf, fmt, args); + + unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); +- prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); + + int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) +- goto err; ++ return ret; + + if (!test_bit(JOURNAL_running, &c->journal.flags)) { + ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s)); + if (ret) +- goto err; ++ return ret; + + struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); + journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); +- memcpy(l->d, buf.buf, buf.pos); ++ memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); + c->journal.early_journal_entries.nr += jset_u64s(u64s); + } else { +- ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, +- bch2_trans_log_msg(trans, &buf)); ++ CLASS(btree_trans, trans)(c); ++ ret = commit_do(trans, NULL, NULL, commit_flags, ++ bch2_trans_log_msg(trans, &buf)); + } +-err: +- printbuf_exit(&buf); +- return ret; ++ ++ return 0; + } + + __printf(2, 3) +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 568e56c91190..663739db82b1 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -4,6 +4,7 @@ + + #include "btree_iter.h" + #include "journal.h" ++#include "snapshot.h" + + struct bch_fs; + struct btree; +@@ -46,22 +47,27 @@ enum bch_trans_commit_flags { + + void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); + +-int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); +-int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); ++int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, ++ enum btree_iter_update_trigger_flags); ++int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, ++ enum btree_iter_update_trigger_flags); + + int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, + struct bkey_i *, enum btree_iter_update_trigger_flags); + + int bch2_btree_insert_trans(struct btree_trans *, enum btree_id, struct bkey_i *, + enum btree_iter_update_trigger_flags); +-int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct +- disk_reservation *, int flags, enum +- btree_iter_update_trigger_flags iter_flags); ++int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, ++ struct disk_reservation *, ++ enum bch_trans_commit_flags, ++ enum btree_iter_update_trigger_flags); + + int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, +- struct bpos, struct bpos, unsigned, u64 *); ++ struct bpos, struct bpos, ++ enum btree_iter_update_trigger_flags, u64 *); + int bch2_btree_delete_range(struct bch_fs *, enum btree_id, +- struct bpos, struct bpos, unsigned, u64 *); ++ struct bpos, struct bpos, ++ enum btree_iter_update_trigger_flags, u64 *); + + int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); + int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +@@ -74,7 +80,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, + } + + int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, +- struct bpos, struct bpos); ++ struct bpos, snapshot_id_list *); + + /* + * For use when splitting extents in existing snapshots: +@@ -88,11 +94,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + struct bpos old_pos, + struct bpos new_pos) + { ++ BUG_ON(old_pos.snapshot != new_pos.snapshot); ++ + if (!btree_type_has_snapshots(btree) || + bkey_eq(old_pos, new_pos)) + return 0; + +- return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); ++ snapshot_id_list s; ++ int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s); ++ if (ret) ++ return ret; ++ ++ return s.nr ++ ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s) ++ : 0; + } + + int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, +@@ -102,32 +117,92 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter * + int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos); + +-int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, +- struct bkey_i *, enum btree_iter_update_trigger_flags); ++int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_iter_update_trigger_flags, ++ unsigned long); ++ ++static inline int __must_check ++bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_iter_update_trigger_flags flags) ++{ ++ return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); ++} ++ ++static inline void *btree_trans_subbuf_base(struct btree_trans *trans, ++ struct btree_trans_subbuf *buf) ++{ ++ return (u64 *) trans->mem + buf->base; ++} + +-struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); ++static inline void *btree_trans_subbuf_top(struct btree_trans *trans, ++ struct btree_trans_subbuf *buf) ++{ ++ return (u64 *) trans->mem + buf->base + buf->u64s; ++} ++ ++void *__bch2_trans_subbuf_alloc(struct btree_trans *, ++ struct btree_trans_subbuf *, ++ unsigned, ulong); ++ ++static inline void * ++bch2_trans_subbuf_alloc_ip(struct btree_trans *trans, ++ struct btree_trans_subbuf *buf, ++ unsigned u64s, ulong ip) ++{ ++ if (buf->u64s + u64s > buf->size) ++ return __bch2_trans_subbuf_alloc(trans, buf, u64s, ip); ++ ++ void *p = btree_trans_subbuf_top(trans, buf); ++ buf->u64s += u64s; ++ return p; ++} ++ ++static inline void * ++bch2_trans_subbuf_alloc(struct btree_trans *trans, ++ struct btree_trans_subbuf *buf, ++ unsigned u64s) ++{ ++ return bch2_trans_subbuf_alloc_ip(trans, buf, u64s, _THIS_IP_); ++} ++ ++static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) ++{ ++ return btree_trans_subbuf_base(trans, &trans->journal_entries); ++} + + static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) + { +- return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); ++ return btree_trans_subbuf_top(trans, &trans->journal_entries); + } + + static inline struct jset_entry * +-bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) ++bch2_trans_jset_entry_alloc_ip(struct btree_trans *trans, unsigned u64s, ulong ip) + { +- if (!trans->journal_entries || +- trans->journal_entries_u64s + u64s > trans->journal_entries_size) +- return __bch2_trans_jset_entry_alloc(trans, u64s); ++ return bch2_trans_subbuf_alloc_ip(trans, &trans->journal_entries, u64s, ip); ++} + +- struct jset_entry *e = btree_trans_journal_entries_top(trans); +- trans->journal_entries_u64s += u64s; +- return e; ++static inline struct jset_entry * ++bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) ++{ ++ return bch2_trans_jset_entry_alloc_ip(trans, u64s, _THIS_IP_); + } + + int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); + +-int bch2_btree_write_buffer_insert_err(struct btree_trans *, +- enum btree_id, struct bkey_i *); ++int bch2_btree_write_buffer_insert_err(struct bch_fs *, enum btree_id, struct bkey_i *); ++ ++static inline int bch2_btree_write_buffer_insert_checks(struct bch_fs *c, enum btree_id btree, ++ struct bkey_i *k) ++{ ++ if (unlikely(!btree_type_uses_write_buffer(btree) || ++ k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX)) { ++ int ret = bch2_btree_write_buffer_insert_err(c, btree, k); ++ dump_stack(); ++ return ret; ++ } ++ ++ return 0; ++} + + static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, +@@ -135,11 +210,10 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr + { + kmsan_check_memory(k, bkey_bytes(&k->k)); + +- if (unlikely(!btree_type_uses_write_buffer(btree))) { +- int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); +- dump_stack(); ++ int ret = bch2_btree_write_buffer_insert_checks(trans->c, btree, k); ++ if (unlikely(ret)) + return ret; +- } ++ + /* + * Most updates skip the btree write buffer until journal replay is + * finished because synchronization with journal replay relies on having +@@ -156,7 +230,7 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr + return bch2_btree_insert_clone_trans(trans, btree, k); + + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); +- int ret = PTR_ERR_OR_ZERO(e); ++ ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + +@@ -167,8 +241,9 @@ static inline int __must_check bch2_trans_update_buffered(struct btree_trans *tr + + void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); +-int __bch2_trans_commit(struct btree_trans *, unsigned); ++int __bch2_trans_commit(struct btree_trans *, enum bch_trans_commit_flags); + ++int bch2_trans_log_str(struct btree_trans *, const char *); + int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); + int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); + +@@ -203,6 +278,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_flags))) + ++/* deprecated, prefer CLASS(btree_trans) */ + #define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \ + bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do)) + +@@ -211,18 +287,28 @@ static inline int bch2_trans_commit(struct btree_trans *trans, + (_i) < (_trans)->updates + (_trans)->nr_updates; \ + (_i)++) + ++static inline bool bch2_trans_has_updates(struct btree_trans *trans) ++{ ++ return trans->nr_updates || ++ trans->journal_entries.u64s || ++ trans->accounting.u64s; ++} ++ + static inline void bch2_trans_reset_updates(struct btree_trans *trans) + { + trans_for_each_update(trans, i) + bch2_path_put(trans, i->path, true); + + trans->nr_updates = 0; +- trans->journal_entries_u64s = 0; ++ trans->journal_entries.u64s = 0; ++ trans->journal_entries.size = 0; ++ trans->accounting.u64s = 0; ++ trans->accounting.size = 0; + trans->hooks = NULL; + trans->extra_disk_res = 0; + } + +-static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, ++static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, + unsigned type, unsigned min_bytes) + { + unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); +@@ -245,7 +331,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t + return mut; + } + +-static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) ++static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) + { + return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); + } +@@ -284,72 +370,52 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, + bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +-static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, +- struct btree_iter *iter, +- unsigned btree_id, struct bpos pos, +- enum btree_iter_update_trigger_flags flags, ++static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_iter *iter, + unsigned type, unsigned min_bytes) + { +- struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, +- btree_id, pos, flags|BTREE_ITER_intent, type); +- struct bkey_i *ret = IS_ERR(k.k) ++ struct bkey_s_c k = __bch2_bkey_get_typed(iter, type); ++ return IS_ERR(k.k) + ? ERR_CAST(k.k) +- : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); +- if (IS_ERR(ret)) +- bch2_trans_iter_exit(trans, iter); +- return ret; ++ : __bch2_bkey_make_mut_noupdate(iter->trans, k, 0, min_bytes); + } + +-static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, +- struct btree_iter *iter, +- unsigned btree_id, struct bpos pos, +- enum btree_iter_update_trigger_flags flags) ++static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_iter *iter) + { +- return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); ++ return __bch2_bkey_get_mut_noupdate(iter, 0, 0); + } + + static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, +- struct btree_iter *iter, +- unsigned btree_id, struct bpos pos, ++ enum btree_id btree, struct bpos pos, + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned min_bytes) + { +- struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, +- btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); +- int ret; +- ++ CLASS(btree_iter, iter)(trans, btree, pos, flags|BTREE_ITER_intent); ++ struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(&iter, type, min_bytes); + if (IS_ERR(mut)) + return mut; +- +- ret = bch2_trans_update(trans, iter, mut, flags); +- if (ret) { +- bch2_trans_iter_exit(trans, iter); ++ int ret = bch2_trans_update(trans, &iter, mut, flags); ++ if (ret) + return ERR_PTR(ret); +- } +- + return mut; + } + + static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, +- struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + enum btree_iter_update_trigger_flags flags, + unsigned min_bytes) + { +- return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); ++ return __bch2_bkey_get_mut(trans, btree_id, pos, flags, 0, min_bytes); + } + + static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, +- struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + enum btree_iter_update_trigger_flags flags) + { +- return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); ++ return __bch2_bkey_get_mut(trans, btree_id, pos, flags, 0, 0); + } + +-#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ +- bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ +- _btree_id, _pos, _flags, \ ++#define bch2_bkey_get_mut_typed(_trans, _btree_id, _pos, _flags, _type) \ ++ bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _btree_id, _pos, _flags, \ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + + static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index 00307356d7c8..76897cf15946 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -14,6 +14,8 @@ + #include "btree_locking.h" + #include "buckets.h" + #include "clock.h" ++#include "disk_groups.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "extents.h" + #include "io_write.h" +@@ -52,12 +54,10 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + : b->data->min_key; + struct btree_and_journal_iter iter; + struct bkey_s_c k; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + struct bkey_buf prev; + int ret = 0; + +- printbuf_indent_add_nextline(&buf, 2); +- + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); +@@ -66,22 +66,29 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + ++ /* ++ * Don't use btree_node_is_root(): we're called by btree split, after ++ * creating a new root but before setting it ++ */ + if (b == btree_node_root(c, b)) { + if (!bpos_eq(b->data->min_key, POS_MIN)) { +- ret = __bch2_topology_error(c, &buf); +- ++ bch2_log_msg_start(c, &buf); ++ prt_printf(&buf, "btree root with incorrect min_key: "); + bch2_bpos_to_text(&buf, b->data->min_key); +- log_fsck_err(trans, btree_root_bad_min_key, +- "btree root with incorrect min_key: %s", buf.buf); +- goto out; ++ prt_newline(&buf); ++ ++ bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); ++ goto err; + } + + if (!bpos_eq(b->data->max_key, SPOS_MAX)) { +- ret = __bch2_topology_error(c, &buf); ++ bch2_log_msg_start(c, &buf); ++ prt_printf(&buf, "btree root with incorrect max_key: "); + bch2_bpos_to_text(&buf, b->data->max_key); +- log_fsck_err(trans, btree_root_bad_max_key, +- "btree root with incorrect max_key: %s", buf.buf); +- goto out; ++ prt_newline(&buf); ++ ++ bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); ++ goto err; + } + } + +@@ -99,19 +106,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + : bpos_successor(prev.k->k.p); + + if (!bpos_eq(expected_min, bp.v->min_key)) { +- ret = __bch2_topology_error(c, &buf); +- +- prt_str(&buf, "end of prev node doesn't match start of next node\nin "); +- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); +- prt_str(&buf, " node "); +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ prt_str(&buf, "end of prev node doesn't match start of next node"); + prt_str(&buf, "\nprev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\nnext "); + bch2_bkey_val_to_text(&buf, c, k); ++ prt_newline(&buf); + +- log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); +- goto out; ++ bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); ++ goto err; + } + + bch2_bkey_buf_reassemble(&prev, c, k); +@@ -119,32 +122,33 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + } + + if (bkey_deleted(&prev.k->k)) { +- ret = __bch2_topology_error(c, &buf); +- +- prt_str(&buf, "empty interior node\nin "); +- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); +- prt_str(&buf, " node "); +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); +- +- log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); +- } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { +- ret = __bch2_topology_error(c, &buf); ++ prt_printf(&buf, "empty interior node\n"); ++ bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); ++ goto err; ++ } + +- prt_str(&buf, "last child node doesn't end at end of parent node\nin "); +- bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); +- prt_str(&buf, " node "); +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); +- prt_str(&buf, "\nlast key "); ++ if (!bpos_eq(prev.k->k.p, b->key.k.p)) { ++ prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); ++ prt_newline(&buf); + +- log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); ++ bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); ++ goto err; + } + out: +-fsck_err: + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev, c); +- printbuf_exit(&buf); + return ret; ++err: ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_char(&buf, ' '); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ prt_newline(&buf); ++ ++ ret = __bch2_topology_error(c, &buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ BUG_ON(!ret); ++ goto out; + } + + /* Calculate ideal packed bkey format for new btree nodes: */ +@@ -217,7 +221,7 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b) + { + struct bch_fs *c = trans->c; + +- trace_and_count(c, btree_node_free, trans, b); ++ trace_btree_node(c, b, btree_node_free); + + BUG_ON(btree_node_write_blocked(b)); + BUG_ON(btree_node_dirty(b)); +@@ -240,9 +244,8 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, + + __btree_node_free(trans, b); + +- mutex_lock(&c->btree_cache.lock); +- bch2_btree_node_hash_remove(&c->btree_cache, b); +- mutex_unlock(&c->btree_cache.lock); ++ scoped_guard(mutex, &c->btree_cache.lock) ++ bch2_btree_node_hash_remove(&c->btree_cache, b); + + six_unlock_write(&b->c.lock); + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); +@@ -268,9 +271,8 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, + clear_btree_node_dirty_acct(c, b); + clear_btree_node_need_write(b); + +- mutex_lock(&c->btree_cache.lock); +- __bch2_btree_node_hash_remove(&c->btree_cache, b); +- mutex_unlock(&c->btree_cache.lock); ++ scoped_guard(mutex, &c->btree_cache.lock) ++ __bch2_btree_node_hash_remove(&c->btree_cache, b); + + BUG_ON(p->nr >= ARRAY_SIZE(p->b)); + p->b[p->nr++] = b; +@@ -280,17 +282,46 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, + bch2_trans_node_drop(trans, b); + } + ++static bool can_use_btree_node(struct bch_fs *c, ++ struct disk_reservation *res, ++ unsigned target, ++ struct bkey_s_c k) ++{ ++ if (!bch2_bkey_devs_rw(c, k)) ++ return false; ++ ++ if (target && !bch2_bkey_in_target(c, k, target)) ++ return false; ++ ++ unsigned durability = bch2_bkey_durability(c, k); ++ ++ if (durability >= res->nr_replicas) ++ return true; ++ ++ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_btree, target); ++ ++ guard(rcu)(); ++ ++ unsigned durability_available = 0, i; ++ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); ++ if (ca) ++ durability_available += ca->mi.durability; ++ } ++ ++ return durability >= durability_available; ++} ++ + static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, + struct disk_reservation *res, + struct closure *cl, + bool interior_node, +- unsigned flags) ++ unsigned target, ++ enum bch_trans_commit_flags flags) + { + struct bch_fs *c = trans->c; + struct write_point *wp; + struct btree *b; +- BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; +- struct open_buckets obs = { .nr = 0 }; + struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim +@@ -306,17 +337,27 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, + + mutex_lock(&c->btree_reserve_cache_lock); + if (c->btree_reserve_cache_nr > nr_reserve) { +- struct btree_alloc *a = +- &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ for (struct btree_alloc *a = c->btree_reserve_cache; ++ a < c->btree_reserve_cache + c->btree_reserve_cache_nr;) { ++ /* check if it has sufficient durability */ ++ ++ if (!can_use_btree_node(c, res, target, bkey_i_to_s_c(&a->k))) { ++ bch2_open_buckets_put(c, &a->ob); ++ *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ continue; ++ } + +- obs = a->ob; +- bkey_copy(&tmp.k, &a->k); +- mutex_unlock(&c->btree_reserve_cache_lock); +- goto out; ++ bkey_copy(&b->key, &a->k); ++ b->ob = a->ob; ++ *a = c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ goto out; ++ } + } + mutex_unlock(&c->btree_reserve_cache_lock); + retry: + ret = bch2_alloc_sectors_start_trans(trans, ++ target ?: + c->opts.metadata_target ?: + c->opts.foreground_target, + 0, +@@ -325,7 +366,9 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, + res->nr_replicas, + min(res->nr_replicas, + c->opts.metadata_replicas_required), +- watermark, 0, cl, &wp); ++ watermark, ++ target ? BCH_WRITE_only_specified_devs : 0, ++ cl, &wp); + if (unlikely(ret)) + goto err; + +@@ -341,14 +384,12 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, + goto retry; + } + +- bkey_btree_ptr_v2_init(&tmp.k); +- bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); ++ bkey_btree_ptr_v2_init(&b->key); ++ bch2_alloc_sectors_append_ptrs(c, wp, &b->key, btree_sectors(c), false); + +- bch2_open_bucket_get(c, wp, &obs); ++ bch2_open_bucket_get(c, wp, &b->ob); + bch2_alloc_sectors_done(c, wp); + out: +- bkey_copy(&b->key, &tmp.k); +- b->ob = obs; + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + +@@ -406,7 +447,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); + BUG_ON(ret); + +- trace_and_count(c, btree_node_alloc, trans, b); ++ trace_btree_node(c, b, btree_node_alloc); + bch2_increment_clock(c, btree_sectors(c), WRITE); + return b; + } +@@ -505,33 +546,29 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * + static int bch2_btree_reserve_get(struct btree_trans *trans, + struct btree_update *as, + unsigned nr_nodes[2], ++ unsigned target, + unsigned flags, + struct closure *cl) + { +- struct btree *b; +- unsigned interior; +- int ret = 0; +- + BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); + + /* + * Protects reaping from the btree node cache and using the btree node + * open bucket reserve: + */ +- ret = bch2_btree_cache_cannibalize_lock(trans, cl); ++ int ret = bch2_btree_cache_cannibalize_lock(trans, cl); + if (ret) + return ret; + +- for (interior = 0; interior < 2; interior++) { ++ for (unsigned interior = 0; interior < 2; interior++) { + struct prealloc_nodes *p = as->prealloc_nodes + interior; + + while (p->nr < nr_nodes[interior]) { +- b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, +- interior, flags); +- if (IS_ERR(b)) { +- ret = PTR_ERR(b); ++ struct btree *b = __bch2_btree_node_alloc(trans, &as->disk_res, ++ cl, interior, target, flags); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (ret) + goto err; +- } + + p->b[p->nr++] = b; + } +@@ -559,7 +596,8 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], + as->start_time); + +- mutex_lock(&c->btree_interior_update_lock); ++ guard(mutex)(&c->btree_interior_update_lock); ++ + list_del(&as->unwritten_list); + list_del(&as->list); + +@@ -571,8 +609,6 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans * + * since being on btree_interior_update_list is our ref on @c: + */ + closure_wake_up(&c->btree_interior_update_wait); +- +- mutex_unlock(&c->btree_interior_update_lock); + } + + static void btree_update_add_key(struct btree_update *as, +@@ -601,12 +637,11 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as) + { + struct bch_fs *c = as->c; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + for_each_keylist_key(&as->new_keys, k) + bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k)); + + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + } + + /* +@@ -658,7 +693,7 @@ static void btree_update_nodes_written(struct btree_update *as) + { + struct bch_fs *c = as->c; + struct btree *b; +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + u64 journal_seq = 0; + unsigned i; + int ret; +@@ -679,12 +714,31 @@ static void btree_update_nodes_written(struct btree_update *as) + + /* + * Wait for any in flight writes to finish before we free the old nodes +- * on disk: ++ * on disk. But we haven't pinned those old nodes in the btree cache, ++ * they might have already been evicted. ++ * ++ * The update we're completing deleted references to those nodes from the ++ * btree, so we know if they've been evicted they can't be pulled back in. ++ * We just have to check if the nodes we have pointers to are still those ++ * old nodes, and haven't been reused. ++ * ++ * This can't be done locklessly because the data buffer might have been ++ * vmalloc allocated, and they're not RCU freed. We also need the ++ * __no_kmsan_checks annotation because even with the btree node read ++ * lock, nothing tells us that the data buffer has been initialized (if ++ * the btree node has been reused for a different node, and the data ++ * buffer swapped for a new data buffer). + */ + for (i = 0; i < as->nr_old_nodes; i++) { + b = as->old_nodes[i]; + +- if (btree_node_seq_matches(b, as->old_nodes_seq[i])) ++ bch2_trans_begin(trans); ++ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); ++ bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); ++ six_unlock_read(&b->c.lock); ++ bch2_trans_unlock_long(trans); ++ ++ if (seq_matches) + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, + TASK_UNINTERRUPTIBLE); + } +@@ -798,15 +852,15 @@ static void btree_update_nodes_written(struct btree_update *as) + + bch2_journal_pin_drop(&c->journal, &as->journal); + +- mutex_lock(&c->btree_interior_update_lock); +- for (i = 0; i < as->nr_new_nodes; i++) { +- b = as->new_nodes[i]; ++ scoped_guard(mutex, &c->btree_interior_update_lock) { ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; + +- BUG_ON(b->will_make_reachable != (unsigned long) as); +- b->will_make_reachable = 0; +- clear_btree_node_will_make_reachable(b); ++ BUG_ON(b->will_make_reachable != (unsigned long) as); ++ b->will_make_reachable = 0; ++ clear_btree_node_will_make_reachable(b); ++ } + } +- mutex_unlock(&c->btree_interior_update_lock); + + for (i = 0; i < as->nr_new_nodes; i++) { + b = as->new_nodes[i]; +@@ -820,7 +874,6 @@ static void btree_update_nodes_written(struct btree_update *as) + bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); + + bch2_btree_update_free(as, trans); +- bch2_trans_put(trans); + } + + static void btree_interior_update_work(struct work_struct *work) +@@ -830,12 +883,12 @@ static void btree_interior_update_work(struct work_struct *work) + struct btree_update *as; + + while (1) { +- mutex_lock(&c->btree_interior_update_lock); +- as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, +- struct btree_update, unwritten_list); +- if (as && !as->nodes_written) +- as = NULL; +- mutex_unlock(&c->btree_interior_update_lock); ++ scoped_guard(mutex, &c->btree_interior_update_lock) { ++ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, ++ struct btree_update, unwritten_list); ++ if (as && !as->nodes_written) ++ as = NULL; ++ } + + if (!as) + break; +@@ -849,9 +902,8 @@ static CLOSURE_CALLBACK(btree_update_set_nodes_written) + closure_type(as, struct btree_update, cl); + struct bch_fs *c = as->c; + +- mutex_lock(&c->btree_interior_update_lock); +- as->nodes_written = true; +- mutex_unlock(&c->btree_interior_update_lock); ++ scoped_guard(mutex, &c->btree_interior_update_lock) ++ as->nodes_written = true; + + queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); + } +@@ -869,7 +921,7 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) + BUG_ON(!btree_node_dirty(b)); + BUG_ON(!b->c.level); + +- mutex_lock(&c->btree_interior_update_lock); ++ guard(mutex)(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + as->mode = BTREE_UPDATE_node; +@@ -878,8 +930,6 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b) + + set_btree_node_write_blocked(b); + list_add(&as->write_blocked_list, &b->write_blocked); +- +- mutex_unlock(&c->btree_interior_update_lock); + } + + static int bch2_update_reparent_journal_pin_flush(struct journal *j, +@@ -918,11 +968,11 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b) + b->c.btree_id, b->c.level, + insert, insert->k.u64s); + +- mutex_lock(&c->btree_interior_update_lock); +- list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ scoped_guard(mutex, &c->btree_interior_update_lock) { ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + +- as->mode = BTREE_UPDATE_root; +- mutex_unlock(&c->btree_interior_update_lock); ++ as->mode = BTREE_UPDATE_root; ++ } + } + + /* +@@ -943,7 +993,8 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree + + closure_get(&as->cl); + +- mutex_lock(&c->btree_interior_update_lock); ++ guard(mutex)(&c->btree_interior_update_lock); ++ + BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); + BUG_ON(b->will_make_reachable); + +@@ -951,8 +1002,6 @@ static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree + b->will_make_reachable = 1UL|(unsigned long) as; + set_btree_node_will_make_reachable(b); + +- mutex_unlock(&c->btree_interior_update_lock); +- + btree_update_add_key(as, &as->new_keys, b); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { +@@ -971,31 +1020,29 @@ static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) + { + struct btree_update *as; + unsigned long v; +- unsigned i; + +- mutex_lock(&c->btree_interior_update_lock); +- /* +- * When b->will_make_reachable != 0, it owns a ref on as->cl that's +- * dropped when it gets written by bch2_btree_complete_write - the +- * xchg() is for synchronization with bch2_btree_complete_write: +- */ +- v = xchg(&b->will_make_reachable, 0); +- clear_btree_node_will_make_reachable(b); +- as = (struct btree_update *) (v & ~1UL); ++ scoped_guard(mutex, &c->btree_interior_update_lock) { ++ /* ++ * When b->will_make_reachable != 0, it owns a ref on as->cl that's ++ * dropped when it gets written by bch2_btree_complete_write - the ++ * xchg() is for synchronization with bch2_btree_complete_write: ++ */ ++ v = xchg(&b->will_make_reachable, 0); ++ clear_btree_node_will_make_reachable(b); ++ as = (struct btree_update *) (v & ~1UL); + +- if (!as) { +- mutex_unlock(&c->btree_interior_update_lock); +- return; +- } ++ if (!as) ++ return; + +- for (i = 0; i < as->nr_new_nodes; i++) +- if (as->new_nodes[i] == b) +- goto found; ++ unsigned i; ++ for (i = 0; i < as->nr_new_nodes; i++) ++ if (as->new_nodes[i] == b) ++ goto found; + +- BUG(); +-found: +- array_remove_item(as->new_nodes, as->nr_new_nodes, i); +- mutex_unlock(&c->btree_interior_update_lock); ++ BUG(); ++ found: ++ array_remove_item(as->new_nodes, as->nr_new_nodes, i); ++ } + + if (v & 1) + closure_put(&as->cl); +@@ -1114,9 +1161,18 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * + start_time); + } + ++static const char * const btree_node_reawrite_reason_strs[] = { ++#define x(n) #n, ++ BTREE_NODE_REWRITE_REASON() ++#undef x ++ NULL, ++}; ++ + static struct btree_update * + bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, +- unsigned level_start, bool split, unsigned flags) ++ unsigned level_start, bool split, ++ unsigned target, ++ enum bch_trans_commit_flags flags) + { + struct bch_fs *c = trans->c; + struct btree_update *as; +@@ -1203,9 +1259,17 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + bch2_keylist_init(&as->new_keys, as->_new_keys); + bch2_keylist_init(&as->parent_keys, as->inline_keys); + +- mutex_lock(&c->btree_interior_update_lock); +- list_add_tail(&as->list, &c->btree_interior_update_list); +- mutex_unlock(&c->btree_interior_update_lock); ++ scoped_guard(mutex, &c->btree_interior_update_lock) ++ list_add_tail(&as->list, &c->btree_interior_update_list); ++ ++ struct btree *b = btree_path_node(path, path->level); ++ as->node_start = b->data->min_key; ++ as->node_end = b->data->max_key; ++ as->node_needed_rewrite = btree_node_rewrite_reason(b); ++ as->node_written = b->written; ++ as->node_sectors = btree_buf_bytes(b) >> 9; ++ as->node_remaining = __bch2_btree_u64s_remaining(b, ++ btree_bkey_last(b, bset_tree_last(b))); + + /* + * We don't want to allocate if we're in an error state, that can cause +@@ -1226,7 +1290,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + if (ret) + goto err; + +- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); ++ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); + if (bch2_err_matches(ret, ENOSPC) || + bch2_err_matches(ret, ENOMEM)) { + struct closure cl; +@@ -1238,18 +1302,19 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + if (bch2_err_matches(ret, ENOSPC) && + (flags & BCH_TRANS_COMMIT_journal_reclaim) && + watermark < BCH_WATERMARK_reclaim) { +- ret = -BCH_ERR_journal_reclaim_would_deadlock; ++ ret = bch_err_throw(c, journal_reclaim_would_deadlock); + goto err; + } + + closure_init_stack(&cl); + + do { +- ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); +- ++ ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); ++ if (!bch2_err_matches(ret, BCH_ERR_operation_blocked)) ++ break; + bch2_trans_unlock(trans); + bch2_wait_on_allocator(c, &cl); +- } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); ++ } while (1); + } + + if (ret) { +@@ -1279,13 +1344,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) + { + /* Root nodes cannot be reaped */ +- mutex_lock(&c->btree_cache.lock); +- list_del_init(&b->list); +- mutex_unlock(&c->btree_cache.lock); ++ scoped_guard(mutex, &c->btree_cache.lock) ++ list_del_init(&b->list); + +- mutex_lock(&c->btree_root_lock); +- bch2_btree_id_root(c, b->c.btree_id)->b = b; +- mutex_unlock(&c->btree_root_lock); ++ scoped_guard(mutex, &c->btree_root_lock) ++ bch2_btree_id_root(c, b->c.btree_id)->b = b; + + bch2_recalc_btree_reserve(c); + } +@@ -1298,7 +1361,7 @@ static int bch2_btree_set_root(struct btree_update *as, + { + struct bch_fs *c = as->c; + +- trace_and_count(c, btree_node_set_root, trans, b); ++ trace_btree_node(c, b, btree_node_set_root); + + struct btree *old = btree_node_root(c, b); + +@@ -1340,7 +1403,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + { + struct bch_fs *c = as->c; + struct bkey_packed *k; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + unsigned long old, new; + + BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && +@@ -1385,8 +1448,6 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + new |= BTREE_WRITE_interior; + new |= 1 << BTREE_NODE_need_write; + } while (!try_cmpxchg(&b->flags, &old, new)); +- +- printbuf_exit(&buf); + } + + static int +@@ -1413,7 +1474,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, + + int ret = bch2_btree_node_check_topology(trans, b); + if (ret) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + for (struct bkey_i *k = keys->keys; + k != insert; +@@ -1598,7 +1659,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, + int ret = 0; + + bch2_verify_btree_nr_keys(b); +- BUG_ON(!parent && (b != btree_node_root(c, b))); ++ BUG_ON(!parent && !btree_node_is_root(c, b)); + BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1)); + + ret = bch2_btree_node_check_topology(trans, b); +@@ -1608,7 +1669,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, + if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { + struct btree *n[2]; + +- trace_and_count(c, btree_node_split, trans, b); ++ trace_btree_node(c, b, btree_node_split); + + n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); + n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); +@@ -1670,7 +1731,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, + goto err; + } + } else { +- trace_and_count(c, btree_node_compact, trans, b); ++ trace_btree_node(c, b, btree_node_compact); + + n1 = bch2_btree_node_alloc_replacement(as, trans, b); + +@@ -1800,16 +1861,15 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t + bch2_verify_keylist_sorted(keys); + + if (!btree_node_intent_locked(path, b->c.level)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "%s(): node not locked at level %u\n", + __func__, b->c.level); + bch2_btree_update_to_text(&buf, as); + bch2_btree_path_to_text(&buf, trans, path_idx); ++ bch2_fs_emergency_read_only2(c, &buf); + +- bch2_print_string_as_lines(KERN_ERR, buf.buf); +- printbuf_exit(&buf); +- bch2_fs_emergency_read_only(c); ++ bch2_print_str(c, KERN_ERR, buf.buf); + return -EIO; + } + +@@ -1878,7 +1938,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, + + as = bch2_btree_update_start(trans, trans->paths + path, + trans->paths[path].level, +- true, flags); ++ true, 0, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + +@@ -1932,9 +1992,8 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * + bch2_trans_node_add(trans, path, n); + six_unlock_intent(&n->c.lock); + +- mutex_lock(&c->btree_cache.lock); +- list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); +- mutex_unlock(&c->btree_cache.lock); ++ scoped_guard(mutex, &c->btree_cache.lock) ++ list_add_tail(&b->list, &c->btree_cache.live[btree_node_pinned(b)].list); + + bch2_trans_verify_locks(trans); + } +@@ -1948,7 +2007,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, + return bch2_btree_split_leaf(trans, path, flags); + + struct btree_update *as = +- bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); ++ bch2_btree_update_start(trans, trans->paths + path, b->c.level, ++ true, 0, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + +@@ -2010,7 +2070,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + + sib_path = bch2_path_get(trans, btree, sib_pos, + U8_MAX, level, BTREE_ITER_intent, _THIS_IP_); +- ret = bch2_btree_path_traverse(trans, sib_path, false); ++ ret = bch2_btree_path_traverse(trans, sib_path, 0); + if (ret) + goto err; + +@@ -2033,7 +2093,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + } + + if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + printbuf_indent_add_nextline(&buf, 2); + prt_printf(&buf, "%s(): ", __func__); +@@ -2048,7 +2108,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + bch2_bpos_to_text(&buf, next->data->min_key); + + bch_err(c, "%s", buf.buf); +- printbuf_exit(&buf); + goto err; + } + +@@ -2077,12 +2136,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + + parent = btree_node_parent(trans->paths + path, b); + as = bch2_btree_update_start(trans, trans->paths + path, level, false, +- BCH_TRANS_COMMIT_no_enospc|flags); ++ 0, BCH_TRANS_COMMIT_no_enospc|flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto err; + +- trace_and_count(c, btree_node_merge, trans, b); ++ as->node_start = prev->data->min_key; ++ as->node_end = next->data->max_key; ++ ++ trace_btree_node(c, b, btree_node_merge); + + n = bch2_btree_node_alloc(as, trans, b->c.level); + +@@ -2162,7 +2224,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, + bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_intent); +- int ret = bch2_btree_iter_traverse(trans, iter); ++ int ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + +@@ -2170,21 +2232,22 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, + if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); +- ret = -BCH_ERR_btree_node_dying; ++ ret = bch_err_throw(trans->c, btree_node_dying); + goto err; + } + + BUG_ON(!btree_node_hashed(b)); + return 0; + err: +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return ret; + } + + int bch2_btree_node_rewrite(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b, +- unsigned flags) ++ unsigned target, ++ enum bch_trans_commit_flags flags) + { + struct bch_fs *c = trans->c; + struct btree *n, *parent; +@@ -2196,7 +2259,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + + struct btree_path *path = btree_iter_path(trans, iter); + parent = btree_node_parent(path, b); +- as = bch2_btree_update_start(trans, path, b->c.level, false, flags); ++ as = bch2_btree_update_start(trans, path, b->c.level, ++ false, target, flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto out; +@@ -2212,8 +2276,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, trans->paths + new_path, n); + +- trace_and_count(c, btree_node_rewrite, trans, b); +- + if (parent) { + bch2_keylist_add(&as->parent_keys, &n->key); + ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); +@@ -2224,6 +2286,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + if (ret) + goto err; + ++ trace_btree_node(c, b, btree_node_rewrite); ++ + bch2_btree_interior_update_will_free_node(as, b); + + bch2_btree_update_get_open_buckets(as, n); +@@ -2246,58 +2310,62 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + goto out; + } + +-static int bch2_btree_node_rewrite_key(struct btree_trans *trans, +- enum btree_id btree, unsigned level, +- struct bkey_i *k, unsigned flags) ++int bch2_btree_node_rewrite_key(struct btree_trans *trans, ++ enum btree_id btree, unsigned level, ++ struct bkey_i *k, ++ enum bch_trans_commit_flags flags) + { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, + btree, k->k.p, + BTREE_MAX_DEPTH, level, 0); +- struct btree *b = bch2_btree_iter_peek_node(trans, &iter); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto out; + + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); + ret = found +- ? bch2_btree_node_rewrite(trans, &iter, b, flags) ++ ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) + : -ENOENT; + out: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + + int bch2_btree_node_rewrite_pos(struct btree_trans *trans, + enum btree_id btree, unsigned level, +- struct bpos pos, unsigned flags) ++ struct bpos pos, ++ unsigned target, ++ enum bch_trans_commit_flags flags) + { + BUG_ON(!level); + + /* Traverse one depth lower to get a pointer to the node itself: */ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); +- struct btree *b = bch2_btree_iter_peek_node(trans, &iter); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + +- ret = bch2_btree_node_rewrite(trans, &iter, b, flags); ++ ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + + int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, +- struct btree *b, unsigned flags) ++ struct btree *b, ++ enum bch_trans_commit_flags flags) + { + struct btree_iter iter; + int ret = get_iter_to_node(trans, &iter, b); + if (ret) + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; + +- ret = bch2_btree_node_rewrite(trans, &iter, b, flags); +- bch2_trans_iter_exit(trans, &iter); ++ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -2318,19 +2386,17 @@ static void async_btree_node_rewrite_work(struct work_struct *work) + + int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, + a->btree_id, a->level, a->key.k, 0)); +- if (ret != -ENOENT && +- !bch2_err_matches(ret, EROFS) && +- ret != -BCH_ERR_journal_shutdown) ++ if (!bch2_err_matches(ret, ENOENT) && ++ !bch2_err_matches(ret, EROFS)) + bch_err_fn_ratelimited(c, ret); + +- spin_lock(&c->btree_node_rewrites_lock); +- list_del(&a->list); +- spin_unlock(&c->btree_node_rewrites_lock); ++ scoped_guard(spinlock, &c->btree_node_rewrites_lock) ++ list_del(&a->list); + + closure_wake_up(&c->btree_node_rewrites_wait); + + bch2_bkey_buf_exit(&a->key, c); +- bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); + kfree(a); + } + +@@ -2350,16 +2416,16 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) + + bool now = false, pending = false; + +- spin_lock(&c->btree_node_rewrites_lock); +- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && +- bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { +- list_add(&a->list, &c->btree_node_rewrites); +- now = true; +- } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { +- list_add(&a->list, &c->btree_node_rewrites_pending); +- pending = true; ++ scoped_guard(spinlock, &c->btree_node_rewrites_lock) { ++ if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && ++ enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { ++ list_add(&a->list, &c->btree_node_rewrites); ++ now = true; ++ } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { ++ list_add(&a->list, &c->btree_node_rewrites_pending); ++ pending = true; ++ } + } +- spin_unlock(&c->btree_node_rewrites_lock); + + if (now) { + queue_work(c->btree_node_rewrite_worker, &a->work); +@@ -2380,18 +2446,19 @@ void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) + void bch2_do_pending_node_rewrites(struct bch_fs *c) + { + while (1) { +- spin_lock(&c->btree_node_rewrites_lock); +- struct async_btree_rewrite *a = +- list_pop_entry(&c->btree_node_rewrites_pending, +- struct async_btree_rewrite, list); +- if (a) +- list_add(&a->list, &c->btree_node_rewrites); +- spin_unlock(&c->btree_node_rewrites_lock); ++ struct async_btree_rewrite *a; ++ ++ scoped_guard(spinlock, &c->btree_node_rewrites_lock) { ++ a = list_pop_entry(&c->btree_node_rewrites_pending, ++ struct async_btree_rewrite, list); ++ if (a) ++ list_add(&a->list, &c->btree_node_rewrites); ++ } + + if (!a) + break; + +- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); ++ enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); + queue_work(c->btree_node_rewrite_worker, &a->work); + } + } +@@ -2399,11 +2466,11 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) + void bch2_free_pending_node_rewrites(struct bch_fs *c) + { + while (1) { +- spin_lock(&c->btree_node_rewrites_lock); +- struct async_btree_rewrite *a = +- list_pop_entry(&c->btree_node_rewrites_pending, +- struct async_btree_rewrite, list); +- spin_unlock(&c->btree_node_rewrites_lock); ++ struct async_btree_rewrite *a; ++ ++ scoped_guard(spinlock, &c->btree_node_rewrites_lock) ++ a = list_pop_entry(&c->btree_node_rewrites_pending, ++ struct async_btree_rewrite, list); + + if (!a) + break; +@@ -2421,7 +2488,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + bool skip_triggers) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter2 = {}; ++ struct btree_iter iter2 = { NULL }; + struct btree *parent; + int ret; + +@@ -2445,7 +2512,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + + parent = btree_node_parent(btree_iter_path(trans, iter), b); + if (parent) { +- bch2_trans_copy_iter(trans, &iter2, iter); ++ bch2_trans_copy_iter(&iter2, iter); + + iter2.path = bch2_btree_path_make_mut(trans, iter2.path, + iter2.flags & BTREE_ITER_intent, +@@ -2459,12 +2526,12 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + + trans->paths_sorted = false; + +- ret = bch2_btree_iter_traverse(trans, &iter2) ?: ++ ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); + if (ret) + goto err; + } else { +- BUG_ON(btree_node_root(c, b) != b); ++ BUG_ON(!btree_node_is_root(c, b)); + + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, + jset_u64s(new_key->k.u64s)); +@@ -2485,7 +2552,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c); + + if (new_hash) { +- mutex_lock(&c->btree_cache.lock); ++ guard(mutex)(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, new_hash); + + __bch2_btree_node_hash_remove(&c->btree_cache, b); +@@ -2493,20 +2560,18 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, + bkey_copy(&b->key, new_key); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); +- mutex_unlock(&c->btree_cache.lock); + } else { + bkey_copy(&b->key, new_key); + } + + bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b); + out: +- bch2_trans_iter_exit(trans, &iter2); ++ bch2_trans_iter_exit(&iter2); + return ret; + err: + if (new_hash) { +- mutex_lock(&c->btree_cache.lock); ++ guard(mutex)(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); +- mutex_unlock(&c->btree_cache.lock); + } + goto out; + } +@@ -2572,7 +2637,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, + + ret = bch2_btree_node_update_key(trans, &iter, b, new_key, + commit_flags, skip_triggers); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -2641,7 +2706,8 @@ int bch2_btree_root_alloc_fake_trans(struct btree_trans *trans, enum btree_id id + + void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level) + { +- bch2_trans_run(c, lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level))); ++ CLASS(btree_trans, trans)(c); ++ lockrestart_do(trans, bch2_btree_root_alloc_fake_trans(trans, id, level)); + } + + static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as) +@@ -2651,9 +2717,19 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update + + prt_str(out, " "); + bch2_btree_id_to_text(out, as->btree_id); +- prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", ++ prt_printf(out, " l=%u-%u ", + as->update_level_start, +- as->update_level_end, ++ as->update_level_end); ++ bch2_bpos_to_text(out, as->node_start); ++ prt_char(out, ' '); ++ bch2_bpos_to_text(out, as->node_end); ++ prt_printf(out, "\nwritten %u/%u u64s_remaining %u need_rewrite %s", ++ as->node_written, ++ as->node_sectors, ++ as->node_remaining, ++ btree_node_reawrite_reason_strs[as->node_needed_rewrite]); ++ ++ prt_printf(out, "\nmode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + bch2_btree_update_modes[as->mode], + as->nodes_written, + closure_nr_remaining(&as->cl), +@@ -2664,21 +2740,15 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) + { + struct btree_update *as; + +- mutex_lock(&c->btree_interior_update_lock); ++ guard(mutex)(&c->btree_interior_update_lock); + list_for_each_entry(as, &c->btree_interior_update_list, list) + bch2_btree_update_to_text(out, as); +- mutex_unlock(&c->btree_interior_update_lock); + } + + static bool bch2_btree_interior_updates_pending(struct bch_fs *c) + { +- bool ret; +- +- mutex_lock(&c->btree_interior_update_lock); +- ret = !list_empty(&c->btree_interior_update_list); +- mutex_unlock(&c->btree_interior_update_lock); +- +- return ret; ++ guard(mutex)(&c->btree_interior_update_lock); ++ return !list_empty(&c->btree_interior_update_list); + } + + bool bch2_btree_interior_updates_flush(struct bch_fs *c) +@@ -2695,13 +2765,11 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry + { + struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); + +- mutex_lock(&c->btree_root_lock); ++ guard(mutex)(&c->btree_interior_update_lock); + + r->level = entry->level; + r->alive = true; + bkey_copy(&r->key, (struct bkey_i *) entry->start); +- +- mutex_unlock(&c->btree_root_lock); + } + + struct jset_entry * +@@ -2709,11 +2777,9 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, + struct jset_entry *end, + unsigned long skip) + { +- unsigned i; +- +- mutex_lock(&c->btree_root_lock); ++ guard(mutex)(&c->btree_interior_update_lock); + +- for (i = 0; i < btree_id_nr_alive(c); i++) { ++ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->alive && !test_bit(i, &skip)) { +@@ -2723,8 +2789,6 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c, + } + } + +- mutex_unlock(&c->btree_root_lock); +- + return end; + } + +@@ -2780,16 +2844,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) + c->btree_interior_update_worker = + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); + if (!c->btree_interior_update_worker) +- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; ++ return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); + + c->btree_node_rewrite_worker = + alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); + if (!c->btree_node_rewrite_worker) +- return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; ++ return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); + + if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, + sizeof(struct btree_update))) +- return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; ++ return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); + + return 0; + } +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index be71cd73b864..6ed049f19a9a 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -57,6 +57,13 @@ struct btree_update { + unsigned took_gc_lock:1; + + enum btree_id btree_id; ++ struct bpos node_start; ++ struct bpos node_end; ++ enum btree_node_rewrite_reason node_needed_rewrite; ++ u16 node_written; ++ u16 node_sectors; ++ u16 node_remaining; ++ + unsigned update_level_start; + unsigned update_level_end; + +@@ -144,7 +151,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, + + EBUG_ON(!btree_node_locked(path, level)); + +- if (bch2_btree_node_merging_disabled) ++ if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) + return 0; + + b = path->l[level].b; +@@ -168,12 +175,19 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, + } + + int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, +- struct btree *, unsigned); ++ struct btree *, unsigned, ++ enum bch_trans_commit_flags); ++int bch2_btree_node_rewrite_key(struct btree_trans *, ++ enum btree_id, unsigned, ++ struct bkey_i *, ++ enum bch_trans_commit_flags); + int bch2_btree_node_rewrite_pos(struct btree_trans *, + enum btree_id, unsigned, +- struct bpos, unsigned); ++ struct bpos, unsigned, ++ enum bch_trans_commit_flags); + int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, +- struct btree *, unsigned); ++ struct btree *, ++ enum bch_trans_commit_flags); + + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + +diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c +index 0941fb2c026d..afad11831e1d 100644 +--- a/fs/bcachefs/btree_write_buffer.c ++++ b/fs/bcachefs/btree_write_buffer.c +@@ -7,6 +7,7 @@ + #include "btree_update_interior.h" + #include "btree_write_buffer.h" + #include "disk_accounting.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "extents.h" + #include "journal.h" +@@ -144,7 +145,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite + EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); + EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); + +- ret = bch2_btree_iter_traverse(trans, iter); ++ ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + +@@ -181,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite + return wb_flush_one_slowpath(trans, iter, wb); + } + ++ EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); ++ + bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); + (*fast)++; + return 0; +@@ -200,19 +203,14 @@ static int + btree_write_buffered_insert(struct btree_trans *trans, + struct btree_write_buffered_key *wb) + { +- struct btree_iter iter; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), +- BTREE_ITER_cached|BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, wb->btree, bkey_start_pos(&wb->k.k), ++ BTREE_ITER_cached|BTREE_ITER_intent); + + trans->journal_res.seq = wb->journal_seq; + +- ret = bch2_btree_iter_traverse(trans, &iter) ?: ++ return bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, &wb->k, + BTREE_UPDATE_internal_snapshot_node); +- bch2_trans_iter_exit(trans, &iter); +- return ret; + } + + static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) +@@ -256,19 +254,17 @@ static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) + bch2_btree_write_buffer_journal_flush); + + if (j->watermark) { +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + bch2_journal_set_watermark(j); +- spin_unlock(&j->lock); + } + + BUG_ON(wb->sorted.size < wb->flushing.keys.nr); + } + +-int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, ++int bch2_btree_write_buffer_insert_err(struct bch_fs *c, + enum btree_id btree, struct bkey_i *k) + { +- struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_printf(&buf, "attempting to do write buffer update on non wb btree="); + bch2_btree_id_to_text(&buf, btree); +@@ -276,7 +272,6 @@ int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + bch2_fs_inconsistent(c, "%s", buf.buf); +- printbuf_exit(&buf); + return -EROFS; + } + +@@ -285,7 +280,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree_write_buffer *wb = &c->btree_write_buffer; +- struct btree_iter iter = {}; ++ struct btree_iter iter = { NULL }; + size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; + bool write_locked = false; + bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); +@@ -298,9 +293,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) + bch2_trans_unlock(trans); + bch2_trans_begin(trans); + +- mutex_lock(&wb->inc.lock); +- move_keys_from_inc_to_flushing(wb); +- mutex_unlock(&wb->inc.lock); ++ scoped_guard(mutex, &wb->inc.lock) ++ move_keys_from_inc_to_flushing(wb); + + for (size_t i = 0; i < wb->flushing.keys.nr; i++) { + wb->sorted.data[i].idx = i; +@@ -328,10 +322,9 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + +- if (unlikely(!btree_type_uses_write_buffer(k->btree))) { +- ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); ++ ret = bch2_btree_write_buffer_insert_checks(c, k->btree, &k->k); ++ if (unlikely(ret)) + goto err; +- } + + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) + prefetch(&wb->flushing.keys.data[n->idx]); +@@ -368,7 +361,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) + write_locked = false; + + ret = lockrestart_do(trans, +- bch2_btree_iter_traverse(trans, &iter) ?: ++ bch2_btree_iter_traverse(&iter) ?: + bch2_foreground_maybe_merge(trans, iter.path, 0, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_journal_reclaim| +@@ -380,18 +373,18 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) + } + + if (!iter.path || iter.btree_id != k->btree) { +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + } + +- bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); ++ bch2_btree_iter_set_pos(&iter, k->k.k.p); + btree_iter_path(trans, &iter)->preserve = false; + + bool accounting_accumulated = false; + do { + if (race_fault()) { +- ret = -BCH_ERR_journal_reclaim_would_deadlock; ++ ret = bch_err_throw(c, journal_reclaim_would_deadlock); + break; + } + +@@ -414,7 +407,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) + struct btree_path *path = btree_iter_path(trans, &iter); + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + } +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + + if (ret) + goto err; +@@ -532,9 +525,8 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) + ret = bch2_journal_keys_to_write_buffer(c, buf); + + if (!blocked && !ret) { +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + buf->need_flush_to_write_buffer = false; +- spin_unlock(&j->lock); + } + + mutex_unlock(&j->buf_lock); +@@ -566,9 +558,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, + * On memory allocation failure, bch2_btree_write_buffer_flush_locked() + * is not guaranteed to empty wb->inc: + */ +- mutex_lock(&wb->flushing.lock); +- ret = bch2_btree_write_buffer_flush_locked(trans); +- mutex_unlock(&wb->flushing.lock); ++ scoped_guard(mutex, &wb->flushing.lock) ++ ret = bch2_btree_write_buffer_flush_locked(trans); + } while (!ret && + (fetch_from_journal_err || + (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || +@@ -581,9 +572,10 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ CLASS(btree_trans, trans)(c); + bool did_work = false; + +- return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work)); ++ return btree_write_buffer_flush_seq(trans, seq, &did_work); + } + + int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +@@ -605,9 +597,9 @@ bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c) + if (bch2_journal_error(&c->journal)) + return false; + ++ CLASS(btree_trans, trans)(c); + bool did_work = false; +- bch2_trans_run(c, btree_write_buffer_flush_seq(trans, +- journal_cur_seq(&c->journal), &did_work)); ++ btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work); + return did_work; + } + +@@ -629,11 +621,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) +- return -BCH_ERR_erofs_no_writes; ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) ++ return bch_err_throw(c, erofs_no_writes); + + int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); +- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); + return ret; + } + +@@ -654,11 +646,10 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, + + if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + if (trace_write_buffer_maybe_flush_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bkey_val_to_text(&buf, c, referring_k); + trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); +- printbuf_exit(&buf); + } + + bch2_bkey_buf_reassemble(&tmp, c, referring_k); +@@ -673,7 +664,10 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, + goto err; + + bch2_bkey_buf_copy(last_flushed, c, tmp.k); +- ret = -BCH_ERR_transaction_restart_write_buffer_flush; ++ ++ /* can we avoid the unconditional restart? */ ++ trace_and_count(c, trans_restart_write_buffer_flush, trans, _RET_IP_); ++ ret = bch_err_throw(c, transaction_restart_write_buffer_flush); + } + err: + bch2_bkey_buf_exit(&tmp, c); +@@ -686,13 +680,14 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work) + struct btree_write_buffer *wb = &c->btree_write_buffer; + int ret; + +- mutex_lock(&wb->flushing.lock); +- do { +- ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); +- } while (!ret && bch2_btree_write_buffer_should_flush(c)); +- mutex_unlock(&wb->flushing.lock); ++ scoped_guard(mutex, &wb->flushing.lock) { ++ CLASS(btree_trans, trans)(c); ++ do { ++ ret = bch2_btree_write_buffer_flush_locked(trans); ++ } while (!ret && bch2_btree_write_buffer_should_flush(c)); ++ } + +- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); + } + + static void wb_accounting_sort(struct btree_write_buffer *wb) +@@ -821,9 +816,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ + bch2_journal_pin_drop(&c->journal, &dst->wb->pin); + + if (bch2_btree_write_buffer_should_flush(c) && +- __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && ++ __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && + !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) +- bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); + + if (dst->wb == &wb->flushing) + mutex_unlock(&wb->flushing.lock); +@@ -866,13 +861,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) + darray_exit(&wb->inc.keys); + } + +-int bch2_fs_btree_write_buffer_init(struct bch_fs *c) ++void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) + { + struct btree_write_buffer *wb = &c->btree_write_buffer; + + mutex_init(&wb->inc.lock); + mutex_init(&wb->flushing.lock); + INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); ++} ++ ++int bch2_fs_btree_write_buffer_init(struct bch_fs *c) ++{ ++ struct btree_write_buffer *wb = &c->btree_write_buffer; + + /* Will be resized by journal as needed: */ + unsigned initial_size = 1 << 16; +diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h +index d535cea28bde..e484cd6b90b0 100644 +--- a/fs/bcachefs/btree_write_buffer.h ++++ b/fs/bcachefs/btree_write_buffer.h +@@ -89,6 +89,10 @@ static inline int bch2_journal_key_to_wb(struct bch_fs *c, + struct journal_keys_to_wb *dst, + enum btree_id btree, struct bkey_i *k) + { ++ int ret = bch2_btree_write_buffer_insert_checks(c, btree, k); ++ if (unlikely(ret)) ++ return ret; ++ + EBUG_ON(!dst->seq); + + return k->k.type == KEY_TYPE_accounting +@@ -101,6 +105,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t + + int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); + void bch2_fs_btree_write_buffer_exit(struct bch_fs *); ++void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); + int bch2_fs_btree_write_buffer_init(struct bch_fs *); + + #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index 31fbc2716d8b..87a6f4dce296 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -71,13 +71,8 @@ __bch2_fs_usage_read_short(struct bch_fs *c) + struct bch_fs_usage_short + bch2_fs_usage_read_short(struct bch_fs *c) + { +- struct bch_fs_usage_short ret; +- +- percpu_down_read(&c->mark_lock); +- ret = __bch2_fs_usage_read_short(c); +- percpu_up_read(&c->mark_lock); +- +- return ret; ++ guard(percpu_read)(&c->mark_lock); ++ return __bch2_fs_usage_read_short(c); + } + + void bch2_dev_usage_to_text(struct printbuf *out, +@@ -113,10 +108,10 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, + bool *do_update) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + +- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); ++ CLASS(bch2_dev_tryget, ca)(c, p.ptr.dev); + if (!ca) { + if (fsck_err_on(p.ptr.dev != BCH_SB_MEMBER_INVALID, + trans, ptr_to_invalid_device, +@@ -138,7 +133,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; +- goto out; ++ return 0; + } + + enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry); +@@ -156,10 +151,14 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { ++ /* this pointer will be dropped */ + *do_update = true; ++ return 0; + } + } + ++ /* g->gen_valid == true */ ++ + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + trans, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" +@@ -172,15 +171,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, + if (!p.ptr.cached && + (g->data_type != BCH_DATA_btree || + data_type == BCH_DATA_btree)) { +- g->gen_valid = true; +- g->gen = p.ptr.gen; +- g->data_type = 0; ++ g->data_type = data_type; + g->stripe_sectors = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; +- } else { +- *do_update = true; + } ++ ++ *do_update = true; + } + + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, +@@ -206,7 +203,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, + *do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) +- goto out; ++ return 0; + + if (fsck_err_on(bucket_data_type_mismatch(g->data_type, data_type), + trans, ptr_bucket_data_type_mismatch, +@@ -217,9 +214,21 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, + bch2_data_type_str(data_type), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { +- if (data_type == BCH_DATA_btree) { +- g->gen_valid = true; +- g->gen = p.ptr.gen; ++ if (!p.ptr.cached && ++ data_type == BCH_DATA_btree) { ++ switch (g->data_type) { ++ case BCH_DATA_sb: ++ bch_err(c, "btree and superblock in the same bucket - cannot repair"); ++ return bch_err_throw(c, fsck_repair_unimplemented); ++ case BCH_DATA_journal: ++ ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); ++ bch_err_msg(c, ret, "error deleting journal bucket %zu", ++ PTR_BUCKET_NR(ca, &p.ptr)); ++ if (ret) ++ return ret; ++ break; ++ } ++ + g->data_type = data_type; + g->stripe_sectors = 0; + g->dirty_sectors = 0; +@@ -250,10 +259,7 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + *do_update = true; + } +-out: + fsck_err: +- bch2_dev_put(ca); +- printbuf_exit(&buf); + return ret; + } + +@@ -266,30 +272,26 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, + const union bch_extent_entry *entry_c; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + ++ /* We don't yet do btree key updates correctly for when we're RW */ ++ BUG_ON(test_bit(BCH_FS_rw, &c->flags)); ++ + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { + ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); + if (ret) +- goto err; ++ return ret; + } + + if (do_update) { +- if (flags & BTREE_TRIGGER_is_root) { +- bch_err(c, "cannot update btree roots yet"); +- ret = -EINVAL; +- goto err; +- } +- + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) +- goto err; ++ return ret; + +- rcu_read_lock(); +- bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); +- rcu_read_unlock(); ++ scoped_guard(rcu) ++ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); + + if (level) { + /* +@@ -298,14 +300,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, + * sort it out: + */ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); +- rcu_read_lock(); +- bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); +- struct bucket *g = PTR_GC_BUCKET(ca, ptr); +- +- ptr->gen = g->gen; +- } +- rcu_read_unlock(); ++ scoped_guard(rcu) ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); ++ ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; ++ } + } else { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; +@@ -369,52 +368,76 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, + bch_info(c, "new key %s", buf.buf); + } + +- struct btree_iter iter; +- bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, +- BTREE_ITER_intent|BTREE_ITER_all_snapshots); +- ret = bch2_btree_iter_traverse(trans, &iter) ?: +- bch2_trans_update(trans, &iter, new, +- BTREE_UPDATE_internal_snapshot_node| +- BTREE_TRIGGER_norun); +- bch2_trans_iter_exit(trans, &iter); +- if (ret) +- goto err; ++ if (!(flags & BTREE_TRIGGER_is_root)) { ++ struct btree_iter iter; ++ bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, ++ BTREE_ITER_intent|BTREE_ITER_all_snapshots); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_trans_update(trans, &iter, new, ++ BTREE_UPDATE_internal_snapshot_node| ++ BTREE_TRIGGER_norun); ++ bch2_trans_iter_exit(&iter); ++ if (ret) ++ return ret; ++ ++ if (level) ++ bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); ++ } else { ++ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, ++ jset_u64s(new->k.u64s)); ++ ret = PTR_ERR_OR_ZERO(e); ++ if (ret) ++ return ret; + +- if (level) +- bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); ++ journal_entry_set(e, ++ BCH_JSET_ENTRY_btree_root, ++ btree, level - 1, ++ new, new->k.u64s); ++ ++ /* ++ * no locking, we're single threaded and not rw yet, see ++ * the big assertino above that we repeat here: ++ */ ++ BUG_ON(test_bit(BCH_FS_rw, &c->flags)); ++ ++ struct btree *b = bch2_btree_id_root(c, btree)->b; ++ bkey_copy(&b->key, new); ++ } + } +-err: +- printbuf_exit(&buf); +- return ret; ++ ++ return 0; + } + + static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, + struct bkey_s_c k, bool insert, enum bch_sb_error_id id) + { + struct bch_fs *c = trans->c; +- bool repeat = false, print = true, suppress = false; + + prt_printf(buf, "\nwhile marking "); + bch2_bkey_val_to_text(buf, c, k); + prt_newline(buf); + +- __bch2_count_fsck_err(c, id, buf->buf, &repeat, &print, &suppress); ++ bool print = __bch2_count_fsck_err(c, id, buf); + +- int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); ++ int ret = bch2_run_explicit_recovery_pass(c, buf, ++ BCH_RECOVERY_PASS_check_allocations, 0); + + if (insert) { +- print = true; +- suppress = false; +- + bch2_trans_updates_to_text(buf, trans); + __bch2_inconsistent_error(c, buf); +- ret = -BCH_ERR_bucket_ref_update; ++ /* ++ * If we're in recovery, run_explicit_recovery_pass might give ++ * us an error code for rewinding recovery ++ */ ++ if (!ret) ++ ret = bch_err_throw(c, bucket_ref_update); ++ } else { ++ /* Always ignore overwrite errors, so that deletion works */ ++ ret = 0; + } + +- if (suppress) +- prt_printf(buf, "Ratelimiting new instances of previous error\n"); +- if (print) +- bch2_print_string_as_lines(KERN_ERR, buf->buf); ++ if (print || insert) ++ bch2_print_str(c, KERN_ERR, buf->buf); + return ret; + } + +@@ -427,9 +450,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + { + struct bch_fs *c = trans->c; + size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bool inserting = sectors > 0; +- int ret = 0; + + BUG_ON(!sectors); + +@@ -441,9 +463,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + bch2_data_type_str(bucket_data_type ?: ptr_data_type), + ptr->gen); + +- ret = bucket_ref_update_err(trans, &buf, k, inserting, +- BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); +- goto out; ++ return bucket_ref_update_err(trans, &buf, k, inserting, ++ BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); + } + + if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { +@@ -454,15 +475,12 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + bch2_data_type_str(bucket_data_type ?: ptr_data_type), + ptr->gen); + +- ret = bucket_ref_update_err(trans, &buf, k, inserting, +- BCH_FSCK_ERR_ptr_too_stale); +- goto out; ++ return bucket_ref_update_err(trans, &buf, k, inserting, ++ BCH_FSCK_ERR_ptr_too_stale); + } + +- if (b_gen != ptr->gen && ptr->cached) { +- ret = 1; +- goto out; +- } ++ if (b_gen != ptr->gen && ptr->cached) ++ return 1; + + if (unlikely(b_gen != ptr->gen)) { + bch2_log_msg_start(c, &buf); +@@ -473,9 +491,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + bch2_data_type_str(bucket_data_type ?: ptr_data_type), + ptr->gen); + +- ret = bucket_ref_update_err(trans, &buf, k, inserting, +- BCH_FSCK_ERR_stale_dirty_ptr); +- goto out; ++ return bucket_ref_update_err(trans, &buf, k, inserting, ++ BCH_FSCK_ERR_stale_dirty_ptr); + } + + if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { +@@ -485,9 +502,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + bch2_data_type_str(bucket_data_type), + bch2_data_type_str(ptr_data_type)); + +- ret = bucket_ref_update_err(trans, &buf, k, inserting, ++ return bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); +- goto out; + } + + if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { +@@ -498,16 +514,13 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + bch2_data_type_str(bucket_data_type ?: ptr_data_type), + *bucket_sectors, sectors); + +- ret = bucket_ref_update_err(trans, &buf, k, inserting, +- BCH_FSCK_ERR_bucket_sector_count_overflow); + sectors = -*bucket_sectors; +- goto out; ++ return bucket_ref_update_err(trans, &buf, k, inserting, ++ BCH_FSCK_ERR_bucket_sector_count_overflow); + } + + *bucket_sectors += sectors; +-out: +- printbuf_exit(&buf); +- return ret; ++ return 0; + } + + void bch2_trans_account_disk_usage_change(struct btree_trans *trans) +@@ -517,7 +530,7 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) + static int warned_disk_usage = 0; + bool warn = false; + +- percpu_down_read(&c->mark_lock); ++ guard(percpu_read)(&c->mark_lock); + struct bch_fs_usage_base *src = &trans->fs_usage_delta; + + s64 added = src->btree + src->data + src->reserved; +@@ -545,11 +558,10 @@ void bch2_trans_account_disk_usage_change(struct btree_trans *trans) + this_cpu_sub(*c->online_reserved, added); + } + +- preempt_disable(); +- struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); +- acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); +- preempt_enable(); +- percpu_up_read(&c->mark_lock); ++ scoped_guard(preempt) { ++ struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); ++ acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); ++ } + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + bch2_trans_inconsistent(trans, +@@ -588,40 +600,34 @@ static int bch2_trigger_pointer(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + bool insert = !(flags & BTREE_TRIGGER_overwrite); +- struct printbuf buf = PRINTBUF; +- int ret = 0; ++ CLASS(printbuf, buf)(); + + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); + + *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; + +- struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); ++ CLASS(bch2_dev_tryget, ca)(c, p.ptr.dev); + if (unlikely(!ca)) { + if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) +- ret = -BCH_ERR_trigger_pointer; +- goto err; ++ return bch_err_throw(c, trigger_pointer); ++ return 0; + } + + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); + if (!bucket_valid(ca, bucket.offset)) { + if (insert) { + bch2_dev_bucket_missing(ca, bucket.offset); +- ret = -BCH_ERR_trigger_pointer; ++ return bch_err_throw(c, trigger_pointer); + } +- goto err; ++ return 0; + } + + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); +- ret = PTR_ERR_OR_ZERO(a) ?: +- __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); +- if (ret) +- goto err; +- +- ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); +- if (ret) +- goto err; ++ return PTR_ERR_OR_ZERO(a) ?: ++ __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert) ?: ++ bch2_bucket_backpointer_mod(trans, k, &bp, insert); + } + + if (flags & BTREE_TRIGGER_gc) { +@@ -629,23 +635,22 @@ static int bch2_trigger_pointer(struct btree_trans *trans, + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + p.ptr.dev, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { +- ret = -BCH_ERR_trigger_pointer; +- goto err; ++ return bch_err_throw(c, trigger_pointer); + } + + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; +- ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); ++ int ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); + alloc_to_bucket(g, new); + bucket_unlock(g); + +- if (!ret) +- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); ++ if (ret) ++ return ret; ++ ++ return bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + } +-err: +- bch2_dev_put(ca); +- printbuf_exit(&buf); +- return ret; ++ ++ return 0; + } + + static int bch2_trigger_stripe_ptr(struct btree_trans *trans, +@@ -655,25 +660,26 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, + s64 sectors, + enum btree_iter_update_trigger_flags flags) + { ++ struct bch_fs *c = trans->c; ++ + if (flags & BTREE_TRIGGER_transactional) { +- struct btree_iter iter; +- struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, +- BTREE_ID_stripes, POS(0, p.ec.idx), +- BTREE_ITER_with_updates, stripe); ++ struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, ++ BTREE_ID_stripes, POS(0, p.ec.idx), ++ BTREE_ITER_with_updates, ++ stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); +- goto err; ++ return ret; + } + + if (!bch2_ptr_matches_stripe(&s->v, p)) { + bch2_trans_inconsistent(trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); +- ret = -BCH_ERR_trigger_stripe_pointer; +- goto err; ++ return bch_err_throw(c, trigger_stripe_pointer); + } + + stripe_blockcount_set(&s->v, p.ec.block, +@@ -685,35 +691,29 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, + acc.type = BCH_DISK_ACCOUNTING_replicas; + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = data_type; +- ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + } + + if (flags & BTREE_TRIGGER_gc) { +- struct bch_fs *c = trans->c; +- + struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + (u64) p.ec.idx); +- return -BCH_ERR_ENOMEM_mark_stripe_ptr; ++ return bch_err_throw(c, ENOMEM_mark_stripe_ptr); + } + + gc_stripe_lock(m); + + if (!m || !m->alive) { + gc_stripe_unlock(m); +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", + (u64) p.ec.idx); + bch2_bkey_val_to_text(&buf, c, k); + __bch2_inconsistent_error(c, &buf); +- bch2_print_string_as_lines(KERN_ERR, buf.buf); +- printbuf_exit(&buf); +- return -BCH_ERR_trigger_stripe_pointer; ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ return bch_err_throw(c, trigger_stripe_pointer); + } + + m->block_sectors[p.ec.block] += sectors; +@@ -736,8 +736,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, + static int __trigger_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, +- enum btree_iter_update_trigger_flags flags, +- s64 *replicas_sectors) ++ enum btree_iter_update_trigger_flags flags) + { + bool gc = flags & BTREE_TRIGGER_gc; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +@@ -748,6 +747,8 @@ static int __trigger_extent(struct btree_trans *trans, + : BCH_DATA_user; + int ret = 0; + ++ s64 replicas_sectors = 0; ++ + struct disk_accounting_pos acc_replicas_key; + memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); + acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; +@@ -774,7 +775,7 @@ static int __trigger_extent(struct btree_trans *trans, + if (ret) + return ret; + } else if (!p.has_ec) { +- *replicas_sectors += disk_sectors; ++ replicas_sectors += disk_sectors; + replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); + } else { + ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); +@@ -812,13 +813,13 @@ static int __trigger_extent(struct btree_trans *trans, + } + + if (acc_replicas_key.replicas.nr_devs) { +- ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); ++ ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); + if (ret) + return ret; + } + + if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { +- ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); ++ ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); + if (ret) + return ret; + } +@@ -834,7 +835,7 @@ static int __trigger_extent(struct btree_trans *trans, + } + + if (level) { +- ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); ++ ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); + if (ret) + return ret; + } else { +@@ -843,7 +844,7 @@ static int __trigger_extent(struct btree_trans *trans, + s64 v[3] = { + insert ? 1 : -1, + insert ? k.k->size : -((s64) k.k->size), +- *replicas_sectors, ++ replicas_sectors, + }; + ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); + if (ret) +@@ -875,20 +876,16 @@ int bch2_trigger_extent(struct btree_trans *trans, + return 0; + + if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { +- s64 old_replicas_sectors = 0, new_replicas_sectors = 0; +- + if (old.k->type) { + int ret = __trigger_extent(trans, btree, level, old, +- flags & ~BTREE_TRIGGER_insert, +- &old_replicas_sectors); ++ flags & ~BTREE_TRIGGER_insert); + if (ret) + return ret; + } + + if (new.k->type) { + int ret = __trigger_extent(trans, btree, level, new.s_c, +- flags & ~BTREE_TRIGGER_overwrite, +- &new_replicas_sectors); ++ flags & ~BTREE_TRIGGER_overwrite); + if (ret) + return ret; + } +@@ -966,15 +963,24 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + return PTR_ERR(a); + + if (a->v.data_type && type && a->v.data_type != type) { +- bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); +- log_fsck_err(trans, bucket_metadata_type_mismatch, +- "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" +- "while marking %s", +- iter.pos.inode, iter.pos.offset, a->v.gen, +- bch2_data_type_str(a->v.data_type), +- bch2_data_type_str(type), +- bch2_data_type_str(type)); +- ret = -BCH_ERR_metadata_bucket_inconsistency; ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" ++ "while marking %s\n", ++ iter.pos.inode, iter.pos.offset, a->v.gen, ++ bch2_data_type_str(a->v.data_type), ++ bch2_data_type_str(type), ++ bch2_data_type_str(type)); ++ ++ bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); ++ ++ ret = bch2_run_explicit_recovery_pass(c, &buf, ++ BCH_RECOVERY_PASS_check_allocations, 0); ++ ++ /* Always print, this is always fatal */ ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ if (!ret) ++ ret = bch_err_throw(c, metadata_bucket_inconsistency); + goto err; + } + +@@ -985,8 +991,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + } + err: +-fsck_err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -995,7 +1000,6 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * + enum btree_iter_update_trigger_flags flags) + { + struct bch_fs *c = trans->c; +- int ret = 0; + + struct bucket *g = gc_bucket(ca, b); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", +@@ -1023,12 +1027,11 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * + g->dirty_sectors += sectors; + struct bch_alloc_v4 new = bucket_m_to_alloc(*g); + bucket_unlock(g); +- ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); +- return ret; ++ return bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + err_unlock: + bucket_unlock(g); + err: +- return -BCH_ERR_metadata_bucket_inconsistency; ++ return bch_err_throw(c, metadata_bucket_inconsistency); + } + + int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, +@@ -1086,10 +1089,10 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c + enum btree_iter_update_trigger_flags flags) + { + struct bch_fs *c = trans->c; ++ struct bch_sb_layout layout; + +- mutex_lock(&c->sb_lock); +- struct bch_sb_layout layout = ca->disk_sb.sb->layout; +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) ++ layout = ca->disk_sb.sb->layout; + + u64 bucket = 0; + unsigned i, bucket_sectors = 0; +@@ -1134,8 +1137,8 @@ static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, struct bch_dev *c + int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, + enum btree_iter_update_trigger_flags flags) + { +- int ret = bch2_trans_run(c, +- __bch2_trans_mark_dev_sb(trans, ca, flags)); ++ CLASS(btree_trans, trans)(c); ++ int ret = __bch2_trans_mark_dev_sb(trans, ca, flags); + bch_err_fn(c, ret); + return ret; + } +@@ -1143,10 +1146,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, + int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, + enum btree_iter_update_trigger_flags flags) + { +- for_each_online_member(c, ca) { ++ for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { + int ret = bch2_trans_mark_dev_sb(c, ca, flags); + if (ret) { +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); + return ret; + } + } +@@ -1188,15 +1191,38 @@ bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) + + #define SECTORS_CACHE 1024 + ++static int disk_reservation_recalc_sectors_available(struct bch_fs *c, ++ struct disk_reservation *res, ++ u64 sectors, enum bch_reservation_flags flags) ++{ ++ guard(mutex)(&c->sectors_available_lock); ++ ++ percpu_u64_set(&c->pcpu->sectors_available, 0); ++ u64 sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); ++ ++ if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) ++ sectors = min(sectors, sectors_available); ++ ++ if (sectors <= sectors_available || ++ (flags & BCH_DISK_RESERVATION_NOFAIL)) { ++ atomic64_set(&c->sectors_available, ++ max_t(s64, 0, sectors_available - sectors)); ++ this_cpu_add(*c->online_reserved, sectors); ++ res->sectors += sectors; ++ return 0; ++ } else { ++ atomic64_set(&c->sectors_available, sectors_available); ++ return bch_err_throw(c, ENOSPC_disk_reservation); ++ } ++} ++ + int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + u64 sectors, enum bch_reservation_flags flags) + { + struct bch_fs_pcpu *pcpu; + u64 old, get; +- u64 sectors_available; +- int ret; + +- percpu_down_read(&c->mark_lock); ++ guard(percpu_read)(&c->mark_lock); + preempt_disable(); + pcpu = this_cpu_ptr(c->pcpu); + +@@ -1207,9 +1233,10 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + do { + get = min((u64) sectors + SECTORS_CACHE, old); + +- if (get < sectors) { ++ if (unlikely(get < sectors)) { + preempt_enable(); +- goto recalculate; ++ return disk_reservation_recalc_sectors_available(c, ++ res, sectors, flags); + } + } while (!atomic64_try_cmpxchg(&c->sectors_available, + &old, old - get)); +@@ -1220,36 +1247,8 @@ int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + pcpu->sectors_available -= sectors; + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; +- + preempt_enable(); +- percpu_up_read(&c->mark_lock); + return 0; +- +-recalculate: +- mutex_lock(&c->sectors_available_lock); +- +- percpu_u64_set(&c->pcpu->sectors_available, 0); +- sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); +- +- if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL)) +- sectors = min(sectors, sectors_available); +- +- if (sectors <= sectors_available || +- (flags & BCH_DISK_RESERVATION_NOFAIL)) { +- atomic64_set(&c->sectors_available, +- max_t(s64, 0, sectors_available - sectors)); +- this_cpu_add(*c->online_reserved, sectors); +- res->sectors += sectors; +- ret = 0; +- } else { +- atomic64_set(&c->sectors_available, sectors_available); +- ret = -BCH_ERR_ENOSPC_disk_reservation; +- } +- +- mutex_unlock(&c->sectors_available_lock); +- percpu_up_read(&c->mark_lock); +- +- return ret; + } + + /* Startup/shutdown: */ +@@ -1272,7 +1271,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c) + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets_nouse) { + bch2_dev_put(ca); +- return -BCH_ERR_ENOMEM_buckets_nouse; ++ return bch_err_throw(c, ENOMEM_buckets_nouse); + } + } + +@@ -1297,12 +1296,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + lockdep_assert_held(&c->state_lock); + + if (resize && ca->buckets_nouse) +- return -BCH_ERR_no_resize_with_buckets_nouse; ++ return bch_err_throw(c, no_resize_with_buckets_nouse); + + bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), + GFP_KERNEL|__GFP_ZERO); + if (!bucket_gens) { +- ret = -BCH_ERR_ENOMEM_bucket_gens; ++ ret = bch_err_throw(c, ENOMEM_bucket_gens); + goto err; + } + +@@ -1321,6 +1320,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + sizeof(bucket_gens->b[0]) * copy); + } + ++ ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, ++ ca->mi.nbuckets, nbuckets) ?: ++ bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, ++ ca->mi.nbuckets, nbuckets); ++ + rcu_assign_pointer(ca->bucket_gens, bucket_gens); + bucket_gens = old_bucket_gens; + +@@ -1345,7 +1349,7 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) + { + ca->usage = alloc_percpu(struct bch_dev_usage_full); + if (!ca->usage) +- return -BCH_ERR_ENOMEM_usage_init; ++ return bch_err_throw(c, ENOMEM_usage_init); + + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); + } +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index af1532de4a37..49a3807a5eab 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -84,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) + + static inline int bucket_gen_get(struct bch_dev *ca, size_t b) + { +- rcu_read_lock(); +- int ret = bucket_gen_get_rcu(ca, b); +- rcu_read_unlock(); +- return ret; ++ guard(rcu)(); ++ return bucket_gen_get_rcu(ca, b); + } + + static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, +@@ -156,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ + */ + static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) + { +- rcu_read_lock(); +- int ret = dev_ptr_stale_rcu(ca, ptr); +- rcu_read_unlock(); +- return ret; ++ guard(rcu)(); ++ return dev_ptr_stale_rcu(ca, ptr); + } + + /* Device usage: */ +diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c +index c8a488e6b7b8..ca341586920b 100644 +--- a/fs/bcachefs/buckets_waiting_for_journal.c ++++ b/fs/bcachefs/buckets_waiting_for_journal.c +@@ -25,25 +25,20 @@ static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_ + u64 bch2_bucket_journal_seq_ready(struct buckets_waiting_for_journal *b, + unsigned dev, u64 bucket) + { +- struct buckets_waiting_for_journal_table *t; + u64 dev_bucket = (u64) dev << 56 | bucket; +- u64 ret = 0; + +- mutex_lock(&b->lock); +- t = b->t; ++ guard(mutex)(&b->lock); ++ ++ struct buckets_waiting_for_journal_table *t = b->t; + + for (unsigned i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); + +- if (h->dev_bucket == dev_bucket) { +- ret = h->journal_seq; +- break; +- } ++ if (h->dev_bucket == dev_bucket) ++ return h->journal_seq; + } + +- mutex_unlock(&b->lock); +- +- return ret; ++ return 0; + } + + static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, +@@ -92,12 +87,11 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + .journal_seq = journal_seq, + }; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0, nr_rehashes_this_size = 0; +- int ret = 0; + +- mutex_lock(&b->lock); ++ guard(mutex)(&b->lock); + + if (likely(bucket_table_insert(b->t, &new, flushed_seq))) +- goto out; ++ return 0; + + t = b->t; + size = 1UL << t->bits; +@@ -108,8 +102,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + realloc: + n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); + if (!n) { +- ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; +- goto out; ++ struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal); ++ return bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set); + } + + retry_rehash: +@@ -142,10 +136,7 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + + pr_debug("took %zu rehashes, table at %zu/%lu elements", + nr_rehashes, nr_elements, 1UL << b->t->bits); +-out: +- mutex_unlock(&b->lock); +- +- return ret; ++ return 0; + } + + void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 5891b3a1e61c..467fc45e84fe 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -52,6 +52,11 @@ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, + return ca; + } + ++DEFINE_CLASS(bch2_device_lookup, struct bch_dev *, ++ bch2_dev_put(_T), ++ bch2_device_lookup(c, dev, flags), ++ struct bch_fs *c, u64 dev, unsigned flags); ++ + #if 0 + static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) + { +@@ -207,8 +212,6 @@ static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) + + static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) + { +- struct bch_dev *ca; +- + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +@@ -219,7 +222,7 @@ static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) + arg.pad) + return -EINVAL; + +- ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ struct bch_dev *ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + +@@ -249,9 +252,6 @@ static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) + + static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) + { +- struct bch_dev *ca; +- int ret; +- + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +@@ -262,21 +262,16 @@ static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) + arg.pad) + return -EINVAL; + +- ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + +- ret = bch2_dev_offline(c, ca, arg.flags); +- bch2_dev_put(ca); +- return ret; ++ return bch2_dev_offline(c, ca, arg.flags); + } + + static long bch2_ioctl_disk_set_state(struct bch_fs *c, + struct bch_ioctl_disk_set_state arg) + { +- struct bch_dev *ca; +- int ret; +- + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +@@ -288,15 +283,12 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c, + arg.new_state >= BCH_MEMBER_STATE_NR) + return -EINVAL; + +- ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + +- ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); +- if (ret) +- bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); +- +- bch2_dev_put(ca); ++ int ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); ++ bch_err_msg(ca, ret, "setting device state"); + return ret; + } + +@@ -312,13 +304,14 @@ static int bch2_data_thread(void *arg) + { + struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); + +- ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ++ ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, &ctx->arg); + if (ctx->thr.ret == -BCH_ERR_device_offline) + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; + else { + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; + ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; + } ++ enumerated_ref_put(&ctx->c->writes, BCH_WRITE_REF_ioctl_data); + return 0; + } + +@@ -348,14 +341,13 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, + }; + + if (ctx->arg.op == BCH_DATA_OP_scrub) { +- struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); ++ CLASS(bch2_dev_tryget_noerror, ca)(c, ctx->arg.scrub.dev); + if (ca) { + struct bch_dev_usage_full u; + bch2_dev_usage_full_read_fast(ca, &u); + for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) + if (ctx->arg.scrub.data_types & BIT(i)) + e.p.sectors_total += u.d[i].sectors; +- bch2_dev_put(ca); + } + } else { + e.p.sectors_total = bch2_fs_usage_read_short(c).used; +@@ -378,15 +370,24 @@ static long bch2_ioctl_data(struct bch_fs *c, + struct bch_data_ctx *ctx; + int ret; + +- if (!capable(CAP_SYS_ADMIN)) +- return -EPERM; ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_ioctl_data)) ++ return -EROFS; + +- if (arg.op >= BCH_DATA_OP_NR || arg.flags) +- return -EINVAL; ++ if (!capable(CAP_SYS_ADMIN)) { ++ ret = -EPERM; ++ goto put_ref; ++ } ++ ++ if (arg.op >= BCH_DATA_OP_NR || arg.flags) { ++ ret = -EINVAL; ++ goto put_ref; ++ } + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); +- if (!ctx) +- return -ENOMEM; ++ if (!ctx) { ++ ret = -ENOMEM; ++ goto put_ref; ++ } + + ctx->c = c; + ctx->arg = arg; +@@ -395,17 +396,21 @@ static long bch2_ioctl_data(struct bch_fs *c, + &bcachefs_data_ops, + bch2_data_thread); + if (ret < 0) +- kfree(ctx); ++ goto cleanup; ++ return ret; ++cleanup: ++ kfree(ctx); ++put_ref: ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_ioctl_data); + return ret; + } + +-static long bch2_ioctl_fs_usage(struct bch_fs *c, ++static noinline_for_stack long bch2_ioctl_fs_usage(struct bch_fs *c, + struct bch_ioctl_fs_usage __user *user_arg) + { + struct bch_ioctl_fs_usage arg = {}; +- darray_char replicas = {}; ++ CLASS(darray_char, replicas)(); + u32 replica_entries_bytes; +- int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; +@@ -413,11 +418,11 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, + if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) + return -EFAULT; + +- ret = bch2_fs_replicas_usage_read(c, &replicas) ?: ++ int ret = bch2_fs_replicas_usage_read(c, &replicas) ?: + (replica_entries_bytes < replicas.nr ? -ERANGE : 0) ?: + copy_to_user_errcode(&user_arg->replicas, replicas.data, replicas.nr); + if (ret) +- goto err; ++ return ret; + + struct bch_fs_usage_short u = bch2_fs_usage_read_short(c); + arg.capacity = c->capacity; +@@ -434,52 +439,41 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, + &arg.persistent_reserved[i], 1); + } + +- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); +-err: +- darray_exit(&replicas); +- return ret; ++ return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); + } + + static long bch2_ioctl_query_accounting(struct bch_fs *c, + struct bch_ioctl_query_accounting __user *user_arg) + { + struct bch_ioctl_query_accounting arg; +- darray_char accounting = {}; +- int ret = 0; ++ CLASS(darray_char, accounting)(); + + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; + +- ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: ++ int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)) ?: + bch2_fs_accounting_read(c, &accounting, arg.accounting_types_mask) ?: + (arg.accounting_u64s * sizeof(u64) < accounting.nr ? -ERANGE : 0) ?: + copy_to_user_errcode(&user_arg->accounting, accounting.data, accounting.nr); + if (ret) +- goto err; ++ return ret; + + arg.capacity = c->capacity; + arg.used = bch2_fs_usage_read_short(c).used; + arg.online_reserved = percpu_u64_get(c->online_reserved); + arg.accounting_u64s = accounting.nr / sizeof(u64); + +- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); +-err: +- darray_exit(&accounting); +- return ret; ++ return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); + } + + /* obsolete, didn't allow for new data types: */ +-static long bch2_ioctl_dev_usage(struct bch_fs *c, ++static noinline_for_stack long bch2_ioctl_dev_usage(struct bch_fs *c, + struct bch_ioctl_dev_usage __user *user_arg) + { +- struct bch_ioctl_dev_usage arg; +- struct bch_dev_usage_full src; +- struct bch_dev *ca; +- unsigned i; +- + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; + ++ struct bch_ioctl_dev_usage arg; + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + +@@ -489,38 +483,32 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, + arg.pad[2]) + return -EINVAL; + +- ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + +- src = bch2_dev_usage_full_read(ca); ++ struct bch_dev_usage_full src = bch2_dev_usage_full_read(ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + +- for (i = 0; i < ARRAY_SIZE(arg.d); i++) { ++ for (unsigned i = 0; i < ARRAY_SIZE(arg.d); i++) { + arg.d[i].buckets = src.d[i].buckets; + arg.d[i].sectors = src.d[i].sectors; + arg.d[i].fragmented = src.d[i].fragmented; + } + +- bch2_dev_put(ca); +- + return copy_to_user_errcode(user_arg, &arg, sizeof(arg)); + } + + static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, + struct bch_ioctl_dev_usage_v2 __user *user_arg) + { +- struct bch_ioctl_dev_usage_v2 arg; +- struct bch_dev_usage_full src; +- struct bch_dev *ca; +- int ret = 0; +- + if (!test_bit(BCH_FS_started, &c->flags)) + return -EINVAL; + ++ struct bch_ioctl_dev_usage_v2 arg; + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + +@@ -530,20 +518,20 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, + arg.pad[2]) + return -EINVAL; + +- ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + +- src = bch2_dev_usage_full_read(ca); ++ struct bch_dev_usage_full src = bch2_dev_usage_full_read(ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; + arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR); + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + +- ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); ++ int ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg)); + if (ret) +- goto err; ++ return ret; + + for (unsigned i = 0; i < arg.nr_data_types; i++) { + struct bch_ioctl_dev_usage_type t = { +@@ -554,11 +542,10 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, + + ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t)); + if (ret) +- goto err; ++ return ret; + } +-err: +- bch2_dev_put(ca); +- return ret; ++ ++ return 0; + } + + static long bch2_ioctl_read_super(struct bch_fs *c, +@@ -575,13 +562,13 @@ static long bch2_ioctl_read_super(struct bch_fs *c, + arg.pad) + return -EINVAL; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + + if (arg.flags & BCH_READ_DEV) { + ca = bch2_device_lookup(c, arg.dev, arg.flags); + ret = PTR_ERR_OR_ZERO(ca); + if (ret) +- goto err_unlock; ++ return ret; + + sb = ca->disk_sb.sb; + } else { +@@ -597,8 +584,6 @@ static long bch2_ioctl_read_super(struct bch_fs *c, + vstruct_bytes(sb)); + err: + bch2_dev_put(ca); +-err_unlock: +- mutex_unlock(&c->sb_lock); + return ret; + } + +@@ -613,21 +598,17 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, + if (!dev) + return -EINVAL; + +- for_each_online_member(c, ca) +- if (ca->dev == dev) { +- percpu_ref_put(&ca->io_ref[READ]); ++ guard(rcu)(); ++ for_each_online_member_rcu(c, ca) ++ if (ca->dev == dev) + return ca->dev_idx; +- } + +- return -BCH_ERR_ENOENT_dev_idx_not_found; ++ return bch_err_throw(c, ENOENT_dev_idx_not_found); + } + + static long bch2_ioctl_disk_resize(struct bch_fs *c, + struct bch_ioctl_disk_resize arg) + { +- struct bch_dev *ca; +- int ret; +- + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +@@ -635,22 +616,16 @@ static long bch2_ioctl_disk_resize(struct bch_fs *c, + arg.pad) + return -EINVAL; + +- ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + +- ret = bch2_dev_resize(c, ca, arg.nbuckets); +- +- bch2_dev_put(ca); +- return ret; ++ return bch2_dev_resize(c, ca, arg.nbuckets); + } + + static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, + struct bch_ioctl_disk_resize_journal arg) + { +- struct bch_dev *ca; +- int ret; +- + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + +@@ -661,14 +636,11 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, + if (arg.nbuckets > U32_MAX) + return -EINVAL; + +- ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ CLASS(bch2_device_lookup, ca)(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + +- ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); +- +- bch2_dev_put(ca); +- return ret; ++ return bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); + } + + #define BCH_IOCTL(_name, _argtype) \ +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +index d0a34a097b80..b1b78643d1d0 100644 +--- a/fs/bcachefs/checksum.c ++++ b/fs/bcachefs/checksum.c +@@ -106,8 +106,8 @@ static void bch2_chacha20_init(u32 state[CHACHA_STATE_WORDS], + memzero_explicit(key_words, sizeof(key_words)); + } + +-static void bch2_chacha20(const struct bch_key *key, struct nonce nonce, +- void *data, size_t len) ++void bch2_chacha20(const struct bch_key *key, struct nonce nonce, ++ void *data, size_t len) + { + u32 state[CHACHA_STATE_WORDS]; + +@@ -173,7 +173,7 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, + + if (bch2_fs_inconsistent_on(!c->chacha20_key_set, + c, "attempting to encrypt without encryption key")) +- return -BCH_ERR_no_encryption_key; ++ return bch_err_throw(c, no_encryption_key); + + bch2_chacha20(&c->chacha20_key, nonce, data, len); + return 0; +@@ -262,7 +262,7 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, + + if (bch2_fs_inconsistent_on(!c->chacha20_key_set, + c, "attempting to encrypt without encryption key")) +- return -BCH_ERR_no_encryption_key; ++ return bch_err_throw(c, no_encryption_key); + + bch2_chacha20_init(chacha_state, &c->chacha20_key, nonce); + +@@ -361,7 +361,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, + extent_nonce(version, crc_old), bio); + + if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n" + " expected %0llx:%0llx got %0llx:%0llx (old type ", + __func__, +@@ -374,8 +374,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, + bch2_prt_csum_type(&buf, new_csum_type); + prt_str(&buf, ")"); + WARN_RATELIMIT(1, "%s", buf.buf); +- printbuf_exit(&buf); +- return -BCH_ERR_recompute_checksum; ++ return bch_err_throw(c, recompute_checksum); + } + + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { +@@ -438,23 +437,21 @@ const struct bch_sb_field_ops bch_sb_field_ops_crypt = { + #ifdef __KERNEL__ + static int __bch2_request_key(char *key_description, struct bch_key *key) + { +- struct key *keyring_key; +- const struct user_key_payload *ukp; + int ret; + +- keyring_key = request_key(&key_type_user, key_description, NULL); ++ struct key *keyring_key = request_key(&key_type_user, key_description, NULL); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + +- down_read(&keyring_key->sem); +- ukp = dereference_key_locked(keyring_key); +- if (ukp->datalen == sizeof(*key)) { +- memcpy(key, ukp->data, ukp->datalen); +- ret = 0; +- } else { +- ret = -EINVAL; ++ scoped_guard(rwsem_read, &keyring_key->sem) { ++ const struct user_key_payload *ukp = dereference_key_locked(keyring_key); ++ if (ukp->datalen == sizeof(*key)) { ++ memcpy(key, ukp->data, ukp->datalen); ++ ret = 0; ++ } else { ++ ret = -EINVAL; ++ } + } +- up_read(&keyring_key->sem); + key_put(keyring_key); + + return ret; +@@ -495,14 +492,13 @@ static int __bch2_request_key(char *key_description, struct bch_key *key) + + int bch2_request_key(struct bch_sb *sb, struct bch_key *key) + { +- struct printbuf key_description = PRINTBUF; ++ CLASS(printbuf, key_description)(); + int ret; + + prt_printf(&key_description, "bcachefs:"); + pr_uuid(&key_description, sb->user_uuid.b); + + ret = __bch2_request_key(key_description.buf, key); +- printbuf_exit(&key_description); + + #ifndef __KERNEL__ + if (ret) { +@@ -524,13 +520,12 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) + int bch2_revoke_key(struct bch_sb *sb) + { + key_serial_t key_id; +- struct printbuf key_description = PRINTBUF; ++ CLASS(printbuf, key_description)(); + + prt_printf(&key_description, "bcachefs:"); + pr_uuid(&key_description, sb->user_uuid.b); + + key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); +- printbuf_exit(&key_description); + if (key_id < 0) + return errno; + +@@ -584,34 +579,28 @@ int bch2_decrypt_sb_key(struct bch_fs *c, + */ + int bch2_disable_encryption(struct bch_fs *c) + { +- struct bch_sb_field_crypt *crypt; +- struct bch_key key; +- int ret = -EINVAL; +- +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + +- crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); ++ struct bch_sb_field_crypt *crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); + if (!crypt) +- goto out; ++ return -EINVAL; + + /* is key encrypted? */ + ret = 0; + if (bch2_key_is_encrypted(&crypt->key)) +- goto out; ++ return 0; + +- ret = bch2_decrypt_sb_key(c, crypt, &key); ++ struct bch_key key; ++ int ret = bch2_decrypt_sb_key(c, crypt, &key); + if (ret) +- goto out; ++ return ret; + + crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); + crypt->key.key = key; + + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); + bch2_write_super(c); +-out: +- mutex_unlock(&c->sb_lock); +- +- return ret; ++ return 0; + } + + /* +@@ -625,7 +614,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) + struct bch_sb_field_crypt *crypt; + int ret = -EINVAL; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + + /* Do we already have an encryption key? */ + if (bch2_sb_field_get(c->disk_sb.sb, crypt)) +@@ -659,7 +648,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) + crypt = bch2_sb_field_resize(&c->disk_sb, crypt, + sizeof(*crypt) / sizeof(u64)); + if (!crypt) { +- ret = -BCH_ERR_ENOSPC_sb_crypt; ++ ret = bch_err_throw(c, ENOSPC_sb_crypt); + goto err; + } + +@@ -669,7 +658,6 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); + bch2_write_super(c); + err: +- mutex_unlock(&c->sb_lock); + memzero_explicit(&user_key, sizeof(user_key)); + memzero_explicit(&key, sizeof(key)); + return ret; +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +index 1310782d3ae9..7bd9cf6104ca 100644 +--- a/fs/bcachefs/checksum.h ++++ b/fs/bcachefs/checksum.h +@@ -69,6 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out, + bch2_csum_to_text(out, type, expected); + } + ++void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); ++ + int bch2_request_key(struct bch_sb *, struct bch_key *); + #ifndef __KERNEL__ + int bch2_revoke_key(struct bch_sb *); +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +index d6dd12d74d4f..1c6d0cdca3c5 100644 +--- a/fs/bcachefs/clock.c ++++ b/fs/bcachefs/clock.c +@@ -40,20 +40,17 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) + + void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) + { +- spin_lock(&clock->timer_lock); ++ guard(spinlock)(&clock->timer_lock); + + for (size_t i = 0; i < clock->timers.nr; i++) + if (clock->timers.data[i] == timer) { + min_heap_del(&clock->timers, i, &callbacks, NULL); +- break; ++ return; + } +- +- spin_unlock(&clock->timer_lock); + } + + struct io_clock_wait { + struct io_timer io_timer; +- struct timer_list cpu_timer; + struct task_struct *task; + int expired; + }; +@@ -67,15 +64,6 @@ static void io_clock_wait_fn(struct io_timer *timer) + wake_up_process(wait->task); + } + +-static void io_clock_cpu_timeout(struct timer_list *timer) +-{ +- struct io_clock_wait *wait = container_of(timer, +- struct io_clock_wait, cpu_timer); +- +- wait->expired = 1; +- wake_up_process(wait->task); +-} +- + void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) + { + struct io_clock_wait wait = { +@@ -90,8 +78,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) + bch2_io_timer_del(clock, &wait.io_timer); + } + +-void bch2_kthread_io_clock_wait(struct io_clock *clock, +- u64 io_until, unsigned long cpu_timeout) ++unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, ++ u64 io_until, unsigned long cpu_timeout) + { + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct io_clock_wait wait = { +@@ -103,27 +91,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, + + bch2_io_timer_add(clock, &wait.io_timer); + +- timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); +- +- if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) +- mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); +- +- do { +- set_current_state(TASK_INTERRUPTIBLE); +- if (kthread && kthread_should_stop()) +- break; +- +- if (wait.expired) +- break; +- +- schedule(); ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (!(kthread && kthread_should_stop())) { ++ cpu_timeout = schedule_timeout(cpu_timeout); + try_to_freeze(); +- } while (0); ++ } + + __set_current_state(TASK_RUNNING); +- timer_delete_sync(&wait.cpu_timer); +- destroy_timer_on_stack(&wait.cpu_timer); + bch2_io_timer_del(clock, &wait.io_timer); ++ return cpu_timeout; ++} ++ ++void bch2_kthread_io_clock_wait(struct io_clock *clock, ++ u64 io_until, unsigned long cpu_timeout) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ ++ while (!(kthread && kthread_should_stop()) && ++ cpu_timeout && ++ atomic64_read(&clock->now) < io_until) ++ cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); + } + + static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) +@@ -144,28 +131,27 @@ void __bch2_increment_clock(struct io_clock *clock, u64 sectors) + struct io_timer *timer; + u64 now = atomic64_add_return(sectors, &clock->now); + +- spin_lock(&clock->timer_lock); ++ guard(spinlock)(&clock->timer_lock); ++ + while ((timer = get_expired_timer(clock, now))) + timer->fn(timer); +- spin_unlock(&clock->timer_lock); + } + + void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) + { +- out->atomic++; +- spin_lock(&clock->timer_lock); + u64 now = atomic64_read(&clock->now); + + printbuf_tabstop_push(out, 40); + prt_printf(out, "current time:\t%llu\n", now); + ++ guard(printbuf_atomic)(out); ++ guard(spinlock)(&clock->timer_lock); ++ + for (unsigned i = 0; i < clock->timers.nr; i++) + prt_printf(out, "%ps %ps:\t%llu\n", + clock->timers.data[i]->fn, + clock->timers.data[i]->fn2, + clock->timers.data[i]->expire); +- spin_unlock(&clock->timer_lock); +- --out->atomic; + } + + void bch2_io_clock_exit(struct io_clock *clock) +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +index 82c79c8baf92..8769be2aa21e 100644 +--- a/fs/bcachefs/clock.h ++++ b/fs/bcachefs/clock.h +@@ -4,6 +4,7 @@ + + void bch2_io_timer_add(struct io_clock *, struct io_timer *); + void bch2_io_timer_del(struct io_clock *, struct io_timer *); ++unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); + void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); + + void __bch2_increment_clock(struct io_clock *, u64); +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 28ed32449913..aeb9b9bd7d33 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -187,7 +187,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + __bch2_compression_types[crc.compression_type])) + ret = bch2_check_set_has_compressed_data(c, opt); + else +- ret = -BCH_ERR_compression_workspace_not_initialized; ++ ret = bch_err_throw(c, compression_workspace_not_initialized); + if (ret) + goto err; + } +@@ -200,7 +200,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret2 != dst_len) +- ret = -BCH_ERR_decompress_lz4; ++ ret = bch_err_throw(c, decompress_lz4); + break; + case BCH_COMPRESSION_TYPE_gzip: { + z_stream strm = { +@@ -219,7 +219,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + mempool_free(workspace, workspace_pool); + + if (ret2 != Z_STREAM_END) +- ret = -BCH_ERR_decompress_gzip; ++ ret = bch_err_throw(c, decompress_gzip); + break; + } + case BCH_COMPRESSION_TYPE_zstd: { +@@ -227,7 +227,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + size_t real_src_len = le32_to_cpup(src_data.b); + + if (real_src_len > src_len - 4) { +- ret = -BCH_ERR_decompress_zstd_src_len_bad; ++ ret = bch_err_throw(c, decompress_zstd_src_len_bad); + goto err; + } + +@@ -241,7 +241,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + mempool_free(workspace, workspace_pool); + + if (ret2 != dst_len) +- ret = -BCH_ERR_decompress_zstd; ++ ret = bch_err_throw(c, decompress_zstd); + break; + } + default: +@@ -270,7 +270,7 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, + bch2_write_op_error(op, op->pos.offset, + "extent too big to decompress (%u > %u)", + crc->uncompressed_size << 9, c->opts.encoded_extent_max); +- return -BCH_ERR_decompress_exceeded_max_encoded_extent; ++ return bch_err_throw(c, decompress_exceeded_max_encoded_extent); + } + + data = __bounce_alloc(c, dst_len, WRITE); +@@ -314,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, + + if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || + crc.compressed_size << 9 > c->opts.encoded_extent_max) +- return -BCH_ERR_decompress_exceeded_max_encoded_extent; ++ return bch_err_throw(c, decompress_exceeded_max_encoded_extent); + + dst_data = dst_len == dst_iter.bi_size + ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) +@@ -336,7 +336,7 @@ static int attempt_compress(struct bch_fs *c, + void *workspace, + void *dst, size_t dst_len, + void *src, size_t src_len, +- struct bch_compression_opt compression) ++ union bch_compression_opt compression) + { + enum bch_compression_type compression_type = + __bch2_compression_opt_to_type[compression.type]; +@@ -426,7 +426,7 @@ static int attempt_compress(struct bch_fs *c, + static unsigned __bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, +- struct bch_compression_opt compression) ++ union bch_compression_opt compression) + { + struct bbuf src_data = { NULL }, dst_data = { NULL }; + void *workspace; +@@ -553,7 +553,7 @@ unsigned bch2_bio_compress(struct bch_fs *c, + + compression_type = + __bio_compress(c, dst, dst_len, src, src_len, +- bch2_compression_decode(compression_opt)); ++ (union bch_compression_opt){ .value = compression_opt }); + + dst->bi_iter.bi_size = orig_dst; + src->bi_iter.bi_size = orig_src; +@@ -579,30 +579,25 @@ static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) + if ((c->sb.features & f) == f) + return 0; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + +- if ((c->sb.features & f) == f) { +- mutex_unlock(&c->sb_lock); ++ if ((c->sb.features & f) == f) + return 0; +- } + + ret = __bch2_fs_compress_init(c, c->sb.features|f); +- if (ret) { +- mutex_unlock(&c->sb_lock); ++ if (ret) + return ret; +- } + + c->disk_sb.sb->features[0] |= cpu_to_le64(f); + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- + return 0; + } + + int bch2_check_set_has_compressed_data(struct bch_fs *c, + unsigned compression_opt) + { +- unsigned compression_type = bch2_compression_decode(compression_opt).type; ++ unsigned int compression_type = ((union bch_compression_opt){ .value = compression_opt }) ++ .type; + + BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); + +@@ -656,12 +651,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + if (!mempool_initialized(&c->compression_bounce[READ]) && + mempool_init_kvmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) +- return -BCH_ERR_ENOMEM_compression_bounce_read_init; ++ return bch_err_throw(c, ENOMEM_compression_bounce_read_init); + + if (!mempool_initialized(&c->compression_bounce[WRITE]) && + mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) +- return -BCH_ERR_ENOMEM_compression_bounce_write_init; ++ return bch_err_throw(c, ENOMEM_compression_bounce_write_init); + + for (i = compression_types; + i < compression_types + ARRAY_SIZE(compression_types); +@@ -675,7 +670,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + if (mempool_init_kvmalloc_pool( + &c->compress_workspace[i->type], + 1, i->compress_workspace)) +- return -BCH_ERR_ENOMEM_compression_workspace_init; ++ return bch_err_throw(c, ENOMEM_compression_workspace_init); + } + + return 0; +@@ -683,7 +678,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + + static u64 compression_opt_to_feature(unsigned v) + { +- unsigned type = bch2_compression_decode(v).type; ++ unsigned int type = ((union bch_compression_opt){ .value = v }).type; + + return BIT_ULL(bch2_compression_opt_to_feature[type]); + } +@@ -703,7 +698,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, + { + char *val = kstrdup(_val, GFP_KERNEL); + char *p = val, *type_str, *level_str; +- struct bch_compression_opt opt = { 0 }; ++ union bch_compression_opt opt = { 0 }; + int ret; + + if (!val) +@@ -714,7 +709,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, + + ret = match_string(bch2_compression_opts, -1, type_str); + if (ret < 0 && err) +- prt_str(err, "invalid compression type"); ++ prt_printf(err, "invalid compression type\n"); + if (ret < 0) + goto err; + +@@ -729,14 +724,14 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, + if (!ret && level > 15) + ret = -EINVAL; + if (ret < 0 && err) +- prt_str(err, "invalid compression level"); ++ prt_printf(err, "invalid compression level\n"); + if (ret < 0) + goto err; + + opt.level = level; + } + +- *res = bch2_compression_encode(opt); ++ *res = opt.value; + err: + kfree(val); + return ret; +@@ -744,7 +739,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, + + void bch2_compression_opt_to_text(struct printbuf *out, u64 v) + { +- struct bch_compression_opt opt = bch2_compression_decode(v); ++ union bch_compression_opt opt = { .value = v }; + + if (opt.type < BCH_COMPRESSION_OPT_NR) + prt_str(out, bch2_compression_opts[opt.type]); +diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h +index bec2f05bfd52..667ddb91d47a 100644 +--- a/fs/bcachefs/compress.h ++++ b/fs/bcachefs/compress.h +@@ -10,41 +10,27 @@ static const unsigned __bch2_compression_opt_to_type[] = { + #undef x + }; + +-struct bch_compression_opt { +- u8 type:4, +- level:4; +-}; +- +-static inline struct bch_compression_opt __bch2_compression_decode(unsigned v) +-{ +- return (struct bch_compression_opt) { +- .type = v & 15, +- .level = v >> 4, ++union bch_compression_opt { ++ u8 value; ++ struct { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ u8 type:4, level:4; ++#elif defined(__BIG_ENDIAN_BITFIELD) ++ u8 level:4, type:4; ++#endif + }; +-} ++}; + + static inline bool bch2_compression_opt_valid(unsigned v) + { +- struct bch_compression_opt opt = __bch2_compression_decode(v); ++ union bch_compression_opt opt = { .value = v }; + + return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level); + } + +-static inline struct bch_compression_opt bch2_compression_decode(unsigned v) +-{ +- return bch2_compression_opt_valid(v) +- ? __bch2_compression_decode(v) +- : (struct bch_compression_opt) { 0 }; +-} +- +-static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) +-{ +- return opt.type|(opt.level << 4); +-} +- + static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) + { +- return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; ++ return __bch2_compression_opt_to_type[((union bch_compression_opt){ .value = v }).type]; + } + + struct bch_write_op; +diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h +index c6151495985f..4080ee99aadd 100644 +--- a/fs/bcachefs/darray.h ++++ b/fs/bcachefs/darray.h +@@ -8,6 +8,7 @@ + * Inspired by CCAN's darray + */ + ++#include + #include + + #define DARRAY_PREALLOCATED(_type, _nr) \ +@@ -20,7 +21,18 @@ struct { \ + #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) + + typedef DARRAY(char) darray_char; +-typedef DARRAY(char *) darray_str; ++typedef DARRAY(char *) darray_str; ++typedef DARRAY(const char *) darray_const_str; ++ ++typedef DARRAY(u8) darray_u8; ++typedef DARRAY(u16) darray_u16; ++typedef DARRAY(u32) darray_u32; ++typedef DARRAY(u64) darray_u64; ++ ++typedef DARRAY(s8) darray_s8; ++typedef DARRAY(s16) darray_s16; ++typedef DARRAY(s32) darray_s32; ++typedef DARRAY(s64) darray_s64; + + int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); + +@@ -76,7 +88,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); + #define darray_remove_item(_d, _pos) \ + array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) + +-#define __darray_for_each(_d, _i) \ ++#define darray_find_p(_d, _i, cond) \ ++({ \ ++ typeof((_d).data) _ret = NULL; \ ++ \ ++ darray_for_each(_d, _i) \ ++ if (cond) { \ ++ _ret = _i; \ ++ break; \ ++ } \ ++ _ret; \ ++}) ++ ++#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item) ++ ++/* Iteration: */ ++ ++#define __darray_for_each(_d, _i) \ + for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) + + #define darray_for_each(_d, _i) \ +@@ -85,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); + #define darray_for_each_reverse(_d, _i) \ + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) + ++/* Init/exit */ ++ + #define darray_init(_d) \ + do { \ + (_d)->nr = 0; \ +@@ -100,4 +130,29 @@ do { \ + darray_init(_d); \ + } while (0) + ++#define DEFINE_DARRAY_CLASS(_type) \ ++DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void) ++ ++#define DEFINE_DARRAY(_type) \ ++typedef DARRAY(_type) darray_##_type; \ ++DEFINE_DARRAY_CLASS(darray_##_type) ++ ++#define DEFINE_DARRAY_NAMED(_name, _type) \ ++typedef DARRAY(_type) _name; \ ++DEFINE_DARRAY_CLASS(_name) ++ ++DEFINE_DARRAY_CLASS(darray_char); ++DEFINE_DARRAY_CLASS(darray_str) ++DEFINE_DARRAY_CLASS(darray_const_str) ++ ++DEFINE_DARRAY_CLASS(darray_u8) ++DEFINE_DARRAY_CLASS(darray_u16) ++DEFINE_DARRAY_CLASS(darray_u32) ++DEFINE_DARRAY_CLASS(darray_u64) ++ ++DEFINE_DARRAY_CLASS(darray_s8) ++DEFINE_DARRAY_CLASS(darray_s16) ++DEFINE_DARRAY_CLASS(darray_s32) ++DEFINE_DARRAY_CLASS(darray_s64) ++ + #endif /* _BCACHEFS_DARRAY_H */ +diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c +index b211c97238ab..01838a3a189d 100644 +--- a/fs/bcachefs/data_update.c ++++ b/fs/bcachefs/data_update.c +@@ -66,46 +66,56 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) + } + } + +-static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) ++static noinline_for_stack ++bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, ++ const struct bch_extent_ptr *start) + { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ if (!ctxt) { ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (ptr == start) ++ break; ++ ++ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); ++ struct bpos bucket = PTR_BUCKET_POS(ca, ptr); ++ bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); ++ } ++ return false; ++ } + +- bkey_for_each_ptr(ptrs, ptr) { ++ __bkey_for_each_ptr(start, ptrs.end, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + +- if (ctxt) { +- bool locked; +- +- move_ctxt_wait_event(ctxt, +- (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || +- list_empty(&ctxt->ios)); ++ bool locked; ++ move_ctxt_wait_event(ctxt, ++ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || ++ list_empty(&ctxt->ios)); ++ if (!locked) ++ bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); ++ } ++ return true; ++} + +- if (!locked) +- bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); +- } else { +- if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { +- bkey_for_each_ptr(ptrs, ptr2) { +- if (ptr2 == ptr) +- break; ++static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) ++{ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); ++ struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + +- ca = bch2_dev_have_ref(c, ptr2->dev); +- bucket = PTR_BUCKET_POS(ca, ptr2); +- bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); +- } +- return false; +- } +- } ++ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) ++ return __bkey_nocow_lock(c, ctxt, ptrs, ptr); + } ++ + return true; + } + +-static noinline void trace_io_move_finish2(struct data_update *u, +- struct bkey_i *new, +- struct bkey_i *insert) ++noinline_for_stack ++static void trace_io_move_finish2(struct data_update *u, ++ struct bkey_i *new, ++ struct bkey_i *insert) + { + struct bch_fs *c = u->op.c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_newline(&buf); + +@@ -121,9 +131,9 @@ static noinline void trace_io_move_finish2(struct data_update *u, + prt_newline(&buf); + + trace_io_move_finish(c, buf.buf); +- printbuf_exit(&buf); + } + ++noinline_for_stack + static void trace_io_move_fail2(struct data_update *m, + struct bkey_s_c new, + struct bkey_s_c wrote, +@@ -132,7 +142,7 @@ static void trace_io_move_fail2(struct data_update *m, + { + struct bch_fs *c = m->op.c; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + unsigned rewrites_found = 0; + + if (!trace_io_move_fail_enabled()) +@@ -176,27 +186,83 @@ static void trace_io_move_fail2(struct data_update *m, + } + + trace_io_move_fail(c, buf.buf); +- printbuf_exit(&buf); ++} ++ ++noinline_for_stack ++static void trace_data_update2(struct data_update *m, ++ struct bkey_s_c old, struct bkey_s_c k, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = m->op.c; ++ CLASS(printbuf, buf)(); ++ ++ prt_str(&buf, "\nold: "); ++ bch2_bkey_val_to_text(&buf, c, old); ++ prt_str(&buf, "\nk: "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_str(&buf, "\nnew: "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); ++ ++ trace_data_update(c, buf.buf); ++} ++ ++noinline_for_stack ++static void trace_io_move_created_rebalance2(struct data_update *m, ++ struct bkey_s_c old, struct bkey_s_c k, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = m->op.c; ++ CLASS(printbuf, buf)(); ++ ++ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); ++ ++ prt_str(&buf, "\nold: "); ++ bch2_bkey_val_to_text(&buf, c, old); ++ prt_str(&buf, "\nk: "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_str(&buf, "\nnew: "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); ++ ++ trace_io_move_created_rebalance(c, buf.buf); ++ ++ this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); ++} ++ ++noinline_for_stack ++static int data_update_invalid_bkey(struct data_update *m, ++ struct bkey_s_c old, struct bkey_s_c k, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = m->op.c; ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ prt_str(&buf, "about to insert invalid key in data update path"); ++ prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); ++ prt_str(&buf, "\nold: "); ++ bch2_bkey_val_to_text(&buf, c, old); ++ prt_str(&buf, "\nk: "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_str(&buf, "\nnew: "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); ++ prt_newline(&buf); ++ ++ bch2_fs_emergency_read_only2(c, &buf); ++ ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ ++ return bch_err_throw(c, invalid_bkey); + } + + static int __bch2_data_update_index_update(struct btree_trans *trans, + struct bch_write_op *op) + { + struct bch_fs *c = op->c; +- struct btree_iter iter; +- struct data_update *m = +- container_of(op, struct data_update, op); +- struct keylist *keys = &op->insert_keys; +- struct bkey_buf _new, _insert; +- struct printbuf journal_msg = PRINTBUF; ++ struct data_update *m = container_of(op, struct data_update, op); + int ret = 0; + +- bch2_bkey_buf_init(&_new); +- bch2_bkey_buf_init(&_insert); +- bch2_bkey_buf_realloc(&_insert, c, U8_MAX); +- +- bch2_trans_iter_init(trans, &iter, m->btree_id, +- bkey_start_pos(&bch2_keylist_front(keys)->k), ++ CLASS(btree_iter, iter)(trans, m->btree_id, ++ bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), + BTREE_ITER_slots|BTREE_ITER_intent); + + while (1) { +@@ -216,24 +282,35 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + + bch2_trans_begin(trans); + +- k = bch2_btree_iter_peek_slot(trans, &iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + +- new = bkey_i_to_extent(bch2_keylist_front(keys)); ++ new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); + + if (!bch2_extents_match(k, old)) { + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), +- NULL, "no match:"); ++ NULL, "no match:"); + goto nowork; + } + +- bkey_reassemble(_insert.k, k); +- insert = _insert.k; ++ insert = bch2_trans_kmalloc(trans, ++ bkey_bytes(k.k) + ++ bkey_val_bytes(&new->k) + ++ sizeof(struct bch_extent_rebalance)); ++ ret = PTR_ERR_OR_ZERO(insert); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(insert, k); ++ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ goto err; + +- bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); +- new = bkey_i_to_extent(_new.k); ++ bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); + bch2_cut_front(iter.pos, &new->k_i); + + bch2_cut_front(iter.pos, insert); +@@ -294,21 +371,21 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); + + /* Now, drop excess replicas: */ +- rcu_read_lock(); ++ scoped_guard(rcu) { + restart_drop_extra_replicas: +- bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { +- unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); ++ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { ++ unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + +- if (!p.ptr.cached && +- durability - ptr_durability >= m->op.opts.data_replicas) { +- durability -= ptr_durability; ++ if (!p.ptr.cached && ++ durability - ptr_durability >= m->op.opts.data_replicas) { ++ durability -= ptr_durability; + +- bch2_extent_ptr_set_cached(c, &m->op.opts, +- bkey_i_to_s(insert), &entry->ptr); +- goto restart_drop_extra_replicas; ++ bch2_extent_ptr_set_cached(c, &m->op.opts, ++ bkey_i_to_s(insert), &entry->ptr); ++ goto restart_drop_extra_replicas; ++ } + } + } +- rcu_read_unlock(); + + /* Finally, add the pointers we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) +@@ -346,44 +423,12 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + .btree = m->btree_id, + .flags = BCH_VALIDATE_commit, + }); +- if (invalid) { +- struct printbuf buf = PRINTBUF; +- +- prt_str(&buf, "about to insert invalid key in data update path"); +- prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); +- prt_str(&buf, "\nold: "); +- bch2_bkey_val_to_text(&buf, c, old); +- prt_str(&buf, "\nk: "); +- bch2_bkey_val_to_text(&buf, c, k); +- prt_str(&buf, "\nnew: "); +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); +- +- bch2_print_string_as_lines(KERN_ERR, buf.buf); +- printbuf_exit(&buf); +- +- bch2_fatal_error(c); +- ret = -BCH_ERR_invalid_bkey; ++ if (unlikely(invalid)) { ++ ret = data_update_invalid_bkey(m, old, k, insert); + goto out; + } + +- if (trace_data_update_enabled()) { +- struct printbuf buf = PRINTBUF; +- +- prt_str(&buf, "\nold: "); +- bch2_bkey_val_to_text(&buf, c, old); +- prt_str(&buf, "\nk: "); +- bch2_bkey_val_to_text(&buf, c, k); +- prt_str(&buf, "\nnew: "); +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); +- +- trace_data_update(c, buf.buf); +- printbuf_exit(&buf); +- } +- +- printbuf_reset(&journal_msg); +- prt_str(&journal_msg, bch2_data_update_type_strs[m->type]); +- +- ret = bch2_trans_log_msg(trans, &journal_msg) ?: ++ ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: + bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, bkey_start_pos(&insert->k)) ?: +@@ -391,28 +436,39 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + k.k->p, insert->k.p) ?: + bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: + bch2_trans_update(trans, &iter, insert, +- BTREE_UPDATE_internal_snapshot_node) ?: +- bch2_trans_commit(trans, &op->res, ++ BTREE_UPDATE_internal_snapshot_node); ++ if (ret) ++ goto err; ++ ++ if (trace_data_update_enabled()) ++ trace_data_update2(m, old, k, insert); ++ ++ if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > ++ bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) ++ trace_io_move_created_rebalance2(m, old, k, insert); ++ ++ ret = bch2_trans_commit(trans, &op->res, + NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + m->data_opts.btree_insert_flags); +- if (!ret) { +- bch2_btree_iter_set_pos(trans, &iter, next_pos); ++ if (ret) ++ goto err; + +- this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); +- if (trace_io_move_finish_enabled()) +- trace_io_move_finish2(m, &new->k_i, insert); +- } ++ bch2_btree_iter_set_pos(&iter, next_pos); ++ ++ this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); ++ if (trace_io_move_finish_enabled()) ++ trace_io_move_finish2(m, &new->k_i, insert); + err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; + next: +- while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { +- bch2_keylist_pop_front(keys); +- if (bch2_keylist_empty(keys)) ++ while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { ++ bch2_keylist_pop_front(&op->insert_keys); ++ if (bch2_keylist_empty(&op->insert_keys)) + goto out; + } + continue; +@@ -426,21 +482,18 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + + count_event(c, io_move_fail); + +- bch2_btree_iter_advance(trans, &iter); ++ bch2_btree_iter_advance(&iter); + goto next; + } + out: +- printbuf_exit(&journal_msg); +- bch2_trans_iter_exit(trans, &iter); +- bch2_bkey_buf_exit(&_insert, c); +- bch2_bkey_buf_exit(&_new, c); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + return ret; + } + + int bch2_data_update_index_update(struct bch_write_op *op) + { +- return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); ++ CLASS(btree_trans, trans)(op->c); ++ return __bch2_data_update_index_update(trans, op); + } + + void bch2_data_update_read_done(struct data_update *m) +@@ -474,8 +527,9 @@ void bch2_data_update_exit(struct data_update *update) + bch2_bkey_buf_exit(&update->k, c); + } + +-static int bch2_update_unwritten_extent(struct btree_trans *trans, +- struct data_update *update) ++static noinline_for_stack ++int bch2_update_unwritten_extent(struct btree_trans *trans, ++ struct data_update *update) + { + struct bch_fs *c = update->op.c; + struct bkey_i_extent *e; +@@ -497,10 +551,10 @@ static int bch2_update_unwritten_extent(struct btree_trans *trans, + bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, + BTREE_ITER_slots); + ret = lockrestart_do(trans, ({ +- k = bch2_btree_iter_peek_slot(trans, &iter); ++ k = bch2_btree_iter_peek_slot(&iter); + bkey_err(k); + })); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + + if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) + break; +@@ -587,6 +641,10 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + + prt_str_indented(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); ++ prt_newline(out); ++ ++ prt_str_indented(out, "scrub:\t"); ++ prt_u64(out, data_opts->scrub); + } + + void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) +@@ -607,9 +665,17 @@ void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update + prt_newline(out); + printbuf_indent_add(out, 2); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); +- prt_printf(out, "read_done:\t%u\n", m->read_done); +- bch2_write_op_to_text(out, &m->op); +- printbuf_indent_sub(out, 2); ++ ++ if (!m->read_done) { ++ prt_printf(out, "read:\n"); ++ printbuf_indent_add(out, 2); ++ bch2_read_bio_to_text(out, m->op.c, &m->rbio); ++ } else { ++ prt_printf(out, "write:\n"); ++ printbuf_indent_add(out, 2); ++ bch2_write_op_to_text(out, &m->op); ++ } ++ printbuf_indent_sub(out, 4); + } + + int bch2_extent_drop_ptrs(struct btree_trans *trans, +@@ -655,18 +721,10 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + } + +-int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, +- struct bch_io_opts *io_opts) ++static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, ++ struct bch_io_opts *io_opts, ++ unsigned buf_bytes) + { +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- +- /* write path might have to decompress data: */ +- unsigned buf_bytes = 0; +- bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) +- buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); +- + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); + + m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); +@@ -690,11 +748,26 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + return 0; + } + ++int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, ++ struct bch_io_opts *io_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ /* write path might have to decompress data: */ ++ unsigned buf_bytes = 0; ++ bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) ++ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); ++ ++ return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); ++} ++ + static int can_write_extent(struct bch_fs *c, struct data_update *m) + { + if ((m->op.flags & BCH_WRITE_alloc_nowait) && + unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) +- return -BCH_ERR_data_update_done_would_block; ++ return bch_err_throw(c, data_update_done_would_block); + + unsigned target = m->op.flags & BCH_WRITE_only_specified_devs + ? m->op.target +@@ -704,27 +777,38 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) + darray_for_each(m->op.devs_have, i) + __clear_bit(*i, devs.d); + +- rcu_read_lock(); ++ CLASS(printbuf, buf)(); ++ ++ guard(printbuf_atomic)(&buf); ++ guard(rcu)(); ++ + unsigned nr_replicas = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { +- struct bch_dev *ca = bch2_dev_rcu(c, i); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); ++ if (!ca) ++ continue; + + struct bch_dev_usage usage; + bch2_dev_usage_read_fast(ca, &usage); + +- if (!dev_buckets_free(ca, usage, m->op.watermark)) ++ u64 nr_free = dev_buckets_free(ca, usage, m->op.watermark); ++ ++ prt_printf(&buf, "%s=%llu ", ca->name, nr_free); ++ ++ if (!nr_free) + continue; + + nr_replicas += ca->mi.durability; + if (nr_replicas >= m->op.nr_replicas) + break; + } +- rcu_read_unlock(); + +- if (!nr_replicas) +- return -BCH_ERR_data_update_done_no_rw_devs; ++ if (!nr_replicas) { ++ trace_data_update_done_no_rw_devs(c, buf.buf); ++ return bch_err_throw(c, data_update_done_no_rw_devs); ++ } + if (nr_replicas < m->op.nr_replicas) +- return -BCH_ERR_insufficient_devices; ++ return bch_err_throw(c, insufficient_devices); + return 0; + } + +@@ -739,19 +823,21 @@ int bch2_data_update_init(struct btree_trans *trans, + struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; + int ret = 0; + +- /* +- * fs is corrupt we have a key for a snapshot node that doesn't exist, +- * and we have to check for this because we go rw before repairing the +- * snapshots table - just skip it, we can move it later. +- */ +- if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) +- return -BCH_ERR_data_update_done_no_snapshot; ++ if (k.k->p.snapshot) { ++ ret = bch2_check_key_has_snapshot(trans, iter, k); ++ if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { ++ /* Can't repair yet, waiting on other recovery passes */ ++ return bch_err_throw(c, data_update_done_no_snapshot); ++ } ++ if (ret < 0) ++ return ret; ++ if (ret) /* key was deleted */ ++ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ bch_err_throw(c, data_update_done_no_snapshot); ++ ret = 0; ++ } + + bch2_bkey_buf_init(&m->k); + bch2_bkey_buf_reassemble(&m->k, c, k); +@@ -779,10 +865,17 @@ int bch2_data_update_init(struct btree_trans *trans, + + unsigned durability_have = 0, durability_removing = 0; + ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; ++ unsigned buf_bytes = 0; ++ bool unwritten = false; ++ + unsigned ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (!p.ptr.cached) { +- rcu_read_lock(); ++ guard(rcu)(); + if (ptr_bit & m->data_opts.rewrite_ptrs) { + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; +@@ -793,7 +886,6 @@ int bch2_data_update_init(struct btree_trans *trans, + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); + } +- rcu_read_unlock(); + } + + /* +@@ -809,6 +901,9 @@ int bch2_data_update_init(struct btree_trans *trans, + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + m->op.incompressible = true; + ++ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); ++ unwritten |= p.ptr.unwritten; ++ + ptr_bit <<= 1; + } + +@@ -847,7 +942,7 @@ int bch2_data_update_init(struct btree_trans *trans, + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); + if (!ret) +- ret = -BCH_ERR_data_update_done_no_writes_needed; ++ ret = bch_err_throw(c, data_update_done_no_writes_needed); + goto out_bkey_buf_exit; + } + +@@ -878,23 +973,25 @@ int bch2_data_update_init(struct btree_trans *trans, + } + + if (!bkey_get_dev_refs(c, k)) { +- ret = -BCH_ERR_data_update_done_no_dev_refs; ++ ret = bch_err_throw(c, data_update_done_no_dev_refs); + goto out_put_disk_res; + } + + if (c->opts.nocow_enabled && +- !bkey_nocow_lock(c, ctxt, k)) { +- ret = -BCH_ERR_nocow_lock_blocked; ++ !bkey_nocow_lock(c, ctxt, ptrs)) { ++ ret = bch_err_throw(c, nocow_lock_blocked); + goto out_put_dev_refs; + } + +- if (bkey_extent_is_unwritten(k)) { ++ if (unwritten) { + ret = bch2_update_unwritten_extent(trans, m) ?: +- -BCH_ERR_data_update_done_unwritten; ++ bch_err_throw(c, data_update_done_unwritten); + goto out_nocow_unlock; + } + +- ret = bch2_data_update_bios_init(m, c, io_opts); ++ bch2_trans_unlock(trans); ++ ++ ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); + if (ret) + goto out_nocow_unlock; + +diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h +index ed05125867da..5e14d13568de 100644 +--- a/fs/bcachefs/data_update.h ++++ b/fs/bcachefs/data_update.h +@@ -50,6 +50,21 @@ struct data_update { + struct bio_vec *bvecs; + }; + ++struct promote_op { ++ struct rcu_head rcu; ++ u64 start_time; ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ unsigned list_idx; ++#endif ++ ++ struct rhash_head hash; ++ struct bpos pos; ++ ++ struct work_struct work; ++ struct data_update write; ++ struct bio_vec bi_inline_vecs[]; /* must be last */ ++}; ++ + void bch2_data_update_to_text(struct printbuf *, struct data_update *); + void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); + +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 5a8bc7013512..33cb94f70b19 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -8,6 +8,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" ++#include "async_objs.h" + #include "bkey_methods.h" + #include "btree_cache.h" + #include "btree_io.h" +@@ -16,6 +17,7 @@ + #include "btree_update.h" + #include "btree_update_interior.h" + #include "buckets.h" ++#include "data_update.h" + #include "debug.h" + #include "error.h" + #include "extents.h" +@@ -40,9 +42,10 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, + struct btree_node *n_sorted = c->verify_data->data; + struct bset *sorted, *inmemory = &b->data->keys; + struct bio *bio; +- bool failed = false, saw_error = false; ++ bool failed = false; + +- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, ++ BCH_DEV_READ_REF_btree_verify_replicas); + if (!ca) + return false; + +@@ -57,12 +60,13 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, + submit_bio_wait(bio); + + bio_put(bio); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_btree_verify_replicas); + + memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); + + v->written = 0; +- if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) ++ if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) + return false; + + n_sorted = c->verify_data->data; +@@ -137,7 +141,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + return; + + bch2_btree_node_io_lock(b); +- mutex_lock(&c->verify_lock); ++ guard(mutex)(&c->verify_lock); + + if (!c->verify_ondisk) { + c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); +@@ -149,8 +153,6 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + c->verify_data = __bch2_btree_node_mem_alloc(c); + if (!c->verify_data) + goto out; +- +- list_del_init(&c->verify_data->list); + } + + BUG_ON(b->nsets != 1); +@@ -170,14 +172,11 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) + failed |= bch2_btree_verify_replica(c, b, p); + + if (failed) { +- struct printbuf buf = PRINTBUF; +- ++ CLASS(printbuf, buf)(); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf); +- printbuf_exit(&buf); + } + out: +- mutex_unlock(&c->verify_lock); + bch2_btree_node_io_unlock(b); + } + +@@ -196,7 +195,8 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, + return; + } + +- ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); ++ ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, ++ BCH_DEV_READ_REF_btree_node_ondisk_to_text); + if (!ca) { + prt_printf(out, "error getting device to read from: not online\n"); + return; +@@ -297,28 +297,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, + if (bio) + bio_put(bio); + kvfree(n_ondisk); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_btree_node_ondisk_to_text); + } + + #ifdef CONFIG_DEBUG_FS + +-/* XXX: bch_fs refcounting */ +- +-struct dump_iter { +- struct bch_fs *c; +- enum btree_id id; +- struct bpos from; +- struct bpos prev_node; +- u64 iter; +- +- struct printbuf buf; +- +- char __user *ubuf; /* destination user buffer */ +- size_t size; /* size of requested read */ +- ssize_t ret; /* bytes read so far */ +-}; +- +-static ssize_t flush_buf(struct dump_iter *i) ++ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) + { + if (i->buf.pos) { + size_t bytes = min_t(size_t, i->buf.pos, i->size); +@@ -330,6 +315,11 @@ static ssize_t flush_buf(struct dump_iter *i) + i->buf.pos -= copied; + memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); + ++ if (i->buf.last_newline >= copied) ++ i->buf.last_newline -= copied; ++ if (i->buf.last_field >= copied) ++ i->buf.last_field -= copied; ++ + if (copied != bytes) + return -EFAULT; + } +@@ -356,7 +346,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file) + return 0; + } + +-static int bch2_dump_release(struct inode *inode, struct file *file) ++int bch2_dump_release(struct inode *inode, struct file *file) + { + struct dump_iter *i = file->private_data; + +@@ -374,17 +364,17 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, + i->size = size; + i->ret = 0; + +- return flush_buf(i) ?: +- bch2_trans_run(i->c, +- for_each_btree_key(trans, iter, i->id, i->from, +- BTREE_ITER_prefetch| +- BTREE_ITER_all_snapshots, k, ({ +- bch2_bkey_val_to_text(&i->buf, i->c, k); +- prt_newline(&i->buf); +- bch2_trans_unlock(trans); +- i->from = bpos_successor(iter.pos); +- flush_buf(i); +- }))) ?: ++ CLASS(btree_trans, trans)(i->c); ++ return bch2_debugfs_flush_buf(i) ?: ++ for_each_btree_key(trans, iter, i->id, i->from, ++ BTREE_ITER_prefetch| ++ BTREE_ITER_all_snapshots, k, ({ ++ bch2_bkey_val_to_text(&i->buf, i->c, k); ++ prt_newline(&i->buf); ++ bch2_trans_unlock(trans); ++ i->from = bpos_successor(iter.pos); ++ bch2_debugfs_flush_buf(i); ++ })) ?: + i->ret; + } + +@@ -404,22 +394,22 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + i->size = size; + i->ret = 0; + +- ssize_t ret = flush_buf(i); ++ ssize_t ret = bch2_debugfs_flush_buf(i); + if (ret) + return ret; + + if (bpos_eq(SPOS_MAX, i->from)) + return i->ret; + +- return bch2_trans_run(i->c, +- for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ +- bch2_btree_node_to_text(&i->buf, i->c, b); +- i->from = !bpos_eq(SPOS_MAX, b->key.k.p) +- ? bpos_successor(b->key.k.p) +- : b->key.k.p; ++ CLASS(btree_trans, trans)(i->c); ++ return for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ ++ bch2_btree_node_to_text(&i->buf, i->c, b); ++ i->from = !bpos_eq(SPOS_MAX, b->key.k.p) ++ ? bpos_successor(b->key.k.p) ++ : b->key.k.p; + +- drop_locks_do(trans, flush_buf(i)); +- }))) ?: i->ret; ++ drop_locks_do(trans, bch2_debugfs_flush_buf(i)); ++ })) ?: i->ret; + } + + static const struct file_operations btree_format_debug_ops = { +@@ -438,27 +428,27 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + i->size = size; + i->ret = 0; + +- return flush_buf(i) ?: +- bch2_trans_run(i->c, +- for_each_btree_key(trans, iter, i->id, i->from, +- BTREE_ITER_prefetch| +- BTREE_ITER_all_snapshots, k, ({ +- struct btree_path_level *l = +- &btree_iter_path(trans, &iter)->l[0]; +- struct bkey_packed *_k = +- bch2_btree_node_iter_peek(&l->iter, l->b); +- +- if (bpos_gt(l->b->key.k.p, i->prev_node)) { +- bch2_btree_node_to_text(&i->buf, i->c, l->b); +- i->prev_node = l->b->key.k.p; +- } +- +- bch2_bfloat_to_text(&i->buf, l->b, _k); +- bch2_trans_unlock(trans); +- i->from = bpos_successor(iter.pos); +- flush_buf(i); +- }))) ?: +- i->ret; ++ CLASS(btree_trans, trans)(i->c); ++ return bch2_debugfs_flush_buf(i) ?: ++ for_each_btree_key(trans, iter, i->id, i->from, ++ BTREE_ITER_prefetch| ++ BTREE_ITER_all_snapshots, k, ({ ++ struct btree_path_level *l = ++ &btree_iter_path(trans, &iter)->l[0]; ++ struct bkey_packed *_k = ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++ ++ if (bpos_gt(l->b->key.k.p, i->prev_node)) { ++ bch2_btree_node_to_text(&i->buf, i->c, l->b); ++ i->prev_node = l->b->key.k.p; ++ } ++ ++ bch2_bfloat_to_text(&i->buf, l->b, _k); ++ bch2_trans_unlock(trans); ++ i->from = bpos_successor(iter.pos); ++ bch2_debugfs_flush_buf(i); ++ })) ?: ++ i->ret; + } + + static const struct file_operations bfloat_failed_debug_ops = { +@@ -472,7 +462,7 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * + struct btree *b) + { + if (!out->nr_tabstops) +- printbuf_tabstop_push(out, 32); ++ printbuf_tabstop_push(out, 36); + + prt_printf(out, "%px ", b); + bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); +@@ -497,6 +487,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * + prt_printf(out, "journal pin %px:\t%llu\n", + &b->writes[1].journal, b->writes[1].journal.seq); + ++ prt_printf(out, "ob:\t%u\n", b->ob.nr); ++ + printbuf_indent_sub(out, 2); + } + +@@ -513,34 +505,33 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, + i->ret = 0; + + do { +- struct bucket_table *tbl; +- struct rhash_head *pos; +- struct btree *b; +- +- ret = flush_buf(i); ++ ret = bch2_debugfs_flush_buf(i); + if (ret) + return ret; + +- rcu_read_lock(); +- i->buf.atomic++; +- tbl = rht_dereference_rcu(c->btree_cache.table.tbl, +- &c->btree_cache.table); +- if (i->iter < tbl->size) { +- rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) +- bch2_cached_btree_node_to_text(&i->buf, c, b); +- i->iter++; +- } else { +- done = true; ++ scoped_guard(rcu) { ++ guard(printbuf_atomic)(&i->buf); ++ struct bucket_table *tbl = ++ rht_dereference_rcu(c->btree_cache.table.tbl, ++ &c->btree_cache.table); ++ if (i->iter < tbl->size) { ++ struct rhash_head *pos; ++ struct btree *b; ++ ++ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) ++ bch2_cached_btree_node_to_text(&i->buf, c, b); ++ i->iter++; ++ } else { ++ done = true; ++ } + } +- --i->buf.atomic; +- rcu_read_unlock(); + } while (!done); + + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) +- ret = flush_buf(i); ++ ret = bch2_debugfs_flush_buf(i); + + return ret ?: i->ret; + } +@@ -589,6 +580,8 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + i->ubuf = buf; + i->size = size; + i->ret = 0; ++ ++ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + restart: + seqmutex_lock(&c->btree_trans_lock); + list_sort(&c->btree_trans_list, list_ptr_order_cmp); +@@ -602,6 +595,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + if (!closure_get_not_zero(&trans->ref)) + continue; + ++ if (!trans->srcu_held) { ++ closure_put(&trans->ref); ++ continue; ++ } ++ + u32 seq = seqmutex_unlock(&c->btree_trans_lock); + + bch2_btree_trans_to_text(&i->buf, trans); +@@ -614,7 +612,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + + closure_put(&trans->ref); + +- ret = flush_buf(i); ++ ret = bch2_debugfs_flush_buf(i); + if (ret) + goto unlocked; + +@@ -623,11 +621,13 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + } + seqmutex_unlock(&c->btree_trans_lock); + unlocked: ++ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); ++ + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) +- ret = flush_buf(i); ++ ret = bch2_debugfs_flush_buf(i); + + return ret ?: i->ret; + } +@@ -652,7 +652,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, + i->ret = 0; + + while (1) { +- err = flush_buf(i); ++ err = bch2_debugfs_flush_buf(i); + if (err) + return err; + +@@ -695,7 +695,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, + i->iter++; + } + +- err = flush_buf(i); ++ err = bch2_debugfs_flush_buf(i); + if (err) + return err; + +@@ -753,7 +753,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, + while (1) { + struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; + +- err = flush_buf(i); ++ err = bch2_debugfs_flush_buf(i); + if (err) + return err; + +@@ -767,9 +767,15 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, + prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); + printbuf_indent_add(&i->buf, 2); + +- mutex_lock(&s->lock); ++ guard(mutex)(&s->lock); + + prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); ++#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE ++ printbuf_indent_add(&i->buf, 2); ++ bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); ++ printbuf_indent_sub(&i->buf, 2); ++#endif ++ + prt_printf(&i->buf, "Transaction duration:\n"); + + printbuf_indent_add(&i->buf, 2); +@@ -792,8 +798,6 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, + printbuf_indent_sub(&i->buf, 2); + } + +- mutex_unlock(&s->lock); +- + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + i->iter++; +@@ -868,7 +872,7 @@ static ssize_t bch2_simple_print(struct file *file, char __user *buf, + ret = -ENOMEM; + + if (!ret) +- ret = flush_buf(i); ++ ret = bch2_debugfs_flush_buf(i); + + return ret ?: i->ret; + } +@@ -927,7 +931,11 @@ void bch2_fs_debug_init(struct bch_fs *c) + if (IS_ERR_OR_NULL(bch_debug)) + return; + +- snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); ++ if (c->sb.multi_device) ++ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); ++ else ++ strscpy(name, c->name, sizeof(name)); ++ + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); + if (IS_ERR_OR_NULL(c->fs_debug_dir)) + return; +@@ -953,6 +961,8 @@ void bch2_fs_debug_init(struct bch_fs *c) + debugfs_create_file("write_points", 0400, c->fs_debug_dir, + c->btree_debug, &write_points_ops); + ++ bch2_fs_async_obj_debugfs_init(c); ++ + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); + if (IS_ERR_OR_NULL(c->btree_debug_dir)) + return; +diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h +index 2c37143b5fd1..d88b1194b8ac 100644 +--- a/fs/bcachefs/debug.h ++++ b/fs/bcachefs/debug.h +@@ -14,11 +14,29 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, + + static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) + { +- if (bch2_verify_btree_ondisk) ++ if (static_branch_unlikely(&bch2_verify_btree_ondisk)) + __bch2_btree_verify(c, b); + } + + #ifdef CONFIG_DEBUG_FS ++struct dump_iter { ++ struct bch_fs *c; ++ struct async_obj_list *list; ++ enum btree_id id; ++ struct bpos from; ++ struct bpos prev_node; ++ u64 iter; ++ ++ struct printbuf buf; ++ ++ char __user *ubuf; /* destination user buffer */ ++ size_t size; /* size of requested read */ ++ ssize_t ret; /* bytes read so far */ ++}; ++ ++ssize_t bch2_debugfs_flush_buf(struct dump_iter *); ++int bch2_dump_release(struct inode *, struct file *); ++ + void bch2_fs_debug_exit(struct bch_fs *); + void bch2_fs_debug_init(struct bch_fs *); + #else +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index a51195088227..cb44b35e0f1d 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -13,14 +13,18 @@ + + #include + ++#if IS_ENABLED(CONFIG_UNICODE) + int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) + { + *out_cf = (struct qstr) QSTR_INIT(NULL, 0); + +-#ifdef CONFIG_UNICODE ++ int ret = bch2_fs_casefold_enabled(trans->c); ++ if (ret) ++ return ret; ++ + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); +- int ret = PTR_ERR_OR_ZERO(buf); ++ ret = PTR_ERR_OR_ZERO(buf); + if (ret) + return ret; + +@@ -30,10 +34,8 @@ int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + + *out_cf = (struct qstr) QSTR_INIT(buf, ret); + return 0; +-#else +- return -EOPNOTSUPP; +-#endif + } ++#endif + + static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) + { +@@ -212,82 +214,87 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr d_name = bch2_dirent_get_name(d); + +- prt_printf(out, "%.*s -> ", d_name.len, d_name.name); ++ prt_bytes(out, d_name.name, d_name.len); ++ ++ if (d.v->d_casefold) { ++ prt_str(out, " (casefold "); ++ struct qstr d_name = bch2_dirent_get_lookup_name(d); ++ prt_bytes(out, d_name.name, d_name.len); ++ prt_char(out, ')'); ++ } ++ ++ prt_str(out, " ->"); + + if (d.v->d_type != DT_SUBVOL) +- prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); ++ prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); + else +- prt_printf(out, "%u -> %u", ++ prt_printf(out, " %u -> %u", + le32_to_cpu(d.v->d_parent_subvol), + le32_to_cpu(d.v->d_child_subvol)); + + prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); + } + +-static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, +- subvol_inum dir, +- u8 type, +- int name_len, int cf_name_len, +- u64 dst) ++int bch2_dirent_init_name(struct bch_fs *c, ++ struct bkey_i_dirent *dirent, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, ++ const struct qstr *cf_name) + { +- struct bkey_i_dirent *dirent; +- unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); +- +- BUG_ON(u64s > U8_MAX); ++ EBUG_ON(hash_info->cf_encoding == NULL && cf_name); ++ int cf_len = 0; + +- dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); +- if (IS_ERR(dirent)) +- return dirent; ++ if (name->len > BCH_NAME_MAX) ++ return -ENAMETOOLONG; + +- bkey_dirent_init(&dirent->k_i); +- dirent->k.u64s = u64s; ++ dirent->v.d_casefold = hash_info->cf_encoding != NULL; + +- if (type != DT_SUBVOL) { +- dirent->v.d_inum = cpu_to_le64(dst); ++ if (!dirent->v.d_casefold) { ++ memcpy(&dirent->v.d_name[0], name->name, name->len); ++ memset(&dirent->v.d_name[name->len], 0, ++ bkey_val_bytes(&dirent->k) - ++ offsetof(struct bch_dirent, d_name) - ++ name->len); + } else { +- dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); +- dirent->v.d_child_subvol = cpu_to_le32(dst); +- } ++ int ret = bch2_fs_casefold_enabled(c); ++ if (ret) ++ return ret; + +- dirent->v.d_type = type; +- dirent->v.d_unused = 0; +- dirent->v.d_casefold = cf_name_len ? 1 : 0; ++#if IS_ENABLED(CONFIG_UNICODE) ++ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); + +- return dirent; +-} ++ char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; ++ void *val_end = bkey_val_end(bkey_i_to_s(&dirent->k_i)); + +-static void dirent_init_regular_name(struct bkey_i_dirent *dirent, +- const struct qstr *name) +-{ +- EBUG_ON(dirent->v.d_casefold); ++ if (cf_name) { ++ cf_len = cf_name->len; + +- memcpy(&dirent->v.d_name[0], name->name, name->len); +- memset(&dirent->v.d_name[name->len], 0, +- bkey_val_bytes(&dirent->k) - +- offsetof(struct bch_dirent, d_name) - +- name->len); +-} ++ memcpy(cf_out, cf_name->name, cf_name->len); ++ } else { ++ cf_len = utf8_casefold(hash_info->cf_encoding, name, ++ cf_out, val_end - (void *) cf_out); ++ if (cf_len <= 0) ++ return cf_len; ++ } + +-static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, +- const struct qstr *name, +- const struct qstr *cf_name) +-{ +- EBUG_ON(!dirent->v.d_casefold); +- EBUG_ON(!cf_name->len); +- +- dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); +- dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); +- memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); +- memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); +- memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, +- bkey_val_bytes(&dirent->k) - +- offsetof(struct bch_dirent, d_cf_name_block.d_names) - +- name->len + cf_name->len); +- +- EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); ++ void *name_end = &dirent->v.d_cf_name_block.d_names[name->len + cf_len]; ++ BUG_ON(name_end > val_end); ++ memset(name_end, 0, val_end - name_end); ++ ++ dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); ++ dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); ++ ++ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); ++#endif ++ } ++ ++ unsigned u64s = dirent_val_u64s(name->len, cf_len); ++ BUG_ON(u64s > bkey_val_u64s(&dirent->k)); ++ set_bkey_val_u64s(&dirent->k, u64s); ++ return 0; + } + +-static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, ++struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, + const struct bch_hash_info *hash_info, + subvol_inum dir, + u8 type, +@@ -295,31 +302,28 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + const struct qstr *cf_name, + u64 dst) + { +- struct bkey_i_dirent *dirent; +- struct qstr _cf_name; +- +- if (name->len > BCH_NAME_MAX) +- return ERR_PTR(-ENAMETOOLONG); ++ struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); ++ if (IS_ERR(dirent)) ++ return dirent; + +- if (hash_info->cf_encoding && !cf_name) { +- int ret = bch2_casefold(trans, hash_info, name, &_cf_name); +- if (ret) +- return ERR_PTR(ret); ++ bkey_dirent_init(&dirent->k_i); ++ dirent->k.u64s = BKEY_U64s_MAX; + +- cf_name = &_cf_name; ++ if (type != DT_SUBVOL) { ++ dirent->v.d_inum = cpu_to_le64(dst); ++ } else { ++ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); ++ dirent->v.d_child_subvol = cpu_to_le32(dst); + } + +- dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); +- if (IS_ERR(dirent)) +- return dirent; ++ dirent->v.d_type = type; ++ dirent->v.d_unused = 0; + +- if (cf_name) +- dirent_init_casefolded_name(dirent, name, cf_name); +- else +- dirent_init_regular_name(dirent, name); ++ int ret = bch2_dirent_init_name(trans->c, dirent, hash_info, name, cf_name); ++ if (ret) ++ return ERR_PTR(ret); + + EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); +- + return dirent; + } + +@@ -334,7 +338,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, + struct bkey_i_dirent *dirent; + int ret; + +- dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); ++ dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; +@@ -358,7 +362,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, + struct bkey_i_dirent *dirent; + int ret; + +- dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); ++ dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; +@@ -395,15 +399,15 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + } + + int bch2_dirent_rename(struct btree_trans *trans, +- subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, +- subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, ++ subvol_inum src_dir, struct bch_hash_info *src_hash, ++ subvol_inum dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, + const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) + { + struct qstr src_name_lookup, dst_name_lookup; +- struct btree_iter src_iter = {}; +- struct btree_iter dst_iter = {}; ++ struct btree_iter src_iter = { NULL }; ++ struct btree_iter dst_iter = { NULL }; + struct bkey_s_c old_src, old_dst = bkey_s_c_null; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = +@@ -463,8 +467,8 @@ int bch2_dirent_rename(struct btree_trans *trans, + *src_offset = dst_iter.pos.offset; + + /* Create new dst key: */ +- new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, +- dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ++ new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, ++ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); + ret = PTR_ERR_OR_ZERO(new_dst); + if (ret) + goto out; +@@ -474,8 +478,8 @@ int bch2_dirent_rename(struct btree_trans *trans, + + /* Create new src key: */ + if (mode == BCH_RENAME_EXCHANGE) { +- new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name, +- src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ++ new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name, ++ src_hash->cf_encoding ? &src_name_lookup : NULL, 0); + ret = PTR_ERR_OR_ZERO(new_src); + if (ret) + goto out; +@@ -535,14 +539,6 @@ int bch2_dirent_rename(struct btree_trans *trans, + new_src->v.d_type == DT_SUBVOL) + new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + +- if (old_dst.k) +- *dst_dir_i_size -= bkey_bytes(old_dst.k); +- *src_dir_i_size -= bkey_bytes(old_src.k); +- +- if (mode == BCH_RENAME_EXCHANGE) +- *src_dir_i_size += bkey_bytes(&new_src->k); +- *dst_dir_i_size += bkey_bytes(&new_dst->k); +- + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); + if (ret) + goto out; +@@ -571,16 +567,16 @@ int bch2_dirent_rename(struct btree_trans *trans, + } + + if (delete_src) { +- bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); +- ret = bch2_btree_iter_traverse(trans, &src_iter) ?: ++ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); ++ ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto out; + } + + if (delete_dst) { +- bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); +- ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: ++ bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); ++ ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto out; +@@ -590,8 +586,8 @@ int bch2_dirent_rename(struct btree_trans *trans, + *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; + out: +- bch2_trans_iter_exit(trans, &src_iter); +- bch2_trans_iter_exit(trans, &dst_iter); ++ bch2_trans_iter_exit(&src_iter); ++ bch2_trans_iter_exit(&dst_iter); + return ret; + } + +@@ -618,7 +614,7 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, + ret = -ENOENT; + err: + if (ret) +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return ret; + } + +@@ -626,19 +622,17 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum) + { +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + struct btree_iter iter = {}; + + int ret = lockrestart_do(trans, + bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); ++ bch2_trans_iter_exit(&iter); + return ret; + } + + int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) + { +- struct btree_iter iter; + struct bkey_s_c k; + int ret; + +@@ -649,10 +643,9 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) + continue; +- ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; ++ ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty); + break; + } +- bch2_trans_iter_exit(trans, &iter); + + return ret; + } +@@ -685,13 +678,15 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv + return !ret; + } + +-int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) ++int bch2_readdir(struct bch_fs *c, subvol_inum inum, ++ struct bch_hash_info *hash_info, ++ struct dir_context *ctx) + { + struct bkey_buf sk; + bch2_bkey_buf_init(&sk); + +- int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, + POS(inum.inum, ctx->pos), + POS(inum.inum, U64_MAX), + inum.subvol, 0, k, ({ +@@ -703,12 +698,16 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) + struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); + + subvol_inum target; +- int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target); ++ ++ bool need_second_pass = false; ++ int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc, ++ hash_info, &iter, k, &need_second_pass) ?: ++ bch2_dirent_read_target(trans, inum, dirent, &target); + if (ret2 > 0) + continue; + + ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target)); +- }))); ++ })); + + bch2_bkey_buf_exit(&sk, c); + +@@ -720,7 +719,6 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) + static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) + { +- struct btree_iter iter; + struct bkey_s_c k; + int ret; + +@@ -733,34 +731,31 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + ret = bch2_inode_unpack(k, inode); + goto found; + } +- ret = -BCH_ERR_ENOENT_inode; ++ ret = bch_err_throw(trans->c, ENOENT_inode); + found: + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); +- bch2_trans_iter_exit(trans, &iter); + return ret; + } + + int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; +- struct bch_inode_unpacked dir_inode; +- struct bch_hash_info dir_hash_info; +- int ret; + +- ret = lookup_first_inode(trans, pos.inode, &dir_inode); ++ struct bch_inode_unpacked dir_inode; ++ int ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + +- dir_hash_info = bch2_hash_info_init(c, &dir_inode); ++ { ++ struct bch_hash_info dir_hash_info = bch2_hash_info_init(c, &dir_inode); + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_dirents, pos, BTREE_ITER_intent); + +- ret = bch2_btree_iter_traverse(trans, &iter) ?: +- bch2_hash_delete_at(trans, bch2_dirent_hash_desc, +- &dir_hash_info, &iter, +- BTREE_UPDATE_internal_snapshot_node); +- bch2_trans_iter_exit(trans, &iter); ++ ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ &dir_hash_info, &iter, ++ BTREE_UPDATE_internal_snapshot_node); ++ } + err: + bch_err_fn(c, ret); + return ret; +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index d3e7ae669575..efb58d2dcf68 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -23,8 +23,16 @@ struct bch_fs; + struct bch_hash_info; + struct bch_inode_info; + ++#if IS_ENABLED(CONFIG_UNICODE) + int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, + const struct qstr *, struct qstr *); ++#else ++static inline int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, ++ const struct qstr *str, struct qstr *out_cf) ++{ ++ return bch_err_throw(trans->c, no_casefolding_without_utf8); ++} ++#endif + + static inline int bch2_maybe_casefold(struct btree_trans *trans, + const struct bch_hash_info *info, +@@ -38,7 +46,7 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans, + } + } + +-struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); ++struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); + + static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) + { +@@ -59,6 +67,15 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst, + dst->v.d_type = src.v->d_type; + } + ++int bch2_dirent_init_name(struct bch_fs *, ++ struct bkey_i_dirent *, ++ const struct bch_hash_info *, ++ const struct qstr *, ++ const struct qstr *); ++struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *, ++ const struct bch_hash_info *, subvol_inum, u8, ++ const struct qstr *, const struct qstr *, u64); ++ + int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, +@@ -80,8 +97,8 @@ enum bch_rename_mode { + }; + + int bch2_dirent_rename(struct btree_trans *, +- subvol_inum, struct bch_hash_info *, u64 *, +- subvol_inum, struct bch_hash_info *, u64 *, ++ subvol_inum, struct bch_hash_info *, ++ subvol_inum, struct bch_hash_info *, + const struct qstr *, subvol_inum *, u64 *, + const struct qstr *, subvol_inum *, u64 *, + enum bch_rename_mode); +@@ -95,7 +112,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, + + int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); + int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); +-int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); ++int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *); + + int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); + +diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c +index 1f0422bfae35..f96530c70262 100644 +--- a/fs/bcachefs/disk_accounting.c ++++ b/fs/bcachefs/disk_accounting.c +@@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = { + NULL + }; + +-static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, +- s64 *d, unsigned nr) ++static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, ++ s64 *d, unsigned nr) + { + struct bkey_i_accounting *acc = bkey_accounting_init(k); + +- acc->k.p = disk_accounting_pos_to_bpos(pos); ++ acc->k.p = pos; + set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); + + memcpy_u64s_small(acc->v.d, d, nr); + } + ++static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, ++ s64 *d, unsigned nr) ++{ ++ return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); ++} ++ + static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); + + int bch2_disk_accounting_mod(struct btree_trans *trans, + struct disk_accounting_pos *k, + s64 *d, unsigned nr, bool gc) + { ++ BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); ++ + /* Normalize: */ + switch (k->type) { + case BCH_DISK_ACCOUNTING_replicas: +@@ -92,21 +100,49 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, + break; + } + +- BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); ++ struct bpos pos = disk_accounting_pos_to_bpos(k); ++ ++ if (likely(!gc)) { ++ struct bkey_i_accounting *a; ++#if 0 ++ for (a = btree_trans_subbuf_base(trans, &trans->accounting); ++ a != btree_trans_subbuf_top(trans, &trans->accounting); ++ a = (void *) bkey_next(&a->k_i)) ++ if (bpos_eq(a->k.p, pos)) { ++ BUG_ON(nr != bch2_accounting_counters(&a->k)); ++ acc_u64s(a->v.d, d, nr); ++ ++ if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { ++ unsigned offset = (u64 *) a - ++ (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); ++ ++ trans->accounting.u64s -= a->k.u64s; ++ memmove_u64s_down(a, ++ bkey_next(&a->k_i), ++ trans->accounting.u64s - offset); ++ } ++ return 0; ++ } ++#endif ++ unsigned u64s = sizeof(*a) / sizeof(u64) + nr; ++ a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); ++ int ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ return ret; + +- struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; ++ __accounting_key_init(&a->k_i, pos, d, nr); ++ return 0; ++ } else { ++ struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + +- accounting_key_init(&k_i.k, k, d, nr); ++ __accounting_key_init(&k_i.k, pos, d, nr); + +- if (unlikely(gc)) { + int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (ret == -BCH_ERR_btree_insert_need_mark_replicas) + ret = drop_locks_do(trans, + bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: + bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + return ret; +- } else { +- return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); + } + } + +@@ -287,7 +323,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc + + static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) + { +- struct bch_replicas_padded r; ++ union bch_replicas_padded r; + return accounting_to_replicas(&r.e, p) + ? bch2_mark_replicas(c, &r.e) + : 0; +@@ -299,14 +335,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) + */ + int bch2_accounting_update_sb(struct btree_trans *trans) + { +- for (struct jset_entry *i = trans->journal_entries; +- i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); +- i = vstruct_next(i)) +- if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { +- int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); +- if (ret) +- return ret; +- } ++ for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); ++ i != btree_trans_subbuf_top(trans, &trans->accounting); ++ i = bkey_next(i)) { ++ int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); ++ if (ret) ++ return ret; ++ } + + return 0; + } +@@ -345,33 +380,32 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun + accounting_pos_cmp, NULL); + + if (trace_accounting_mem_insert_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_accounting_to_text(&buf, c, a.s_c); + trace_accounting_mem_insert(c, buf.buf); +- printbuf_exit(&buf); + } + return 0; + err: + free_percpu(n.v[1]); + free_percpu(n.v[0]); +- return -BCH_ERR_ENOMEM_disk_accounting; ++ return bch_err_throw(c, ENOMEM_disk_accounting); + } + + int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) + { +- struct bch_replicas_padded r; ++ union bch_replicas_padded r; + + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) +- return -BCH_ERR_btree_insert_need_mark_replicas; ++ return bch_err_throw(c, btree_insert_need_mark_replicas); + + percpu_up_read(&c->mark_lock); +- percpu_down_write(&c->mark_lock); +- int ret = __bch2_accounting_mem_insert(c, a); +- percpu_up_write(&c->mark_lock); ++ int ret; ++ scoped_guard(percpu_write, &c->mark_lock) ++ ret = __bch2_accounting_mem_insert(c, a); + percpu_down_read(&c->mark_lock); + return ret; + } +@@ -379,12 +413,12 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, + int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) + { +- struct bch_replicas_padded r; ++ union bch_replicas_padded r; + + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) +- return -BCH_ERR_btree_insert_need_mark_replicas; ++ return bch_err_throw(c, btree_insert_need_mark_replicas); + + return __bch2_accounting_mem_insert(c, a); + } +@@ -403,7 +437,7 @@ void bch2_accounting_mem_gc(struct bch_fs *c) + { + struct bch_accounting_mem *acc = &c->accounting; + +- percpu_down_write(&c->mark_lock); ++ guard(percpu_write)(&c->mark_lock); + struct accounting_mem_entry *dst = acc->k.data; + + darray_for_each(acc->k, src) { +@@ -418,7 +452,6 @@ void bch2_accounting_mem_gc(struct bch_fs *c) + acc->k.nr = dst - acc->k.data; + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); +- percpu_up_write(&c->mark_lock); + } + + /* +@@ -436,12 +469,14 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) + + darray_init(usage); + +- percpu_down_read(&c->mark_lock); ++ guard(percpu_read)(&c->mark_lock); + darray_for_each(acc->k, i) { +- struct { ++ union { ++ u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, ++ BCH_BKEY_PTRS_MAX)]; + struct bch_replicas_usage r; +- u8 pad[BCH_BKEY_PTRS_MAX]; + } u; ++ u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; + + if (!accounting_to_replicas(&u.r.r, i->pos)) + continue; +@@ -457,7 +492,6 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) + memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r)); + usage->nr += replicas_usage_bytes(&u.r); + } +- percpu_up_read(&c->mark_lock); + + if (ret) + darray_exit(usage); +@@ -472,7 +506,7 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc + + darray_init(out_buf); + +- percpu_down_read(&c->mark_lock); ++ guard(percpu_read)(&c->mark_lock); + darray_for_each(acc->k, i) { + struct disk_accounting_pos a_p; + bpos_to_disk_accounting_pos(&a_p, i->pos); +@@ -496,8 +530,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc + out_buf->nr += bkey_bytes(&a_out->k); + } + +- percpu_up_read(&c->mark_lock); +- + if (ret) + darray_exit(out_buf); + return ret; +@@ -516,32 +548,30 @@ int bch2_gc_accounting_start(struct bch_fs *c) + struct bch_accounting_mem *acc = &c->accounting; + int ret = 0; + +- percpu_down_write(&c->mark_lock); ++ guard(percpu_write)(&c->mark_lock); + darray_for_each(acc->k, e) { + e->v[1] = __alloc_percpu_gfp(e->nr_counters * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!e->v[1]) { + bch2_accounting_free_counters(acc, true); +- ret = -BCH_ERR_ENOMEM_disk_accounting; ++ ret = bch_err_throw(c, ENOMEM_disk_accounting); + break; + } + } + + acc->gc_running = !ret; +- percpu_up_write(&c->mark_lock); +- + return ret; + } + + int bch2_gc_accounting_done(struct bch_fs *c) + { + struct bch_accounting_mem *acc = &c->accounting; +- struct btree_trans *trans = bch2_trans_get(c); +- struct printbuf buf = PRINTBUF; ++ CLASS(btree_trans, trans)(c); ++ CLASS(printbuf, buf)(); + struct bpos pos = POS_MIN; + int ret = 0; + +- percpu_down_write(&c->mark_lock); ++ guard(percpu_write)(&c->mark_lock); + while (1) { + unsigned idx = eytzinger0_find_ge(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &pos); +@@ -570,20 +600,23 @@ int bch2_gc_accounting_done(struct bch_fs *c) + prt_str(&buf, "accounting mismatch for "); + bch2_accounting_key_to_text(&buf, &acc_k); + +- prt_str(&buf, ": got"); ++ prt_str(&buf, ":\n got"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", dst_v[j]); + +- prt_str(&buf, " should be"); ++ prt_str(&buf, "\nshould be"); + for (unsigned j = 0; j < nr; j++) + prt_printf(&buf, " %llu", src_v[j]); + + for (unsigned j = 0; j < nr; j++) + src_v[j] -= dst_v[j]; + +- if (fsck_err(trans, accounting_mismatch, "%s", buf.buf)) { ++ bch2_trans_unlock_long(trans); ++ ++ if (fsck_err(c, accounting_mismatch, "%s", buf.buf)) { + percpu_up_write(&c->mark_lock); +- ret = commit_do(trans, NULL, NULL, 0, ++ ret = commit_do(trans, NULL, NULL, ++ BCH_TRANS_COMMIT_skip_accounting_apply, + bch2_disk_accounting_mod(trans, &acc_k, src_v, nr, false)); + percpu_down_write(&c->mark_lock); + if (ret) +@@ -598,20 +631,16 @@ int bch2_gc_accounting_done(struct bch_fs *c) + bkey_i_to_s_c_accounting(&k_i.k), + BCH_ACCOUNTING_normal, true); + +- preempt_disable(); ++ guard(preempt)(); + struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); + struct bch_fs_usage_base *src = &trans->fs_usage_delta; + acc_u64s((u64 *) dst, (u64 *) src, sizeof(*src) / sizeof(u64)); +- preempt_enable(); + } + } + } + } + err: + fsck_err: +- percpu_up_write(&c->mark_lock); +- printbuf_exit(&buf); +- bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; + } +@@ -623,25 +652,23 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) + if (k.k->type != KEY_TYPE_accounting) + return 0; + +- percpu_down_read(&c->mark_lock); +- int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), +- BCH_ACCOUNTING_read, false); +- percpu_up_read(&c->mark_lock); +- return ret; ++ guard(percpu_read)(&c->mark_lock); ++ return bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), ++ BCH_ACCOUNTING_read, false); + } + + static int bch2_disk_accounting_validate_late(struct btree_trans *trans, +- struct disk_accounting_pos acc, ++ struct disk_accounting_pos *acc, + u64 *v, unsigned nr) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0, invalid_dev = -1; + +- switch (acc.type) { ++ switch (acc->type) { + case BCH_DISK_ACCOUNTING_replicas: { +- struct bch_replicas_padded r; +- __accounting_to_replicas(&r.e, &acc); ++ union bch_replicas_padded r; ++ __accounting_to_replicas(&r.e, acc); + + for (unsigned i = 0; i < r.e.nr_devs; i++) + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && +@@ -660,7 +687,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + trans, accounting_replicas_not_marked, + "accounting not marked in superblock replicas\n%s", + (printbuf_reset(&buf), +- bch2_accounting_key_to_text(&buf, &acc), ++ bch2_accounting_key_to_text(&buf, acc), + buf.buf))) { + /* + * We're not RW yet and still single threaded, dropping +@@ -676,31 +703,30 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + } + + case BCH_DISK_ACCOUNTING_dev_data_type: +- if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { +- invalid_dev = acc.dev_data_type.dev; ++ if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { ++ invalid_dev = acc->dev_data_type.dev; + goto invalid_device; + } + break; + } + + fsck_err: +- printbuf_exit(&buf); + return ret; + invalid_device: + if (fsck_err(trans, accounting_to_invalid_device, + "accounting entry points to invalid device %i\n%s", + invalid_dev, + (printbuf_reset(&buf), +- bch2_accounting_key_to_text(&buf, &acc), ++ bch2_accounting_key_to_text(&buf, acc), + buf.buf))) { + for (unsigned i = 0; i < nr; i++) + v[i] = -v[i]; + + ret = commit_do(trans, NULL, NULL, 0, +- bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: ++ bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: + -BCH_ERR_remove_disk_accounting_entry; + } else { +- ret = -BCH_ERR_remove_disk_accounting_entry; ++ ret = bch_err_throw(c, remove_disk_accounting_entry); + } + goto fsck_err; + } +@@ -712,8 +738,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + int bch2_accounting_read(struct bch_fs *c) + { + struct bch_accounting_mem *acc = &c->accounting; +- struct btree_trans *trans = bch2_trans_get(c); +- struct printbuf buf = PRINTBUF; ++ CLASS(btree_trans, trans)(c); ++ CLASS(printbuf, buf)(); + + /* + * We might run more than once if we rewind to start topology repair or +@@ -722,13 +748,13 @@ int bch2_accounting_read(struct bch_fs *c) + * + * Instead, zero out any accounting we have: + */ +- percpu_down_write(&c->mark_lock); +- darray_for_each(acc->k, e) +- percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); +- for_each_member_device(c, ca) +- percpu_memset(ca->usage, 0, sizeof(*ca->usage)); +- percpu_memset(c->usage, 0, sizeof(*c->usage)); +- percpu_up_write(&c->mark_lock); ++ scoped_guard(percpu_write, &c->mark_lock) { ++ darray_for_each(acc->k, e) ++ percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); ++ for_each_member_device(c, ca) ++ percpu_memset(ca->usage, 0, sizeof(*ca->usage)); ++ percpu_memset(c->usage, 0, sizeof(*c->usage)); ++ } + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, +@@ -748,18 +774,19 @@ int bch2_accounting_read(struct bch_fs *c) + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + +- if (!bch2_accounting_is_mem(acc_k)) { ++ if (!bch2_accounting_is_mem(&acc_k)) { + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; +- bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); ++ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + continue; + } + + accounting_read_key(trans, k); + })); ++ bch2_trans_iter_exit(&iter); + if (ret) +- goto err; ++ return ret; + + struct journal_keys *keys = &c->journal_keys; + struct journal_key *dst = keys->data; +@@ -770,7 +797,7 @@ int bch2_accounting_read(struct bch_fs *c) + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); + +- if (!bch2_accounting_is_mem(acc_k)) ++ if (!bch2_accounting_is_mem(&acc_k)) + continue; + + struct bkey_s_c k = bkey_i_to_s_c(i->k); +@@ -798,14 +825,14 @@ int bch2_accounting_read(struct bch_fs *c) + + ret = accounting_read_key(trans, k); + if (ret) +- goto err; ++ return ret; + } + + *dst++ = *i; + } + keys->gap = keys->nr = dst - keys->data; + +- percpu_down_write(&c->mark_lock); ++ guard(percpu_write)(&c->mark_lock); + + darray_for_each_reverse(acc->k, i) { + struct disk_accounting_pos acc_k; +@@ -826,7 +853,7 @@ int bch2_accounting_read(struct bch_fs *c) + */ + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry +- : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); ++ : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { + free_percpu(i->v[0]); +@@ -837,60 +864,55 @@ int bch2_accounting_read(struct bch_fs *c) + } + + if (ret) +- goto fsck_err; ++ return ret; + } + + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + +- preempt_disable(); +- struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); ++ scoped_guard(preempt) { ++ struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + +- for (unsigned i = 0; i < acc->k.nr; i++) { +- struct disk_accounting_pos k; +- bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); ++ for (unsigned i = 0; i < acc->k.nr; i++) { ++ struct disk_accounting_pos k; ++ bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos); + +- u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; +- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); ++ u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; ++ bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); + +- switch (k.type) { +- case BCH_DISK_ACCOUNTING_persistent_reserved: +- usage->reserved += v[0] * k.persistent_reserved.nr_replicas; +- break; +- case BCH_DISK_ACCOUNTING_replicas: +- fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); +- break; +- case BCH_DISK_ACCOUNTING_dev_data_type: +- rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); +- if (ca) { +- struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; +- percpu_u64_set(&d->buckets, v[0]); +- percpu_u64_set(&d->sectors, v[1]); +- percpu_u64_set(&d->fragmented, v[2]); +- +- if (k.dev_data_type.data_type == BCH_DATA_sb || +- k.dev_data_type.data_type == BCH_DATA_journal) +- usage->hidden += v[0] * ca->mi.bucket_size; ++ switch (k.type) { ++ case BCH_DISK_ACCOUNTING_persistent_reserved: ++ usage->reserved += v[0] * k.persistent_reserved.nr_replicas; ++ break; ++ case BCH_DISK_ACCOUNTING_replicas: ++ fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); ++ break; ++ case BCH_DISK_ACCOUNTING_dev_data_type: { ++ guard(rcu)(); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); ++ if (ca) { ++ struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; ++ percpu_u64_set(&d->buckets, v[0]); ++ percpu_u64_set(&d->sectors, v[1]); ++ percpu_u64_set(&d->fragmented, v[2]); ++ ++ if (k.dev_data_type.data_type == BCH_DATA_sb || ++ k.dev_data_type.data_type == BCH_DATA_journal) ++ usage->hidden += v[0] * ca->mi.bucket_size; ++ } ++ break; ++ } + } +- rcu_read_unlock(); +- break; + } + } +- preempt_enable(); +-fsck_err: +- percpu_up_write(&c->mark_lock); +-err: +- printbuf_exit(&buf); +- bch2_trans_put(trans); +- bch_err_fn(c, ret); ++ + return ret; + } + + int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) + { +- return bch2_trans_run(c, +- bch2_btree_write_buffer_flush_sync(trans) ?: ++ CLASS(btree_trans, trans)(c); ++ return bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_commit(trans, iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_all_snapshots, k, NULL, NULL, 0, ({ + struct disk_accounting_pos acc; +@@ -901,15 +923,16 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_accounting, k.k->p, 0) + : 0; + })) ?: +- bch2_btree_write_buffer_flush_sync(trans)); ++ bch2_btree_write_buffer_flush_sync(trans); + } + + int bch2_dev_usage_init(struct bch_dev *ca, bool gc) + { + struct bch_fs *c = ca->fs; ++ CLASS(btree_trans, trans)(c); + u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; + +- int ret = bch2_trans_do(c, ({ ++ int ret = lockrestart_do(trans, ({ + bch2_disk_accounting_mod2(trans, gc, + v, dev_data_type, + .dev = ca->dev_idx, +@@ -925,80 +948,77 @@ void bch2_verify_accounting_clean(struct bch_fs *c) + bool mismatch = false; + struct bch_fs_usage_base base = {}, base_inmem = {}; + +- bch2_trans_run(c, +- for_each_btree_key(trans, iter, +- BTREE_ID_accounting, POS_MIN, +- BTREE_ITER_all_snapshots, k, ({ +- u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; +- struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); +- unsigned nr = bch2_accounting_counters(k.k); ++ CLASS(btree_trans, trans)(c); ++ for_each_btree_key(trans, iter, ++ BTREE_ID_accounting, POS_MIN, ++ BTREE_ITER_all_snapshots, k, ({ ++ u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; ++ struct bkey_s_c_accounting a = bkey_s_c_to_accounting(k); ++ unsigned nr = bch2_accounting_counters(k.k); + +- struct disk_accounting_pos acc_k; +- bpos_to_disk_accounting_pos(&acc_k, k.k->p); ++ struct disk_accounting_pos acc_k; ++ bpos_to_disk_accounting_pos(&acc_k, k.k->p); + +- if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) +- break; ++ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) ++ break; + +- if (!bch2_accounting_is_mem(acc_k)) { +- struct disk_accounting_pos next; +- memset(&next, 0, sizeof(next)); +- next.type = acc_k.type + 1; +- bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); +- continue; +- } ++ if (!bch2_accounting_is_mem(&acc_k)) { ++ struct disk_accounting_pos next; ++ memset(&next, 0, sizeof(next)); ++ next.type = acc_k.type + 1; ++ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); ++ continue; ++ } + +- bch2_accounting_mem_read(c, k.k->p, v, nr); ++ bch2_accounting_mem_read(c, k.k->p, v, nr); + +- if (memcmp(a.v->d, v, nr * sizeof(u64))) { +- struct printbuf buf = PRINTBUF; ++ if (memcmp(a.v->d, v, nr * sizeof(u64))) { ++ CLASS(printbuf, buf)(); + +- bch2_bkey_val_to_text(&buf, c, k); +- prt_str(&buf, " !="); +- for (unsigned j = 0; j < nr; j++) +- prt_printf(&buf, " %llu", v[j]); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_str(&buf, " !="); ++ for (unsigned j = 0; j < nr; j++) ++ prt_printf(&buf, " %llu", v[j]); + +- pr_err("%s", buf.buf); +- printbuf_exit(&buf); +- mismatch = true; +- } ++ pr_err("%s", buf.buf); ++ mismatch = true; ++ } + +- switch (acc_k.type) { +- case BCH_DISK_ACCOUNTING_persistent_reserved: +- base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; +- break; +- case BCH_DISK_ACCOUNTING_replicas: +- fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); +- break; +- case BCH_DISK_ACCOUNTING_dev_data_type: { +- rcu_read_lock(); ++ switch (acc_k.type) { ++ case BCH_DISK_ACCOUNTING_persistent_reserved: ++ base.reserved += acc_k.persistent_reserved.nr_replicas * a.v->d[0]; ++ break; ++ case BCH_DISK_ACCOUNTING_replicas: ++ fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); ++ break; ++ case BCH_DISK_ACCOUNTING_dev_data_type: { ++ { ++ guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); +- if (!ca) { +- rcu_read_unlock(); ++ if (!ca) + continue; +- } + + v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); + v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); + v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); +- rcu_read_unlock(); ++ } + +- if (memcmp(a.v->d, v, 3 * sizeof(u64))) { +- struct printbuf buf = PRINTBUF; ++ if (memcmp(a.v->d, v, 3 * sizeof(u64))) { ++ CLASS(printbuf, buf)(); + +- bch2_bkey_val_to_text(&buf, c, k); +- prt_str(&buf, " in mem"); +- for (unsigned j = 0; j < nr; j++) +- prt_printf(&buf, " %llu", v[j]); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_str(&buf, " in mem"); ++ for (unsigned j = 0; j < nr; j++) ++ prt_printf(&buf, " %llu", v[j]); + +- pr_err("dev accounting mismatch: %s", buf.buf); +- printbuf_exit(&buf); +- mismatch = true; +- } +- } ++ pr_err("dev accounting mismatch: %s", buf.buf); ++ mismatch = true; + } ++ } ++ } + +- 0; +- }))); ++ 0; ++ })); + + acc_u64s_percpu(&base_inmem.hidden, &c->usage->hidden, sizeof(base_inmem) / sizeof(u64)); + +diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h +index d557b99b3c0a..43f4b21d0aab 100644 +--- a/fs/bcachefs/disk_accounting.h ++++ b/fs/bcachefs/disk_accounting.h +@@ -139,10 +139,10 @@ int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum + int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); + void bch2_accounting_mem_gc(struct bch_fs *); + +-static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) ++static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) + { +- return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && +- acc.type != BCH_DISK_ACCOUNTING_inum; ++ return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && ++ acc->type != BCH_DISK_ACCOUNTING_inum; + } + + /* +@@ -163,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, + if (gc && !acc->gc_running) + return 0; + +- if (!bch2_accounting_is_mem(acc_k)) ++ if (!bch2_accounting_is_mem(&acc_k)) + return 0; + + if (mode == BCH_ACCOUNTING_normal) { +@@ -174,17 +174,17 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, + case BCH_DISK_ACCOUNTING_replicas: + fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); + break; +- case BCH_DISK_ACCOUNTING_dev_data_type: +- rcu_read_lock(); ++ case BCH_DISK_ACCOUNTING_dev_data_type: { ++ guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); + if (ca) { + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); + } +- rcu_read_unlock(); + break; + } ++ } + } + + unsigned idx; +@@ -211,10 +211,8 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, + + static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) + { +- percpu_down_read(&trans->c->mark_lock); +- int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); +- percpu_up_read(&trans->c->mark_lock); +- return ret; ++ guard(percpu_read)(&trans->c->mark_lock); ++ return bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); + } + + static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *acc, +@@ -236,13 +234,12 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem * + static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, + u64 *v, unsigned nr) + { +- percpu_down_read(&c->mark_lock); ++ guard(percpu_read)(&c->mark_lock); + struct bch_accounting_mem *acc = &c->accounting; + unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, &p); + + bch2_accounting_mem_read_counters(acc, idx, v, nr, false); +- percpu_up_read(&c->mark_lock); + } + + static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) +@@ -259,8 +256,8 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, + struct bkey_i_accounting *a, + unsigned commit_flags) + { +- a->k.bversion = journal_pos_to_bversion(&trans->journal_res, +- (u64 *) a - (u64 *) trans->journal_entries); ++ u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); ++ a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); + + EBUG_ON(bversion_zero(a->k.bversion)); + +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +index 2ca3cbf12b71..293e47268508 100644 +--- a/fs/bcachefs/disk_groups.c ++++ b/fs/bcachefs/disk_groups.c +@@ -86,35 +86,6 @@ static int bch2_sb_disk_groups_validate(struct bch_sb *sb, struct bch_sb_field * + return ret; + } + +-void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +-{ +- out->atomic++; +- rcu_read_lock(); +- +- struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); +- if (!g) +- goto out; +- +- for (unsigned i = 0; i < g->nr; i++) { +- if (i) +- prt_printf(out, " "); +- +- if (g->entries[i].deleted) { +- prt_printf(out, "[deleted]"); +- continue; +- } +- +- prt_printf(out, "[parent %d devs", g->entries[i].parent); +- for_each_member_device_rcu(c, ca, &g->entries[i].devs) +- prt_printf(out, " %s", ca->name); +- prt_printf(out, "]"); +- } +- +-out: +- rcu_read_unlock(); +- out->atomic--; +-} +- + static void bch2_sb_disk_groups_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +@@ -159,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) + + cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); + if (!cpu_g) +- return -BCH_ERR_ENOMEM_disk_groups_to_cpu; ++ return bch_err_throw(c, ENOMEM_disk_groups_to_cpu); + + cpu_g->nr = nr_groups; + +@@ -199,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) + const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) + { + struct target t = target_decode(target); +- struct bch_devs_mask *devs; + +- rcu_read_lock(); ++ guard(rcu)(); + + switch (t.type) { + case TARGET_NULL: +- devs = NULL; +- break; ++ return NULL; + case TARGET_DEV: { + struct bch_dev *ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; +- devs = ca ? &ca->self : NULL; +- break; ++ return ca ? &ca->self : NULL; + } + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + +- devs = g && t.group < g->nr && !g->entries[t.group].deleted ++ return g && t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; +- break; + } + default: + BUG(); + } +- +- rcu_read_unlock(); +- +- return devs; + } + + bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) +@@ -241,20 +204,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) + case TARGET_DEV: + return dev == t.dev; + case TARGET_GROUP: { +- struct bch_disk_groups_cpu *g; +- const struct bch_devs_mask *m; +- bool ret; +- +- rcu_read_lock(); +- g = rcu_dereference(c->disk_groups); +- m = g && t.group < g->nr && !g->entries[t.group].deleted ++ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); ++ const struct bch_devs_mask *m = ++ g && t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + +- ret = m ? test_bit(dev, m->d) : false; +- rcu_read_unlock(); +- +- return ret; ++ return m ? test_bit(dev, m->d) : false; + } + default: + BUG(); +@@ -377,54 +333,76 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) + return v; + } + +-void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) ++static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, ++ unsigned v) + { +- struct bch_disk_groups_cpu *groups; +- struct bch_disk_group_cpu *g; +- unsigned nr = 0; + u16 path[32]; +- +- out->atomic++; +- rcu_read_lock(); +- groups = rcu_dereference(c->disk_groups); +- if (!groups) +- goto invalid; ++ unsigned nr = 0; + + while (1) { + if (nr == ARRAY_SIZE(path)) + goto invalid; + +- if (v >= groups->nr) ++ if (v >= (g ? g->nr : 0)) + goto invalid; + +- g = groups->entries + v; ++ struct bch_disk_group_cpu *e = g->entries + v; + +- if (g->deleted) ++ if (e->deleted) + goto invalid; + + path[nr++] = v; + +- if (!g->parent) ++ if (!e->parent) + break; + +- v = g->parent - 1; ++ v = e->parent - 1; + } + + while (nr) { +- v = path[--nr]; +- g = groups->entries + v; ++ struct bch_disk_group_cpu *e = g->entries + path[--nr]; + +- prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); ++ prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); + if (nr) + prt_printf(out, "."); + } +-out: +- rcu_read_unlock(); +- out->atomic--; + return; + invalid: + prt_printf(out, "invalid label %u", v); +- goto out; ++} ++ ++void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ bch2_printbuf_make_room(out, 4096); ++ ++ guard(printbuf_atomic)(out); ++ guard(rcu)(); ++ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); ++ ++ for (unsigned i = 0; i < (g ? g->nr : 0); i++) { ++ prt_printf(out, "%2u: ", i); ++ ++ if (g->entries[i].deleted) { ++ prt_printf(out, "[deleted]"); ++ goto next; ++ } ++ ++ __bch2_disk_path_to_text(out, g, i); ++ ++ prt_printf(out, " devs"); ++ ++ for_each_member_device_rcu(c, ca, &g->entries[i].devs) ++ prt_printf(out, " %s", ca->name); ++next: ++ prt_newline(out); ++ } ++} ++ ++void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) ++{ ++ guard(printbuf_atomic)(out); ++ guard(rcu)(); ++ __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v); + } + + void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) +@@ -490,14 +468,9 @@ int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) + + int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) + { +- int ret; +- +- mutex_lock(&c->sb_lock); +- ret = __bch2_dev_group_set(c, ca, name) ?: ++ guard(mutex)(&c->sb_lock); ++ return __bch2_dev_group_set(c, ca, name) ?: + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- +- return ret; + } + + int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, +@@ -525,9 +498,8 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, + return 0; + } + +- mutex_lock(&c->sb_lock); +- g = bch2_disk_path_find(&c->disk_sb, val); +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) ++ g = bch2_disk_path_find(&c->disk_sb, val); + + if (g >= 0) { + *res = group_to_target(g); +@@ -544,32 +516,25 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) + switch (t.type) { + case TARGET_NULL: + prt_printf(out, "none"); +- break; ++ return; + case TARGET_DEV: { +- struct bch_dev *ca; +- +- out->atomic++; +- rcu_read_lock(); +- ca = t.dev < c->sb.nr_devices ++ guard(printbuf_atomic)(out); ++ guard(rcu)(); ++ struct bch_dev *ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + +- if (ca && percpu_ref_tryget(&ca->io_ref[READ])) { ++ if (ca && ca->disk_sb.bdev) + prt_printf(out, "/dev/%s", ca->name); +- percpu_ref_put(&ca->io_ref[READ]); +- } else if (ca) { ++ else if (ca) + prt_printf(out, "offline device %u", t.dev); +- } else { ++ else + prt_printf(out, "invalid device %u", t.dev); +- } +- +- rcu_read_unlock(); +- out->atomic--; +- break; ++ return; + } + case TARGET_GROUP: + bch2_disk_path_to_text(out, c, t.group); +- break; ++ return; + default: + BUG(); + } +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index c6cb26981923..c2840cb674b2 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -16,6 +16,7 @@ + #include "disk_accounting.h" + #include "disk_groups.h" + #include "ec.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "io_read.h" + #include "io_write.h" +@@ -196,8 +197,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s.v->sectors) : 0; +- struct printbuf buf = PRINTBUF; +- int ret = 0; ++ CLASS(printbuf, buf)(); + + struct bch_fs *c = trans->c; + if (deleting) +@@ -211,10 +211,8 @@ static int __mark_stripe_bucket(struct btree_trans *trans, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->stripe, s.k->p.offset, +- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { +- ret = -BCH_ERR_mark_stripe; +- goto err; +- } ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) ++ return bch_err_throw(c, mark_stripe); + + if (bch2_trans_inconsistent_on(parity && bch2_bucket_sectors_total(*a), trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u cached_sectors %u: data already in parity bucket\n%s", +@@ -222,30 +220,24 @@ static int __mark_stripe_bucket(struct btree_trans *trans, + bch2_data_type_str(a->data_type), + a->dirty_sectors, + a->cached_sectors, +- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { +- ret = -BCH_ERR_mark_stripe; +- goto err; +- } ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) ++ return bch_err_throw(c, mark_stripe); + } else { + if (bch2_trans_inconsistent_on(a->stripe != s.k->p.offset || + a->stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe (got %u)\n%s", + bucket.inode, bucket.offset, a->gen, + a->stripe, +- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { +- ret = -BCH_ERR_mark_stripe; +- goto err; +- } ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) ++ return bch_err_throw(c, mark_stripe); + + if (bch2_trans_inconsistent_on(a->data_type != data_type, trans, + "bucket %llu:%llu gen %u data type %s: wrong data type when stripe, should be %s\n%s", + bucket.inode, bucket.offset, a->gen, + bch2_data_type_str(a->data_type), + bch2_data_type_str(data_type), +- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { +- ret = -BCH_ERR_mark_stripe; +- goto err; +- } ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) ++ return bch_err_throw(c, mark_stripe); + + if (bch2_trans_inconsistent_on(parity && + (a->dirty_sectors != -sectors || +@@ -254,17 +246,15 @@ static int __mark_stripe_bucket(struct btree_trans *trans, + bucket.inode, bucket.offset, a->gen, + a->dirty_sectors, + a->cached_sectors, +- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { +- ret = -BCH_ERR_mark_stripe; +- goto err; +- } ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) ++ return bch_err_throw(c, mark_stripe); + } + + if (sectors) { +- ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, +- a->gen, a->data_type, &a->dirty_sectors); ++ int ret = bch2_bucket_ref_update(trans, ca, s.s_c, ptr, sectors, data_type, ++ a->gen, a->data_type, &a->dirty_sectors); + if (ret) +- goto err; ++ return ret; + } + + if (!deleting) { +@@ -276,9 +266,8 @@ static int __mark_stripe_bucket(struct btree_trans *trans, + a->stripe_redundancy = 0; + alloc_data_type_set(a, BCH_DATA_user); + } +-err: +- printbuf_exit(&buf); +- return ret; ++ ++ return 0; + } + + static int mark_stripe_bucket(struct btree_trans *trans, +@@ -288,14 +277,13 @@ static int mark_stripe_bucket(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = s.v->ptrs + ptr_idx; +- struct printbuf buf = PRINTBUF; +- int ret = 0; ++ CLASS(printbuf, buf)(); + +- struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); ++ CLASS(bch2_dev_tryget, ca)(c, ptr->dev); + if (unlikely(!ca)) { + if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) +- ret = -BCH_ERR_mark_stripe; +- goto err; ++ return bch_err_throw(c, mark_stripe); ++ return 0; + } + + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); +@@ -311,36 +299,32 @@ static int mark_stripe_bucket(struct btree_trans *trans, + + struct bkey_i_alloc_v4 *a = + bch2_trans_start_alloc_update(trans, bucket, 0); +- ret = PTR_ERR_OR_ZERO(a) ?: ++ int ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: + bch2_bucket_backpointer_mod(trans, s.s_c, &bp, + !(flags & BTREE_TRIGGER_overwrite)); + if (ret) +- goto err; ++ return ret; + } + + if (flags & BTREE_TRIGGER_gc) { + struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", + ptr->dev, +- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { +- ret = -BCH_ERR_mark_stripe; +- goto err; +- } ++ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) ++ return bch_err_throw(c, mark_stripe); + + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; +- ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); ++ int ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); + alloc_to_bucket(g, new); + bucket_unlock(g); + + if (!ret) + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + } +-err: +- bch2_dev_put(ca); +- printbuf_exit(&buf); +- return ret; ++ ++ return 0; + } + + static int mark_stripe_buckets(struct btree_trans *trans, +@@ -427,7 +411,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, + gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + if (!gc) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); +- return -BCH_ERR_ENOMEM_mark_stripe; ++ return bch_err_throw(c, ENOMEM_mark_stripe); + } + + /* +@@ -535,7 +519,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) + } + + /* XXX: this is a non-mempoolified memory allocation: */ +-static int ec_stripe_buf_init(struct ec_stripe_buf *buf, ++static int ec_stripe_buf_init(struct bch_fs *c, ++ struct ec_stripe_buf *buf, + unsigned offset, unsigned size) + { + struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; +@@ -563,7 +548,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, + return 0; + err: + ec_stripe_buf_exit(buf); +- return -BCH_ERR_ENOMEM_stripe_buf; ++ return bch_err_throw(c, ENOMEM_stripe_buf); + } + + /* Checksumming: */ +@@ -628,16 +613,15 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) + struct bch_csum got = ec_block_checksum(buf, i, offset); + + if (bch2_crc_cmp(want, got)) { +- struct bch_dev *ca = bch2_dev_tryget(c, v->ptrs[i].dev); ++ CLASS(bch2_dev_tryget, ca)(c, v->ptrs[i].dev); + if (ca) { +- struct printbuf err = PRINTBUF; ++ CLASS(printbuf, err)(); + + prt_str(&err, "stripe "); + bch2_csum_err_msg(&err, v->csum_type, want, got); + prt_printf(&err, " for %ps at %u of\n ", (void *) _RET_IP_, i); + bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key)); + bch_err_ratelimited(ca, "%s", err.buf); +- printbuf_exit(&err); + + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + } +@@ -700,6 +684,9 @@ static void ec_block_endio(struct bio *bio) + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + int rw = ec_bio->rw; ++ unsigned ref = rw == READ ++ ? (unsigned) BCH_DEV_READ_REF_ec_block ++ : (unsigned) BCH_DEV_WRITE_REF_ec_block; + + bch2_account_io_completion(ca, bio_data_dir(bio), + ec_bio->submit_time, !bio->bi_status); +@@ -721,7 +708,7 @@ static void ec_block_endio(struct bio *bio) + } + + bio_put(&ec_bio->bio); +- percpu_ref_put(&ca->io_ref[rw]); ++ enumerated_ref_put(&ca->io_ref[rw], ref); + closure_put(cl); + } + +@@ -735,8 +722,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + ? BCH_DATA_user + : BCH_DATA_parity; + int rw = op_is_write(opf); ++ unsigned ref = rw == READ ++ ? (unsigned) BCH_DEV_READ_REF_ec_block ++ : (unsigned) BCH_DEV_WRITE_REF_ec_block; + +- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); + if (!ca) { + clear_bit(idx, buf->valid); + return; +@@ -782,36 +772,28 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); + + closure_get(cl); +- percpu_ref_get(&ca->io_ref[rw]); ++ enumerated_ref_get(&ca->io_ref[rw], ref); + + submit_bio(&ec_bio->bio); + + offset += b; + } + +- percpu_ref_put(&ca->io_ref[rw]); ++ enumerated_ref_put(&ca->io_ref[rw], ref); + } + + static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, + struct ec_stripe_buf *stripe) + { +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; +- +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, +- POS(0, idx), BTREE_ITER_slots); +- ret = bkey_err(k); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_slots); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); + if (ret) +- goto err; +- if (k.k->type != KEY_TYPE_stripe) { +- ret = -ENOENT; +- goto err; +- } ++ return ret; ++ if (k.k->type != KEY_TYPE_stripe) ++ return -ENOENT; + bkey_reassemble(&stripe->key, k); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return 0; + } + + /* recovery read path: */ +@@ -824,7 +806,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bch_stripe *v; + unsigned i, offset; + const char *msg = NULL; +- struct printbuf msgbuf = PRINTBUF; ++ CLASS(printbuf, msgbuf)(); + int ret = 0; + + closure_init_stack(&cl); +@@ -833,7 +815,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + + buf = kzalloc(sizeof(*buf), GFP_NOFS); + if (!buf) +- return -BCH_ERR_ENOMEM_ec_read_extent; ++ return bch_err_throw(c, ENOMEM_ec_read_extent); + + ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); + if (ret) { +@@ -854,7 +836,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + goto err; + } + +- ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); ++ ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); + if (ret) { + msg = "-ENOMEM"; + goto err; +@@ -886,8 +868,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + bch2_bkey_val_to_text(&msgbuf, c, orig_k); + bch_err_ratelimited(c, + "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); +- printbuf_exit(&msgbuf); +- ret = -BCH_ERR_stripe_reconstruct; ++ ret = bch_err_throw(c, stripe_reconstruct); + goto out; + } + +@@ -897,7 +878,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) + { + if (c->gc_pos.phase != GC_PHASE_not_running && + !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) +- return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; ++ return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); + + return 0; + } +@@ -928,31 +909,22 @@ static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) + + static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) + { +- bool ret = false; +- +- spin_lock(&c->ec_stripes_new_lock); +- ret = __bch2_stripe_is_open(c, idx); +- spin_unlock(&c->ec_stripes_new_lock); +- +- return ret; ++ guard(spinlock)(&c->ec_stripes_new_lock); ++ return __bch2_stripe_is_open(c, idx); + } + + static bool bch2_try_open_stripe(struct bch_fs *c, + struct ec_stripe_new *s, + u64 idx) + { +- bool ret; +- +- spin_lock(&c->ec_stripes_new_lock); +- ret = !__bch2_stripe_is_open(c, idx); ++ guard(spinlock)(&c->ec_stripes_new_lock); ++ bool ret = !__bch2_stripe_is_open(c, idx); + if (ret) { + unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); + + s->idx = idx; + hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); + } +- spin_unlock(&c->ec_stripes_new_lock); +- + return ret; + } + +@@ -960,9 +932,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) + { + BUG_ON(!s->idx); + +- spin_lock(&c->ec_stripes_new_lock); ++ guard(spinlock)(&c->ec_stripes_new_lock); + hlist_del_init(&s->hash); +- spin_unlock(&c->ec_stripes_new_lock); + + s->idx = 0; + } +@@ -971,13 +942,11 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) + + static int ec_stripe_delete(struct btree_trans *trans, u64 idx) + { +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, +- BTREE_ID_stripes, POS(0, idx), +- BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + /* + * We expect write buffer races here +@@ -986,10 +955,9 @@ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) + if (k.k->type == KEY_TYPE_stripe && + !bch2_stripe_is_open(trans->c, idx) && + stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) +- ret = bch2_btree_delete_at(trans, &iter, 0); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_btree_delete_at(trans, &iter, 0); ++ ++ return 0; + } + + /* +@@ -1011,14 +979,14 @@ static void ec_stripe_delete_work(struct work_struct *work) + BCH_TRANS_COMMIT_no_enospc, ({ + ec_stripe_delete(trans, lru_k.k->p.offset); + }))); +- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); + } + + void bch2_do_stripe_deletes(struct bch_fs *c) + { +- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && ++ if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && + !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) +- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); + } + + /* stripe creation: */ +@@ -1030,20 +998,17 @@ static int ec_stripe_key_update(struct btree_trans *trans, + struct bch_fs *c = trans->c; + bool create = !old; + +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, +- new->k.p, BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, new->k.p, BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe), + c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", +- bch2_bkey_types[k.k->type])) { +- ret = -EINVAL; +- goto err; +- } ++ bch2_bkey_types[k.k->type])) ++ return -EINVAL; + + if (k.k->type == KEY_TYPE_stripe) { + const struct bch_stripe *v = bkey_s_c_to_stripe(k).v; +@@ -1055,7 +1020,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, + unsigned sectors = stripe_blockcount_get(v, i); + + if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_printf(&buf, "stripe changed nonempty block %u", i); + prt_str(&buf, "\nold: "); +@@ -1063,9 +1028,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i)); + bch2_fs_inconsistent(c, "%s", buf.buf); +- printbuf_exit(&buf); +- ret = -EINVAL; +- goto err; ++ return -EINVAL; + } + + /* +@@ -1083,10 +1046,7 @@ static int ec_stripe_key_update(struct btree_trans *trans, + } + } + +- ret = bch2_trans_update(trans, &iter, &new->k_i, 0); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_trans_update(trans, &iter, &new->k_i, 0); + } + + static int ec_stripe_update_extent(struct btree_trans *trans, +@@ -1107,22 +1067,19 @@ static int ec_stripe_update_extent(struct btree_trans *trans, + int ret, dev, block; + + if (bp.v->level) { +- struct printbuf buf = PRINTBUF; + struct btree_iter node_iter; +- struct btree *b; +- +- b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); +- bch2_trans_iter_exit(trans, &node_iter); ++ struct btree *b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); ++ bch2_trans_iter_exit(&node_iter); + + if (!b) + return 0; + ++ CLASS(printbuf, buf)(); + prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); + bch2_bkey_val_to_text(&buf, c, bp.s_c); + + bch2_fs_inconsistent(c, "%s", buf.buf); +- printbuf_exit(&buf); +- return -BCH_ERR_erasure_coding_found_btree_node; ++ return bch_err_throw(c, erasure_coding_found_btree_node); + } + + k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); +@@ -1174,7 +1131,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, + + ret = bch2_trans_update(trans, &iter, n, 0); + out: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -1186,9 +1143,9 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b + struct bch_extent_ptr ptr = v->ptrs[block]; + int ret = 0; + +- struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); ++ CLASS(bch2_dev_tryget, ca)(c, ptr.dev); + if (!ca) +- return -BCH_ERR_ENOENT_dev_not_found; ++ return bch_err_throw(c, ENOENT_dev_not_found); + + struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); + +@@ -1217,28 +1174,26 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b + })); + + bch2_bkey_buf_exit(&last_flushed, c); +- bch2_dev_put(ca); + return ret; + } + + static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) + { +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + + int ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) +- goto err; ++ return ret; + + for (unsigned i = 0; i < nr_data; i++) { + ret = ec_stripe_update_bucket(trans, s, i); + if (ret) +- break; ++ return ret; + } +-err: +- bch2_trans_put(trans); +- return ret; ++ ++ return 0; + } + + static void zero_out_rest_of_ec_bucket(struct bch_fs *c, +@@ -1246,9 +1201,10 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, + unsigned block, + struct open_bucket *ob) + { +- struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, ++ BCH_DEV_WRITE_REF_ec_bucket_zero); + if (!ca) { +- s->err = -BCH_ERR_erofs_no_writes; ++ s->err = bch_err_throw(c, erofs_no_writes); + return; + } + +@@ -1262,7 +1218,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, + ob->sectors_free, + GFP_KERNEL, 0); + +- percpu_ref_put(&ca->io_ref[WRITE]); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); + + if (ret) + s->err = ret; +@@ -1312,7 +1268,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) + + if (ec_do_recov(c, &s->existing_stripe)) { + bch_err(c, "error creating stripe: error reading existing stripe"); +- ret = -BCH_ERR_ec_block_read; ++ ret = bch_err_throw(c, ec_block_read); + goto err; + } + +@@ -1338,7 +1294,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) + + if (ec_nr_failed(&s->new_stripe)) { + bch_err(c, "error creating stripe: error writing redundancy buckets"); +- ret = -BCH_ERR_ec_block_write; ++ ret = bch_err_throw(c, ec_block_write); + goto err; + } + +@@ -1376,9 +1332,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) + } + } + +- mutex_lock(&c->ec_stripe_new_lock); +- list_del(&s->list); +- mutex_unlock(&c->ec_stripe_new_lock); ++ scoped_guard(mutex, &c->ec_stripe_new_lock) ++ list_del(&s->list); + wake_up(&c->ec_stripe_new_wait); + + ec_stripe_buf_exit(&s->existing_stripe); +@@ -1392,15 +1347,11 @@ static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) + { + struct ec_stripe_new *s; + +- mutex_lock(&c->ec_stripe_new_lock); ++ guard(mutex)(&c->ec_stripe_new_lock); + list_for_each_entry(s, &c->ec_stripe_new_list, list) + if (!atomic_read(&s->ref[STRIPE_REF_io])) +- goto out; +- s = NULL; +-out: +- mutex_unlock(&c->ec_stripe_new_lock); +- +- return s; ++ return s; ++ return NULL; + } + + static void ec_stripe_create_work(struct work_struct *work) +@@ -1412,15 +1363,15 @@ static void ec_stripe_create_work(struct work_struct *work) + while ((s = get_pending_stripe(c))) + ec_stripe_create(s); + +- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); + } + + void bch2_ec_do_stripe_creates(struct bch_fs *c) + { +- bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); ++ enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); + + if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) +- bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); + } + + static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) +@@ -1434,9 +1385,8 @@ static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h + h->s = NULL; + s->pending = true; + +- mutex_lock(&c->ec_stripe_new_lock); +- list_add(&s->list, &c->ec_stripe_new_list); +- mutex_unlock(&c->ec_stripe_new_lock); ++ scoped_guard(mutex, &c->ec_stripe_new_lock) ++ list_add(&s->list, &c->ec_stripe_new_list); + + ec_stripe_new_put(c, s, STRIPE_REF_io); + } +@@ -1570,26 +1520,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str + static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) + { + struct bch_devs_mask devs = h->devs; ++ unsigned nr_devs, nr_devs_with_durability; + +- rcu_read_lock(); +- h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label +- ? group_to_target(h->disk_label - 1) +- : 0); +- unsigned nr_devs = dev_mask_nr(&h->devs); +- +- for_each_member_device_rcu(c, ca, &h->devs) +- if (!ca->mi.durability) +- __clear_bit(ca->dev_idx, h->devs.d); +- unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); ++ scoped_guard(rcu) { ++ h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label ++ ? group_to_target(h->disk_label - 1) ++ : 0); ++ nr_devs = dev_mask_nr(&h->devs); + +- h->blocksize = pick_blocksize(c, &h->devs); ++ for_each_member_device_rcu(c, ca, &h->devs) ++ if (!ca->mi.durability) ++ __clear_bit(ca->dev_idx, h->devs.d); ++ nr_devs_with_durability = dev_mask_nr(&h->devs); + +- h->nr_active_devs = 0; +- for_each_member_device_rcu(c, ca, &h->devs) +- if (ca->mi.bucket_size == h->blocksize) +- h->nr_active_devs++; ++ h->blocksize = pick_blocksize(c, &h->devs); + +- rcu_read_unlock(); ++ h->nr_active_devs = 0; ++ for_each_member_device_rcu(c, ca, &h->devs) ++ if (ca->mi.bucket_size == h->blocksize) ++ h->nr_active_devs++; ++ } + + /* + * If we only have redundancy + 1 devices, we're better off with just +@@ -1674,7 +1624,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, + return ERR_PTR(ret); + + if (test_bit(BCH_FS_going_ro, &c->flags)) { +- h = ERR_PTR(-BCH_ERR_erofs_no_writes); ++ h = ERR_PTR(bch_err_throw(c, erofs_no_writes)); + goto err; + } + +@@ -1693,7 +1643,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, + + h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark); + if (!h) { +- h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc); ++ h = ERR_PTR(bch_err_throw(c, ENOMEM_stripe_head_alloc)); + goto err; + } + found: +@@ -1710,23 +1660,32 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, + } + + static int new_stripe_alloc_buckets(struct btree_trans *trans, ++ struct alloc_request *req, + struct ec_stripe_head *h, struct ec_stripe_new *s, +- enum bch_watermark watermark, struct closure *cl) ++ struct closure *cl) + { + struct bch_fs *c = trans->c; +- struct bch_devs_mask devs = h->devs; + struct open_bucket *ob; +- struct open_buckets buckets; + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; +- bool have_cache = true; + int ret = 0; + ++ req->scratch_data_type = req->data_type; ++ req->scratch_ptrs = req->ptrs; ++ req->scratch_nr_replicas = req->nr_replicas; ++ req->scratch_nr_effective = req->nr_effective; ++ req->scratch_have_cache = req->have_cache; ++ req->scratch_devs_may_alloc = req->devs_may_alloc; ++ ++ req->devs_may_alloc = h->devs; ++ req->have_cache = true; ++ + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); + BUG_ON(v->nr_redundant != s->nr_parity); + + /* * We bypass the sector allocator which normally does this: */ +- bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); ++ bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, ++ c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { + /* +@@ -1736,7 +1695,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, + * block when updating the stripe + */ + if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) +- __clear_bit(v->ptrs[i].dev, devs.d); ++ __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); + + if (i < s->nr_data) + nr_have_data++; +@@ -1747,60 +1706,58 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, + BUG_ON(nr_have_data > s->nr_data); + BUG_ON(nr_have_parity > s->nr_parity); + +- buckets.nr = 0; ++ req->ptrs.nr = 0; + if (nr_have_parity < s->nr_parity) { +- ret = bch2_bucket_alloc_set_trans(trans, &buckets, +- &h->parity_stripe, +- &devs, +- s->nr_parity, +- &nr_have_parity, +- &have_cache, 0, +- BCH_DATA_parity, +- watermark, +- cl); +- +- open_bucket_for_each(c, &buckets, ob, i) { ++ req->nr_replicas = s->nr_parity; ++ req->nr_effective = nr_have_parity; ++ req->data_type = BCH_DATA_parity; ++ ++ ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); ++ ++ open_bucket_for_each(c, &req->ptrs, ob, i) { + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data + s->nr_parity, + s->nr_data); + BUG_ON(j >= s->nr_data + s->nr_parity); + +- s->blocks[j] = buckets.v[i]; ++ s->blocks[j] = req->ptrs.v[i]; + v->ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, s->blocks_gotten); + } + + if (ret) +- return ret; ++ goto err; + } + +- buckets.nr = 0; ++ req->ptrs.nr = 0; + if (nr_have_data < s->nr_data) { +- ret = bch2_bucket_alloc_set_trans(trans, &buckets, +- &h->block_stripe, +- &devs, +- s->nr_data, +- &nr_have_data, +- &have_cache, 0, +- BCH_DATA_user, +- watermark, +- cl); +- +- open_bucket_for_each(c, &buckets, ob, i) { ++ req->nr_replicas = s->nr_data; ++ req->nr_effective = nr_have_data; ++ req->data_type = BCH_DATA_user; ++ ++ ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); ++ ++ open_bucket_for_each(c, &req->ptrs, ob, i) { + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data, 0); + BUG_ON(j >= s->nr_data); + +- s->blocks[j] = buckets.v[i]; ++ s->blocks[j] = req->ptrs.v[i]; + v->ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, s->blocks_gotten); + } + + if (ret) +- return ret; ++ goto err; + } +- +- return 0; ++err: ++ req->data_type = req->scratch_data_type; ++ req->ptrs = req->scratch_ptrs; ++ req->nr_replicas = req->scratch_nr_replicas; ++ req->nr_effective = req->scratch_nr_effective; ++ req->have_cache = req->scratch_have_cache; ++ req->devs_may_alloc = req->scratch_devs_may_alloc; ++ return ret; + } + + static int __get_existing_stripe(struct btree_trans *trans, +@@ -1810,20 +1767,19 @@ static int __get_existing_stripe(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, +- BTREE_ID_stripes, POS(0, idx), 0); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, POS(0, idx), BTREE_ITER_nopreserve); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + /* We expect write buffer races here */ + if (k.k->type != KEY_TYPE_stripe) +- goto out; ++ return 0; + + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + if (stripe_lru_pos(s.v) <= 1) +- goto out; ++ return 0; + + if (s.v->disk_label == head->disk_label && + s.v->algorithm == head->algo && +@@ -1831,13 +1787,10 @@ static int __get_existing_stripe(struct btree_trans *trans, + le16_to_cpu(s.v->sectors) == head->blocksize && + bch2_try_open_stripe(c, head->s, idx)) { + bkey_reassemble(&stripe->key, k); +- ret = 1; ++ return 1; + } +-out: +- bch2_set_btree_iter_dontneed(trans, &iter); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ ++ return 0; + } + + static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) +@@ -1850,7 +1803,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new + s->nr_data = existing_v->nr_blocks - + existing_v->nr_redundant; + +- int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); ++ int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); + if (ret) { + bch2_stripe_close(c, s); + return ret; +@@ -1896,7 +1849,6 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri + if (may_create_new_stripe(c)) + return -1; + +- struct btree_iter lru_iter; + struct bkey_s_c lru_k; + int ret = 0; + +@@ -1908,9 +1860,8 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri + if (ret) + break; + } +- bch2_trans_iter_exit(trans, &lru_iter); + if (!ret) +- ret = -BCH_ERR_stripe_alloc_blocked; ++ ret = bch_err_throw(c, stripe_alloc_blocked); + if (ret == 1) + ret = 0; + if (ret) +@@ -1923,7 +1874,6 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st + struct ec_stripe_new *s) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_s_c k; + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); +@@ -1944,54 +1894,44 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st + */ + for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { ++ c->ec_stripe_hint = iter.pos.offset; ++ + if (bkey_gt(k.k->p, POS(0, U32_MAX))) { + if (start_pos.offset) { + start_pos = min_pos; +- bch2_btree_iter_set_pos(trans, &iter, start_pos); ++ bch2_btree_iter_set_pos(&iter, start_pos); + continue; + } + +- ret = -BCH_ERR_ENOSPC_stripe_create; ++ ret = bch_err_throw(c, ENOSPC_stripe_create); + break; + } + + if (bkey_deleted(k.k) && +- bch2_try_open_stripe(c, s, k.k->p.offset)) ++ bch2_try_open_stripe(c, s, k.k->p.offset)) { ++ ret = ec_stripe_mem_alloc(trans, &iter); ++ if (ret) ++ bch2_stripe_close(c, s); ++ s->new_stripe.key.k.p = iter.pos; + break; ++ } + } + +- c->ec_stripe_hint = iter.pos.offset; +- + if (ret) +- goto err; +- +- ret = ec_stripe_mem_alloc(trans, &iter); +- if (ret) { +- bch2_stripe_close(c, s); +- goto err; +- } +- +- s->new_stripe.key.k.p = iter.pos; +-out: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_disk_reservation_put(c, &s->res); + return ret; +-err: +- bch2_disk_reservation_put(c, &s->res); +- goto out; + } + + struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, +- unsigned target, ++ struct alloc_request *req, + unsigned algo, +- unsigned redundancy, +- enum bch_watermark watermark, + struct closure *cl) + { + struct bch_fs *c = trans->c; +- struct ec_stripe_head *h; +- bool waiting = false; ++ unsigned redundancy = req->nr_replicas - 1; + unsigned disk_label = 0; +- struct target t = target_decode(target); ++ struct target t = target_decode(req->target); ++ bool waiting = false; + int ret; + + if (t.type == TARGET_GROUP) { +@@ -2002,14 +1942,16 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + disk_label = t.group + 1; /* 0 == no label */ + } + +- h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); ++ struct ec_stripe_head *h = ++ __bch2_ec_stripe_head_get(trans, disk_label, algo, ++ redundancy, req->watermark); + if (IS_ERR_OR_NULL(h)) + return h; + + if (!h->s) { + h->s = ec_new_stripe_alloc(c, h); + if (!h->s) { +- ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; ++ ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); + bch_err(c, "failed to allocate new stripe"); + goto err; + } +@@ -2026,8 +1968,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + goto alloc_existing; + + /* First, try to allocate a full stripe: */ +- ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: ++ enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; ++ swap(req->watermark, saved_watermark); ++ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h, s); ++ swap(req->watermark, saved_watermark); ++ + if (!ret) + goto allocate_buf; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || +@@ -2045,8 +1991,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) + goto err; + +- if (watermark == BCH_WATERMARK_copygc) { +- ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: ++ if (req->watermark == BCH_WATERMARK_copygc) { ++ ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h, s); + if (ret) + goto err; +@@ -2065,12 +2011,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + * Retry allocating buckets, with the watermark for this + * particular write: + */ +- ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); ++ ret = new_stripe_alloc_buckets(trans, req, h, s, cl); + if (ret) + goto err; + + allocate_buf: +- ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); ++ ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); + if (ret) + goto err; + +@@ -2081,29 +2027,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + BUG_ON(trans->restarted); + return h; + err: ++ if (waiting && ++ !bch2_err_matches(ret, BCH_ERR_operation_blocked)) ++ closure_wake_up(&c->freelist_wait); + bch2_ec_stripe_head_put(c, h); + return ERR_PTR(ret); + } + + /* device removal */ + +-static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) ++int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ unsigned dev_idx, ++ unsigned flags) + { +- struct bch_alloc_v4 a_convert; +- const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); +- +- if (!a->stripe) ++ if (k.k->type != KEY_TYPE_stripe) + return 0; + +- if (a->stripe_sectors) { +- bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); +- return -BCH_ERR_invalidate_stripe_to_dev; +- } +- +- struct btree_iter iter; ++ struct bch_fs *c = trans->c; + struct bkey_i_stripe *s = +- bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), +- BTREE_ITER_slots, stripe); ++ bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; +@@ -2120,12 +2064,30 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ + acc.replicas.data_type = BCH_DATA_user; + ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + if (ret) +- goto err; ++ return ret; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); +- bkey_for_each_ptr(ptrs, ptr) +- if (ptr->dev == k_a.k->p.inode) +- ptr->dev = BCH_SB_MEMBER_INVALID; ++ ++ /* XXX: how much redundancy do we still have? check degraded flags */ ++ ++ unsigned nr_good = 0; ++ ++ scoped_guard(rcu) ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (ptr->dev == dev_idx) ++ ptr->dev = BCH_SB_MEMBER_INVALID; ++ ++ struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); ++ nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; ++ } ++ ++ if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) ++ return bch_err_throw(c, remove_would_lose_data); ++ ++ unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; ++ ++ if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) ++ return bch_err_throw(c, remove_would_lose_data); + + sectors = -sectors; + +@@ -2133,23 +2095,44 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ + acc.type = BCH_DISK_ACCOUNTING_replicas; + bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); + acc.replicas.data_type = BCH_DATA_user; +- ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); ++ return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); ++} ++ ++static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, ++ unsigned flags) ++{ ++ struct bch_alloc_v4 a_convert; ++ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); ++ ++ if (!a->stripe) ++ return 0; ++ ++ if (a->stripe_sectors) { ++ struct bch_fs *c = trans->c; ++ bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); ++ return bch_err_throw(c, invalidate_stripe_to_dev); ++ } ++ ++ CLASS(btree_iter, iter)(trans, BTREE_ID_stripes, POS(0, a->stripe), 0); ++ struct bkey_s_c_stripe s = bch2_bkey_get_typed(&iter, stripe); ++ int ret = bkey_err(s); + if (ret) +- goto err; +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return ret; ++ ++ return bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); + } + +-int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) ++int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) + { +- return bch2_trans_run(c, +- for_each_btree_key_max_commit(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_max_commit(trans, iter, + BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), + BTREE_ITER_intent, k, + NULL, NULL, 0, ({ +- bch2_invalidate_stripe_to_dev(trans, k); +- }))); ++ bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); ++ })); ++ bch_err_fn(c, ret); ++ return ret; + } + + /* startup/shutdown */ +@@ -2157,33 +2140,28 @@ int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) + static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) + { + struct ec_stripe_head *h; +- struct open_bucket *ob; +- unsigned i; + +- mutex_lock(&c->ec_stripe_head_lock); ++ guard(mutex)(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { +- mutex_lock(&h->lock); ++ guard(mutex)(&h->lock); + if (!h->s) +- goto unlock; ++ continue; + + if (!ca) + goto found; + +- for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { ++ for (unsigned i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { + if (!h->s->blocks[i]) + continue; + +- ob = c->open_buckets + h->s->blocks[i]; ++ struct open_bucket *ob = c->open_buckets + h->s->blocks[i]; + if (ob->dev == ca->dev_idx) + goto found; + } +- goto unlock; ++ continue; + found: + ec_stripe_new_cancel(c, h, -BCH_ERR_erofs_no_writes); +-unlock: +- mutex_unlock(&h->lock); + } +- mutex_unlock(&c->ec_stripe_head_lock); + } + + void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +@@ -2200,11 +2178,8 @@ static bool bch2_fs_ec_flush_done(struct bch_fs *c) + { + sched_annotate_sleep(); + +- mutex_lock(&c->ec_stripe_new_lock); +- bool ret = list_empty(&c->ec_stripe_new_list); +- mutex_unlock(&c->ec_stripe_new_lock); +- +- return ret; ++ guard(mutex)(&c->ec_stripe_new_lock); ++ return list_empty(&c->ec_stripe_new_list); + } + + void bch2_fs_ec_flush(struct bch_fs *c) +@@ -2241,41 +2216,40 @@ void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) + struct ec_stripe_head *h; + struct ec_stripe_new *s; + +- mutex_lock(&c->ec_stripe_head_lock); +- list_for_each_entry(h, &c->ec_stripe_head_list, list) { +- prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", +- h->disk_label, h->algo, h->redundancy, +- bch2_watermarks[h->watermark], +- h->nr_created); ++ scoped_guard(mutex, &c->ec_stripe_head_lock) ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { ++ prt_printf(out, "disk label %u algo %u redundancy %u %s nr created %llu:\n", ++ h->disk_label, h->algo, h->redundancy, ++ bch2_watermarks[h->watermark], ++ h->nr_created); + +- if (h->s) +- bch2_new_stripe_to_text(out, c, h->s); +- } +- mutex_unlock(&c->ec_stripe_head_lock); ++ if (h->s) ++ bch2_new_stripe_to_text(out, c, h->s); ++ } + + prt_printf(out, "in flight:\n"); + +- mutex_lock(&c->ec_stripe_new_lock); +- list_for_each_entry(s, &c->ec_stripe_new_list, list) +- bch2_new_stripe_to_text(out, c, s); +- mutex_unlock(&c->ec_stripe_new_lock); ++ scoped_guard(mutex, &c->ec_stripe_new_lock) ++ list_for_each_entry(s, &c->ec_stripe_new_list, list) ++ bch2_new_stripe_to_text(out, c, s); + } + + void bch2_fs_ec_exit(struct bch_fs *c) + { +- struct ec_stripe_head *h; +- unsigned i; + + while (1) { +- mutex_lock(&c->ec_stripe_head_lock); +- h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); +- mutex_unlock(&c->ec_stripe_head_lock); ++ struct ec_stripe_head *h; ++ ++ scoped_guard(mutex, &c->ec_stripe_head_lock) ++ h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); + + if (!h) + break; + + if (h->s) { +- for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) ++ for (unsigned i = 0; ++ i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; ++ i++) + BUG_ON(h->s->blocks[i]); + + kfree(h->s); +@@ -2328,20 +2302,18 @@ static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, + return 0; + } + +-int bch2_check_stripe_to_lru_refs(struct bch_fs *c) ++int bch2_check_stripe_to_lru_refs(struct btree_trans *trans) + { + struct bkey_buf last_flushed; +- + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, ++ int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, + POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); ++ bch2_check_stripe_to_lru_ref(trans, k, &last_flushed)); + +- bch2_bkey_buf_exit(&last_flushed, c); +- bch_err_fn(c, ret); ++ bch2_bkey_buf_exit(&last_flushed, trans->c); ++ bch_err_fn(trans->c, ret); + return ret; + } +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 51893e1ee874..e807e7027d7a 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -255,9 +255,10 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); + int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + + void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); ++ ++struct alloc_request; + struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, +- unsigned, unsigned, unsigned, +- enum bch_watermark, struct closure *); ++ struct alloc_request *, unsigned, struct closure *); + + void bch2_do_stripe_deletes(struct bch_fs *); + void bch2_ec_do_stripe_creates(struct bch_fs *); +@@ -287,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, + } + } + +-int bch2_dev_remove_stripes(struct bch_fs *, unsigned); ++int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, ++ struct bkey_s_c, unsigned, unsigned); ++int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); + + void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); + void bch2_fs_ec_stop(struct bch_fs *); +@@ -301,6 +304,6 @@ void bch2_fs_ec_exit(struct bch_fs *); + void bch2_fs_ec_init_early(struct bch_fs *); + int bch2_fs_ec_init(struct bch_fs *); + +-int bch2_check_stripe_to_lru_refs(struct bch_fs *); ++int bch2_check_stripe_to_lru_refs(struct btree_trans *); + + #endif /* _BCACHEFS_EC_H */ +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +index 06144bfd9c19..809446c78951 100644 +--- a/fs/bcachefs/ec_types.h ++++ b/fs/bcachefs/ec_types.h +@@ -4,9 +4,10 @@ + + #include "bcachefs_format.h" + +-struct bch_replicas_padded { ++union bch_replicas_padded { ++ u8 bytes[struct_size_t(struct bch_replicas_entry_v1, ++ devs, BCH_BKEY_PTRS_MAX)]; + struct bch_replicas_entry_v1 e; +- u8 pad[BCH_BKEY_PTRS_MAX]; + }; + + struct stripe { +@@ -28,7 +29,7 @@ struct gc_stripe { + u16 block_sectors[BCH_BKEY_PTRS_MAX]; + struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; + +- struct bch_replicas_padded r; ++ union bch_replicas_padded r; + }; + + #endif /* _BCACHEFS_EC_TYPES_H */ +diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c +new file mode 100644 +index 000000000000..2ded74135977 +--- /dev/null ++++ b/fs/bcachefs/enumerated_ref.c +@@ -0,0 +1,142 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "enumerated_ref.h" ++#include "util.h" ++ ++#include ++ ++#ifdef ENUMERATED_REF_DEBUG ++void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) ++{ ++ BUG_ON(idx >= ref->nr); ++ atomic_long_inc(&ref->refs[idx]); ++} ++ ++bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) ++{ ++ BUG_ON(idx >= ref->nr); ++ return atomic_long_inc_not_zero(&ref->refs[idx]); ++} ++ ++bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) ++{ ++ BUG_ON(idx >= ref->nr); ++ return !ref->dying && ++ atomic_long_inc_not_zero(&ref->refs[idx]); ++} ++ ++void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) ++{ ++ BUG_ON(idx >= ref->nr); ++ long v = atomic_long_dec_return(&ref->refs[idx]); ++ ++ BUG_ON(v < 0); ++ if (v) ++ return; ++ ++ for (unsigned i = 0; i < ref->nr; i++) ++ if (atomic_long_read(&ref->refs[i])) ++ return; ++ ++ if (ref->stop_fn) ++ ref->stop_fn(ref); ++ complete(&ref->stop_complete); ++} ++#endif ++ ++#ifndef ENUMERATED_REF_DEBUG ++static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) ++{ ++ struct enumerated_ref *ref = ++ container_of(percpu_ref, struct enumerated_ref, ref); ++ ++ if (ref->stop_fn) ++ ref->stop_fn(ref); ++ complete(&ref->stop_complete); ++} ++#endif ++ ++void enumerated_ref_stop_async(struct enumerated_ref *ref) ++{ ++ reinit_completion(&ref->stop_complete); ++ ++#ifndef ENUMERATED_REF_DEBUG ++ percpu_ref_kill(&ref->ref); ++#else ++ ref->dying = true; ++ for (unsigned i = 0; i < ref->nr; i++) ++ enumerated_ref_put(ref, i); ++#endif ++} ++ ++void enumerated_ref_stop(struct enumerated_ref *ref, ++ const char * const names[]) ++{ ++ enumerated_ref_stop_async(ref); ++ while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { ++ CLASS(printbuf, buf)(); ++ prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); ++ prt_str(&buf, "Outstanding refs:\n"); ++ enumerated_ref_to_text(&buf, ref, names); ++ printk(KERN_ERR "%s", buf.buf); ++ } ++} ++ ++void enumerated_ref_start(struct enumerated_ref *ref) ++{ ++#ifndef ENUMERATED_REF_DEBUG ++ percpu_ref_reinit(&ref->ref); ++#else ++ ref->dying = false; ++ for (unsigned i = 0; i < ref->nr; i++) { ++ BUG_ON(atomic_long_read(&ref->refs[i])); ++ atomic_long_inc(&ref->refs[i]); ++ } ++#endif ++} ++ ++void enumerated_ref_exit(struct enumerated_ref *ref) ++{ ++#ifndef ENUMERATED_REF_DEBUG ++ percpu_ref_exit(&ref->ref); ++#else ++ kfree(ref->refs); ++ ref->refs = NULL; ++ ref->nr = 0; ++#endif ++} ++ ++int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, ++ void (*stop_fn)(struct enumerated_ref *)) ++{ ++ init_completion(&ref->stop_complete); ++ ref->stop_fn = stop_fn; ++ ++#ifndef ENUMERATED_REF_DEBUG ++ return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL); ++#else ++ ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); ++ if (!ref->refs) ++ return -ENOMEM; ++ ++ ref->nr = nr; ++ return 0; ++#endif ++} ++ ++void enumerated_ref_to_text(struct printbuf *out, ++ struct enumerated_ref *ref, ++ const char * const names[]) ++{ ++#ifdef ENUMERATED_REF_DEBUG ++ bch2_printbuf_tabstop_push(out, 32); ++ ++ for (unsigned i = 0; i < ref->nr; i++) ++ prt_printf(out, "%s\t%li\n", names[i], ++ atomic_long_read(&ref->refs[i])); ++#else ++ prt_str(out, "(not in debug mode)\n"); ++#endif ++} +diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h +new file mode 100644 +index 000000000000..ec01cf59ef80 +--- /dev/null ++++ b/fs/bcachefs/enumerated_ref.h +@@ -0,0 +1,66 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ENUMERATED_REF_H ++#define _BCACHEFS_ENUMERATED_REF_H ++ ++#include "enumerated_ref_types.h" ++ ++/* ++ * A refcount where the users are enumerated: in debug mode, we create sepate ++ * refcounts for each user, to make leaks and refcount errors easy to track ++ * down: ++ */ ++ ++#ifdef ENUMERATED_REF_DEBUG ++void enumerated_ref_get(struct enumerated_ref *, unsigned); ++bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); ++bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); ++void enumerated_ref_put(struct enumerated_ref *, unsigned); ++#else ++ ++static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) ++{ ++ percpu_ref_get(&ref->ref); ++} ++ ++static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) ++{ ++ return percpu_ref_tryget(&ref->ref); ++} ++ ++static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) ++{ ++ return percpu_ref_tryget_live(&ref->ref); ++} ++ ++static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) ++{ ++ percpu_ref_put(&ref->ref); ++} ++#endif ++ ++static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) ++{ ++#ifndef ENUMERATED_REF_DEBUG ++ return percpu_ref_is_zero(&ref->ref); ++#else ++ for (unsigned i = 0; i < ref->nr; i++) ++ if (atomic_long_read(&ref->refs[i])) ++ return false; ++ return true; ++#endif ++} ++ ++void enumerated_ref_stop_async(struct enumerated_ref *); ++void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); ++void enumerated_ref_start(struct enumerated_ref *); ++ ++void enumerated_ref_exit(struct enumerated_ref *); ++int enumerated_ref_init(struct enumerated_ref *, unsigned, ++ void (*stop_fn)(struct enumerated_ref *)); ++ ++struct printbuf; ++void enumerated_ref_to_text(struct printbuf *, ++ struct enumerated_ref *, ++ const char * const[]); ++ ++#endif /* _BCACHEFS_ENUMERATED_REF_H */ +diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h +new file mode 100644 +index 000000000000..0e6076f466d3 +--- /dev/null ++++ b/fs/bcachefs/enumerated_ref_types.h +@@ -0,0 +1,19 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H ++#define _BCACHEFS_ENUMERATED_REF_TYPES_H ++ ++#include ++ ++struct enumerated_ref { ++#ifdef ENUMERATED_REF_DEBUG ++ unsigned nr; ++ bool dying; ++ atomic_long_t *refs; ++#else ++ struct percpu_ref ref; ++#endif ++ void (*stop_fn)(struct enumerated_ref *); ++ struct completion stop_complete; ++}; ++ ++#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ +diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c +index 43557bebd0f8..86264b8c343c 100644 +--- a/fs/bcachefs/errcode.c ++++ b/fs/bcachefs/errcode.c +@@ -13,19 +13,21 @@ static const char * const bch2_errcode_strs[] = { + NULL + }; + +-static unsigned bch2_errcode_parents[] = { ++static const unsigned bch2_errcode_parents[] = { + #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, + BCH_ERRCODES() + #undef x + }; + ++__attribute__((const)) + const char *bch2_err_str(int err) + { + const char *errstr; + + err = abs(err); + +- BUG_ON(err >= BCH_ERR_MAX); ++ if (err >= BCH_ERR_MAX) ++ return "(Invalid error)"; + + if (err >= BCH_ERR_START) + errstr = bch2_errcode_strs[err - BCH_ERR_START]; +@@ -36,6 +38,7 @@ const char *bch2_err_str(int err) + return errstr ?: "(Invalid error)"; + } + ++__attribute__((const)) + bool __bch2_err_matches(int err, int class) + { + err = abs(err); +diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h +index d9ebffa5b3a2..adc1f9315eab 100644 +--- a/fs/bcachefs/errcode.h ++++ b/fs/bcachefs/errcode.h +@@ -5,6 +5,7 @@ + #define BCH_ERRCODES() \ + x(ERANGE, ERANGE_option_too_small) \ + x(ERANGE, ERANGE_option_too_big) \ ++ x(ERANGE, projid_too_big) \ + x(EINVAL, injected) \ + x(BCH_ERR_injected, injected_fs_start) \ + x(EINVAL, mount_option) \ +@@ -53,6 +54,7 @@ + x(ENOMEM, ENOMEM_dio_write_bioset_init) \ + x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ + x(ENOMEM, ENOMEM_promote_table_init) \ ++ x(ENOMEM, ENOMEM_async_obj_init) \ + x(ENOMEM, ENOMEM_compression_bounce_read_init) \ + x(ENOMEM, ENOMEM_compression_bounce_write_init) \ + x(ENOMEM, ENOMEM_compression_workspace_init) \ +@@ -88,6 +90,8 @@ + x(ENOMEM, ENOMEM_disk_accounting) \ + x(ENOMEM, ENOMEM_stripe_head_alloc) \ + x(ENOMEM, ENOMEM_journal_read_bucket) \ ++ x(ENOMEM, ENOMEM_acl) \ ++ x(ENOMEM, ENOMEM_move_extent) \ + x(ENOSPC, ENOSPC_disk_reservation) \ + x(ENOSPC, ENOSPC_bucket_alloc) \ + x(ENOSPC, ENOSPC_disk_label_add) \ +@@ -115,6 +119,7 @@ + x(ENOENT, ENOENT_not_directory) \ + x(ENOENT, ENOENT_directory_dead) \ + x(ENOENT, ENOENT_subvolume) \ ++ x(ENOENT, ENOENT_snapshot) \ + x(ENOENT, ENOENT_snapshot_tree) \ + x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ + x(ENOENT, ENOENT_dev_not_found) \ +@@ -136,7 +141,6 @@ + x(BCH_ERR_transaction_restart, transaction_restart_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ +- x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ + x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ +@@ -147,11 +151,8 @@ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ + x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ + x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ +- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ +- x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ +- x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ + x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ + x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ + x(BCH_ERR_transaction_restart, transaction_restart_nested) \ +@@ -174,16 +175,19 @@ + x(0, backpointer_to_overwritten_btree_node) \ + x(0, journal_reclaim_would_deadlock) \ + x(EINVAL, fsck) \ ++ x(BCH_ERR_fsck, fsck_ask) \ + x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_delete_bkey) \ + x(BCH_ERR_fsck, fsck_ignore) \ + x(BCH_ERR_fsck, fsck_errors_not_fixed) \ + x(BCH_ERR_fsck, fsck_repair_unimplemented) \ + x(BCH_ERR_fsck, fsck_repair_impossible) \ +- x(EINVAL, restart_recovery) \ +- x(EINVAL, not_in_recovery) \ +- x(EINVAL, cannot_rewind_recovery) \ ++ x(EINVAL, recovery_will_run) \ ++ x(BCH_ERR_recovery_will_run, restart_recovery) \ ++ x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \ ++ x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ + x(0, data_update_done) \ ++ x(0, bkey_was_deleted) \ + x(BCH_ERR_data_update_done, data_update_done_would_block) \ + x(BCH_ERR_data_update_done, data_update_done_unwritten) \ + x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ +@@ -201,6 +205,7 @@ + x(EINVAL, device_has_been_removed) \ + x(EINVAL, device_splitbrain) \ + x(EINVAL, device_already_online) \ ++ x(EINVAL, filesystem_uuid_already_open) \ + x(EINVAL, insufficient_devices_to_start) \ + x(EINVAL, invalid) \ + x(EINVAL, internal_fsck_err) \ +@@ -209,9 +214,22 @@ + x(EINVAL, remove_would_lose_data) \ + x(EINVAL, no_resize_with_buckets_nouse) \ + x(EINVAL, inode_unpack_error) \ ++ x(EINVAL, inode_not_unlinked) \ ++ x(EINVAL, inode_has_child_snapshot) \ + x(EINVAL, varint_decode_error) \ + x(EINVAL, erasure_coding_found_btree_node) \ ++ x(EINVAL, option_negative) \ ++ x(EINVAL, topology_repair) \ ++ x(BCH_ERR_topology_repair, topology_repair_drop_this_node) \ ++ x(BCH_ERR_topology_repair, topology_repair_drop_prev_node) \ ++ x(BCH_ERR_topology_repair, topology_repair_did_fill_from_scan) \ + x(EOPNOTSUPP, may_not_use_incompat_feature) \ ++ x(EOPNOTSUPP, no_casefolding_without_utf8) \ ++ x(EOPNOTSUPP, casefolding_disabled) \ ++ x(EOPNOTSUPP, casefold_opt_is_dir_only) \ ++ x(EOPNOTSUPP, unsupported_fsx_flag) \ ++ x(EOPNOTSUPP, unsupported_fa_flag) \ ++ x(EOPNOTSUPP, unsupported_fallocate_mode) \ + x(EROFS, erofs_trans_commit) \ + x(EROFS, erofs_no_writes) \ + x(EROFS, erofs_journal_err) \ +@@ -219,6 +237,8 @@ + x(EROFS, erofs_unfixed_errors) \ + x(EROFS, erofs_norecovery) \ + x(EROFS, erofs_nochanges) \ ++ x(EROFS, erofs_no_alloc_info) \ ++ x(EROFS, erofs_filesystem_full) \ + x(EROFS, insufficient_devices) \ + x(0, operation_blocked) \ + x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ +@@ -231,7 +251,6 @@ + x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ + x(BCH_ERR_journal_res_blocked, journal_stuck) \ + x(BCH_ERR_journal_res_blocked, journal_retry_open) \ +- x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ + x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ + x(BCH_ERR_invalid, invalid_sb) \ +@@ -277,7 +296,6 @@ + x(EIO, sb_not_downgraded) \ + x(EIO, btree_node_write_all_failed) \ + x(EIO, btree_node_read_error) \ +- x(EIO, btree_node_read_validate_error) \ + x(EIO, btree_need_topology_repair) \ + x(EIO, bucket_ref_update) \ + x(EIO, trigger_alloc) \ +@@ -352,9 +370,11 @@ enum bch_errcode { + BCH_ERR_MAX + }; + +-const char *bch2_err_str(int); +-bool __bch2_err_matches(int, int); ++__attribute__((const)) const char *bch2_err_str(int); + ++__attribute__((const)) bool __bch2_err_matches(int, int); ++ ++__attribute__((const)) + static inline bool _bch2_err_matches(int err, int class) + { + return err < 0 && __bch2_err_matches(err, class); +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index 6b8695b1349c..32a286b3a74e 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -11,12 +11,12 @@ + + #define FSCK_ERR_RATELIMIT_NR 10 + +-void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) ++void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) + { + printbuf_indent_add_nextline(out, 2); + + #ifdef BCACHEFS_LOG_PREFIX +- prt_printf(out, bch2_log_msg(c, "")); ++ prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); + #endif + } + +@@ -29,12 +29,10 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) + return false; + case BCH_ON_ERROR_fix_safe: + case BCH_ON_ERROR_ro: +- if (bch2_fs_emergency_read_only(c)) +- prt_printf(out, "inconsistency detected - emergency read only at journal seq %llu\n", +- journal_cur_seq(&c->journal)); ++ bch2_fs_emergency_read_only2(c, out); + return true; + case BCH_ON_ERROR_panic: +- bch2_print_string_as_lines_nonblocking(KERN_ERR, out->buf); ++ bch2_print_str(c, KERN_ERR, out->buf); + panic(bch2_fmt(c, "panic after error")); + return true; + default: +@@ -44,15 +42,14 @@ bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) + + bool bch2_inconsistent_error(struct bch_fs *c) + { +- struct printbuf buf = PRINTBUF; +- buf.atomic++; ++ CLASS(printbuf, buf)(); ++ guard(printbuf_atomic)(&buf); + + printbuf_indent_add_nextline(&buf, 2); + + bool ret = __bch2_inconsistent_error(c, &buf); + if (ret) + bch_err(c, "%s", buf.buf); +- printbuf_exit(&buf); + return ret; + } + +@@ -60,8 +57,8 @@ __printf(3, 0) + static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, + const char *fmt, va_list args) + { +- struct printbuf buf = PRINTBUF; +- buf.atomic++; ++ CLASS(printbuf, buf)(); ++ guard(printbuf_atomic)(&buf); + + bch2_log_msg_start(c, &buf); + +@@ -71,9 +68,7 @@ static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *tra + if (trans) + bch2_trans_updates_to_text(&buf, trans); + bool ret = __bch2_inconsistent_error(c, &buf); +- bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); +- +- printbuf_exit(&buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); + return ret; + } + +@@ -100,19 +95,18 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) + prt_printf(out, "btree topology error: "); + + set_bit(BCH_FS_topology_error, &c->flags); +- if (!test_bit(BCH_FS_recovery_running, &c->flags)) { ++ if (!test_bit(BCH_FS_in_recovery, &c->flags)) { + __bch2_inconsistent_error(c, out); +- return -BCH_ERR_btree_need_topology_repair; ++ return bch_err_throw(c, btree_need_topology_repair); + } else { +- return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: +- -BCH_ERR_btree_node_read_validate_error; ++ return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: ++ bch_err_throw(c, btree_need_topology_repair); + } + } + + int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) + { +- struct printbuf buf = PRINTBUF; +- ++ CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + + va_list args; +@@ -121,9 +115,7 @@ int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) + va_end(args); + + int ret = __bch2_topology_error(c, &buf); +- bch2_print_string_as_lines(KERN_ERR, buf.buf); +- +- printbuf_exit(&buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); + return ret; + } + +@@ -140,28 +132,28 @@ void bch2_io_error_work(struct work_struct *work) + + /* XXX: if it's reads or checksums that are failing, set it to failed */ + +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); + + if (write_errors_start && + time_after(jiffies, + write_errors_start + c->opts.write_error_timeout * HZ)) { + if (ca->mi.state >= BCH_MEMBER_STATE_ro) +- goto out; ++ return; + + bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); ++ CLASS(printbuf, buf)(); ++ __bch2_log_msg_start(ca->name, &buf); + +- bch_err(ca, +- "writes erroring for %u seconds, setting %s ro", ++ prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", + c->opts.write_error_timeout, + dev ? "device" : "filesystem"); + if (!dev) +- bch2_fs_emergency_read_only(c); ++ bch2_fs_emergency_read_only2(c, &buf); + ++ bch2_print_str(c, KERN_ERR, buf.buf); + } +-out: +- up_write(&c->state_lock); + } + + void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) +@@ -328,7 +320,7 @@ static int do_fsck_ask_yn(struct bch_fs *c, + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s", question->buf); + else +- bch2_print_string_as_lines(KERN_ERR, question->buf); ++ bch2_print_str(c, KERN_ERR, question->buf); + + int ask = bch2_fsck_ask_yn(c, trans); + +@@ -376,15 +368,63 @@ static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, + return s; + } + +-void __bch2_count_fsck_err(struct bch_fs *c, +- enum bch_sb_error_id id, const char *msg, +- bool *repeat, bool *print, bool *suppress) ++bool __bch2_count_fsck_err(struct bch_fs *c, ++ enum bch_sb_error_id id, struct printbuf *msg) + { + bch2_sb_error_count(c, id); + +- mutex_lock(&c->fsck_error_msgs_lock); +- count_fsck_err_locked(c, id, msg, repeat, print, suppress); +- mutex_unlock(&c->fsck_error_msgs_lock); ++ bool print = true, repeat = false, suppress = false; ++ ++ scoped_guard(mutex, &c->fsck_error_msgs_lock) ++ count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); ++ ++ if (suppress) ++ prt_printf(msg, "Ratelimiting new instances of previous error\n"); ++ ++ return print && !repeat; ++} ++ ++int bch2_fsck_err_opt(struct bch_fs *c, ++ enum bch_fsck_flags flags, ++ enum bch_sb_error_id err) ++{ ++ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) ++ flags |= fsck_flags_extra[err]; ++ ++ if (test_bit(BCH_FS_in_fsck, &c->flags) || ++ test_bit(BCH_FS_in_recovery, &c->flags)) { ++ if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) ++ return bch_err_throw(c, fsck_repair_unimplemented); ++ ++ switch (c->opts.fix_errors) { ++ case FSCK_FIX_exit: ++ return bch_err_throw(c, fsck_errors_not_fixed); ++ case FSCK_FIX_yes: ++ if (flags & FSCK_CAN_FIX) ++ return bch_err_throw(c, fsck_fix); ++ fallthrough; ++ case FSCK_FIX_no: ++ if (flags & FSCK_CAN_IGNORE) ++ return bch_err_throw(c, fsck_ignore); ++ return bch_err_throw(c, fsck_errors_not_fixed); ++ case FSCK_FIX_ask: ++ if (flags & FSCK_AUTOFIX) ++ return bch_err_throw(c, fsck_fix); ++ return bch_err_throw(c, fsck_ask); ++ default: ++ BUG(); ++ } ++ } else { ++ if ((flags & FSCK_AUTOFIX) && ++ (c->opts.errors == BCH_ON_ERROR_continue || ++ c->opts.errors == BCH_ON_ERROR_fix_safe)) ++ return bch_err_throw(c, fsck_fix); ++ ++ if (c->opts.errors == BCH_ON_ERROR_continue && ++ (flags & FSCK_CAN_IGNORE)) ++ return bch_err_throw(c, fsck_ignore); ++ return bch_err_throw(c, fsck_errors_not_fixed); ++ } + } + + int __bch2_fsck_err(struct bch_fs *c, +@@ -394,8 +434,9 @@ int __bch2_fsck_err(struct bch_fs *c, + const char *fmt, ...) + { + va_list args; +- struct printbuf buf = PRINTBUF, *out = &buf; +- int ret = -BCH_ERR_fsck_ignore; ++ CLASS(printbuf, buf)(); ++ struct printbuf *out = &buf; ++ int ret = 0; + const char *action_orig = "fix?", *action = action_orig; + + might_sleep(); +@@ -423,10 +464,13 @@ int __bch2_fsck_err(struct bch_fs *c, + !trans && + bch2_current_has_btree_trans(c)); + +- if (test_bit(err, c->sb.errors_silent)) +- return flags & FSCK_CAN_FIX +- ? -BCH_ERR_fsck_fix +- : -BCH_ERR_fsck_ignore; ++ if ((flags & FSCK_ERR_SILENT) || ++ test_bit(err, c->sb.errors_silent)) { ++ ret = flags & FSCK_CAN_FIX ++ ? bch_err_throw(c, fsck_fix) ++ : bch_err_throw(c, fsck_ignore); ++ goto err; ++ } + + printbuf_indent_add_nextline(out, 2); + +@@ -468,14 +512,14 @@ int __bch2_fsck_err(struct bch_fs *c, + prt_str(out, ", "); + if (flags & FSCK_CAN_FIX) { + prt_actioning(out, action); +- ret = -BCH_ERR_fsck_fix; ++ ret = bch_err_throw(c, fsck_fix); + } else { + prt_str(out, ", continuing"); +- ret = -BCH_ERR_fsck_ignore; ++ ret = bch_err_throw(c, fsck_ignore); + } + + goto print; +- } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { ++ } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { + if (c->opts.errors != BCH_ON_ERROR_continue || + !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { + prt_str_indented(out, ", shutting down\n" +@@ -483,18 +527,18 @@ int __bch2_fsck_err(struct bch_fs *c, + "run fsck, and forward to devs so error can be marked for self-healing"); + inconsistent = true; + print = true; +- ret = -BCH_ERR_fsck_errors_not_fixed; ++ ret = bch_err_throw(c, fsck_errors_not_fixed); + } else if (flags & FSCK_CAN_FIX) { + prt_str(out, ", "); + prt_actioning(out, action); +- ret = -BCH_ERR_fsck_fix; ++ ret = bch_err_throw(c, fsck_fix); + } else { + prt_str(out, ", continuing"); +- ret = -BCH_ERR_fsck_ignore; ++ ret = bch_err_throw(c, fsck_ignore); + } + } else if (c->opts.fix_errors == FSCK_FIX_exit) { + prt_str(out, ", exiting"); +- ret = -BCH_ERR_fsck_errors_not_fixed; ++ ret = bch_err_throw(c, fsck_errors_not_fixed); + } else if (flags & FSCK_CAN_FIX) { + int fix = s && s->fix + ? s->fix +@@ -513,30 +557,37 @@ int __bch2_fsck_err(struct bch_fs *c, + : FSCK_FIX_yes; + + ret = ret & 1 +- ? -BCH_ERR_fsck_fix +- : -BCH_ERR_fsck_ignore; ++ ? bch_err_throw(c, fsck_fix) ++ : bch_err_throw(c, fsck_ignore); + } else if (fix == FSCK_FIX_yes || + (c->opts.nochanges && + !(flags & FSCK_CAN_IGNORE))) { + prt_str(out, ", "); + prt_actioning(out, action); +- ret = -BCH_ERR_fsck_fix; ++ ret = bch_err_throw(c, fsck_fix); + } else { + prt_str(out, ", not "); + prt_actioning(out, action); ++ ret = bch_err_throw(c, fsck_ignore); ++ } ++ } else { ++ if (flags & FSCK_CAN_IGNORE) { ++ prt_str(out, ", continuing"); ++ ret = bch_err_throw(c, fsck_ignore); ++ } else { ++ prt_str(out, " (repair unimplemented)"); ++ ret = bch_err_throw(c, fsck_repair_unimplemented); + } +- } else if (!(flags & FSCK_CAN_IGNORE)) { +- prt_str(out, " (repair unimplemented)"); + } + +- if (ret == -BCH_ERR_fsck_ignore && ++ if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) && + (c->opts.fix_errors == FSCK_FIX_exit || + !(flags & FSCK_CAN_IGNORE))) +- ret = -BCH_ERR_fsck_errors_not_fixed; ++ ret = bch_err_throw(c, fsck_errors_not_fixed); + +- if (test_bit(BCH_FS_fsck_running, &c->flags) && +- (ret != -BCH_ERR_fsck_fix && +- ret != -BCH_ERR_fsck_ignore)) { ++ if (test_bit(BCH_FS_in_fsck, &c->flags) && ++ (!bch2_err_matches(ret, BCH_ERR_fsck_fix) && ++ !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) { + exiting = true; + print = true; + } +@@ -559,31 +610,37 @@ int __bch2_fsck_err(struct bch_fs *c, + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s", out->buf); + else +- bch2_print_string_as_lines(KERN_ERR, out->buf); ++ bch2_print_str(c, KERN_ERR, out->buf); + } + + if (s) + s->ret = ret; ++err_unlock: ++ mutex_unlock(&c->fsck_error_msgs_lock); ++err: ++ if (trans && ++ !(flags & FSCK_ERR_NO_LOG) && ++ ret == -BCH_ERR_fsck_fix) ++ ret = bch2_trans_log_str(trans, bch2_sb_error_strs[err]) ?: ret; + + /* + * We don't yet track whether the filesystem currently has errors, for + * log_fsck_err()s: that would require us to track for every error type + * which recovery pass corrects it, to get the fsck exit status correct: + */ +- if (flags & FSCK_CAN_FIX) { +- if (ret == -BCH_ERR_fsck_fix) { +- set_bit(BCH_FS_errors_fixed, &c->flags); +- } else { +- set_bit(BCH_FS_errors_not_fixed, &c->flags); +- set_bit(BCH_FS_error, &c->flags); +- } ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ++ /* nothing */ ++ } else if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { ++ set_bit(BCH_FS_errors_fixed, &c->flags); ++ } else { ++ set_bit(BCH_FS_errors_not_fixed, &c->flags); ++ set_bit(BCH_FS_error, &c->flags); + } +-err_unlock: +- mutex_unlock(&c->fsck_error_msgs_lock); +-err: ++ + if (action != action_orig) + kfree(action); +- printbuf_exit(&buf); ++ ++ BUG_ON(!ret); + return ret; + } + +@@ -601,19 +658,19 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, + const char *fmt, ...) + { + if (from.flags & BCH_VALIDATE_silent) +- return -BCH_ERR_fsck_delete_bkey; ++ return bch_err_throw(c, fsck_delete_bkey); + + unsigned fsck_flags = 0; + if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { + if (test_bit(err, c->sb.errors_silent)) +- return -BCH_ERR_fsck_delete_bkey; ++ return bch_err_throw(c, fsck_delete_bkey); + + fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; + } + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + fsck_flags |= fsck_flags_extra[err]; + +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + prt_printf(&buf, "invalid bkey in %s", + bch2_bkey_validate_contexts[from.from]); + +@@ -634,7 +691,6 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, + va_end(args); + + int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); +- printbuf_exit(&buf); + return ret; + } + +@@ -642,7 +698,7 @@ static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) + { + struct fsck_err_state *s, *n; + +- mutex_lock(&c->fsck_error_msgs_lock); ++ guard(mutex)(&c->fsck_error_msgs_lock); + + list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { + if (print && s->ratelimited && s->last_msg) +@@ -652,8 +708,6 @@ static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) + kfree(s->last_msg); + kfree(s); + } +- +- mutex_unlock(&c->fsck_error_msgs_lock); + } + + void bch2_flush_fsck_errs(struct bch_fs *c) +@@ -687,31 +741,16 @@ int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *o + void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, + subvol_inum inum, u64 offset) + { +- bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); ++ CLASS(btree_trans, trans)(c); ++ lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); + } + + int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bpos pos) + { +- struct bch_fs *c = trans->c; +- int ret = 0; +- +- if (!bch2_snapshot_is_leaf(c, pos.snapshot)) +- prt_str(out, "(multiple snapshots) "); +- +- subvol_inum inum = { +- .subvol = bch2_snapshot_tree_oldest_subvol(c, pos.snapshot), +- .inum = pos.inode, +- }; +- +- if (inum.subvol) { +- ret = bch2_inum_to_path(trans, inum, out); +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- return ret; +- } +- +- if (!inum.subvol || ret) +- prt_printf(out, "inum %llu:%u", pos.inode, pos.snapshot); ++ int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); ++ if (ret) ++ return ret; + + prt_printf(out, " offset %llu: ", pos.offset << 8); + return 0; +@@ -720,5 +759,6 @@ int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printb + void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, + struct bpos pos) + { +- bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); ++ CLASS(btree_trans, trans)(c); ++ lockrestart_do(trans, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); + } +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 4a364fd44abe..0c3c3a24fc6f 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -18,7 +18,12 @@ struct work_struct; + + /* Error messages: */ + +-void bch2_log_msg_start(struct bch_fs *, struct printbuf *); ++void __bch2_log_msg_start(const char *, struct printbuf *); ++ ++static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) ++{ ++ __bch2_log_msg_start(c->name, out); ++} + + /* + * Inconsistency errors: The on disk data is inconsistent. If these occur during +@@ -76,12 +81,14 @@ struct fsck_err_state { + + #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) + +-void __bch2_count_fsck_err(struct bch_fs *, +- enum bch_sb_error_id, const char *, +- bool *, bool *, bool *); ++bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); + #define bch2_count_fsck_err(_c, _err, ...) \ + __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) + ++int bch2_fsck_err_opt(struct bch_fs *, ++ enum bch_fsck_flags, ++ enum bch_sb_error_id); ++ + __printf(5, 6) __cold + int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, + enum bch_fsck_flags, +@@ -98,13 +105,13 @@ void bch2_free_fsck_errs(struct bch_fs *); + #define fsck_err_wrap(_do) \ + ({ \ + int _ret = _do; \ +- if (_ret != -BCH_ERR_fsck_fix && \ +- _ret != -BCH_ERR_fsck_ignore) { \ ++ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ ++ !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \ + ret = _ret; \ + goto fsck_err; \ + } \ + \ +- _ret == -BCH_ERR_fsck_fix; \ ++ bch2_err_matches(_ret, BCH_ERR_fsck_fix); \ + }) + + #define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) +@@ -163,10 +170,10 @@ do { \ + int _ret = __bch2_bkey_fsck_err(c, k, from, \ + BCH_FSCK_ERR_##_err_type, \ + _err_msg, ##__VA_ARGS__); \ +- if (_ret != -BCH_ERR_fsck_fix && \ +- _ret != -BCH_ERR_fsck_ignore) \ ++ if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ ++ !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \ + ret = _ret; \ +- ret = -BCH_ERR_fsck_delete_bkey; \ ++ ret = bch_err_throw(c, fsck_delete_bkey); \ + goto fsck_err; \ + } while (0) + +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 6bb42985306e..c4b0ea1adaa8 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) + return lru + ret * 2; + } + ++#define EXTENT_ITERS_MAX 64 ++ + static int count_iters_for_insert(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, +- unsigned *nr_iters, +- unsigned max_iters) ++ unsigned *nr_iters) + { + int ret = 0, ret2 = 0; + +- if (*nr_iters >= max_iters) { ++ if (*nr_iters >= EXTENT_ITERS_MAX) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } +@@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans, + case KEY_TYPE_reflink_v: + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); + +- if (*nr_iters >= max_iters) { ++ if (*nr_iters >= EXTENT_ITERS_MAX) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } +@@ -67,7 +68,6 @@ static int count_iters_for_insert(struct btree_trans *trans, + u64 idx = REFLINK_P_IDX(p.v); + unsigned sectors = bpos_min(*end, p.k->p).offset - + bkey_start_offset(p.k); +- struct btree_iter iter; + struct bkey_s_c r_k; + + for_each_btree_key_norestart(trans, iter, +@@ -81,17 +81,15 @@ static int count_iters_for_insert(struct btree_trans *trans, + + *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); + +- if (*nr_iters >= max_iters) { ++ if (*nr_iters >= EXTENT_ITERS_MAX) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += min_t(u64, k.k->size, + r_k.k->p.offset - idx); + + *end = bpos_min(*end, pos); +- ret = 1; +- break; ++ return 1; + } + } +- bch2_trans_iter_exit(trans, &iter); + + break; + } +@@ -100,60 +98,32 @@ static int count_iters_for_insert(struct btree_trans *trans, + return ret2 ?: ret; + } + +-#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) +- + int bch2_extent_atomic_end(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_i *insert, + struct bpos *end) + { +- struct btree_iter copy; +- struct bkey_s_c k; + unsigned nr_iters = 0; +- int ret; +- +- ret = bch2_btree_iter_traverse(trans, iter); +- if (ret) +- return ret; +- +- *end = insert->k.p; + +- /* extent_update_to_keys(): */ +- nr_iters += 1; +- +- ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, +- &nr_iters, EXTENT_ITERS_MAX / 2); +- if (ret < 0) +- return ret; ++ struct btree_iter copy; ++ bch2_trans_copy_iter(©, iter); + +- bch2_trans_copy_iter(trans, ©, iter); ++ int ret = bch2_btree_iter_traverse(©); ++ if (ret) ++ goto err; + +- for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) { ++ struct bkey_s_c k; ++ for_each_btree_key_max_continue_norestart(copy, *end, 0, k, ret) { + unsigned offset = 0; + +- if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) +- offset = bkey_start_offset(&insert->k) - +- bkey_start_offset(k.k); +- +- /* extent_handle_overwrites(): */ +- switch (bch2_extent_overlap(&insert->k, k.k)) { +- case BCH_EXTENT_OVERLAP_ALL: +- case BCH_EXTENT_OVERLAP_FRONT: +- nr_iters += 1; +- break; +- case BCH_EXTENT_OVERLAP_BACK: +- case BCH_EXTENT_OVERLAP_MIDDLE: +- nr_iters += 2; +- break; +- } ++ if (bkey_gt(iter->pos, bkey_start_pos(k.k))) ++ offset = iter->pos.offset - bkey_start_offset(k.k); + +- ret = count_iters_for_insert(trans, k, offset, end, +- &nr_iters, EXTENT_ITERS_MAX); ++ ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); + if (ret) + break; + } +- +- bch2_trans_iter_exit(trans, ©); ++err: ++ bch2_trans_iter_exit(©); + return ret < 0 ? ret : 0; + } + +@@ -161,13 +131,22 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k) + { +- struct bpos end; +- int ret; +- +- ret = bch2_extent_atomic_end(trans, iter, k, &end); ++ struct bpos end = k->k.p; ++ int ret = bch2_extent_atomic_end(trans, iter, &end); + if (ret) + return ret; + +- bch2_cut_back(end, k); ++ /* tracepoint */ ++ ++ if (bpos_lt(end, k->k.p)) { ++ if (trace_extent_trim_atomic_enabled()) { ++ CLASS(printbuf, buf)(); ++ bch2_bpos_to_text(&buf, end); ++ prt_newline(&buf); ++ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); ++ trace_extent_trim_atomic(trans->c, buf.buf); ++ } ++ bch2_cut_back(end, k); ++ } + return 0; + } +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +index 6f5cf449361a..34467db53f45 100644 +--- a/fs/bcachefs/extent_update.h ++++ b/fs/bcachefs/extent_update.h +@@ -5,7 +5,7 @@ + #include "bcachefs.h" + + int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, +- struct bkey_i *, struct bpos *); ++ struct bpos *); + int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, + struct bkey_i *); + +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index e597fb9c9823..b879a586b7f6 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -45,6 +45,48 @@ static void bch2_extent_crc_pack(union bch_extent_crc *, + struct bch_extent_crc_unpacked, + enum bch_extent_entry_type); + ++void bch2_io_failures_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_io_failures *failed) ++{ ++ static const char * const error_types[] = { ++ "btree validate", "io", "checksum", "ec reconstruct", NULL ++ }; ++ ++ for (struct bch_dev_io_failures *f = failed->devs; ++ f < failed->devs + failed->nr; ++ f++) { ++ unsigned errflags = ++ ((!!f->failed_btree_validate) << 0) | ++ ((!!f->failed_io) << 1) | ++ ((!!f->failed_csum_nr) << 2) | ++ ((!!f->failed_ec) << 3); ++ ++ bch2_printbuf_make_room(out, 1024); ++ scoped_guard(rcu) { ++ guard(printbuf_atomic)(out); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); ++ if (ca) ++ prt_str(out, ca->name); ++ else ++ prt_printf(out, "(invalid device %u)", f->dev); ++ } ++ ++ prt_char(out, ' '); ++ ++ if (!errflags) { ++ prt_str(out, "no error - confused"); ++ } else if (is_power_of_2(errflags)) { ++ prt_bitflags(out, error_types, errflags); ++ prt_str(out, " error"); ++ } else { ++ prt_str(out, "errors: "); ++ prt_bitflags(out, error_types, errflags); ++ } ++ prt_newline(out); ++ } ++} ++ + struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, + unsigned dev) + { +@@ -79,6 +121,22 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, + f->failed_csum_nr++; + } + ++void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, ++ unsigned dev) ++{ ++ struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); ++ ++ if (!f) { ++ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); ++ ++ f = &failed->devs[failed->nr++]; ++ memset(f, 0, sizeof(*f)); ++ f->dev = dev; ++ } ++ ++ f->failed_btree_validate = true; ++} ++ + static inline u64 dev_latency(struct bch_dev *ca) + { + return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; +@@ -105,7 +163,7 @@ static inline bool ptr_better(struct bch_fs *c, + if (unlikely(failed_delta)) + return failed_delta < 0; + +- if (unlikely(bch2_force_reconstruct_read)) ++ if (static_branch_unlikely(&bch2_force_reconstruct_read)) + return p1.do_ec_reconstruct > p2.do_ec_reconstruct; + + if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) +@@ -134,14 +192,10 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + bool have_dirty_ptrs = false, have_pick = false; + + if (k.k->type == KEY_TYPE_error) +- return -BCH_ERR_key_type_error; +- +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- +- if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) +- return -BCH_ERR_extent_poisoned; ++ return bch_err_throw(c, key_type_error); + + rcu_read_lock(); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 pick_latency; +@@ -162,7 +216,15 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + if (dev >= 0 && p.ptr.dev != dev) + continue; + +- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); ++ ++ if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { ++ rcu_read_unlock(); ++ int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); ++ if (ret) ++ return ret; ++ rcu_read_lock(); ++ } + + if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) + continue; +@@ -175,6 +237,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + + if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { + have_io_errors |= f->failed_io; ++ have_io_errors |= f->failed_btree_validate; + have_io_errors |= f->failed_ec; + } + have_csum_errors |= !!f->failed_csum_nr; +@@ -182,6 +245,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + if (p.has_ec && (f->failed_io || f->failed_csum_nr)) + p.do_ec_reconstruct = true; + else if (f->failed_io || ++ f->failed_btree_validate || + f->failed_csum_nr > c->opts.checksum_err_retry_nr) + continue; + } +@@ -194,7 +258,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + p.do_ec_reconstruct = true; + } + +- if (bch2_force_reconstruct_read && p.has_ec) ++ if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) + p.do_ec_reconstruct = true; + + u64 p_latency = dev_latency(ca); +@@ -218,20 +282,20 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + + if (have_pick) + return 1; +- if (!have_dirty_ptrs) ++ if (!have_dirty_ptrs && !bkey_is_btree_ptr(k.k)) + return 0; +- if (have_missing_devs) +- return -BCH_ERR_no_device_to_read_from; ++ if (have_missing_devs || !have_dirty_ptrs) ++ return bch_err_throw(c, no_device_to_read_from); + if (have_csum_errors) +- return -BCH_ERR_data_read_csum_err; ++ return bch_err_throw(c, data_read_csum_err); + if (have_io_errors) +- return -BCH_ERR_data_read_io_err; ++ return bch_err_throw(c, data_read_io_err); + + /* + * If we get here, we have pointers (bkey_ptrs_validate() ensures that), + * but they don't point to valid devices: + */ +- return -BCH_ERR_no_devices_valid; ++ return bch_err_throw(c, no_devices_valid); + } + + /* KEY_TYPE_btree_ptr: */ +@@ -342,6 +406,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); + ++ guard(rcu)(); ++ + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && + __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != +@@ -353,10 +419,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + return false; + + /* Extents may not straddle buckets: */ +- rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); + bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); +- rcu_read_unlock(); + + if (!same_bucket) + return false; +@@ -773,11 +837,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) + struct extent_ptr_decoded p; + unsigned durability = 0; + +- rcu_read_lock(); ++ guard(rcu)(); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c, &p); +- rcu_read_unlock(); +- + return durability; + } + +@@ -788,12 +850,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) + struct extent_ptr_decoded p; + unsigned durability = 0; + +- rcu_read_lock(); ++ guard(rcu)(); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) + durability += bch2_extent_ptr_durability(c, &p); +- rcu_read_unlock(); +- + return durability; + } + +@@ -946,24 +1006,46 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned + return NULL; + } + ++bool bch2_bkey_devs_rw(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ ++ guard(rcu)(); ++ bkey_for_each_ptr(ptrs, ptr) { ++ CLASS(bch2_dev_tryget, ca)(c, ptr->dev); ++ if (!ca || ca->mi.state != BCH_MEMBER_STATE_rw) ++ return false; ++ } ++ ++ return true; ++} ++ + bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_dev *ca; +- bool ret = false; + +- rcu_read_lock(); ++ guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (ca = bch2_dev_rcu(c, ptr->dev)) && + (!ptr->cached || +- !dev_ptr_stale_rcu(ca, ptr))) { +- ret = true; +- break; +- } +- rcu_read_unlock(); ++ !dev_ptr_stale_rcu(ca, ptr))) ++ return true; + +- return ret; ++ return false; ++} ++ ++bool bch2_bkey_in_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ ++ guard(rcu)(); ++ bkey_for_each_ptr(ptrs, ptr) ++ if (!bch2_dev_in_target(c, ptr->dev, target)) ++ return false; ++ ++ return true; + } + + bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, +@@ -1071,33 +1153,48 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, + struct bkey_s k, + struct bch_extent_ptr *ptr) + { +- struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ struct bkey_ptrs ptrs; + union bch_extent_entry *entry; + struct extent_ptr_decoded p; ++ bool have_cached_ptr; ++ unsigned drop_dev = ptr->dev; + +- rcu_read_lock(); +- if (!want_cached_ptr(c, opts, ptr)) { +- bch2_bkey_drop_ptr_noerror(k, ptr); +- goto out; +- } ++ guard(rcu)(); ++restart_drop_ptrs: ++ ptrs = bch2_bkey_ptrs(k); ++ have_cached_ptr = false; + +- /* +- * Stripes can't contain cached data, for - reasons. +- * +- * Possibly something we can fix in the future? +- */ +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (&entry->ptr == ptr) { +- if (p.has_ec) +- bch2_bkey_drop_ptr_noerror(k, ptr); +- else +- ptr->cached = true; +- goto out; ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ /* ++ * Check if it's erasure coded - stripes can't contain cached ++ * data. Possibly something we can fix in the future? ++ */ ++ if (&entry->ptr == ptr && p.has_ec) ++ goto drop; ++ ++ if (p.ptr.cached) { ++ if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { ++ bch2_bkey_drop_ptr_noerror(k, &entry->ptr); ++ ptr = NULL; ++ goto restart_drop_ptrs; ++ } ++ ++ have_cached_ptr = true; + } ++ } + +- BUG(); +-out: +- rcu_read_unlock(); ++ if (!ptr) ++ bkey_for_each_ptr(ptrs, ptr2) ++ if (ptr2->dev == drop_dev) ++ ptr = ptr2; ++ ++ if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) ++ goto drop; ++ ++ ptr->cached = true; ++ return; ++drop: ++ bch2_bkey_drop_ptr_noerror(k, ptr); + } + + /* +@@ -1112,12 +1209,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) + { + struct bch_dev *ca; + +- rcu_read_lock(); ++ guard(rcu)(); + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + (!(ca = bch2_dev_rcu(c, ptr->dev)) || + dev_ptr_stale_rcu(ca, ptr) > 0)); +- rcu_read_unlock(); + + return bkey_deleted(k.k); + } +@@ -1135,7 +1231,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c, + struct bkey_ptrs ptrs; + bool have_cached_ptr; + +- rcu_read_lock(); ++ guard(rcu)(); + restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; +@@ -1148,15 +1244,14 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c, + } + have_cached_ptr = true; + } +- rcu_read_unlock(); + + return bkey_deleted(k.k); + } + + void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) + { +- out->atomic++; +- rcu_read_lock(); ++ guard(printbuf_atomic)(out); ++ guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, +@@ -1180,8 +1275,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc + else if (stale) + prt_printf(out, " invalid"); + } +- rcu_read_unlock(); +- --out->atomic; + } + + void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_crc_unpacked *crc) +@@ -1443,10 +1536,10 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + const struct bch_extent_rebalance *r = &entry->rebalance; + + if (!bch2_compression_opt_valid(r->compression)) { +- struct bch_compression_opt opt = __bch2_compression_decode(r->compression); ++ union bch_compression_opt opt = { .value = r->compression }; + prt_printf(err, "invalid compression opt %u:%u", + opt.type, opt.level); +- return -BCH_ERR_invalid_bkey; ++ return bch_err_throw(c, invalid_bkey); + } + #endif + break; +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index 9fe153183b36..35ee03cd5065 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -392,10 +392,13 @@ out: \ + + /* utility code common to all keys with pointers: */ + ++void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, ++ struct bch_io_failures *); + struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, + unsigned); + void bch2_mark_io_failure(struct bch_io_failures *, + struct extent_ptr_decoded *, bool); ++void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); + int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + struct bch_io_failures *, + struct extent_ptr_decoded *, int); +@@ -611,7 +614,10 @@ static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsig + return (void *) bch2_bkey_has_device_c(k.s_c, dev); + } + ++bool bch2_bkey_devs_rw(struct bch_fs *, struct bkey_s_c); ++ + bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); ++bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned); + + void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); + +diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h +index e51529dca4c2..b23ce4a373c0 100644 +--- a/fs/bcachefs/extents_types.h ++++ b/fs/bcachefs/extents_types.h +@@ -34,6 +34,7 @@ struct bch_io_failures { + u8 dev; + unsigned failed_csum_nr:6, + failed_io:1, ++ failed_btree_validate:1, + failed_ec:1; + } devs[BCH_REPLICAS_MAX + 1]; + }; +diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c +new file mode 100644 +index 000000000000..6be2a45be1dd +--- /dev/null ++++ b/fs/bcachefs/fast_list.c +@@ -0,0 +1,168 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++/* ++ * Fast, unordered lists ++ * ++ * Supports add, remove, and iterate ++ * ++ * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot ++ * allocation and freeing. ++ * ++ * This means that adding, removing, and iterating over items is lockless, ++ * except when refilling/emptying the percpu slot buffers. ++ */ ++ ++#include "fast_list.h" ++ ++struct fast_list_pcpu { ++ u32 nr; ++ u32 entries[31]; ++}; ++ ++static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) ++{ ++ int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); ++ if (unlikely(idx < 0)) ++ return 0; ++ ++ if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { ++ ida_free(&l->slots_allocated, idx); ++ return 0; ++ } ++ ++ return idx; ++} ++ ++/** ++ * fast_list_get_idx - get a slot in a fast_list ++ * @l: list to get slot in ++ * ++ * This allocates a slot in the radix tree without storing to it, so that we can ++ * take the potential memory allocation failure early and do the list add later ++ * when we can't take an allocation failure. ++ * ++ * Returns: positive integer on success, -ENOMEM on failure ++ */ ++int fast_list_get_idx(struct fast_list *l) ++{ ++ unsigned long flags; ++ int idx; ++retry: ++ local_irq_save(flags); ++ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); ++ ++ if (unlikely(!lp->nr)) { ++ u32 entries[16], nr = 0; ++ ++ local_irq_restore(flags); ++ while (nr < ARRAY_SIZE(entries) && ++ (idx = fast_list_alloc_idx(l, GFP_KERNEL))) ++ entries[nr++] = idx; ++ local_irq_save(flags); ++ ++ lp = this_cpu_ptr(l->buffer); ++ ++ while (nr && lp->nr < ARRAY_SIZE(lp->entries)) ++ lp->entries[lp->nr++] = entries[--nr]; ++ ++ if (unlikely(nr)) { ++ local_irq_restore(flags); ++ while (nr) ++ ida_free(&l->slots_allocated, entries[--nr]); ++ goto retry; ++ } ++ ++ if (unlikely(!lp->nr)) { ++ local_irq_restore(flags); ++ return -ENOMEM; ++ } ++ } ++ ++ idx = lp->entries[--lp->nr]; ++ local_irq_restore(flags); ++ ++ return idx; ++} ++ ++/** ++ * fast_list_add - add an item to a fast_list ++ * @l: list ++ * @item: item to add ++ * ++ * Allocates a slot in the radix tree and stores to it and then returns the ++ * slot index, which must be passed to fast_list_remove(). ++ * ++ * Returns: positive integer on success, -ENOMEM on failure ++ */ ++int fast_list_add(struct fast_list *l, void *item) ++{ ++ int idx = fast_list_get_idx(l); ++ if (idx < 0) ++ return idx; ++ ++ *genradix_ptr_inlined(&l->items, idx) = item; ++ return idx; ++} ++ ++/** ++ * fast_list_remove - remove an item from a fast_list ++ * @l: list ++ * @idx: item's slot index ++ * ++ * Zeroes out the slot in the radix tree and frees the slot for future ++ * fast_list_add() operations. ++ */ ++void fast_list_remove(struct fast_list *l, unsigned idx) ++{ ++ u32 entries[16], nr = 0; ++ ++ if (!idx) ++ return; ++ ++ *genradix_ptr_inlined(&l->items, idx) = NULL; ++ ++ scoped_guard(irqsave) { ++ struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); ++ ++ if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) ++ while (nr < ARRAY_SIZE(entries)) ++ entries[nr++] = lp->entries[--lp->nr]; ++ ++ lp->entries[lp->nr++] = idx; ++ } ++ ++ if (unlikely(nr)) ++ while (nr) ++ ida_free(&l->slots_allocated, entries[--nr]); ++} ++ ++void fast_list_exit(struct fast_list *l) ++{ ++ if (l->buffer) { ++ int cpu; ++ for_each_possible_cpu(cpu) { ++ struct fast_list_pcpu *lp = per_cpu_ptr(l->buffer, cpu); ++ ++ while (lp->nr) ++ ida_free(&l->slots_allocated, lp->entries[--lp->nr]); ++ } ++ ++ free_percpu(l->buffer); ++ } ++ ++ WARN(ida_find_first(&l->slots_allocated) >= 0, ++ "fast_list still has objects on exit\n"); ++ ++ ida_destroy(&l->slots_allocated); ++ genradix_free(&l->items); ++} ++ ++int fast_list_init(struct fast_list *l) ++{ ++ genradix_init(&l->items); ++ ida_init(&l->slots_allocated); ++ l->buffer = alloc_percpu(*l->buffer); ++ if (!l->buffer) ++ return -ENOMEM; ++ return 0; ++} +diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h +new file mode 100644 +index 000000000000..f67df3f72ee2 +--- /dev/null ++++ b/fs/bcachefs/fast_list.h +@@ -0,0 +1,41 @@ ++#ifndef _LINUX_FAST_LIST_H ++#define _LINUX_FAST_LIST_H ++ ++#include ++#include ++#include ++ ++struct fast_list_pcpu; ++ ++struct fast_list { ++ GENRADIX(void *) items; ++ struct ida slots_allocated; ++ struct fast_list_pcpu __percpu ++ *buffer; ++}; ++ ++static inline void *fast_list_iter_peek(struct genradix_iter *iter, ++ struct fast_list *list) ++{ ++ void **p; ++ while ((p = genradix_iter_peek(iter, &list->items)) && !*p) ++ genradix_iter_advance(iter, &list->items); ++ ++ return p ? *p : NULL; ++} ++ ++#define fast_list_for_each_from(_list, _iter, _i, _start) \ ++ for (_iter = genradix_iter_init(&(_list)->items, _start); \ ++ (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ ++ genradix_iter_advance(&(_iter), &(_list)->items)) ++ ++#define fast_list_for_each(_list, _iter, _i) \ ++ fast_list_for_each_from(_list, _iter, _i, 0) ++ ++int fast_list_get_idx(struct fast_list *l); ++int fast_list_add(struct fast_list *l, void *item); ++void fast_list_remove(struct fast_list *l, unsigned idx); ++void fast_list_exit(struct fast_list *l); ++int fast_list_init(struct fast_list *l); ++ ++#endif /* _LINUX_FAST_LIST_H */ +diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c +index e3a75dcca60c..0005569ecace 100644 +--- a/fs/bcachefs/fs-io-buffered.c ++++ b/fs/bcachefs/fs-io-buffered.c +@@ -145,7 +145,7 @@ static int readpage_bio_extend(struct btree_trans *trans, + + BUG_ON(folio_sector(folio) != bio_end_sector(bio)); + +- BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); ++ bio_add_folio_nofail(bio, folio, folio_size(folio), 0); + } + + return bch2_trans_relock(trans); +@@ -157,7 +157,6 @@ static void bchfs_read(struct btree_trans *trans, + struct readpages_iter *readpages_iter) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_buf sk; + int flags = BCH_READ_retry_if_stale| + BCH_READ_may_promote; +@@ -167,7 +166,7 @@ static void bchfs_read(struct btree_trans *trans, + + bch2_bkey_buf_init(&sk); + bch2_trans_begin(trans); +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, ++ CLASS(btree_iter, iter)(trans, BTREE_ID_extents, + POS(inum.inum, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_slots); + while (1) { +@@ -183,12 +182,12 @@ static void bchfs_read(struct btree_trans *trans, + if (ret) + goto err; + +- bch2_btree_iter_set_snapshot(trans, &iter, snapshot); ++ bch2_btree_iter_set_snapshot(&iter, snapshot); + +- bch2_btree_iter_set_pos(trans, &iter, ++ bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); + +- k = bch2_btree_iter_peek_slot(trans, &iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -251,15 +250,13 @@ static void bchfs_read(struct btree_trans *trans, + !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + break; + } +- bch2_trans_iter_exit(trans, &iter); + + if (ret) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); +- prt_printf(&buf, "read error %i from btree lookup", ret); ++ prt_printf(&buf, "read error %s from btree lookup", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); +- printbuf_exit(&buf); + + rbio->bio.bi_status = BLK_STS_IOERR; + bio_endio(&rbio->bio); +@@ -311,7 +308,7 @@ void bch2_readahead(struct readahead_control *ractl) + readpage_iter_advance(&readpages_iter); + + rbio->bio.bi_iter.bi_sector = folio_sector(folio); +- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); ++ bio_add_folio_nofail(&rbio->bio, folio, folio_size(folio), 0); + + bchfs_read(trans, rbio, inode_inum(inode), + &readpages_iter); +@@ -354,7 +351,7 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) + rbio->bio.bi_private = &done; + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); +- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); ++ bio_add_folio_nofail(&rbio->bio, folio, folio_size(folio), 0); + + blk_start_plug(&plug); + bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); +@@ -394,17 +391,9 @@ struct bch_writepage_state { + struct bch_io_opts opts; + struct bch_folio_sector *tmp; + unsigned tmp_sectors; ++ struct blk_plug plug; + }; + +-static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, +- struct bch_inode_info *inode) +-{ +- struct bch_writepage_state ret = { 0 }; +- +- bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); +- return ret; +-} +- + /* + * Determine when a writepage io is full. We have to limit writepage bios to a + * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to +@@ -433,27 +422,23 @@ static void bch2_writepage_io_done(struct bch_write_op *op) + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + + bio_for_each_folio_all(fi, bio) { +- struct bch_folio *s; +- + mapping_set_error(fi.folio->mapping, -EIO); + +- s = __bch2_folio(fi.folio); +- spin_lock(&s->lock); ++ struct bch_folio *s = __bch2_folio(fi.folio); ++ guard(spinlock)(&s->lock); ++ + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; +- spin_unlock(&s->lock); + } + } + + if (io->op.flags & BCH_WRITE_wrote_data_inline) { + bio_for_each_folio_all(fi, bio) { +- struct bch_folio *s; ++ struct bch_folio *s = __bch2_folio(fi.folio); ++ guard(spinlock)(&s->lock); + +- s = __bch2_folio(fi.folio); +- spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; +- spin_unlock(&s->lock); + } + } + +@@ -579,30 +564,30 @@ static int __bch2_writepage(struct folio *folio, + BUG_ON(ret); + + /* Before unlocking the page, get copy of reservations: */ +- spin_lock(&s->lock); +- memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); ++ scoped_guard(spinlock, &s->lock) { ++ memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); + +- for (i = 0; i < f_sectors; i++) { +- if (s->s[i].state < SECTOR_dirty) +- continue; ++ for (i = 0; i < f_sectors; i++) { ++ if (s->s[i].state < SECTOR_dirty) ++ continue; + +- nr_replicas_this_write = +- min_t(unsigned, nr_replicas_this_write, +- s->s[i].nr_replicas + +- s->s[i].replicas_reserved); +- } ++ nr_replicas_this_write = ++ min_t(unsigned, nr_replicas_this_write, ++ s->s[i].nr_replicas + ++ s->s[i].replicas_reserved); ++ } + +- for (i = 0; i < f_sectors; i++) { +- if (s->s[i].state < SECTOR_dirty) +- continue; ++ for (i = 0; i < f_sectors; i++) { ++ if (s->s[i].state < SECTOR_dirty) ++ continue; + +- s->s[i].nr_replicas = w->opts.compression +- ? 0 : nr_replicas_this_write; ++ s->s[i].nr_replicas = w->opts.compression ++ ? 0 : nr_replicas_this_write; + +- s->s[i].replicas_reserved = 0; +- bch2_folio_sector_set(folio, s, i, SECTOR_allocated); ++ s->s[i].replicas_reserved = 0; ++ bch2_folio_sector_set(folio, s, i, SECTOR_allocated); ++ } + } +- spin_unlock(&s->lock); + + BUG_ON(atomic_read(&s->write_count)); + atomic_set(&s->write_count, 1); +@@ -647,8 +632,8 @@ static int __bch2_writepage(struct folio *folio, + atomic_inc(&s->write_count); + + BUG_ON(inode != w->io->inode); +- BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, +- sectors << 9, offset << 9)); ++ bio_add_folio_nofail(&w->io->op.wbio.bio, folio, ++ sectors << 9, offset << 9); + + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; +@@ -666,17 +651,17 @@ static int __bch2_writepage(struct folio *folio, + int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) + { + struct bch_fs *c = mapping->host->i_sb->s_fs_info; +- struct bch_writepage_state w = +- bch_writepage_state_init(c, to_bch_ei(mapping->host)); +- struct blk_plug plug; +- int ret; ++ struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); + +- blk_start_plug(&plug); +- ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); +- if (w.io) +- bch2_writepage_do_io(&w); +- blk_finish_plug(&plug); +- kfree(w.tmp); ++ bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); ++ ++ blk_start_plug(&w->plug); ++ int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); ++ if (w->io) ++ bch2_writepage_do_io(w); ++ blk_finish_plug(&w->plug); ++ kfree(w->tmp); ++ kfree(w); + return bch2_err_class(ret); + } + +@@ -788,10 +773,9 @@ int bch2_write_end(struct file *file, struct address_space *mapping, + copied = 0; + } + +- spin_lock(&inode->v.i_lock); +- if (pos + copied > inode->v.i_size) +- i_size_write(&inode->v, pos + copied); +- spin_unlock(&inode->v.i_lock); ++ scoped_guard(spinlock, &inode->v.i_lock) ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); + + if (copied) { + if (!folio_test_uptodate(folio)) +@@ -950,10 +934,9 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, + + end = pos + copied; + +- spin_lock(&inode->v.i_lock); +- if (end > inode->v.i_size) +- i_size_write(&inode->v, end); +- spin_unlock(&inode->v.i_lock); ++ scoped_guard(spinlock, &inode->v.i_lock) ++ if (end > inode->v.i_size) ++ i_size_write(&inode->v, end); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(fs)); +diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c +index 535bc5fcbcc0..79823234160f 100644 +--- a/fs/bcachefs/fs-io-direct.c ++++ b/fs/bcachefs/fs-io-direct.c +@@ -3,6 +3,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" ++#include "enumerated_ref.h" + #include "fs.h" + #include "fs-io.h" + #include "fs-io-direct.h" +@@ -126,7 +127,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + * the dirtying of requests that are internal from the kernel (i.e. from + * loopback), because we'll deadlock on page_lock. + */ +- dio->should_dirty = iter_is_iovec(iter); ++ dio->should_dirty = user_backed_iter(iter); + + blk_start_plug(&plug); + +@@ -251,12 +252,10 @@ static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; ++ CLASS(btree_trans, trans)(c); + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; +- bool ret = true; + int err; + retry: + bch2_trans_begin(trans); +@@ -268,25 +267,21 @@ static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + for_each_btree_key_norestart(trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_slots, k, err) { ++ offset = iter.pos.offset; ++ + if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) + break; + + if (k.k->p.snapshot != snapshot || + nr_replicas > bch2_bkey_replicas(c, k) || +- (!compressed && bch2_bkey_sectors_compressed(k))) { +- ret = false; +- break; +- } ++ (!compressed && bch2_bkey_sectors_compressed(k))) ++ return false; + } +- +- offset = iter.pos.offset; +- bch2_trans_iter_exit(trans, &iter); + err: + if (bch2_err_matches(err, BCH_ERR_transaction_restart)) + goto retry; +- bch2_trans_put(trans); + +- return err ? false : ret; ++ return !err; + } + + static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) +@@ -401,7 +396,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) + ret = dio->op.error ?: ((long) dio->written << 9); + bio_put(&dio->op.wbio.bio); + +- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ + inode_dio_end(&inode->v); +@@ -427,17 +422,15 @@ static __always_inline void bch2_dio_write_end(struct dio_write *dio) + dio->written += dio->op.written; + + if (dio->extending) { +- spin_lock(&inode->v.i_lock); ++ guard(spinlock)(&inode->v.i_lock); + if (req->ki_pos > inode->v.i_size) + i_size_write(&inode->v, req->ki_pos); +- spin_unlock(&inode->v.i_lock); + } + + if (dio->op.i_sectors_delta || dio->quota_res.sectors) { +- mutex_lock(&inode->ei_quota_lock); ++ guard(mutex)(&inode->ei_quota_lock); + __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); + __bch2_quota_reservation_put(c, inode, &dio->quota_res); +- mutex_unlock(&inode->ei_quota_lock); + } + + bio_release_pages(bio, false); +@@ -606,7 +599,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) + prefetch(&inode->ei_inode); + prefetch((void *) &inode->ei_inode + 64); + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) + return -EROFS; + + inode_lock(&inode->v); +@@ -675,7 +668,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) + bio_put(bio); + inode_dio_end(&inode->v); + err_put_write_ref: +- bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); + goto out; + } + +diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c +index fbae9c1de746..469492f6264a 100644 +--- a/fs/bcachefs/fs-io-pagecache.c ++++ b/fs/bcachefs/fs-io-pagecache.c +@@ -125,11 +125,9 @@ folio_sector_reserve(enum bch_folio_sector_state state) + /* for newly allocated folios: */ + struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) + { +- struct bch_folio *s; +- +- s = kzalloc(sizeof(*s) + +- sizeof(struct bch_folio_sector) * +- folio_sectors(folio), gfp); ++ struct bch_folio *s = kzalloc(sizeof(*s) + ++ sizeof(struct bch_folio_sector) * ++ folio_sectors(folio), gfp); + if (!s) + return NULL; + +@@ -162,7 +160,7 @@ static void __bch2_folio_set(struct folio *folio, + BUG_ON(pg_offset >= sectors); + BUG_ON(pg_offset + pg_len > sectors); + +- spin_lock(&s->lock); ++ guard(spinlock)(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; +@@ -171,8 +169,6 @@ static void __bch2_folio_set(struct folio *folio, + + if (i == sectors) + s->uptodate = true; +- +- spin_unlock(&s->lock); + } + + /* +@@ -276,10 +272,9 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, + s = bch2_folio(folio); + + if (s) { +- spin_lock(&s->lock); ++ guard(spinlock)(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) + s->s[j].nr_replicas = 0; +- spin_unlock(&s->lock); + } + + folio_unlock(folio); +@@ -330,13 +325,12 @@ int bch2_mark_pagecache_reserved(struct bch_inode_info *inode, + unsigned folio_offset = max(*start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + +- spin_lock(&s->lock); ++ guard(spinlock)(&s->lock); + for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) { + i_sectors_delta -= s->s[j].state == SECTOR_dirty; + bch2_folio_sector_set(folio, s, j, + folio_sector_reserve(s->s[j].state)); + } +- spin_unlock(&s->lock); + } + + folio_unlock(folio); +@@ -447,7 +441,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c, + + if (!reserved) { + bch2_disk_reservation_put(c, &disk_res); +- return -BCH_ERR_ENOSPC_disk_reservation; ++ return bch_err_throw(c, ENOSPC_disk_reservation); + } + break; + } +@@ -529,29 +523,26 @@ void bch2_set_folio_dirty(struct bch_fs *c, + + BUG_ON(!s->uptodate); + +- spin_lock(&s->lock); ++ scoped_guard(spinlock, &s->lock) ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ unsigned sectors = sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); + +- for (i = round_down(offset, block_bytes(c)) >> 9; +- i < round_up(offset + len, block_bytes(c)) >> 9; +- i++) { +- unsigned sectors = sectors_to_reserve(&s->s[i], +- res->disk.nr_replicas); ++ /* ++ * This can happen if we race with the error path in ++ * bch2_writepage_io_done(): ++ */ ++ sectors = min_t(unsigned, sectors, res->disk.sectors); + +- /* +- * This can happen if we race with the error path in +- * bch2_writepage_io_done(): +- */ +- sectors = min_t(unsigned, sectors, res->disk.sectors); ++ s->s[i].replicas_reserved += sectors; ++ res->disk.sectors -= sectors; + +- s->s[i].replicas_reserved += sectors; +- res->disk.sectors -= sectors; ++ dirty_sectors += s->s[i].state == SECTOR_unallocated; + +- dirty_sectors += s->s[i].state == SECTOR_unallocated; +- +- bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); +- } +- +- spin_unlock(&s->lock); ++ bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); ++ } + + bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); + +@@ -644,6 +635,8 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) + goto out; + } + ++ inode->ei_last_dirtied = (unsigned long) current; ++ + bch2_set_folio_dirty(c, inode, folio, &res, offset, len); + bch2_folio_reservation_put(c, inode, &res); + +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 9657144666b8..de0d965f3fde 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -7,6 +7,7 @@ + #include "btree_update.h" + #include "buckets.h" + #include "clock.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "extents.h" + #include "extent_update.h" +@@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio) + struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); + + closure_put(bio->cl); +- percpu_ref_put(&bio->ca->io_ref[WRITE]); ++ enumerated_ref_put(&bio->ca->io_ref[WRITE], ++ BCH_DEV_WRITE_REF_nocow_flush); + bio_put(&bio->bio); + } + +@@ -69,11 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); + + for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { +- rcu_read_lock(); +- ca = rcu_dereference(c->devs[dev]); +- if (ca && !percpu_ref_tryget(&ca->io_ref[WRITE])) +- ca = NULL; +- rcu_read_unlock(); ++ scoped_guard(rcu) { ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], ++ BCH_DEV_WRITE_REF_nocow_flush)) ++ ca = NULL; ++ } + + if (!ca) + continue; +@@ -145,17 +148,15 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) + { + if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); + +- bool repeat = false, print = false, suppress = false; +- bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, buf.buf, &repeat, &print, &suppress); ++ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); + if (print) +- bch2_print_str(c, buf.buf); +- printbuf_exit(&buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); + + if (sectors < 0) + sectors = -inode->v.i_blocks; +@@ -185,7 +186,6 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, + u64 *seq) + { +- struct printbuf buf = PRINTBUF; + struct bch_inode_unpacked u; + struct btree_iter iter; + int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); +@@ -195,6 +195,7 @@ static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_in + u64 cur_seq = journal_cur_seq(&trans->c->journal); + *seq = min(cur_seq, u.bi_journal_seq); + ++ CLASS(printbuf, buf)(); + if (fsck_err_on(u.bi_journal_seq > cur_seq, + trans, inode_journal_seq_in_future, + "inode journal seq in future (currently at %llu)\n%s", +@@ -205,8 +206,7 @@ static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_in + ret = bch2_inode_write(trans, &iter, &u); + } + fsck_err: +- bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -220,15 +220,15 @@ static int bch2_flush_inode(struct bch_fs *c, + if (c->opts.journal_flush_disabled) + return 0; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) + return -EROFS; + + u64 seq; + int ret = bch2_trans_commit_do(c, NULL, NULL, 0, +- bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: ++ bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: + bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: + bch2_inode_flush_nocow_writes(c, inode); +- bch2_write_ref_put(c, BCH_WRITE_REF_fsync); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); + return ret; + } + +@@ -265,11 +265,11 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, + struct bpos start, + struct bpos end) + { +- return bch2_trans_run(c, +- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, + subvol, 0, k, ({ +- bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); +- }))); ++ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); ++ })); + } + + static int __bch2_truncate_folio(struct bch_inode_info *inode, +@@ -519,19 +519,16 @@ int bchfs_truncate(struct mnt_idmap *idmap, + + if (unlikely(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal))) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, + inode->ei_inode.bi_sectors); + +- bool repeat = false, print = false, suppress = false; +- bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, buf.buf, +- &repeat, &print, &suppress); ++ bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); + if (print) +- bch2_print_str(c, buf.buf); +- printbuf_exit(&buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); + } + + ret = bch2_setattr_nonsize(idmap, inode, iattr); +@@ -559,11 +556,10 @@ static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, l + u64 block_start = round_up(offset, block_bytes(c)); + u64 block_end = round_down(end, block_bytes(c)); + bool truncated_last_page; +- int ret = 0; + +- ret = bch2_truncate_folios(inode, offset, end); ++ int ret = bch2_truncate_folios(inode, offset, end); + if (unlikely(ret < 0)) +- goto err; ++ return ret; + + truncated_last_page = ret; + +@@ -576,19 +572,18 @@ static noinline long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, l + block_start >> 9, block_end >> 9, + &i_sectors_delta); + bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (ret) ++ return ret; + } + +- mutex_lock(&inode->ei_update_lock); +- if (end >= inode->v.i_size && !truncated_last_page) { +- ret = bch2_write_inode_size(c, inode, inode->v.i_size, +- ATTR_MTIME|ATTR_CTIME); +- } else { +- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ guard(mutex)(&inode->ei_update_lock); ++ if (end >= inode->v.i_size && !truncated_last_page) ++ return bch2_write_inode_size(c, inode, inode->v.i_size, ++ ATTR_MTIME|ATTR_CTIME); ++ else ++ return bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); +- } +- mutex_unlock(&inode->ei_update_lock); +-err: +- return ret; + } + + static noinline long bchfs_fcollapse_finsert(struct bch_inode_info *inode, +@@ -631,15 +626,14 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + u64 start_sector, u64 end_sector) + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; + struct bpos end_pos = POS(inode->v.i_ino, end_sector); + struct bch_io_opts opts; + int ret = 0; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_extents, + POS(inode->v.i_ino, start_sector), + BTREE_ITER_slots|BTREE_ITER_intent); + +@@ -662,9 +656,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + if (ret) + goto bkey_err; + +- bch2_btree_iter_set_snapshot(trans, &iter, snapshot); ++ bch2_btree_iter_set_snapshot(&iter, snapshot); + +- k = bch2_btree_iter_peek_slot(trans, &iter); ++ k = bch2_btree_iter_peek_slot(&iter); + if ((ret = bkey_err(k))) + goto bkey_err; + +@@ -675,13 +669,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + /* already reserved */ + if (bkey_extent_is_reservation(k) && + bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { +- bch2_btree_iter_advance(trans, &iter); ++ bch2_btree_iter_advance(&iter); + continue; + } + + if (bkey_extent_is_data(k.k) && + !(mode & FALLOC_FL_ZERO_RANGE)) { +- bch2_btree_iter_advance(trans, &iter); ++ bch2_btree_iter_advance(&iter); + continue; + } + +@@ -702,7 +696,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + if (ret) + goto bkey_err; + } +- bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); ++ bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + + if (ret) + goto bkey_err; +@@ -752,8 +746,6 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + bch2_quota_reservation_put(c, inode, "a_res); + } + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); + return ret; + } + +@@ -802,13 +794,11 @@ static noinline long bchfs_fallocate(struct bch_inode_info *inode, int mode, + if (end >= inode->v.i_size && + (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || + !(mode & FALLOC_FL_KEEP_SIZE))) { +- spin_lock(&inode->v.i_lock); +- i_size_write(&inode->v, end); +- spin_unlock(&inode->v.i_lock); ++ scoped_guard(spinlock, &inode->v.i_lock) ++ i_size_write(&inode->v, end); + +- mutex_lock(&inode->ei_update_lock); +- ret2 = bch2_write_inode_size(c, inode, end, 0); +- mutex_unlock(&inode->ei_update_lock); ++ scoped_guard(mutex, &inode->ei_update_lock) ++ ret2 = bch2_write_inode_size(c, inode, end, 0); + } + + return ret ?: ret2; +@@ -821,7 +811,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, + struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) + return -EROFS; + + inode_lock(&inode->v); +@@ -841,11 +831,11 @@ long bch2_fallocate_dispatch(struct file *file, int mode, + else if (mode == FALLOC_FL_COLLAPSE_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, false); + else +- ret = -EOPNOTSUPP; ++ ret = bch_err_throw(c, unsupported_fallocate_mode); + err: + bch2_pagecache_block_put(inode); + inode_unlock(&inode->v); +- bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); + + return bch2_err_class(ret); + } +@@ -861,8 +851,8 @@ static int quota_reserve_range(struct bch_inode_info *inode, + struct bch_fs *c = inode->v.i_sb->s_fs_info; + u64 sectors = end - start; + +- int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_max(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_in_subvolume_max(trans, iter, + BTREE_ID_extents, + POS(inode->v.i_ino, start), + POS(inode->v.i_ino, end - 1), +@@ -875,7 +865,7 @@ static int quota_reserve_range(struct bch_inode_info *inode, + } + + 0; +- }))); ++ })); + + return ret ?: bch2_quota_reservation_add(c, inode, res, sectors, true); + } +@@ -955,10 +945,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + + bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); + +- spin_lock(&dst->v.i_lock); +- if (pos_dst + ret > dst->v.i_size) +- i_size_write(&dst->v, pos_dst + ret); +- spin_unlock(&dst->v.i_lock); ++ scoped_guard(spinlock, &dst->v.i_lock) ++ if (pos_dst + ret > dst->v.i_size) ++ i_size_write(&dst->v, pos_dst + ret); + + if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || + IS_SYNC(file_inode(file_dst))) +@@ -1020,38 +1009,38 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + if (offset >= isize) + return -ENXIO; + +- int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ +- if (k.k->p.inode != inode->v.i_ino || +- !bkey_extent_is_data(k.k)) { +- loff_t start_offset = k.k->p.inode == inode->v.i_ino +- ? max(offset, bkey_start_offset(k.k) << 9) +- : offset; +- loff_t end_offset = k.k->p.inode == inode->v.i_ino +- ? MAX_LFS_FILESIZE +- : k.k->p.offset << 9; +- +- /* +- * Found a hole in the btree, now make sure it's +- * a hole in the pagecache. We might have to +- * keep searching if this hole is entirely dirty +- * in the page cache: +- */ +- bch2_trans_unlock(trans); +- loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, +- start_offset, end_offset, 0, false); +- if (pagecache_hole < end_offset) { +- next_hole = pagecache_hole; +- break; +- } +- } else { +- offset = max(offset, bkey_start_offset(k.k) << 9); ++ if (k.k->p.inode != inode->v.i_ino || ++ !bkey_extent_is_data(k.k)) { ++ loff_t start_offset = k.k->p.inode == inode->v.i_ino ++ ? max(offset, bkey_start_offset(k.k) << 9) ++ : offset; ++ loff_t end_offset = k.k->p.inode == inode->v.i_ino ++ ? MAX_LFS_FILESIZE ++ : k.k->p.offset << 9; ++ ++ /* ++ * Found a hole in the btree, now make sure it's ++ * a hole in the pagecache. We might have to ++ * keep searching if this hole is entirely dirty ++ * in the page cache: ++ */ ++ bch2_trans_unlock(trans); ++ loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, ++ start_offset, end_offset, 0, false); ++ if (pagecache_hole < end_offset) { ++ next_hole = pagecache_hole; ++ break; + } +- 0; +- }))); ++ } else { ++ offset = max(offset, bkey_start_offset(k.k) << 9); ++ } ++ 0; ++ })); + if (ret) + return ret; + +diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h +index ca70346e68dc..d229f7225da1 100644 +--- a/fs/bcachefs/fs-io.h ++++ b/fs/bcachefs/fs-io.h +@@ -77,9 +77,8 @@ static inline void bch2_quota_reservation_put(struct bch_fs *c, + struct quota_res *res) + { + if (res->sectors) { +- mutex_lock(&inode->ei_quota_lock); ++ guard(mutex)(&inode->ei_quota_lock); + __bch2_quota_reservation_put(c, inode, res); +- mutex_unlock(&inode->ei_quota_lock); + } + } + +@@ -94,16 +93,15 @@ static inline int bch2_quota_reservation_add(struct bch_fs *c, + if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) + return 0; + +- mutex_lock(&inode->ei_quota_lock); ++ guard(mutex)(&inode->ei_quota_lock); + ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, + check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); +- if (likely(!ret)) { +- inode->ei_quota_reserved += sectors; +- res->sectors += sectors; +- } +- mutex_unlock(&inode->ei_quota_lock); ++ if (ret) ++ return ret; + +- return ret; ++ inode->ei_quota_reserved += sectors; ++ res->sectors += sectors; ++ return 0; + } + + #else +@@ -134,9 +132,8 @@ static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info * + struct quota_res *quota_res, s64 sectors) + { + if (sectors) { +- mutex_lock(&inode->ei_quota_lock); ++ guard(mutex)(&inode->ei_quota_lock); + __bch2_i_sectors_acct(c, inode, quota_res, sectors); +- mutex_unlock(&inode->ei_quota_lock); + } + } + +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index a82dfce9e4ad..8b9d3c7d1f57 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -111,9 +111,8 @@ static int bch2_ioc_getlabel(struct bch_fs *c, char __user *user_label) + + BUILD_BUG_ON(BCH_SB_LABEL_SIZE >= FSLABEL_MAX); + +- mutex_lock(&c->sb_lock); +- memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) ++ memcpy(label, c->disk_sb.sb->label, BCH_SB_LABEL_SIZE); + + len = strnlen(label, BCH_SB_LABEL_SIZE); + if (len == BCH_SB_LABEL_SIZE) { +@@ -152,10 +151,10 @@ static int bch2_ioc_setlabel(struct bch_fs *c, + if (ret) + return ret; + +- mutex_lock(&c->sb_lock); +- strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); +- ret = bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) { ++ strscpy(c->disk_sb.sb->label, label, BCH_SB_LABEL_SIZE); ++ ret = bch2_write_super(c); ++ } + + mnt_drop_write_file(file); + return ret; +@@ -172,7 +171,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) + if (get_user(flags, arg)) + return -EFAULT; + +- bch_notice(c, "shutdown by ioctl type %u", flags); ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ prt_printf(&buf, "shutdown by ioctl type %u", flags); + + switch (flags) { + case FSOP_GOING_FLAGS_DEFAULT: +@@ -180,20 +182,20 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) + if (ret) + break; + bch2_journal_flush(&c->journal); +- bch2_fs_emergency_read_only(c); ++ bch2_fs_emergency_read_only2(c, &buf); + bdev_thaw(c->vfs_sb->s_bdev); + break; + case FSOP_GOING_FLAGS_LOGFLUSH: + bch2_journal_flush(&c->journal); + fallthrough; + case FSOP_GOING_FLAGS_NOLOGFLUSH: +- bch2_fs_emergency_read_only(c); ++ bch2_fs_emergency_read_only2(c, &buf); + break; + default: +- ret = -EINVAL; +- break; ++ return -EINVAL; + } + ++ bch2_print_str(c, KERN_ERR, buf.buf); + return ret; + } + +@@ -228,9 +230,8 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) { + /* sync_inodes_sb enforce s_umount is locked */ +- down_read(&c->vfs_sb->s_umount); ++ guard(rwsem_read)(&c->vfs_sb->s_umount); + sync_inodes_sb(c->vfs_sb); +- up_read(&c->vfs_sb->s_umount); + } + + if (arg.src_ptr) { +@@ -262,13 +263,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + } + + if (dst_dentry->d_inode) { +- error = -BCH_ERR_EEXIST_subvolume_create; ++ error = bch_err_throw(c, EEXIST_subvolume_create); + goto err3; + } + + dir = dst_path.dentry->d_inode; + if (IS_DEADDIR(dir)) { +- error = -BCH_ERR_ENOENT_directory_dead; ++ error = bch_err_throw(c, ENOENT_directory_dead); + goto err3; + } + +@@ -295,12 +296,10 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + !arg.src_ptr) + snapshot_src.subvol = inode_inum(to_bch_ei(dir)).subvol; + +- down_write(&c->snapshot_create_lock); +- inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), +- dst_dentry, arg.mode|S_IFDIR, +- 0, snapshot_src, create_flags); +- up_write(&c->snapshot_create_lock); +- ++ scoped_guard(rwsem_write, &c->snapshot_create_lock) ++ inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), ++ dst_dentry, arg.mode|S_IFDIR, ++ 0, snapshot_src, create_flags); + error = PTR_ERR_OR_ZERO(inode); + if (error) + goto err3; +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index 47f1a64c5c8d..b5e3090f1cb8 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -106,14 +106,13 @@ int __must_check bch2_write_inode(struct bch_fs *c, + inode_set_fn set, + void *p, unsigned fields) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter = {}; +- struct bch_inode_unpacked inode_u; +- int ret; ++ CLASS(btree_trans, trans)(c); + retry: + bch2_trans_begin(trans); + +- ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); ++ struct btree_iter iter = {}; ++ struct bch_inode_unpacked inode_u; ++ int ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); + if (ret) + goto err; + +@@ -124,8 +123,9 @@ int __must_check bch2_write_inode(struct bch_fs *c, + goto err; + + struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); ++ bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); + +- if (memcmp(&old_r, &new_r, sizeof(new_r))) { ++ if (rebalance_changed) { + ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); + if (ret) + goto err; +@@ -141,18 +141,20 @@ int __must_check bch2_write_inode(struct bch_fs *c, + if (!ret) + bch2_inode_update_after_write(trans, inode, &inode_u, fields); + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + ++ if (rebalance_changed) ++ bch2_rebalance_wakeup(c); ++ + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, + "%s: inode %llu:%llu not found when updating", + bch2_err_str(ret), + inode_inum(inode).subvol, + inode_inum(inode).inum); + +- bch2_trans_put(trans); + return ret < 0 ? ret : 0; + } + +@@ -162,40 +164,30 @@ int bch2_fs_quota_transfer(struct bch_fs *c, + unsigned qtypes, + enum quota_acct_mode mode) + { +- unsigned i; +- int ret; +- + qtypes &= enabled_qtypes(c); + +- for (i = 0; i < QTYP_NR; i++) ++ for (unsigned i = 0; i < QTYP_NR; i++) + if (new_qid.q[i] == inode->ei_qid.q[i]) + qtypes &= ~(1U << i); + + if (!qtypes) + return 0; + +- mutex_lock(&inode->ei_quota_lock); ++ guard(mutex)(&inode->ei_quota_lock); + +- ret = bch2_quota_transfer(c, qtypes, new_qid, ++ int ret = bch2_quota_transfer(c, qtypes, new_qid, + inode->ei_qid, + inode->v.i_blocks + + inode->ei_quota_reserved, + mode); + if (!ret) +- for (i = 0; i < QTYP_NR; i++) ++ for (unsigned i = 0; i < QTYP_NR; i++) + if (qtypes & (1 << i)) + inode->ei_qid.q[i] = new_qid.q[i]; + +- mutex_unlock(&inode->ei_quota_lock); +- + return ret; + } + +-static bool subvol_inum_eq(subvol_inum a, subvol_inum b) +-{ +- return a.subvol == b.subvol && a.inum == b.inum; +-} +- + static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) + { + const subvol_inum *inum = data; +@@ -242,7 +234,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) + struct bch_fs *c = trans->c; + struct rhltable *ht = &c->vfs_inodes_by_inum_table; + u64 inum = p.offset; +- DARRAY(u32) subvols; ++ CLASS(darray_u32, subvols)(); + int ret = 0; + + if (!test_bit(BCH_FS_started, &c->flags)) +@@ -281,7 +273,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) + rcu_read_unlock(); + ret = darray_make_room(&subvols, 1); + if (ret) +- goto err; ++ return ret; + subvols.nr = 0; + goto restart_from_top; + } +@@ -304,14 +296,13 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) + u32 snap; + ret = bch2_subvolume_get_snapshot(trans, *i, &snap); + if (ret) +- goto err; ++ return ret; + + ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot); + if (ret) + break; + } +-err: +- darray_exit(&subvols); ++ + return ret; + } + +@@ -352,9 +343,8 @@ static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btre + if (!trans) { + __wait_on_freeing_inode(c, inode, inum); + } else { +- bch2_trans_unlock(trans); +- __wait_on_freeing_inode(c, inode, inum); +- int ret = bch2_trans_relock(trans); ++ int ret = drop_locks_do(trans, ++ (__wait_on_freeing_inode(c, inode, inum), 0)); + if (ret) + return ERR_PTR(ret); + } +@@ -369,9 +359,9 @@ static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btre + + static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inode) + { +- spin_lock(&inode->v.i_lock); +- bool remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); +- spin_unlock(&inode->v.i_lock); ++ bool remove; ++ scoped_guard(spinlock, &inode->v.i_lock) ++ remove = test_and_clear_bit(EI_INODE_HASHED, &inode->ei_flags); + + if (remove) { + int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, +@@ -432,9 +422,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, + + inode_sb_list_add(&inode->v); + +- mutex_lock(&c->vfs_inodes_lock); +- list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); +- mutex_unlock(&c->vfs_inodes_lock); ++ scoped_guard(mutex, &c->vfs_inodes_lock) ++ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + return inode; + } + } +@@ -516,15 +505,14 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + if (inode) + return &inode->v; + +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + int ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: +- bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: +- PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); +- bch2_trans_put(trans); ++ bch2_inode_find_by_inum_trans(trans, inum, &inode_u) ?: ++ PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol))); + + return ret ? ERR_PTR(ret) : &inode->v; + } +@@ -536,7 +524,6 @@ __bch2_create(struct mnt_idmap *idmap, + unsigned flags) + { + struct bch_fs *c = dir->v.i_sb->s_fs_info; +- struct btree_trans *trans; + struct bch_inode_unpacked dir_u; + struct bch_inode_info *inode; + struct bch_inode_unpacked inode_u; +@@ -557,18 +544,23 @@ __bch2_create(struct mnt_idmap *idmap, + if (ret) + return ERR_PTR(ret); + #endif ++ + inode = __bch2_new_inode(c, GFP_NOFS); + if (unlikely(!inode)) { +- inode = ERR_PTR(-ENOMEM); +- goto err; ++ posix_acl_release(default_acl); ++ posix_acl_release(acl); ++ return ERR_PTR(-ENOMEM); + } + + bch2_inode_init_early(c, &inode_u); + + if (!(flags & BCH_CREATE_TMPFILE)) + mutex_lock(&dir->ei_update_lock); +- +- trans = bch2_trans_get(c); ++ /* ++ * posix_acl_create() calls get_acl -> btree transaction, don't start ++ * ours until after, ei->update_lock must also be taken first: ++ */ ++ CLASS(btree_trans, trans)(c); + retry: + bch2_trans_begin(trans); + +@@ -627,7 +619,6 @@ __bch2_create(struct mnt_idmap *idmap, + * restart here. + */ + inode = bch2_inode_hash_insert(c, NULL, inode); +- bch2_trans_put(trans); + err: + posix_acl_release(default_acl); + posix_acl_release(acl); +@@ -636,7 +627,6 @@ __bch2_create(struct mnt_idmap *idmap, + if (!(flags & BCH_CREATE_TMPFILE)) + mutex_unlock(&dir->ei_update_lock); + +- bch2_trans_put(trans); + make_bad_inode(&inode->v); + iput(&inode->v); + inode = ERR_PTR(ret); +@@ -651,7 +641,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + subvol_inum inum = {}; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + struct qstr lookup_name; + int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); +@@ -702,8 +692,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + if (ret) + goto err; + out: +- bch2_trans_iter_exit(trans, &dirent_iter); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&dirent_iter); + return inode; + err: + inode = ERR_PTR(ret); +@@ -724,7 +713,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, + if (IS_ERR(inode)) + inode = NULL; + +-#ifdef CONFIG_UNICODE + if (!inode && IS_CASEFOLDED(vdir)) { + /* + * Do not cache a negative dentry in casefolded directories +@@ -739,7 +727,6 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, + */ + return NULL; + } +-#endif + + return d_splice_alias(&inode->v, dentry); + } +@@ -774,8 +761,8 @@ static int __bch2_link(struct bch_fs *c, + struct bch_inode_unpacked dir_u, inode_u; + int ret; + +- mutex_lock(&inode->ei_update_lock); +- struct btree_trans *trans = bch2_trans_get(c); ++ guard(mutex)(&inode->ei_update_lock); ++ CLASS(btree_trans, trans)(c); + + ret = commit_do(trans, NULL, NULL, 0, + bch2_link_trans(trans, +@@ -789,8 +776,6 @@ static int __bch2_link(struct bch_fs *c, + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); + } + +- bch2_trans_put(trans); +- mutex_unlock(&inode->ei_update_lock); + return ret; + } + +@@ -825,8 +810,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + int ret; + + bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); +- +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + + ret = commit_do(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, +@@ -853,7 +837,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + if (IS_CASEFOLDED(vdir)) + d_invalidate(dentry); + err: +- bch2_trans_put(trans); + bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); + + return ret; +@@ -922,7 +905,6 @@ static int bch2_rename2(struct mnt_idmap *idmap, + struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); + struct bch_inode_unpacked dst_dir_u, src_dir_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u, *whiteout_inode_u; +- struct btree_trans *trans; + enum bch_rename_mode mode = flags & RENAME_EXCHANGE + ? BCH_RENAME_EXCHANGE + : dst_dentry->d_inode +@@ -946,7 +928,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, + src_inode, + dst_inode); + +- trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + + ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?: + bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol); +@@ -1032,8 +1014,6 @@ static int bch2_rename2(struct mnt_idmap *idmap, + bch2_inode_update_after_write(trans, dst_inode, &dst_inode_u, + ATTR_CTIME); + err: +- bch2_trans_put(trans); +- + bch2_fs_quota_transfer(c, src_inode, + bch_qid(&src_inode->ei_inode), + 1 << QTYP_PRJ, +@@ -1101,7 +1081,6 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, + { + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_qid qid; +- struct btree_trans *trans; + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl = NULL; +@@ -1109,7 +1088,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, + kgid_t kgid; + int ret; + +- mutex_lock(&inode->ei_update_lock); ++ guard(mutex)(&inode->ei_update_lock); + + qid = inode->ei_qid; + +@@ -1126,9 +1105,9 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, + ret = bch2_fs_quota_transfer(c, inode, qid, ~0, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) +- goto err; ++ return ret; + +- trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + retry: + bch2_trans_begin(trans); + kfree(acl); +@@ -1152,23 +1131,18 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); + btree_err: +- bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(&inode_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (unlikely(ret)) +- goto err_trans; ++ return ret; + + bch2_inode_update_after_write(trans, inode, &inode_u, attr->ia_valid); + + if (acl) + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); +-err_trans: +- bch2_trans_put(trans); +-err: +- mutex_unlock(&inode->ei_update_lock); +- +- return bch2_err_class(ret); ++ return 0; + } + + static int bch2_getattr(struct mnt_idmap *idmap, +@@ -1232,18 +1206,16 @@ static int bch2_setattr(struct mnt_idmap *idmap, + { + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; +- int ret; + + lockdep_assert_held(&inode->v.i_rwsem); + +- ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: +- setattr_prepare(idmap, dentry, iattr); +- if (ret) +- return ret; ++ int ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: ++ setattr_prepare(idmap, dentry, iattr) ?: ++ (iattr->ia_valid & ATTR_SIZE ++ ? bchfs_truncate(idmap, inode, iattr) ++ : bch2_setattr_nonsize(idmap, inode, iattr)); + +- return iattr->ia_valid & ATTR_SIZE +- ? bchfs_truncate(idmap, inode, iattr) +- : bch2_setattr_nonsize(idmap, inode, iattr); ++ return bch2_err_class(ret); + } + + static int bch2_tmpfile(struct mnt_idmap *idmap, +@@ -1323,8 +1295,14 @@ static int bch2_fill_extent(struct bch_fs *c, + flags| + FIEMAP_EXTENT_DELALLOC| + FIEMAP_EXTENT_UNWRITTEN); ++ } else if (k.k->type == KEY_TYPE_error) { ++ return 0; + } else { +- BUG(); ++ WARN_ONCE(1, "unhandled key type %s", ++ k.k->type < KEY_TYPE_MAX ++ ? bch2_bkey_types[k.k->type] ++ : "(unknown)"); ++ return 0; + } + } + +@@ -1419,21 +1397,20 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans, + if (ret) + return ret; + +- struct btree_iter iter; +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, +- SPOS(inode->ei_inum.inum, start, snapshot), 0); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_extents, ++ SPOS(inode->ei_inum.inum, start, snapshot), 0); + + struct bkey_s_c k = +- bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); ++ bch2_btree_iter_peek_max(&iter, POS(inode->ei_inum.inum, end)); + ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; + + ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); + if (ret) +- goto err; ++ return ret; + + struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); + +@@ -1469,7 +1446,7 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans, + ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, + &cur->kbuf); + if (ret) +- goto err; ++ return ret; + + struct bkey_i *k = cur->kbuf.k; + sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); +@@ -1481,9 +1458,8 @@ static int bch2_next_fiemap_extent(struct btree_trans *trans, + k->k.p = iter.pos; + k->k.p.offset += k->k.size; + } +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ ++ return 0; + } + + static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, +@@ -1491,7 +1467,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + { + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); +- struct btree_trans *trans; + struct bch_fiemap_extent cur, prev; + int ret = 0; + +@@ -1509,7 +1484,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + bch2_bkey_buf_init(&prev.kbuf); + bkey_init(&prev.kbuf.k->k); + +- trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + + while (start < end) { + ret = lockrestart_do(trans, +@@ -1542,7 +1517,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + ret = bch2_fill_extent(c, info, &prev); + } + err: +- bch2_trans_put(trans); + bch2_bkey_buf_exit(&cur.kbuf, c); + bch2_bkey_buf_exit(&prev.kbuf, c); + +@@ -1575,11 +1549,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) + { + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + + if (!dir_emit_dots(file, ctx)) + return 0; + +- int ret = bch2_readdir(c, inode_inum(inode), ctx); ++ int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx); + + bch_err_fn(c, ret); + return bch2_err_class(ret); +@@ -1695,11 +1670,15 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, + + s.mask = map_defined(bch_flags_to_xflags); + s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); +- if (fa->fsx_xflags) +- return -EOPNOTSUPP; ++ if (fa->fsx_xflags) { ++ ret = bch_err_throw(c, unsupported_fsx_flag); ++ goto err; ++ } + +- if (fa->fsx_projid >= U32_MAX) +- return -EINVAL; ++ if (fa->fsx_projid >= U32_MAX) { ++ ret = bch_err_throw(c, projid_too_big); ++ goto err; ++ } + + /* + * inode fields accessible via the xattr interface are stored with a +1 +@@ -1721,8 +1700,10 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, + fa->flags &= ~FS_CASEFOLD_FL; + + s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); +- if (fa->flags) +- return -EOPNOTSUPP; ++ if (fa->flags) { ++ ret = bch_err_throw(c, unsupported_fa_flag); ++ goto err; ++ } + } + + mutex_lock(&inode->ei_update_lock); +@@ -1733,7 +1714,8 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, + bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); +- return ret; ++err: ++ return bch2_err_class(ret); + } + + static const struct file_operations bch_file_operations = { +@@ -1964,9 +1946,6 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_inode_info *dir = to_bch_ei(parent->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; +- struct btree_trans *trans; +- struct btree_iter iter1; +- struct btree_iter iter2; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct bch_inode_unpacked inode_u; +@@ -1979,12 +1958,11 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child + if (!S_ISDIR(dir->v.i_mode)) + return -EINVAL; + +- trans = bch2_trans_get(c); +- +- bch2_trans_iter_init(trans, &iter1, BTREE_ID_dirents, +- POS(dir->ei_inode.bi_inum, 0), 0); +- bch2_trans_iter_init(trans, &iter2, BTREE_ID_dirents, +- POS(dir->ei_inode.bi_inum, 0), 0); ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter1)(trans, BTREE_ID_dirents, ++ POS(dir->ei_inode.bi_inum, 0), 0); ++ CLASS(btree_iter, iter2)(trans, BTREE_ID_dirents, ++ POS(dir->ei_inode.bi_inum, 0), 0); + retry: + bch2_trans_begin(trans); + +@@ -1992,30 +1970,30 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child + if (ret) + goto err; + +- bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); +- bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); ++ bch2_btree_iter_set_snapshot(&iter1, snapshot); ++ bch2_btree_iter_set_snapshot(&iter2, snapshot); + + ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); + if (ret) + goto err; + + if (inode_u.bi_dir == dir->ei_inode.bi_inum) { +- bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); ++ bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + +- k = bch2_btree_iter_peek_slot(trans, &iter1); ++ k = bch2_btree_iter_peek_slot(&iter1); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_dirent) { +- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; ++ ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); + goto err; + } + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); + if (ret > 0) +- ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; ++ ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); + if (ret) + goto err; + +@@ -2026,7 +2004,7 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child + * File with multiple hardlinks and our backref is to the wrong + * directory - linear search: + */ +- for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { ++ for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + if (k.k->p.inode > dir->ei_inode.bi_inum) + break; + +@@ -2057,10 +2035,6 @@ static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + +- bch2_trans_iter_exit(trans, &iter1); +- bch2_trans_iter_exit(trans, &iter2); +- bch2_trans_put(trans); +- + return ret; + } + +@@ -2144,12 +2118,11 @@ static int bch2_vfs_write_inode(struct inode *vinode, + { + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *inode = to_bch_ei(vinode); +- int ret; + +- mutex_lock(&inode->ei_update_lock); +- ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, +- ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); +- mutex_unlock(&inode->ei_update_lock); ++ guard(mutex)(&inode->ei_update_lock); ++ ++ int ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); + + return bch2_err_class(ret); + } +@@ -2181,7 +2154,13 @@ static void bch2_evict_inode(struct inode *vinode) + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +- bch2_inode_rm(c, inode_inum(inode)); ++ int ret = bch2_inode_rm(c, inode_inum(inode)); ++ if (ret && !bch2_err_matches(ret, EROFS)) { ++ bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", ++ inode->ei_inum.subvol, ++ inode->ei_inum.inum); ++ bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); ++ } + + /* + * If we are deleting, we need it present in the vfs hash table +@@ -2190,9 +2169,8 @@ static void bch2_evict_inode(struct inode *vinode) + bch2_inode_hash_remove(c, inode); + } + +- mutex_lock(&c->vfs_inodes_lock); +- list_del_init(&inode->ei_vfs_inode_list); +- mutex_unlock(&c->vfs_inodes_lock); ++ scoped_guard(mutex, &c->vfs_inodes_lock) ++ list_del_init(&inode->ei_vfs_inode_list); + } + + void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) +@@ -2328,7 +2306,8 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) + struct bch_fs *c = root->d_sb->s_fs_info; + bool first = true; + +- for_each_online_member(c, ca) { ++ guard(rcu)(); ++ for_each_online_member_rcu(c, ca) { + if (!first) + seq_putc(seq, ':'); + first = false; +@@ -2341,16 +2320,14 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) + static int bch2_show_options(struct seq_file *seq, struct dentry *root) + { + struct bch_fs *c = root->d_sb->s_fs_info; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb, + OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE); + printbuf_nul_terminate(&buf); + seq_printf(seq, ",%s", buf.buf); + +- int ret = buf.allocation_failure ? -ENOMEM : 0; +- printbuf_exit(&buf); +- return ret; ++ return buf.allocation_failure ? -ENOMEM : 0; + } + + static void bch2_put_super(struct super_block *sb) +@@ -2372,24 +2349,20 @@ static int bch2_freeze(struct super_block *sb) + { + struct bch_fs *c = sb->s_fs_info; + +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + bch2_fs_read_only(c); +- up_write(&c->state_lock); + return 0; + } + + static int bch2_unfreeze(struct super_block *sb) + { + struct bch_fs *c = sb->s_fs_info; +- int ret; + + if (test_bit(BCH_FS_emergency_ro, &c->flags)) + return 0; + +- down_write(&c->state_lock); +- ret = bch2_fs_read_write(c); +- up_write(&c->state_lock); +- return ret; ++ guard(rwsem_write)(&c->state_lock); ++ return bch2_fs_read_write(c); + } + + static const struct super_operations bch_super_operations = { +@@ -2440,7 +2413,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) + struct inode *vinode; + struct bch2_opts_parse *opts_parse = fc->fs_private; + struct bch_opts opts = opts_parse->opts; +- darray_str devs; ++ darray_const_str devs; + darray_fs devs_to_fs = {}; + int ret; + +@@ -2464,7 +2437,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) + if (!IS_ERR(sb)) + goto got_sb; + +- c = bch2_fs_open(devs.data, devs.nr, opts); ++ c = bch2_fs_open(&devs, &opts); + ret = PTR_ERR_OR_ZERO(c); + if (ret) + goto err; +@@ -2484,6 +2457,14 @@ static int bch2_fs_get_tree(struct fs_context *fc) + if (ret) + goto err_stop_fs; + ++ /* ++ * We might be doing a RO mount because other options required it, or we ++ * have no alloc info and it's a small image with no room to regenerate ++ * it ++ */ ++ if (c->opts.read_only) ++ fc->sb_flags |= SB_RDONLY; ++ + sb = sget(fc->fs_type, NULL, bch2_set_super, fc->sb_flags|SB_NOSEC, c); + ret = PTR_ERR_OR_ZERO(sb); + if (ret) +@@ -2514,7 +2495,12 @@ static int bch2_fs_get_tree(struct fs_context *fc) + sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; + sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); +- super_set_sysfs_name_uuid(sb); ++ ++ if (c->sb.multi_device) ++ super_set_sysfs_name_uuid(sb); ++ else ++ strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); ++ + sb->s_shrink->seeks = 0; + c->vfs_sb = sb; + strscpy(sb->s_id, c->name, sizeof(sb->s_id)); +@@ -2525,14 +2511,15 @@ static int bch2_fs_get_tree(struct fs_context *fc) + + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; + +- for_each_online_member(c, ca) { +- struct block_device *bdev = ca->disk_sb.bdev; ++ scoped_guard(rcu) { ++ for_each_online_member_rcu(c, ca) { ++ struct block_device *bdev = ca->disk_sb.bdev; + +- /* XXX: create an anonymous device for multi device filesystems */ +- sb->s_bdev = bdev; +- sb->s_dev = bdev->bd_dev; +- percpu_ref_put(&ca->io_ref[READ]); +- break; ++ /* XXX: create an anonymous device for multi device filesystems */ ++ sb->s_bdev = bdev; ++ sb->s_dev = bdev->bd_dev; ++ break; ++ } + } + + c->dev = sb->s_dev; +@@ -2544,10 +2531,11 @@ static int bch2_fs_get_tree(struct fs_context *fc) + + sb->s_shrink->seeks = 0; + +-#ifdef CONFIG_UNICODE +- sb->s_encoding = c->cf_encoding; +-#endif ++#if IS_ENABLED(CONFIG_UNICODE) ++ if (!bch2_fs_casefold_enabled(c)) ++ sb->s_encoding = c->cf_encoding; + generic_set_sb_d_ops(sb); ++#endif + + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); + ret = PTR_ERR_OR_ZERO(vinode); +@@ -2645,7 +2633,7 @@ static int bch2_fs_reconfigure(struct fs_context *fc) + opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); + + if (opts->opts.read_only != c->opts.read_only) { +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + + if (opts->opts.read_only) { + bch2_fs_read_only(c); +@@ -2655,22 +2643,18 @@ static int bch2_fs_reconfigure(struct fs_context *fc) + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); +- up_write(&c->state_lock); +- ret = -EINVAL; +- goto err; ++ return -EINVAL; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts->opts.read_only; +- +- up_write(&c->state_lock); + } + + if (opt_defined(opts->opts, errors)) + c->opts.errors = opts->opts.errors; +-err: ++ + return bch2_err_class(ret); + } + +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index aaf187085276..6ccea09243ab 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -12,8 +12,10 @@ + #include "fs.h" + #include "fsck.h" + #include "inode.h" ++#include "io_misc.h" + #include "keylist.h" + #include "namei.h" ++#include "progress.h" + #include "recovery_passes.h" + #include "snapshot.h" + #include "super.h" +@@ -23,14 +25,15 @@ + #include + #include /* struct qstr */ + +-static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, ++static int dirent_points_to_inode_nowarn(struct bch_fs *c, ++ struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) + { + if (d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum) + return 0; +- return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; ++ return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); + } + + static void dirent_inode_mismatch_msg(struct printbuf *out, +@@ -49,12 +52,11 @@ static int dirent_points_to_inode(struct bch_fs *c, + struct bkey_s_c_dirent dirent, + struct bch_inode_unpacked *inode) + { +- int ret = dirent_points_to_inode_nowarn(dirent, inode); ++ int ret = dirent_points_to_inode_nowarn(c, dirent, inode); + if (ret) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + dirent_inode_mismatch_msg(&buf, c, dirent, inode); + bch_warn(c, "%s", buf.buf); +- printbuf_exit(&buf); + } + return ret; + } +@@ -109,27 +111,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, + return ret; + } + +-static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, +- struct bch_inode_unpacked *inode) +-{ +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; +- +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, +- SPOS(0, inode_nr, snapshot), 0); +- ret = bkey_err(k); +- if (ret) +- goto err; +- +- ret = bkey_is_inode(k.k) +- ? bch2_inode_unpack(k, inode) +- : -BCH_ERR_ENOENT_inode; +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} +- + static int lookup_dirent_in_snapshot(struct btree_trans *trans, + struct bch_hash_info hash_info, + subvol_inum dir, struct qstr *name, +@@ -145,7 +126,7 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + *target = le64_to_cpu(d.v->d_inum); + *type = d.v->d_type; +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return 0; + } + +@@ -156,7 +137,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, + static int find_snapshot_tree_subvol(struct btree_trans *trans, + u32 tree_id, u32 *subvol) + { +- struct btree_iter iter; + struct bkey_s_c k; + int ret; + +@@ -170,13 +150,11 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans, + + if (s.v->subvol) { + *subvol = le32_to_cpu(s.v->subvol); +- goto found; ++ return 0; + } + } +- ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; +-found: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ ++ return ret ?: bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); + } + + /* Get lost+found, create if it doesn't exist: */ +@@ -186,7 +164,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + { + struct bch_fs *c = trans->c; + struct qstr lostfound_str = QSTR("lost+found"); +- struct btree_iter lostfound_iter = {}; ++ struct btree_iter lostfound_iter = { NULL }; + u64 inum = 0; + unsigned d_type = 0; + int ret; +@@ -212,8 +190,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + return ret; + + if (!subvol.inode) { +- struct btree_iter iter; +- struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, ++ struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, + BTREE_ID_subvolumes, POS(0, subvolid), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); +@@ -221,7 +198,6 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + return ret; + + subvol->v.inode = cpu_to_le64(reattaching_inum); +- bch2_trans_iter_exit(trans, &iter); + } + + subvol_inum root_inum = { +@@ -231,7 +207,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + + struct bch_inode_unpacked root_inode; + struct bch_hash_info root_hash_info; +- ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); ++ ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); + bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", + root_inum.inum, subvolid); + if (ret) +@@ -250,14 +226,14 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + + if (d_type != DT_DIR) { + bch_err(c, "error looking up lost+found: not a directory"); +- return -BCH_ERR_ENOENT_not_directory; ++ return bch_err_throw(c, ENOENT_not_directory); + } + + /* + * The bch2_check_dirents pass has already run, dangling dirents + * shouldn't exist here: + */ +- ret = lookup_inode(trans, inum, snapshot, lostfound); ++ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); + bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", + inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); + return ret; +@@ -272,20 +248,19 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + * XXX: we could have a nicer log message here if we had a nice way to + * walk backpointers to print a path + */ +- struct printbuf path = PRINTBUF; ++ CLASS(printbuf, path)(); + ret = bch2_inum_to_path(trans, root_inum, &path); + if (ret) + goto err; + + bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", + path.buf, root_inum.subvol, snapshot); +- printbuf_exit(&path); + + u64 now = bch2_current_time(c); + u64 cpu = raw_smp_processor_id(); + + bch2_inode_init_early(c, lostfound); +- bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); ++ bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); + lostfound->bi_dir = root_inode.bi_inum; + lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); + +@@ -295,8 +270,8 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + if (ret) + goto err; + +- bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); +- ret = bch2_btree_iter_traverse(trans, &lostfound_iter); ++ bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); ++ ret = bch2_btree_iter_traverse(&lostfound_iter); + if (ret) + goto err; + +@@ -312,7 +287,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + BTREE_UPDATE_internal_snapshot_node); + err: + bch_err_msg(c, ret, "creating lost+found"); +- bch2_trans_iter_exit(trans, &lostfound_iter); ++ bch2_trans_iter_exit(&lostfound_iter); + return ret; + } + +@@ -347,16 +322,17 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) + (inode->bi_flags & BCH_INODE_has_child_snapshot)) + return false; + +- return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); ++ return !bch2_inode_has_backpointer(inode) && ++ !(inode->bi_flags & BCH_INODE_unlinked); + } + + static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot) + { +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents, +- SPOS(d_pos.inode, d_pos.offset, snapshot), +- BTREE_ITER_intent| +- BTREE_ITER_with_updates); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_dirents, ++ SPOS(d_pos.inode, d_pos.offset, snapshot), ++ BTREE_ITER_intent| ++ BTREE_ITER_with_updates); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; +@@ -369,16 +345,15 @@ static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (ret) +- goto err; ++ return ret; + + bkey_init(&k->k); + k->k.type = KEY_TYPE_whiteout; + k->k.p = iter.pos; +- ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); ++ return bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node); + } +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ ++ return 0; + } + + static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) +@@ -392,6 +367,16 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * + if (inode->bi_subvol) { + inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + ++ struct bkey_i_subvolume *subvol = ++ bch2_bkey_get_mut_typed(trans, ++ BTREE_ID_subvolumes, POS(0, inode->bi_subvol), ++ 0, subvolume); ++ ret = PTR_ERR_OR_ZERO(subvol); ++ if (ret) ++ return ret; ++ ++ subvol->v.fs_path_parent = BCACHEFS_ROOT_SUBVOL; ++ + u64 root_inum; + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &dirent_snapshot, &root_inum); +@@ -407,6 +392,8 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * + if (ret) + return ret; + ++ bch_verbose(c, "got lostfound inum %llu", lostfound.bi_inum); ++ + lostfound.bi_nlink += S_ISDIR(inode->bi_mode); + + /* ensure lost+found inode is also present in inode snapshot */ +@@ -443,14 +430,23 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * + if (ret) + return ret; + ++ { ++ CLASS(printbuf, buf)(); ++ ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, ++ inode->bi_snapshot, NULL, &buf); ++ if (ret) ++ return ret; ++ ++ bch_info(c, "reattached at %s", buf.buf); ++ } ++ + /* + * Fix up inodes in child snapshots: if they should also be reattached + * update the backpointer field, if they should not be we need to emit + * whiteouts for the dirent we just created. + */ + if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) { +- snapshot_id_list whiteouts_done; +- struct btree_iter iter; ++ CLASS(snapshot_id_list, whiteouts_done)(); + struct bkey_s_c k; + + darray_init(&whiteouts_done); +@@ -469,19 +465,16 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * + struct bch_inode_unpacked child_inode; + ret = bch2_inode_unpack(k, &child_inode); + if (ret) +- break; ++ return ret; + + if (!inode_should_reattach(&child_inode)) { +- ret = maybe_delete_dirent(trans, +- SPOS(lostfound.bi_inum, inode->bi_dir_offset, +- dirent_snapshot), +- k.k->p.snapshot); +- if (ret) +- break; +- +- ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); ++ ret = maybe_delete_dirent(trans, ++ SPOS(lostfound.bi_inum, inode->bi_dir_offset, ++ dirent_snapshot), ++ k.k->p.snapshot) ?: ++ snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot); + if (ret) +- break; ++ return ret; + } else { + iter.snapshot = k.k->p.snapshot; + child_inode.bi_dir = inode->bi_dir; +@@ -490,11 +483,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * + ret = bch2_inode_write_flags(trans, &iter, &child_inode, + BTREE_UPDATE_internal_snapshot_node); + if (ret) +- break; ++ return ret; + } + } +- darray_exit(&whiteouts_done); +- bch2_trans_iter_exit(trans, &iter); + } + + return ret; +@@ -504,23 +495,35 @@ static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) + { +- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); ++ bch2_trans_iter_init(trans, iter, BTREE_ID_dirents, pos, 0); ++ struct bkey_s_c_dirent d = bch2_bkey_get_typed(iter, dirent); ++ if (bkey_err(d.s_c)) ++ bch2_trans_iter_exit(iter); ++ return d; + } + + static int remove_backpointer(struct btree_trans *trans, + struct bch_inode_unpacked *inode) + { +- if (!inode->bi_dir) ++ if (!bch2_inode_has_backpointer(inode)) + return 0; + ++ u32 snapshot = inode->bi_snapshot; ++ ++ if (inode->bi_parent_subvol) { ++ int ret = bch2_subvolume_get_snapshot(trans, inode->bi_parent_subvol, &snapshot); ++ if (ret) ++ return ret; ++ } ++ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, +- SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); ++ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); + int ret = bkey_err(d) ?: + dirent_points_to_inode(c, d, inode) ?: + bch2_fsck_remove_dirent(trans, d.k->p); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -552,7 +555,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub + + if (!bch2_snapshot_is_leaf(c, snapshotid)) { + bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); +- return -BCH_ERR_fsck_repair_unimplemented; ++ return bch_err_throw(c, fsck_repair_unimplemented); + } + + /* +@@ -566,14 +569,14 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub + u64 cpu = raw_smp_processor_id(); + + bch2_inode_init_early(c, &new_inode); +- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); ++ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + + new_inode.bi_subvol = subvolid; + + int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: +- bch2_btree_iter_traverse(trans, &inode_iter) ?: ++ bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, &new_inode); +- bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(&inode_iter); + if (ret) + return ret; + +@@ -595,8 +598,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub + if (ret) + return ret; + +- struct btree_iter iter; +- struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter, ++ struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, + BTREE_ID_snapshots, POS(0, snapshotid), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); +@@ -608,9 +610,8 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub + + s->v.subvol = cpu_to_le32(subvolid); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, true); +- bch2_trans_iter_exit(trans, &iter); + +- struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter, ++ struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, + BTREE_ID_snapshot_trees, POS(0, snapshot_tree), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(st); +@@ -620,8 +621,6 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub + + if (!st->v.master_subvol) + st->v.master_subvol = cpu_to_le32(subvolid); +- +- bch2_trans_iter_exit(trans, &iter); + return 0; + } + +@@ -633,11 +632,8 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 + + switch (btree) { + case BTREE_ID_extents: { +- struct btree_iter iter = {}; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); +- struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); +- bch2_trans_iter_exit(trans, &iter); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); ++ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); + int ret = bkey_err(k); + if (ret) + return ret; +@@ -656,7 +652,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 + + struct bch_inode_unpacked new_inode; + bch2_inode_init_early(c, &new_inode); +- bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); ++ bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); + new_inode.bi_size = i_size; + new_inode.bi_inum = inum; + new_inode.bi_snapshot = snapshot; +@@ -664,21 +660,20 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 + return __bch2_fsck_write_inode(trans, &new_inode); + } + +-struct snapshots_seen { +- struct bpos pos; +- snapshot_id_list ids; +-}; +- + static inline void snapshots_seen_exit(struct snapshots_seen *s) + { + darray_exit(&s->ids); + } + +-static inline void snapshots_seen_init(struct snapshots_seen *s) ++static inline struct snapshots_seen snapshots_seen_init(void) + { +- memset(s, 0, sizeof(*s)); ++ return (struct snapshots_seen) {}; + } + ++DEFINE_CLASS(snapshots_seen, struct snapshots_seen, ++ snapshots_seen_exit(&_T), ++ snapshots_seen_init(), void) ++ + static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) + { + u32 *i; +@@ -720,14 +715,8 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, + static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, + u32 id, u32 ancestor) + { +- ssize_t i; +- + EBUG_ON(id > ancestor); + +- /* @ancestor should be the snapshot most recently added to @seen */ +- EBUG_ON(ancestor != seen->pos.snapshot); +- EBUG_ON(ancestor != darray_last(seen->ids)); +- + if (id == ancestor) + return true; + +@@ -743,11 +732,8 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see + * numerically, since snapshot ID lists are kept sorted, so if we find + * an id that's an ancestor of @id we're done: + */ +- +- for (i = seen->ids.nr - 2; +- i >= 0 && seen->ids.data[i] >= id; +- --i) +- if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i])) ++ darray_for_each_reverse(seen->ids, i) ++ if (*i != ancestor && bch2_snapshot_is_ancestor(c, id, *i)) + return false; + + return true; +@@ -787,12 +773,12 @@ static int ref_visible2(struct bch_fs *c, + + #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ + for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ +- (_i)->snapshot <= (_snapshot); _i++) \ +- if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) ++ (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ ++ if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) + + struct inode_walker_entry { + struct bch_inode_unpacked inode; +- u32 snapshot; ++ bool whiteout; + u64 count; + u64 i_size; + }; +@@ -815,26 +801,36 @@ static void inode_walker_exit(struct inode_walker *w) + + static struct inode_walker inode_walker_init(void) + { +- return (struct inode_walker) { 0, }; ++ return (struct inode_walker) {}; + } + ++DEFINE_CLASS(inode_walker, struct inode_walker, ++ inode_walker_exit(&_T), ++ inode_walker_init(), void) ++ + static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bkey_s_c inode) + { +- struct bch_inode_unpacked u; +- +- return bch2_inode_unpack(inode, &u) ?: +- darray_push(&w->inodes, ((struct inode_walker_entry) { +- .inode = u, +- .snapshot = inode.k->p.snapshot, ++ int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { ++ .whiteout = !bkey_is_inode(inode.k), + })); ++ if (ret) ++ return ret; ++ ++ struct inode_walker_entry *n = &darray_last(w->inodes); ++ if (!n->whiteout) { ++ return bch2_inode_unpack(inode, &n->inode); ++ } else { ++ n->inode.bi_inum = inode.k->p.offset; ++ n->inode.bi_snapshot = inode.k->p.snapshot; ++ return 0; ++ } + } + + static int get_inodes_all_snapshots(struct btree_trans *trans, + struct inode_walker *w, u64 inum) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_s_c k; + int ret; + +@@ -847,15 +843,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, + w->recalculate_sums = false; + w->inodes.nr = 0; + +- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), +- BTREE_ITER_all_snapshots, k, ret) { +- if (k.k->p.offset != inum) ++ for_each_btree_key_max_norestart(trans, iter, ++ BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), ++ BTREE_ITER_all_snapshots, k, ret) { ++ ret = add_inode(c, w, k); ++ if (ret) + break; +- +- if (bkey_is_inode(k.k)) +- add_inode(c, w, k); + } +- bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; +@@ -865,48 +859,104 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, + return 0; + } + +-static struct inode_walker_entry * +-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) ++static int get_visible_inodes(struct btree_trans *trans, ++ struct inode_walker *w, ++ struct snapshots_seen *s, ++ u64 inum) + { +- bool is_whiteout = k.k->type == KEY_TYPE_whiteout; ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ int ret; + +- struct inode_walker_entry *i; +- __darray_for_each(w->inodes, i) +- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) +- goto found; ++ w->inodes.nr = 0; ++ w->deletes.nr = 0; + +- return NULL; +-found: +- BUG_ON(k.k->p.snapshot > i->snapshot); ++ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), ++ BTREE_ITER_all_snapshots, k, ret) { ++ if (k.k->p.offset != inum) ++ break; ++ ++ if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) ++ continue; ++ ++ if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) ++ continue; ++ ++ ret = bkey_is_inode(k.k) ++ ? add_inode(c, w, k) ++ : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); ++ if (ret) ++ break; ++ } + +- if (k.k->p.snapshot != i->snapshot && !is_whiteout) { +- struct inode_walker_entry new = *i; ++ return ret; ++} + +- new.snapshot = k.k->p.snapshot; +- new.count = 0; +- new.i_size = 0; ++static struct inode_walker_entry * ++lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ ++ struct inode_walker_entry *i = darray_find_p(w->inodes, i, ++ bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); ++ ++ if (!i) ++ return NULL; + +- struct printbuf buf = PRINTBUF; +- bch2_bkey_val_to_text(&buf, c, k); ++ CLASS(printbuf, buf)(); ++ int ret = 0; + +- bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" ++ if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, ++ trans, snapshot_key_missing_inode_snapshot, ++ "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + "unexpected because we should always update the inode when we update a key in that inode\n" + "%s", +- w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); +- printbuf_exit(&buf); ++ w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, ++ (bch2_bkey_val_to_text(&buf, c, k), ++ buf.buf))) { ++ if (!i->whiteout) { ++ struct bch_inode_unpacked new = i->inode; ++ new.bi_snapshot = k.k->p.snapshot; ++ ret = __bch2_fsck_write_inode(trans, &new); ++ } else { ++ struct bkey_i whiteout; ++ bkey_init(&whiteout.k); ++ whiteout.k.type = KEY_TYPE_whiteout; ++ whiteout.k.p = SPOS(0, i->inode.bi_inum, k.k->p.snapshot); ++ ret = bch2_btree_insert_trans(trans, BTREE_ID_inodes, ++ &whiteout, ++ BTREE_ITER_cached| ++ BTREE_UPDATE_internal_snapshot_node); ++ } ++ ++ if (ret) ++ goto fsck_err; ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ goto fsck_err; ++ ++ struct inode_walker_entry new_entry = *i; ++ ++ new_entry.inode.bi_snapshot = k.k->p.snapshot; ++ new_entry.count = 0; ++ new_entry.i_size = 0; + +- while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) ++ while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) + --i; + + size_t pos = i - w->inodes.data; +- int ret = darray_insert_item(&w->inodes, pos, new); ++ ret = darray_insert_item(&w->inodes, pos, new_entry); + if (ret) +- return ERR_PTR(ret); ++ goto fsck_err; + +- i = w->inodes.data + pos; ++ ret = bch_err_throw(c, transaction_restart_nested); ++ goto fsck_err; + } + + return i; ++fsck_err: ++ return ERR_PTR(ret); + } + + static struct inode_walker_entry *walk_inode(struct btree_trans *trans, +@@ -921,42 +971,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, + + w->last_pos = k.k->p; + +- return lookup_inode_for_snapshot(trans->c, w, k); +-} +- +-static int get_visible_inodes(struct btree_trans *trans, +- struct inode_walker *w, +- struct snapshots_seen *s, +- u64 inum) +-{ +- struct bch_fs *c = trans->c; +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; +- +- w->inodes.nr = 0; +- w->deletes.nr = 0; +- +- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), +- BTREE_ITER_all_snapshots, k, ret) { +- if (k.k->p.offset != inum) +- break; +- +- if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) +- continue; +- +- if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) +- continue; +- +- ret = bkey_is_inode(k.k) +- ? add_inode(c, w, k) +- : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); +- if (ret) +- break; +- } +- bch2_trans_iter_exit(trans, &iter); +- +- return ret; ++ return lookup_inode_for_snapshot(trans, w, k); + } + + /* +@@ -974,26 +989,25 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans, + return 0; + + struct bkey_i_dirent *d = bkey_i_to_dirent(new); +- struct inode_walker target = inode_walker_init(); +- int ret = 0; ++ CLASS(inode_walker, target)(); + + if (d->v.d_type == DT_SUBVOL) { +- BUG(); ++ bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); ++ return bch_err_throw(trans->c, fsck_repair_unimplemented); + } else { +- ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); ++ int ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); + if (ret) +- goto err; ++ return ret; + + darray_for_each(target.inodes, i) { + i->inode.bi_dir_offset = d->k.p.offset; + ret = __bch2_fsck_write_inode(trans, &i->inode); + if (ret) +- goto err; ++ return ret; + } ++ ++ return 0; + } +-err: +- inode_walker_exit(&target); +- return ret; + } + + static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, +@@ -1013,11 +1027,9 @@ static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, + + static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) + { +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); +- int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ CLASS(btree_iter, iter)(trans, BTREE_ID_deleted_inodes, p, 0); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ return bkey_err(k) ?: k.k->type == KEY_TYPE_set; + } + + static int check_inode_dirent_inode(struct btree_trans *trans, +@@ -1025,7 +1037,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, + bool *write_inode) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + u32 inode_snapshot = inode->bi_snapshot; + struct btree_iter dirent_iter = {}; +@@ -1034,7 +1046,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + +- if ((ret || dirent_points_to_inode_nowarn(d, inode)) && ++ if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && + inode->bi_subvol && + (inode->bi_flags & BCH_INODE_has_child_snapshot)) { + /* Older version of a renamed subvolume root: we won't have a +@@ -1055,7 +1067,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, + trans, inode_points_to_missing_dirent, + "inode points to missing dirent\n%s", + (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || +- fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), ++ fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), + trans, inode_points_to_wrong_dirent, + "%s", + (printbuf_reset(&buf), +@@ -1074,38 +1086,11 @@ static int check_inode_dirent_inode(struct btree_trans *trans, + out: + ret = 0; + fsck_err: +- bch2_trans_iter_exit(trans, &dirent_iter); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&dirent_iter); + bch_err_fn(c, ret); + return ret; + } + +-static int get_snapshot_root_inode(struct btree_trans *trans, +- struct bch_inode_unpacked *root, +- u64 inum) +-{ +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret = 0; +- +- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, +- SPOS(0, inum, U32_MAX), +- BTREE_ITER_all_snapshots, k, ret) { +- if (k.k->p.offset != inum) +- break; +- if (bkey_is_inode(k.k)) +- goto found_root; +- } +- if (ret) +- goto err; +- BUG(); +-found_root: +- ret = bch2_inode_unpack(k, root); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} +- + static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, +@@ -1113,7 +1098,7 @@ static int check_inode(struct btree_trans *trans, + struct snapshots_seen *s) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + struct bch_inode_unpacked u; + bool do_update = false; + int ret; +@@ -1136,27 +1121,31 @@ static int check_inode(struct btree_trans *trans, + goto err; + + if (snapshot_root->bi_inum != u.bi_inum) { +- ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); ++ ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); + if (ret) + goto err; + } + +- if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || +- INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), +- trans, inode_snapshot_mismatch, +- "inode hash info in different snapshots don't match")) { +- u.bi_hash_seed = snapshot_root->bi_hash_seed; +- SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); +- do_update = true; ++ if (u.bi_hash_seed != snapshot_root->bi_hash_seed || ++ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { ++ ret = bch2_repair_inode_hash_info(trans, snapshot_root); ++ BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); ++ if (ret) ++ goto err; + } + +- if (u.bi_dir || u.bi_dir_offset) { ++ ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); ++ if (ret) ++ goto err; ++ ++ if (bch2_inode_has_backpointer(&u)) { + ret = check_inode_dirent_inode(trans, &u, &do_update); + if (ret) + goto err; + } + +- if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked), ++ if (fsck_err_on(bch2_inode_has_backpointer(&u) && ++ (u.bi_flags & BCH_INODE_unlinked), + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), +@@ -1183,6 +1172,14 @@ static int check_inode(struct btree_trans *trans, + ret = 0; + } + ++ if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, ++ trans, inode_dir_has_nonzero_i_size, ++ "directory %llu:%u with nonzero i_size %lli", ++ u.bi_inum, u.bi_snapshot, u.bi_size)) { ++ u.bi_size = 0; ++ do_update = true; ++ } ++ + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) + goto err; +@@ -1217,7 +1214,7 @@ static int check_inode(struct btree_trans *trans, + */ + ret = check_inode_deleted_list(trans, k.k->p); + if (ret < 0) +- goto err_noprint; ++ return ret; + + fsck_err_on(!ret, + trans, unlinked_inode_not_on_deleted_list, +@@ -1238,7 +1235,7 @@ static int check_inode(struct btree_trans *trans, + u.bi_inum, u.bi_snapshot)) { + ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck deleting inode"); +- goto err_noprint; ++ return ret; + } + ret = 0; + } +@@ -1299,40 +1296,37 @@ static int check_inode(struct btree_trans *trans, + ret = __bch2_fsck_write_inode(trans, &u); + bch_err_msg(c, ret, "in fsck updating inode"); + if (ret) +- goto err_noprint; ++ return ret; + } + err: + fsck_err: + bch_err_fn(c, ret); +-err_noprint: +- printbuf_exit(&buf); + return ret; + } + + int bch2_check_inodes(struct bch_fs *c) + { + struct bch_inode_unpacked snapshot_root = {}; +- struct snapshots_seen s; + +- snapshots_seen_init(&s); ++ CLASS(btree_trans, trans)(c); ++ CLASS(snapshots_seen, s)(); ++ ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_inodes)); + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, ++ return for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_inode(trans, &iter, k, &snapshot_root, &s))); +- +- snapshots_seen_exit(&s); +- bch_err_fn(c, ret); +- return ret; ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ progress_update_iter(trans, &progress, &iter); ++ check_inode(trans, &iter, k, &snapshot_root, &s); ++ })); + } + + static int find_oldest_inode_needs_reattach(struct btree_trans *trans, + struct bch_inode_unpacked *inode) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +@@ -1364,7 +1358,6 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, + + *inode = parent_inode; + } +- bch2_trans_iter_exit(trans, &iter); + + return ret; + } +@@ -1373,7 +1366,7 @@ static int check_unreachable_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) + { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + if (!bkey_is_inode(k.k)) +@@ -1397,7 +1390,6 @@ static int check_unreachable_inode(struct btree_trans *trans, + buf.buf))) + ret = reattach_inode(trans, &inode); + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -1413,14 +1405,17 @@ static int check_unreachable_inode(struct btree_trans *trans, + */ + int bch2_check_unreachable_inodes(struct bch_fs *c) + { +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_inodes)); ++ ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_unreachable_inode(trans, &iter, k))); +- bch_err_fn(c, ret); +- return ret; ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ progress_update_iter(trans, &progress, &iter); ++ check_unreachable_inode(trans, &iter, k); ++ })); + } + + static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode) +@@ -1444,48 +1439,155 @@ static int check_key_has_inode(struct btree_trans *trans, + struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = PTR_ERR_OR_ZERO(i); + if (ret) + return ret; + + if (k.k->type == KEY_TYPE_whiteout) +- goto out; ++ return 0; + +- if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { +- ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: +- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +- if (ret) +- goto err; ++ bool have_inode = i && !i->whiteout; + +- inode->last_pos.inode--; +- ret = -BCH_ERR_transaction_restart_nested; +- goto err; ++ if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) ++ goto reconstruct; ++ ++ if (have_inode && btree_matches_i_mode(iter->btree_id, i->inode.bi_mode)) ++ return 0; ++ ++ prt_printf(&buf, ", "); ++ ++ bool have_old_inode = false; ++ darray_for_each(inode->inodes, i2) ++ if (!i2->whiteout && ++ bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i2->inode.bi_snapshot) && ++ btree_matches_i_mode(iter->btree_id, i2->inode.bi_mode)) { ++ prt_printf(&buf, "but found good inode in older snapshot\n"); ++ bch2_inode_unpacked_to_text(&buf, &i2->inode); ++ prt_newline(&buf); ++ have_old_inode = true; ++ break; ++ } ++ ++ struct bkey_s_c k2; ++ unsigned nr_keys = 0; ++ ++ prt_printf(&buf, "found keys:\n"); ++ ++ for_each_btree_key_max_norestart(trans, iter2, iter->btree_id, ++ SPOS(k.k->p.inode, 0, k.k->p.snapshot), ++ POS(k.k->p.inode, U64_MAX), ++ 0, k2, ret) { ++ if (k.k->type == KEY_TYPE_error || ++ k.k->type == KEY_TYPE_hash_whiteout) ++ continue; ++ ++ nr_keys++; ++ if (nr_keys <= 10) { ++ bch2_bkey_val_to_text(&buf, c, k2); ++ prt_newline(&buf); ++ } ++ if (nr_keys >= 100) ++ break; + } + +- if (fsck_err_on(!i, +- trans, key_in_missing_inode, +- "key in missing inode:\n%s", +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) +- goto delete; ++ if (ret) ++ goto err; + +- if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), +- trans, key_in_wrong_inode_type, +- "key for wrong inode mode %o:\n%s", +- i->inode.bi_mode, +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, k), buf.buf))) +- goto delete; ++ unsigned reconstruct_limit = iter->btree_id == BTREE_ID_extents ? 3 : 0; ++ ++ if (nr_keys > 100) ++ prt_printf(&buf, "found > %u keys for this missing inode\n", nr_keys); ++ else if (nr_keys > reconstruct_limit) ++ prt_printf(&buf, "found %u keys for this missing inode\n", nr_keys); ++ ++ if (!have_inode) { ++ if (fsck_err_on(!have_inode, ++ trans, key_in_missing_inode, ++ "key in missing inode%s", buf.buf)) { ++ /* ++ * Maybe a deletion that raced with data move, or something ++ * weird like that? But if we know the inode was deleted, or ++ * it's just a few keys, we can safely delete them. ++ * ++ * If it's many keys, we should probably recreate the inode ++ */ ++ if (have_old_inode || nr_keys <= 2) ++ goto delete; ++ else ++ goto reconstruct; ++ } ++ } else { ++ /* ++ * not autofix, this one would be a giant wtf - bit error in the ++ * inode corrupting i_mode? ++ * ++ * may want to try repairing inode instead of deleting ++ */ ++ if (fsck_err_on(!btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), ++ trans, key_in_wrong_inode_type, ++ "key for wrong inode mode %o%s", ++ i->inode.bi_mode, buf.buf)) ++ goto delete; ++ } + out: + err: + fsck_err: +- printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; + delete: ++ /* ++ * XXX: print out more info ++ * count up extents for this inode, check if we have different inode in ++ * an older snapshot version, perhaps decide if we want to reconstitute ++ */ + ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); + goto out; ++reconstruct: ++ ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: ++ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); ++ if (ret) ++ goto err; ++ ++ inode->last_pos.inode--; ++ ret = bch_err_throw(c, transaction_restart_nested); ++ goto out; ++} ++ ++static int maybe_reconstruct_inum_btree(struct btree_trans *trans, ++ u64 inum, u32 snapshot, ++ enum btree_id btree) ++{ ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ for_each_btree_key_max_norestart(trans, iter, btree, ++ SPOS(inum, 0, snapshot), ++ POS(inum, U64_MAX), ++ 0, k, ret) { ++ ret = 1; ++ break; ++ } ++ ++ if (ret <= 0) ++ return ret; ++ ++ if (fsck_err(trans, missing_inode_with_contents, ++ "inode %llu:%u type %s missing, but contents found: reconstruct?", ++ inum, snapshot, ++ btree == BTREE_ID_extents ? "reg" : "dir")) ++ return reconstruct_inode(trans, btree, snapshot, inum) ?: ++ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ bch_err_throw(trans->c, transaction_restart_commit); ++fsck_err: ++ return ret; ++} ++ ++static int maybe_reconstruct_inum(struct btree_trans *trans, ++ u64 inum, u32 snapshot) ++{ ++ return maybe_reconstruct_inum_btree(trans, inum, snapshot, BTREE_ID_extents) ?: ++ maybe_reconstruct_inum_btree(trans, inum, snapshot, BTREE_ID_dirents); + } + + static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w) +@@ -1498,22 +1600,28 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal + if (i->inode.bi_sectors == i->count) + continue; + +- count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); ++ CLASS(printbuf, buf)(); ++ lockrestart_do(trans, ++ bch2_inum_snapshot_to_path(trans, ++ i->inode.bi_inum, ++ i->inode.bi_snapshot, NULL, &buf)); ++ ++ count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); + + if (w->recalculate_sums) + i->count = count2; + + if (i->count != count2) { +- bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", +- w->last_pos.inode, i->snapshot, i->count, count2); ++ bch_err_ratelimited(c, "fsck counted i_sectors wrong: got %llu should be %llu\n%s", ++ i->count, count2, buf.buf); + i->count = count2; + } + +- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), ++ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty) && ++ i->inode.bi_sectors != i->count, + trans, inode_i_sectors_wrong, +- "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", +- w->last_pos.inode, i->snapshot, +- i->inode.bi_sectors, i->count)) { ++ "incorrect i_sectors: got %llu, should be %llu\n%s", ++ i->inode.bi_sectors, i->count, buf.buf)) { + i->inode.bi_sectors = i->count; + ret = bch2_fsck_write_inode(trans, &i->inode); + if (ret) +@@ -1556,11 +1664,15 @@ static void extent_ends_exit(struct extent_ends *extent_ends) + darray_exit(&extent_ends->e); + } + +-static void extent_ends_init(struct extent_ends *extent_ends) ++static struct extent_ends extent_ends_init(void) + { +- memset(extent_ends, 0, sizeof(*extent_ends)); ++ return (struct extent_ends) {}; + } + ++DEFINE_CLASS(extent_ends, struct extent_ends, ++ extent_ends_exit(&_T), ++ extent_ends_init(), void) ++ + static int extent_ends_at(struct bch_fs *c, + struct extent_ends *extent_ends, + struct snapshots_seen *seen, +@@ -1576,7 +1688,7 @@ static int extent_ends_at(struct bch_fs *c, + sizeof(seen->ids.data[0]) * seen->ids.size, + GFP_KERNEL); + if (!n.seen.ids.data) +- return -BCH_ERR_ENOMEM_fsck_extent_ends_at; ++ return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); + + __darray_for_each(extent_ends->e, i) { + if (i->snapshot == k.k->p.snapshot) { +@@ -1600,17 +1712,17 @@ static int overlapping_extents_found(struct btree_trans *trans, + struct extent_end *extent_end) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; +- struct btree_iter iter1, iter2 = {}; ++ CLASS(printbuf, buf)(); ++ struct btree_iter iter2 = {}; + struct bkey_s_c k1, k2; + int ret; + + BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); + +- bch2_trans_iter_init(trans, &iter1, btree, pos1, +- BTREE_ITER_all_snapshots| +- BTREE_ITER_not_extents); +- k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); ++ CLASS(btree_iter, iter1)(trans, btree, pos1, ++ BTREE_ITER_all_snapshots| ++ BTREE_ITER_not_extents); ++ k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); + ret = bkey_err(k1); + if (ret) + goto err; +@@ -1626,16 +1738,16 @@ static int overlapping_extents_found(struct btree_trans *trans, + + bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", + __func__, buf.buf); +- ret = -BCH_ERR_internal_fsck_err; ++ ret = bch_err_throw(c, internal_fsck_err); + goto err; + } + +- bch2_trans_copy_iter(trans, &iter2, &iter1); ++ bch2_trans_copy_iter(&iter2, &iter1); + + while (1) { +- bch2_btree_iter_advance(trans, &iter2); ++ bch2_btree_iter_advance(&iter2); + +- k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); ++ k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); + ret = bkey_err(k2); + if (ret) + goto err; +@@ -1651,7 +1763,7 @@ static int overlapping_extents_found(struct btree_trans *trans, + pos2.size != k2.k->size) { + bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", + __func__, buf.buf); +- ret = -BCH_ERR_internal_fsck_err; ++ ret = bch_err_throw(c, internal_fsck_err); + goto err; + } + +@@ -1699,14 +1811,12 @@ static int overlapping_extents_found(struct btree_trans *trans, + * We overwrote the second extent - restart + * check_extent() from the top: + */ +- ret = -BCH_ERR_transaction_restart_nested; ++ ret = bch_err_throw(c, transaction_restart_nested); + } + } + fsck_err: + err: +- bch2_trans_iter_exit(trans, &iter2); +- bch2_trans_iter_exit(trans, &iter1); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&iter2); + return ret; + } + +@@ -1763,16 +1873,16 @@ static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *it + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc_is_encoded(crc) && + crc.uncompressed_size > encoded_extent_max_sectors) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "overbig encoded extent, please report this:\n %s", buf.buf); +- printbuf_exit(&buf); + } + + return 0; + } + ++noinline_for_stack + static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct inode_walker *inode, +@@ -1781,7 +1891,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct disk_reservation *res) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + ret = bch2_check_key_has_snapshot(trans, iter, k); +@@ -1823,24 +1933,24 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); + inode->inodes.data && i >= inode->inodes.data; + --i) { +- if (i->snapshot > k.k->p.snapshot || +- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) ++ if (i->inode.bi_snapshot > k.k->p.snapshot || ++ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) + continue; + +- if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && ++ u64 last_block = round_up(i->inode.bi_size, block_bytes(c)) >> 9; ++ ++ if (fsck_err_on(k.k->p.offset > last_block && + !bkey_extent_is_reservation(k), + trans, extent_past_end_of_inode, + "extent type past end of inode %llu:%u, i_size %llu\n%s", +- i->inode.bi_inum, i->snapshot, i->inode.bi_size, ++ i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { +- struct btree_iter iter2; +- +- bch2_trans_copy_iter(trans, &iter2, iter); +- bch2_btree_iter_set_snapshot(trans, &iter2, i->snapshot); +- ret = bch2_btree_iter_traverse(trans, &iter2) ?: +- bch2_btree_delete_at(trans, &iter2, +- BTREE_UPDATE_internal_snapshot_node); +- bch2_trans_iter_exit(trans, &iter2); ++ ret = snapshots_seen_add_inorder(c, s, i->inode.bi_snapshot) ?: ++ bch2_fpunch_snapshot(trans, ++ SPOS(i->inode.bi_inum, ++ last_block, ++ i->inode.bi_snapshot), ++ POS(i->inode.bi_inum, U64_MAX)); + if (ret) + goto err; + +@@ -1850,6 +1960,10 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + } + } + ++ ret = check_extent_overbig(trans, iter, k); ++ if (ret) ++ goto err; ++ + ret = bch2_trans_commit(trans, res, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; +@@ -1858,8 +1972,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); + inode->inodes.data && i >= inode->inodes.data; + --i) { +- if (i->snapshot > k.k->p.snapshot || +- !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) ++ if (i->whiteout || ++ i->inode.bi_snapshot > k.k->p.snapshot || ++ !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) + continue; + + i->count += k.k->size; +@@ -1874,7 +1989,6 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + out: + err: + fsck_err: +- printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; + } +@@ -1885,49 +1999,48 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + */ + int bch2_check_extents(struct bch_fs *c) + { +- struct inode_walker w = inode_walker_init(); +- struct snapshots_seen s; +- struct extent_ends extent_ends; + struct disk_reservation res = { 0 }; + +- snapshots_seen_init(&s); +- extent_ends_init(&extent_ends); ++ CLASS(btree_trans, trans)(c); ++ CLASS(snapshots_seen, s)(); ++ CLASS(inode_walker, w)(); ++ CLASS(extent_ends, extent_ends)(); ++ ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_extents)); + +- int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, BTREE_ID_extents, ++ int ret = for_each_btree_key(trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ ++ progress_update_iter(trans, &progress, &iter); + bch2_disk_reservation_put(c, &res); +- check_extent(trans, &iter, k, &w, &s, &extent_ends, &res) ?: +- check_extent_overbig(trans, &iter, k); ++ check_extent(trans, &iter, k, &w, &s, &extent_ends, &res); + })) ?: +- check_i_sectors_notnested(trans, &w)); ++ check_i_sectors_notnested(trans, &w); + + bch2_disk_reservation_put(c, &res); +- extent_ends_exit(&extent_ends); +- inode_walker_exit(&w); +- snapshots_seen_exit(&s); +- +- bch_err_fn(c, ret); + return ret; + } + + int bch2_check_indirect_extents(struct bch_fs *c) + { ++ CLASS(btree_trans, trans)(c); + struct disk_reservation res = { 0 }; + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_reflink)); ++ ++ int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink, + POS_MIN, + BTREE_ITER_prefetch, k, + &res, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ ++ progress_update_iter(trans, &progress, &iter); + bch2_disk_reservation_put(c, &res); + check_extent_overbig(trans, &iter, k); +- }))); ++ })); + + bch2_disk_reservation_put(c, &res); +- bch_err_fn(c, ret); + return ret; + } + +@@ -1941,26 +2054,34 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ + if (i->inode.bi_nlink == i->count) + continue; + +- count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); ++ count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); + if (count2 < 0) + return count2; + + if (i->count != count2) { + bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", +- w->last_pos.inode, i->snapshot, i->count, count2); ++ w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); + i->count = count2; + if (i->inode.bi_nlink == i->count) + continue; + } + +- if (fsck_err_on(i->inode.bi_nlink != i->count, +- trans, inode_dir_wrong_nlink, +- "directory %llu:%u with wrong i_nlink: got %u, should be %llu", +- w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { +- i->inode.bi_nlink = i->count; +- ret = bch2_fsck_write_inode(trans, &i->inode); +- if (ret) +- break; ++ if (i->inode.bi_nlink != i->count) { ++ CLASS(printbuf, buf)(); ++ ++ lockrestart_do(trans, ++ bch2_inum_snapshot_to_path(trans, w->last_pos.inode, ++ i->inode.bi_snapshot, NULL, &buf)); ++ ++ if (fsck_err_on(i->inode.bi_nlink != i->count, ++ trans, inode_dir_wrong_nlink, ++ "directory with wrong i_nlink: got %u, should be %llu\n%s", ++ i->inode.bi_nlink, i->count, buf.buf)) { ++ i->inode.bi_nlink = i->count; ++ ret = bch2_fsck_write_inode(trans, &i->inode); ++ if (ret) ++ break; ++ } + } + } + fsck_err: +@@ -1978,7 +2099,6 @@ static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_wa + /* find a subvolume that's a descendent of @snapshot: */ + static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) + { +- struct btree_iter iter; + struct bkey_s_c k; + int ret; + +@@ -1988,16 +2108,13 @@ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *su + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + *subvolid = k.k->p.offset; +- goto found; ++ return 0; + } + } +- if (!ret) +- ret = -ENOENT; +-found: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ ++ return ret ?: -ENOENT; + } + + noinline_for_stack +@@ -2012,7 +2129,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * + u32 parent_snapshot; + u32 new_parent_subvol = 0; + u64 parent_inum; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); +@@ -2051,24 +2168,22 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + if (!new_parent_subvol) { + bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); +- return -BCH_ERR_fsck_repair_unimplemented; ++ return bch_err_throw(c, fsck_repair_unimplemented); + } + + struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); + ret = PTR_ERR_OR_ZERO(new_dirent); + if (ret) +- goto err; ++ return ret; + + new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); + } + +- struct bkey_s_c_subvolume s = +- bch2_bkey_get_iter_typed(trans, &subvol_iter, +- BTREE_ID_subvolumes, POS(0, target_subvol), +- 0, subvolume); ++ bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes, POS(0, target_subvol), 0); ++ struct bkey_s_c_subvolume s = bch2_bkey_get_typed(&subvol_iter, subvolume); + ret = bkey_err(s.s_c); + if (ret && !bch2_err_matches(ret, ENOENT)) +- return ret; ++ goto err; + + if (ret) { + if (fsck_err(trans, dirent_to_missing_subvol, +@@ -2079,30 +2194,41 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * + goto out; + } + +- if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, +- trans, subvol_fs_path_parent_wrong, +- "subvol with wrong fs_path_parent, should be be %u\n%s", +- parent_subvol, +- (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { +- struct bkey_i_subvolume *n = +- bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); +- ret = PTR_ERR_OR_ZERO(n); ++ if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { ++ printbuf_reset(&buf); ++ ++ prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", ++ parent_subvol); ++ ++ ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, ++ le64_to_cpu(s.v->inode) }, &buf); + if (ret) + goto err; ++ prt_newline(&buf); ++ bch2_bkey_val_to_text(&buf, c, s.s_c); + +- n->v.fs_path_parent = cpu_to_le32(parent_subvol); ++ if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { ++ struct bkey_i_subvolume *n = ++ bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto err; ++ ++ n->v.fs_path_parent = cpu_to_le32(parent_subvol); ++ } + } + + u64 target_inum = le64_to_cpu(s.v->inode); + u32 target_snapshot = le32_to_cpu(s.v->snapshot); + +- ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); ++ ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, ++ &subvol_root, 0); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (ret) { + bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); +- ret = -BCH_ERR_fsck_repair_unimplemented; ++ ret = bch_err_throw(c, fsck_repair_unimplemented); + goto err; + } + +@@ -2124,8 +2250,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * + out: + err: + fsck_err: +- bch2_trans_iter_exit(trans, &subvol_iter); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&subvol_iter); + return ret; + } + +@@ -2134,59 +2259,57 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bch_hash_info *hash_info, + struct inode_walker *dir, + struct inode_walker *target, +- struct snapshots_seen *s) ++ struct snapshots_seen *s, ++ bool *need_second_pass) + { + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + ret = bch2_check_key_has_snapshot(trans, iter, k); +- if (ret) { +- ret = ret < 0 ? ret : 0; +- goto out; +- } ++ if (ret) ++ return ret < 0 ? ret : 0; + + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) +- goto err; ++ return ret; + + if (k.k->type == KEY_TYPE_whiteout) +- goto out; ++ return 0; + + if (dir->last_pos.inode != k.k->p.inode && dir->have_inodes) { + ret = check_subdir_dirents_count(trans, dir); + if (ret) +- goto err; ++ return ret; + } + + i = walk_inode(trans, dir, k); + ret = PTR_ERR_OR_ZERO(i); +- if (ret < 0) +- goto err; ++ if (ret) ++ return ret; + + ret = check_key_has_inode(trans, iter, dir, i, k); + if (ret) +- goto err; ++ return ret; + +- if (!i) +- goto out; ++ if (!i || i->whiteout) ++ return 0; + + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &i->inode); + dir->first_this_inode = false; + +- ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); +- if (ret < 0) +- goto err; +- if (ret) { +- /* dirent has been deleted */ +- ret = 0; +- goto out; +- } ++ hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; + ++ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, ++ iter, k, need_second_pass); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ return 0; /* dirent has been deleted */ + if (k.k->type != KEY_TYPE_dirent) +- goto out; ++ return 0; + + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + +@@ -2197,42 +2320,51 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { +- struct qstr name = bch2_dirent_get_name(d); +- u32 subvol = d.v->d_type == DT_SUBVOL +- ? d.v->d_parent_subvol +- : 0; ++ subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL ++ ? le32_to_cpu(d.v->d_parent_subvol) ++ : 0, ++ }; + u64 target = d.v->d_type == DT_SUBVOL +- ? d.v->d_child_subvol +- : d.v->d_inum; +- u64 dir_offset; ++ ? le32_to_cpu(d.v->d_child_subvol) ++ : le64_to_cpu(d.v->d_inum); ++ struct qstr name = bch2_dirent_get_name(d); + +- ret = bch2_hash_delete_at(trans, ++ struct bkey_i_dirent *new_d = ++ bch2_dirent_create_key(trans, hash_info, dir_inum, ++ d.v->d_type, &name, NULL, target); ++ ret = PTR_ERR_OR_ZERO(new_d); ++ if (ret) ++ return ret; ++ ++ new_d->k.p.inode = d.k->p.inode; ++ new_d->k.p.snapshot = d.k->p.snapshot; ++ ++ struct btree_iter dup_iter = {}; ++ return bch2_hash_delete_at(trans, + bch2_dirent_hash_desc, hash_info, iter, + BTREE_UPDATE_internal_snapshot_node) ?: +- bch2_dirent_create_snapshot(trans, subvol, +- d.k->p.inode, d.k->p.snapshot, +- hash_info, +- d.v->d_type, +- &name, +- target, +- &dir_offset, +- BTREE_ITER_with_updates| +- BTREE_UPDATE_internal_snapshot_node| +- STR_HASH_must_create) ?: +- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +- +- /* might need another check_dirents pass */ +- goto out; ++ bch2_str_hash_repair_key(trans, s, ++ &bch2_dirent_hash_desc, hash_info, ++ iter, bkey_i_to_s_c(&new_d->k_i), ++ &dup_iter, bkey_s_c_null, ++ need_second_pass); + } + + if (d.v->d_type == DT_SUBVOL) { + ret = check_dirent_to_subvol(trans, iter, d); + if (ret) +- goto err; ++ return ret; + } else { + ret = get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + if (ret) +- goto err; ++ return ret; ++ ++ if (!target->inodes.nr) { ++ ret = maybe_reconstruct_inum(trans, le64_to_cpu(d.v->d_inum), ++ d.k->p.snapshot); ++ if (ret) ++ return ret; ++ } + + if (fsck_err_on(!target->inodes.nr, + trans, dirent_to_missing_inode, +@@ -2242,13 +2374,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + buf.buf))) { + ret = bch2_fsck_remove_dirent(trans, d.k->p); + if (ret) +- goto err; ++ return ret; + } + + darray_for_each(target->inodes, i) { + ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); + if (ret) +- goto err; ++ return ret; + } + + darray_for_each(target->deletes, i) +@@ -2259,37 +2391,37 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { +- struct btree_iter delete_iter; +- bch2_trans_iter_init(trans, &delete_iter, ++ CLASS(btree_iter, delete_iter)(trans, + BTREE_ID_dirents, + SPOS(k.k->p.inode, k.k->p.offset, *i), + BTREE_ITER_intent); +- ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: ++ ret = bch2_btree_iter_traverse(&delete_iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + hash_info, + &delete_iter, + BTREE_UPDATE_internal_snapshot_node); +- bch2_trans_iter_exit(trans, &delete_iter); + if (ret) +- goto err; ++ return ret; + + } + } + ++ /* ++ * Cannot access key values after doing a transaction commit without ++ * revalidating: ++ */ ++ bool have_dir = d.v->d_type == DT_DIR; ++ + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) +- goto err; ++ return ret; + + for_each_visible_inode(c, s, dir, d.k->p.snapshot, i) { +- if (d.v->d_type == DT_DIR) ++ if (have_dir) + i->count++; + i->i_size += bkey_bytes(d.k); + } +-out: +-err: + fsck_err: +- printbuf_exit(&buf); +- bch_err_fn(c, ret); + return ret; + } + +@@ -2299,24 +2431,38 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + */ + int bch2_check_dirents(struct bch_fs *c) + { +- struct inode_walker dir = inode_walker_init(); +- struct inode_walker target = inode_walker_init(); +- struct snapshots_seen s; + struct bch_hash_info hash_info; ++ CLASS(btree_trans, trans)(c); ++ CLASS(snapshots_seen, s)(); ++ CLASS(inode_walker, dir)(); ++ CLASS(inode_walker, target)(); ++ struct progress_indicator_state progress; ++ bool need_second_pass = false, did_second_pass = false; ++ int ret; ++again: ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_dirents)); + +- snapshots_seen_init(&s); +- +- int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, BTREE_ID_dirents, ++ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, +- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: +- check_subdir_count_notnested(trans, &dir)); ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ progress_update_iter(trans, &progress, &iter); ++ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, ++ &need_second_pass); ++ })) ?: ++ check_subdir_count_notnested(trans, &dir); ++ ++ if (!ret && need_second_pass && !did_second_pass) { ++ bch_info(c, "check_dirents requires second pass"); ++ swap(did_second_pass, need_second_pass); ++ goto again; ++ } ++ ++ if (!ret && need_second_pass) { ++ bch_err(c, "dirents not repairing"); ++ ret = -EINVAL; ++ } + +- snapshots_seen_exit(&s); +- inode_walker_exit(&dir); +- inode_walker_exit(&target); +- bch_err_fn(c, ret); + return ret; + } + +@@ -2326,16 +2472,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + struct inode_walker *inode) + { + struct bch_fs *c = trans->c; +- struct inode_walker_entry *i; +- int ret; + +- ret = bch2_check_key_has_snapshot(trans, iter, k); ++ int ret = bch2_check_key_has_snapshot(trans, iter, k); + if (ret < 0) + return ret; + if (ret) + return 0; + +- i = walk_inode(trans, inode, k); ++ struct inode_walker_entry *i = walk_inode(trans, inode, k); + ret = PTR_ERR_OR_ZERO(i); + if (ret) + return ret; +@@ -2344,16 +2488,16 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + if (ret) + return ret; + +- if (!i) ++ if (!i || i->whiteout) + return 0; + + if (inode->first_this_inode) + *hash_info = bch2_hash_info_init(c, &i->inode); + inode->first_this_inode = false; + +- ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); +- bch_err_fn(c, ret); +- return ret; ++ bool need_second_pass = false; ++ return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, ++ iter, k, &need_second_pass); + } + + /* +@@ -2361,21 +2505,22 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + */ + int bch2_check_xattrs(struct bch_fs *c) + { +- struct inode_walker inode = inode_walker_init(); + struct bch_hash_info hash_info; +- int ret = 0; ++ CLASS(btree_trans, trans)(c); ++ CLASS(inode_walker, inode)(); + +- ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_xattrs)); ++ ++ int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, + k, + NULL, NULL, +- BCH_TRANS_COMMIT_no_enospc, +- check_xattr(trans, &iter, k, &hash_info, &inode))); +- +- inode_walker_exit(&inode); +- bch_err_fn(c, ret); ++ BCH_TRANS_COMMIT_no_enospc, ({ ++ progress_update_iter(trans, &progress, &iter); ++ check_xattr(trans, &iter, k, &hash_info, &inode); ++ })); + return ret; + } + +@@ -2413,7 +2558,8 @@ static int check_root_trans(struct btree_trans *trans) + goto err; + } + +- ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); ++ ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, ++ &root_inode, 0); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + +@@ -2439,37 +2585,32 @@ static int check_root_trans(struct btree_trans *trans) + /* Get root directory, create if it doesn't exist: */ + int bch2_check_root(struct bch_fs *c) + { +- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_root_trans(trans)); +- bch_err_fn(c, ret); +- return ret; +-} +- +-typedef DARRAY(u32) darray_u32; +- +-static bool darray_u32_has(darray_u32 *d, u32 v) +-{ +- darray_for_each(*d, i) +- if (*i == v) +- return true; +- return false; ++ CLASS(btree_trans, trans)(c); ++ return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ++ check_root_trans(trans)); + } + + static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- struct btree_iter parent_iter = {}; +- darray_u32 subvol_path = {}; +- struct printbuf buf = PRINTBUF; ++ CLASS(darray_u32, subvol_path)(); ++ CLASS(printbuf, buf)(); + int ret = 0; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + ++ CLASS(btree_iter, parent_iter)(trans, BTREE_ID_subvolumes, POS_MIN, 0); ++ ++ subvol_inum start = { ++ .subvol = k.k->p.offset, ++ .inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode), ++ }; ++ + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { + ret = darray_push(&subvol_path, k.k->p.offset); + if (ret) +- goto err; ++ return ret; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + +@@ -2482,87 +2623,85 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, + + u32 parent = le32_to_cpu(s.v->fs_path_parent); + +- if (darray_u32_has(&subvol_path, parent)) { +- if (fsck_err(trans, subvol_loop, "subvolume loop")) ++ if (darray_find(subvol_path, parent)) { ++ printbuf_reset(&buf); ++ prt_printf(&buf, "subvolume loop: "); ++ ++ ret = bch2_inum_to_path(trans, start, &buf); ++ if (ret) ++ return ret; ++ ++ if (fsck_err(trans, subvol_loop, "%s", buf.buf)) + ret = reattach_subvol(trans, s); + break; + } + +- bch2_trans_iter_exit(trans, &parent_iter); +- bch2_trans_iter_init(trans, &parent_iter, +- BTREE_ID_subvolumes, POS(0, parent), 0); +- k = bch2_btree_iter_peek_slot(trans, &parent_iter); ++ bch2_btree_iter_set_pos(&parent_iter, POS(0, parent)); ++ k = bch2_btree_iter_peek_slot(&parent_iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, + trans, subvol_unreachable, + "unreachable subvolume %s", +- (bch2_bkey_val_to_text(&buf, c, s.s_c), ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, s.s_c), + buf.buf))) { +- ret = reattach_subvol(trans, s); +- break; ++ return reattach_subvol(trans, s); + } + } + fsck_err: +-err: +- printbuf_exit(&buf); +- darray_exit(&subvol_path); +- bch2_trans_iter_exit(trans, &parent_iter); + return ret; + } + + int bch2_check_subvolume_structure(struct bch_fs *c) + { +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, +- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_subvol_path(trans, &iter, k))); +- bch_err_fn(c, ret); +- return ret; +-} ++ CLASS(btree_trans, trans)(c); + +-struct pathbuf_entry { +- u64 inum; +- u32 snapshot; +-}; ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_subvolumes)); + +-typedef DARRAY(struct pathbuf_entry) pathbuf; ++ return for_each_btree_key_commit(trans, iter, ++ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ progress_update_iter(trans, &progress, &iter); ++ check_subvol_path(trans, &iter, k); ++ })); ++} + +-static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, ++static int bch2_bi_depth_renumber_one(struct btree_trans *trans, ++ u64 inum, u32 snapshot, + u32 new_depth) + { +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, +- SPOS(0, p->inum, p->snapshot), 0); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, inum, snapshot), 0); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + + struct bch_inode_unpacked inode; + int ret = bkey_err(k) ?: + !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(k, &inode); + if (ret) +- goto err; ++ return ret; + + if (inode.bi_depth != new_depth) { + inode.bi_depth = new_depth; +- ret = __bch2_fsck_write_inode(trans, &inode) ?: +- bch2_trans_commit(trans, NULL, NULL, 0); ++ return __bch2_fsck_write_inode(trans, &inode) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); + } +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ ++ return 0; + } + +-static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) ++static int bch2_bi_depth_renumber(struct btree_trans *trans, darray_u64 *path, ++ u32 snapshot, u32 new_bi_depth) + { + u32 restart_count = trans->restart_count; + int ret = 0; + + darray_for_each_reverse(*path, i) { + ret = nested_lockrestart_do(trans, +- bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); ++ bch2_bi_depth_renumber_one(trans, *i, snapshot, new_bi_depth)); + bch_err_fn(trans->c, ret); + if (ret) + break; +@@ -2573,43 +2712,43 @@ static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 + return ret ?: trans_was_restarted(trans, restart_count); + } + +-static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) +-{ +- darray_for_each(*p, i) +- if (i->inum == inum && +- i->snapshot == snapshot) +- return true; +- return false; +-} +- + static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) + { + struct bch_fs *c = trans->c; +- struct btree_iter inode_iter = {}; +- pathbuf path = {}; +- struct printbuf buf = PRINTBUF; ++ CLASS(darray_u64, path)(); ++ CLASS(printbuf, buf)(); + u32 snapshot = inode_k.k->p.snapshot; + bool redo_bi_depth = false; + u32 min_bi_depth = U32_MAX; + int ret = 0; + ++ struct bpos start = inode_k.k->p; ++ + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(inode_k, &inode); + if (ret) + return ret; + +- while (!inode.bi_subvol) { ++ CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, POS_MIN, 0); ++ ++ /* ++ * If we're running full fsck, check_dirents() will have already ran, ++ * and we shouldn't see any missing backpointers here - otherwise that's ++ * handled separately, by check_unreachable_inodes ++ */ ++ while (!inode.bi_subvol && ++ bch2_inode_has_backpointer(&inode)) { + struct btree_iter dirent_iter; + struct bkey_s_c_dirent d; +- u32 parent_snapshot = snapshot; + +- d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ++ d = dirent_get_by_pos(trans, &dirent_iter, ++ SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot)); + ret = bkey_err(d.s_c); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto out; + + if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) +- bch2_trans_iter_exit(trans, &dirent_iter); ++ bch2_trans_iter_exit(&dirent_iter); + + if (bch2_err_matches(ret, ENOENT)) { + printbuf_reset(&buf); +@@ -2619,20 +2758,14 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) + goto out; + } + +- bch2_trans_iter_exit(trans, &dirent_iter); ++ bch2_trans_iter_exit(&dirent_iter); + +- ret = darray_push(&path, ((struct pathbuf_entry) { +- .inum = inode.bi_inum, +- .snapshot = snapshot, +- })); ++ ret = darray_push(&path, inode.bi_inum); + if (ret) + return ret; + +- snapshot = parent_snapshot; +- +- bch2_trans_iter_exit(trans, &inode_iter); +- inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, +- SPOS(0, inode.bi_dir, snapshot), 0); ++ bch2_btree_iter_set_pos(&inode_iter, SPOS(0, inode.bi_dir, snapshot)); ++ inode_k = bch2_btree_iter_peek_slot(&inode_iter); + + struct bch_inode_unpacked parent_inode; + ret = bkey_err(inode_k) ?: +@@ -2651,22 +2784,28 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) + break; + + inode = parent_inode; +- snapshot = inode_k.k->p.snapshot; + redo_bi_depth = true; + +- if (path_is_dup(&path, inode.bi_inum, snapshot)) { +- /* XXX print path */ +- bch_err(c, "directory structure loop"); ++ if (darray_find(path, inode.bi_inum)) { ++ printbuf_reset(&buf); ++ prt_printf(&buf, "directory structure loop in snapshot %u: ", ++ snapshot); ++ ++ ret = bch2_inum_snapshot_to_path(trans, start.offset, start.snapshot, NULL, &buf); ++ if (ret) ++ goto out; + +- darray_for_each(path, i) +- pr_err("%llu:%u", i->inum, i->snapshot); +- pr_err("%llu:%u", inode.bi_inum, snapshot); ++ if (c->opts.verbose) { ++ prt_newline(&buf); ++ darray_for_each(path, i) ++ prt_printf(&buf, "%llu ", *i); ++ } + +- if (fsck_err(trans, dir_loop, "directory structure loop")) { ++ if (fsck_err(trans, dir_loop, "%s", buf.buf)) { + ret = remove_backpointer(trans, &inode); + bch_err_msg(c, ret, "removing dirent"); + if (ret) +- break; ++ goto out; + + ret = reattach_inode(trans, &inode); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); +@@ -2680,12 +2819,9 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) + min_bi_depth = 0; + + if (redo_bi_depth) +- ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); ++ ret = bch2_bi_depth_renumber(trans, &path, snapshot, min_bi_depth); + out: + fsck_err: +- bch2_trans_iter_exit(trans, &inode_iter); +- darray_exit(&path); +- printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; + } +@@ -2696,8 +2832,8 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) + */ + int bch2_check_directory_structure(struct bch_fs *c) + { +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_reverse_commit(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, +@@ -2709,10 +2845,7 @@ int bch2_check_directory_structure(struct bch_fs *c) + continue; + + check_path_loop(trans, k); +- }))); +- +- bch_err_fn(c, ret); +- return ret; ++ })); + } + + struct nlink_table { +@@ -2736,7 +2869,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t, + if (!d) { + bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", + new_size); +- return -BCH_ERR_ENOMEM_fsck_add_nlink; ++ return bch_err_throw(c, ENOMEM_fsck_add_nlink); + } + + if (t->d) +@@ -2796,8 +2929,8 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + struct nlink_table *t, + u64 start, u64 *end) + { +- int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, BTREE_ID_inodes, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, + POS(0, start), + BTREE_ITER_intent| + BTREE_ITER_prefetch| +@@ -2832,7 +2965,7 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + break; + } + 0; +- }))); ++ })); + + bch_err_fn(c, ret); + return ret; +@@ -2842,12 +2975,10 @@ noinline_for_stack + static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, + u64 range_start, u64 range_end) + { +- struct snapshots_seen s; +- +- snapshots_seen_init(&s); ++ CLASS(btree_trans, trans)(c); ++ CLASS(snapshots_seen, s)(); + +- int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, ++ int ret = for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN, + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, ({ +@@ -2864,9 +2995,7 @@ static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links + le64_to_cpu(d.v->d_inum), d.k->p.snapshot); + } + 0; +- }))); +- +- snapshots_seen_exit(&s); ++ })); + + bch_err_fn(c, ret); + return ret; +@@ -2920,14 +3049,14 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c, + struct nlink_table *links, + u64 range_start, u64 range_end) + { ++ CLASS(btree_trans, trans)(c); + size_t idx = 0; + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, ++ int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, + POS(0, range_start), + BTREE_ITER_intent|BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end))); ++ check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)); + if (ret < 0) { + bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret)); + return ret; +@@ -2966,7 +3095,6 @@ int bch2_check_nlinks(struct bch_fs *c) + } while (next_iter_range_start != U64_MAX); + + kvfree(links.d); +- bch_err_fn(c, ret); + return ret; + } + +@@ -3001,15 +3129,13 @@ int bch2_fix_reflink_p(struct bch_fs *c) + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) + return 0; + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_commit(trans, iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_intent|BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- fix_reflink_p_key(trans, &iter, k))); +- bch_err_fn(c, ret); +- return ret; ++ fix_reflink_p_key(trans, &iter, k)); + } + + #ifndef NO_BCACHEFS_CHARDEV +@@ -3035,6 +3161,8 @@ static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) + if (ret) + return ret; + ++ thr->c->recovery_task = current; ++ + ret = bch2_fs_start(thr->c); + if (ret) + goto err; +@@ -3061,7 +3189,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) + { + struct bch_ioctl_fsck_offline arg; + struct fsck_thread *thr = NULL; +- darray_str(devs) = {}; ++ darray_const_str devs = {}; + long ret = 0; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) +@@ -3119,7 +3247,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) + + bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); + +- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); ++ thr->c = bch2_fs_open(&devs, &thr->opts); + + if (!IS_ERR(thr->c) && + thr->c->opts.errors == BCH_ON_ERROR_panic) +@@ -3156,19 +3284,18 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) + c->opts.fix_errors = FSCK_FIX_ask; + + c->opts.fsck = true; +- set_bit(BCH_FS_fsck_running, &c->flags); ++ set_bit(BCH_FS_in_fsck, &c->flags); + +- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; +- int ret = bch2_run_online_recovery_passes(c); ++ int ret = bch2_run_online_recovery_passes(c, ~0ULL); + +- clear_bit(BCH_FS_fsck_running, &c->flags); ++ clear_bit(BCH_FS_in_fsck, &c->flags); + bch_err_fn(c, ret); + + c->stdio = NULL; + c->stdio_filter = NULL; + c->opts.fix_errors = old_fix_errors; + +- up(&c->online_fsck_mutex); ++ up(&c->recovery.run_lock); + bch2_ro_ref_put(c); + return ret; + } +@@ -3192,7 +3319,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) + if (!bch2_ro_ref_tryget(c)) + return -EROFS; + +- if (down_trylock(&c->online_fsck_mutex)) { ++ if (down_trylock(&c->recovery.run_lock)) { + bch2_ro_ref_put(c); + return -EAGAIN; + } +@@ -3224,7 +3351,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) + bch_err_fn(c, ret); + if (thr) + bch2_fsck_thread_exit(&thr->thr); +- up(&c->online_fsck_mutex); ++ up(&c->recovery.run_lock); + bch2_ro_ref_put(c); + } + return ret; +diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h +index 574948278cd4..e5fe7cf7b251 100644 +--- a/fs/bcachefs/fsck.h ++++ b/fs/bcachefs/fsck.h +@@ -4,6 +4,12 @@ + + #include "str_hash.h" + ++/* recoverds snapshot IDs of overwrites at @pos */ ++struct snapshots_seen { ++ struct bpos pos; ++ snapshot_id_list ids; ++}; ++ + int bch2_fsck_update_backpointers(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc, +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 490b85841de9..d5e5190f0663 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -38,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = { + #undef x + + static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); ++static int may_delete_deleted_inum(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *); + + static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; + +@@ -241,6 +242,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, + u64 v[2]; + + unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_snapshot = inode.k->p.snapshot; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); +@@ -285,13 +287,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, + { + memset(unpacked, 0, sizeof(*unpacked)); + +- unpacked->bi_snapshot = k.k->p.snapshot; +- + switch (k.k->type) { + case KEY_TYPE_inode: { + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + + unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_snapshot = inode.k->p.snapshot; + unpacked->bi_journal_seq= 0; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); +@@ -310,6 +311,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + + unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_snapshot = inode.k->p.snapshot; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); +@@ -327,8 +329,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, + int bch2_inode_unpack(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) + { +- unpacked->bi_snapshot = k.k->p.snapshot; +- + return likely(k.k->type == KEY_TYPE_inode_v3) + ? bch2_inode_unpack_v3(k, unpacked) + : bch2_inode_unpack_slowpath(k, unpacked); +@@ -345,12 +345,12 @@ int __bch2_inode_peek(struct btree_trans *trans, + if (ret) + return ret; + +- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, +- SPOS(0, inum.inum, snapshot), +- flags|BTREE_ITER_cached); ++ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, SPOS(0, inum.inum, snapshot), ++ flags|BTREE_ITER_cached); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) +- return ret; ++ goto err; + + ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; + if (ret) +@@ -364,7 +364,75 @@ int __bch2_inode_peek(struct btree_trans *trans, + err: + if (warn) + bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum); +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, ++ u64 inode_nr, u32 snapshot, ++ struct bch_inode_unpacked *inode, ++ unsigned flags) ++{ ++ CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, inode_nr, snapshot), flags); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ return bkey_is_inode(k.k) ++ ? bch2_inode_unpack(k, inode) ++ : -BCH_ERR_ENOENT_inode; ++} ++ ++int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, ++ subvol_inum inum, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); ++ if (!ret) ++ bch2_trans_iter_exit(&iter); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *trans, ++ subvol_inum inum, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter iter; ++ int ret; ++ ++ ret = bch2_inode_peek(trans, &iter, inode, inum, 0); ++ if (!ret) ++ bch2_trans_iter_exit(&iter); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, ++ struct bch_inode_unpacked *inode) ++{ ++ CLASS(btree_trans, trans)(c); ++ return lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, inode)); ++} ++ ++int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, ++ struct bch_inode_unpacked *root) ++{ ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, ++ SPOS(0, inum, U32_MAX), ++ BTREE_ITER_all_snapshots, k, ret) { ++ if (k.k->p.offset != inum) ++ break; ++ if (bkey_is_inode(k.k)) ++ return bch2_inode_unpack(k, root); ++ } ++ /* We're only called when we know we have an inode for @inum */ ++ BUG_ON(!ret); + return ret; + } + +@@ -395,9 +463,10 @@ int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = inode->bi_snapshot; + +- return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, +- &inode_p->inode.k_i, +- BTREE_UPDATE_internal_snapshot_node); ++ return bch2_btree_insert_trans(trans, BTREE_ID_inodes, ++ &inode_p->inode.k_i, ++ BTREE_ITER_cached| ++ BTREE_UPDATE_internal_snapshot_node); + } + + int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode) +@@ -619,14 +688,15 @@ bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter + struct bkey_s_c k; + int ret = 0; + +- for_each_btree_key_max_norestart(trans, *iter, btree, +- bpos_successor(pos), +- SPOS(pos.inode, pos.offset, U32_MAX), +- flags|BTREE_ITER_all_snapshots, k, ret) ++ bch2_trans_iter_init(trans, iter, btree, bpos_successor(pos), ++ flags|BTREE_ITER_all_snapshots); ++ ++ for_each_btree_key_max_continue_norestart(*iter, SPOS(pos.inode, pos.offset, U32_MAX), ++ flags|BTREE_ITER_all_snapshots, k, ret) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot)) + return k; + +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + } + +@@ -642,7 +712,7 @@ bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter + bkey_is_inode(k.k)) + return k; + +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + pos = k.k->p; + goto again; + } +@@ -650,7 +720,6 @@ bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter + int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + +@@ -663,7 +732,6 @@ int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) + ret = 1; + break; + } +- bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -715,7 +783,7 @@ static int update_parent_inode_has_children(struct btree_trans *trans, struct bp + bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot); + } + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -833,7 +901,8 @@ void bch2_inode_init_early(struct bch_fs *c, + get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); + } + +-void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, ++void bch2_inode_init_late(struct bch_fs *c, ++ struct bch_inode_unpacked *inode_u, u64 now, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) + { +@@ -857,6 +926,12 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, + BCH_INODE_OPTS() + #undef x + } ++ ++ if (!S_ISDIR(mode)) ++ inode_u->bi_casefold = 0; ++ ++ if (bch2_inode_casefold(c, inode_u)) ++ inode_u->bi_flags |= BCH_INODE_has_case_insensitive; + } + + void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, +@@ -864,7 +939,7 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, + struct bch_inode_unpacked *parent) + { + bch2_inode_init_early(c, inode_u); +- bch2_inode_init_late(inode_u, bch2_current_time(c), ++ bch2_inode_init_late(c, inode_u, bch2_current_time(c), + uid, gid, mode, rdev, parent); + } + +@@ -877,11 +952,10 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m + + cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); + +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, +- BTREE_ID_logged_ops, +- POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), +- BTREE_ITER_cached); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_logged_ops, ++ POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), ++ BTREE_ITER_cached); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ERR_PTR(ret); +@@ -890,9 +964,8 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m + k.k->type == KEY_TYPE_inode_alloc_cursor + ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) + : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); +- ret = PTR_ERR_OR_ZERO(cursor); +- if (ret) +- goto err; ++ if (IS_ERR(cursor)) ++ return cursor; + + if (c->opts.inodes_32bit) { + *min = BLOCKDEV_INODE_MAX; +@@ -913,9 +986,8 @@ bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *m + cursor->v.idx = cpu_to_le64(*min); + le32_add_cpu(&cursor->v.gen, 1); + } +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret ? ERR_PTR(ret) : cursor; ++ ++ return cursor; + } + + /* +@@ -935,53 +1007,60 @@ int bch2_inode_create(struct btree_trans *trans, + + u64 start = le64_to_cpu(cursor->v.idx); + u64 pos = start; ++ u64 gen = 0; + + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); + struct bkey_s_c k; + again: +- while ((k = bch2_btree_iter_peek(trans, iter)).k && ++ while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_lt(k.k->p, POS(0, max))) { + if (pos < iter->pos.offset) + goto found_slot; + ++ if (bch2_snapshot_is_ancestor(trans->c, snapshot, k.k->p.snapshot) && ++ k.k->type == KEY_TYPE_inode_generation) { ++ gen = le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); ++ goto found_slot; ++ } ++ + /* + * We don't need to iterate over keys in every snapshot once + * we've found just one: + */ + pos = iter->pos.offset + 1; +- bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); ++ bch2_btree_iter_set_pos(iter, POS(0, pos)); + } + + if (!ret && pos < max) + goto found_slot; + + if (!ret && start == min) +- ret = -BCH_ERR_ENOSPC_inode_create; ++ ret = bch_err_throw(trans->c, ENOSPC_inode_create); + + if (ret) { +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return ret; + } + + /* Retry from start */ + pos = start = min; +- bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); ++ bch2_btree_iter_set_pos(iter, POS(0, pos)); + le32_add_cpu(&cursor->v.gen, 1); + goto again; + found_slot: +- bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); +- k = bch2_btree_iter_peek_slot(trans, iter); ++ bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); ++ k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return ret; + } + + inode_u->bi_inum = k.k->p.offset; +- inode_u->bi_generation = le64_to_cpu(cursor->v.gen); ++ inode_u->bi_generation = max(gen, le64_to_cpu(cursor->v.gen)); + cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); + return 0; + } +@@ -989,7 +1068,6 @@ int bch2_inode_create(struct btree_trans *trans, + static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) + { +- struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + struct bpos end = POS(inum.inum, U64_MAX); +@@ -1000,8 +1078,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, + * We're never going to be deleting partial extents, no need to use an + * extent iterator: + */ +- bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), +- BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, id, POS(inum.inum, 0), BTREE_ITER_intent); + + while (1) { + bch2_trans_begin(trans); +@@ -1010,9 +1087,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, + if (ret) + goto err; + +- bch2_btree_iter_set_snapshot(trans, &iter, snapshot); ++ bch2_btree_iter_set_snapshot(&iter, snapshot); + +- k = bch2_btree_iter_peek_max(trans, &iter, end); ++ k = bch2_btree_iter_peek_max(&iter, end); + ret = bkey_err(k); + if (ret) + goto err; +@@ -1036,31 +1113,36 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, + break; + } + +- bch2_trans_iter_exit(trans, &iter); + return ret; + } + + int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) + { +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + struct btree_iter iter = {}; + struct bkey_s_c k; ++ struct bch_inode_unpacked inode; + u32 snapshot; + int ret; + ++ ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum, &inode)); ++ if (ret) ++ return ret; ++ + /* + * If this was a directory, there shouldn't be any real dirents left - + * but there could be whiteouts (from hash collisions) that we should + * delete: + * +- * XXX: the dirent could ideally would delete whiteouts when they're no ++ * XXX: the dirent code ideally would delete whiteouts when they're no + * longer needed + */ +- ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: +- bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: +- bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); ++ ret = (!S_ISDIR(inode.bi_mode) ++ ? bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ++ : bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents)) ?: ++ bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs); + if (ret) +- goto err; ++ return ret; + retry: + bch2_trans_begin(trans); + +@@ -1079,7 +1161,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) + bch2_fs_inconsistent(c, + "inode %llu:%u not found when deleting", + inum.inum, snapshot); +- ret = -BCH_ERR_ENOENT_inode; ++ ret = bch_err_throw(c, ENOENT_inode); + goto err; + } + +@@ -1087,49 +1169,14 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) +- goto err2; +- +- ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); +-err2: +- bch2_trans_put(trans); +- return ret; +-} +- +-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, +- subvol_inum inum, +- struct bch_inode_unpacked *inode) +-{ +- struct btree_iter iter; +- int ret; +- +- ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); +- if (!ret) +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} +- +-int bch2_inode_find_by_inum_trans(struct btree_trans *trans, +- subvol_inum inum, +- struct bch_inode_unpacked *inode) +-{ +- struct btree_iter iter; +- int ret; +- +- ret = bch2_inode_peek(trans, &iter, inode, inum, 0); +- if (!ret) +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} ++ return ret; + +-int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, +- struct bch_inode_unpacked *inode) +-{ +- return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); ++ return delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot)); + } + + int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) +@@ -1210,11 +1257,15 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, + { + struct bch_fs *c = trans->c; + +-#ifdef CONFIG_UNICODE +- int ret = 0; ++ int ret = bch2_fs_casefold_enabled(c); ++ if (ret) { ++ bch_err_ratelimited(c, "Cannot enable casefolding: %s", bch2_err_str(ret)); ++ return ret; ++ } ++ + /* Not supported on individual files. */ + if (!S_ISDIR(bi->bi_mode)) +- return -EOPNOTSUPP; ++ return bch_err_throw(c, casefold_opt_is_dir_only); + + /* + * Make sure the dir is empty, as otherwise we'd need to +@@ -1233,20 +1284,13 @@ int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, + bi->bi_casefold = v + 1; + bi->bi_fields_set |= BIT(Inode_opt_casefold); + +- return 0; +-#else +- bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); +- return -EOPNOTSUPP; +-#endif ++ return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); + } + + static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter = {}; +- struct bkey_i_inode_generation delete; +- struct bch_inode_unpacked inode_u; +- struct bkey_s_c k; ++ struct btree_iter iter = { NULL }; + int ret; + + do { +@@ -1262,14 +1306,14 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL); +- } while (ret == -BCH_ERR_transaction_restart_nested); ++ } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); + if (ret) + goto err; + retry: + bch2_trans_begin(trans); + +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, +- SPOS(0, inum, snapshot), BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, inum, snapshot), BTREE_ITER_intent); + ret = bkey_err(k); + if (ret) + goto err; +@@ -1278,16 +1322,18 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum + bch2_fs_inconsistent(c, + "inode %llu:%u not found when deleting", + inum, snapshot); +- ret = -BCH_ERR_ENOENT_inode; ++ ret = bch_err_throw(c, ENOENT_inode); + goto err; + } + ++ struct bch_inode_unpacked inode_u; + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + if (inode_u.bi_subvol) + bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); + ++ struct bkey_i_inode_generation delete; + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); +@@ -1296,11 +1342,11 @@ static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + +- return ret ?: -BCH_ERR_transaction_restart_nested; ++ return ret ?: bch_err_throw(c, transaction_restart_nested); + } + + /* +@@ -1321,7 +1367,7 @@ static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpo + + bool unlinked = bkey_is_unlinked_inode(k); + pos = k.k->p; +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + + if (!unlinked) + return 0; +@@ -1342,122 +1388,133 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) + delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); + } + +-static int may_delete_deleted_inode(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bpos pos, +- bool *need_another_pass) ++static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, ++ struct bch_inode_unpacked *inode, ++ bool from_deleted_inodes) + { + struct bch_fs *c = trans->c; +- struct btree_iter inode_iter; +- struct bkey_s_c k; +- struct bch_inode_unpacked inode; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret; + +- k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached); ++ CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, pos, BTREE_ITER_cached); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&inode_iter); + ret = bkey_err(k); + if (ret) + return ret; + +- ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; +- if (fsck_err_on(!bkey_is_inode(k.k), ++ ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode); ++ if (fsck_err_on(from_deleted_inodes && ret, + trans, deleted_inode_missing, + "nonexistent inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; ++ if (ret) ++ return ret; + +- ret = bch2_inode_unpack(k, &inode); ++ ret = bch2_inode_unpack(k, inode); + if (ret) +- goto out; ++ return ret; + +- if (S_ISDIR(inode.bi_mode)) { ++ if (S_ISDIR(inode->bi_mode)) { + ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); +- if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), ++ if (fsck_err_on(from_deleted_inodes && ++ bch2_err_matches(ret, ENOTEMPTY), + trans, deleted_inode_is_dir, + "non empty directory %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; + if (ret) +- goto out; ++ return ret; + } + +- if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), ++ ret = inode->bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); ++ if (fsck_err_on(from_deleted_inodes && ret, + trans, deleted_inode_not_unlinked, + "non-deleted inode %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; ++ if (ret) ++ return ret; + +- if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, ++ ret = !(inode->bi_flags & BCH_INODE_has_child_snapshot) ++ ? 0 : bch_err_throw(c, inode_has_child_snapshot); ++ ++ if (fsck_err_on(from_deleted_inodes && ret, + trans, deleted_inode_has_child_snapshots, + "inode with child snapshots %llu:%u in deleted_inodes btree", + pos.offset, pos.snapshot)) + goto delete; ++ if (ret) ++ return ret; + + ret = bch2_inode_has_child_snapshots(trans, k.k->p); + if (ret < 0) +- goto out; ++ return ret; + + if (ret) { + if (fsck_err(trans, inode_has_child_snapshots_wrong, + "inode has_child_snapshots flag wrong (should be set)\n%s", + (printbuf_reset(&buf), +- bch2_inode_unpacked_to_text(&buf, &inode), ++ bch2_inode_unpacked_to_text(&buf, inode), + buf.buf))) { +- inode.bi_flags |= BCH_INODE_has_child_snapshot; +- ret = __bch2_fsck_write_inode(trans, &inode); ++ inode->bi_flags |= BCH_INODE_has_child_snapshot; ++ ret = __bch2_fsck_write_inode(trans, inode); + if (ret) +- goto out; ++ return ret; + } ++ ++ if (!from_deleted_inodes) { ++ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ bch_err_throw(c, inode_has_child_snapshot); ++ } ++ + goto delete; + + } + +- if (test_bit(BCH_FS_clean_recovery, &c->flags) && +- !fsck_err(trans, deleted_inode_but_clean, +- "filesystem marked as clean but have deleted inode %llu:%u", +- pos.offset, pos.snapshot)) { +- ret = 0; +- goto out; +- } ++ if (from_deleted_inodes) { ++ if (test_bit(BCH_FS_clean_recovery, &c->flags) && ++ !fsck_err(trans, deleted_inode_but_clean, ++ "filesystem marked as clean but have deleted inode %llu:%u", ++ pos.offset, pos.snapshot)) ++ return 0; + +- ret = 1; +-out: ++ ret = 1; ++ } + fsck_err: +- bch2_trans_iter_exit(trans, &inode_iter); +- printbuf_exit(&buf); + return ret; + delete: +- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); +- goto out; ++ return bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); ++} ++ ++static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum, ++ struct bch_inode_unpacked *inode) ++{ ++ u32 snapshot; ++ ++ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: ++ may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), inode, false); + } + + int bch2_delete_dead_inodes(struct bch_fs *c) + { +- struct btree_trans *trans = bch2_trans_get(c); +- bool need_another_pass; +- int ret; +-again: ++ CLASS(btree_trans, trans)(c); + /* + * if we ran check_inodes() unlinked inodes will have already been + * cleaned up but the write buffer will be out of sync; therefore we + * alway need a write buffer flush +- */ +- ret = bch2_btree_write_buffer_flush_sync(trans); +- if (ret) +- goto err; +- +- need_another_pass = false; +- +- /* ++ * + * Weird transaction restart handling here because on successful delete, + * bch2_inode_rm_snapshot() will return a nested transaction restart, + * but we can't retry because the btree write buffer won't have been + * flushed and we'd spin: + */ +- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, ++ return bch2_btree_write_buffer_flush_sync(trans) ?: ++ for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ +- ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); ++ struct bch_inode_unpacked inode; ++ int ret = may_delete_deleted_inode(trans, k.k->p, &inode, true); + if (ret > 0) { + bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", + k.k->p.offset, k.k->p.snapshot); +@@ -1478,10 +1535,4 @@ int bch2_delete_dead_inodes(struct bch_fs *c) + + ret; + })); +- +- if (!ret && need_another_pass) +- goto again; +-err: +- bch2_trans_put(trans); +- return ret; + } +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index 5cfba9e98966..b8ec3e628d90 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans, + subvol_inum inum, unsigned flags) + { + return __bch2_inode_peek(trans, iter, inode, inum, flags, true); +- int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); +- return ret; + } + ++int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, ++ struct bch_inode_unpacked *, unsigned); ++int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, ++ subvol_inum, ++ struct bch_inode_unpacked *); ++int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *); ++int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, ++ struct bch_inode_unpacked *); ++ ++int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, ++ struct bch_inode_unpacked *root); ++ + int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); + +@@ -153,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); + + void bch2_inode_init_early(struct bch_fs *, + struct bch_inode_unpacked *); +-void bch2_inode_init_late(struct bch_inode_unpacked *, u64, ++void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); + void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, +@@ -165,14 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, + + int bch2_inode_rm(struct bch_fs *, subvol_inum); + +-int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, +- subvol_inum, +- struct bch_inode_unpacked *); +-int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, +- struct bch_inode_unpacked *); +-int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, +- struct bch_inode_unpacked *); +- + #define inode_opt_get(_c, _inode, _name) \ + ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) + +@@ -245,12 +248,17 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k) + + static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) + { +- /* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */ ++ /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ + return bi->bi_casefold + ? bi->bi_casefold - 1 + : c->opts.casefold; + } + ++static inline bool bch2_inode_has_backpointer(const struct bch_inode_unpacked *bi) ++{ ++ return bi->bi_dir || bi->bi_dir_offset; ++} ++ + /* i_nlink: */ + + static inline unsigned nlink_bias(umode_t mode) +@@ -280,15 +288,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, + int bch2_inode_nlink_inc(struct bch_inode_unpacked *); + void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); + +-static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) +-{ +- bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; +- +- return S_ISDIR(inode->bi_mode) || +- inode->bi_subvol || +- (!inode->bi_nlink && inode_has_bp); +-} +- + struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); + void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, + struct bch_inode_unpacked *); +@@ -306,6 +305,14 @@ bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode + return io_opts_to_rebalance_opts(c, &io_opts); + } + ++#define BCACHEFS_ROOT_SUBVOL_INUM \ ++ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) ++ ++static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) ++{ ++ return a.subvol == b.subvol && a.inum == b.inum; ++} ++ + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); + int bch2_delete_dead_inodes(struct bch_fs *); + +diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h +index 87e193e8ed25..1f00938b1bdc 100644 +--- a/fs/bcachefs/inode_format.h ++++ b/fs/bcachefs/inode_format.h +@@ -129,6 +129,10 @@ enum inode_opt_id { + Inode_opt_nr, + }; + ++/* ++ * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - ++ * for overlayfs ++ */ + #define BCH_INODE_FLAGS() \ + x(sync, 0) \ + x(immutable, 1) \ +@@ -139,7 +143,8 @@ enum inode_opt_id { + x(i_sectors_dirty, 6) \ + x(unlinked, 7) \ + x(backptr_untrusted, 8) \ +- x(has_child_snapshot, 9) ++ x(has_child_snapshot, 9) \ ++ x(has_case_insensitive, 10) + + /* bits 20+ reserved for packed fields below: */ + +diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c +index cc07729a4b62..fa0b06e17d17 100644 +--- a/fs/bcachefs/io_misc.c ++++ b/fs/bcachefs/io_misc.c +@@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); + +- k = bch2_btree_iter_peek_slot(trans, iter); ++ k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; +@@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, + opts.data_replicas, + BCH_WATERMARK_normal, 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) +- ret = -BCH_ERR_transaction_restart_nested; ++ ret = bch_err_throw(c, transaction_restart_nested); + if (ret) + goto err; + +@@ -114,12 +114,11 @@ int bch2_extent_fallocate(struct btree_trans *trans, + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + if (should_print_err(ret)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); + prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); +- printbuf_exit(&buf); + } + err_noprint: + bch2_open_buckets_put(c, &open_buckets); +@@ -135,6 +134,33 @@ int bch2_extent_fallocate(struct btree_trans *trans, + return ret; + } + ++/* For fsck */ ++int bch2_fpunch_snapshot(struct btree_trans *trans, struct bpos start, struct bpos end) ++{ ++ u32 restart_count = trans->restart_count; ++ struct bch_fs *c = trans->c; ++ struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); ++ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); ++ struct bkey_i delete; ++ ++ int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, ++ start, end, 0, k, ++ &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ bkey_init(&delete.k); ++ delete.k.p = iter.pos; ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete); ++ ++ bch2_extent_trim_atomic(trans, &iter, &delete) ?: ++ bch2_trans_update(trans, &iter, &delete, 0); ++ })); ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ return ret ?: trans_was_restarted(trans, restart_count); ++} ++ + /* + * Returns -BCH_ERR_transacton_restart if we had to drop locks: + */ +@@ -164,12 +190,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + if (ret) + continue; + +- bch2_btree_iter_set_snapshot(trans, iter, snapshot); ++ bch2_btree_iter_set_snapshot(iter, snapshot); + + /* + * peek_max() doesn't have ideal semantics for extents: + */ +- k = bch2_btree_iter_peek_max(trans, iter, end_pos); ++ k = bch2_btree_iter_peek_max(iter, end_pos); + if (!k.k) + break; + +@@ -195,23 +221,13 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, +- POS(inum.inum, start), +- BTREE_ITER_intent); ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_extents, POS(inum.inum, start), ++ BTREE_ITER_intent); + +- ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); ++ int ret = bch2_fpunch_at(trans, &iter, inum, end, i_sectors_delta); + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); +- +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- ret = 0; +- +- return ret; ++ return bch2_err_matches(ret, BCH_ERR_transaction_restart) ? 0 : ret; + } + + /* truncate: */ +@@ -230,7 +246,7 @@ static int truncate_set_isize(struct btree_trans *trans, + u64 new_i_size, + bool warn) + { +- struct btree_iter iter = {}; ++ struct btree_iter iter = { NULL }; + struct bch_inode_unpacked inode_u; + int ret; + +@@ -238,7 +254,7 @@ static int truncate_set_isize(struct btree_trans *trans, + (inode_u.bi_size = new_i_size, 0) ?: + bch2_inode_write(trans, &iter, &inode_u); + +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -247,7 +263,6 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, + u64 *i_sectors_delta) + { + struct bch_fs *c = trans->c; +- struct btree_iter fpunch_iter; + struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k); + subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) }; + u64 new_i_size = le64_to_cpu(op->v.new_i_size); +@@ -259,14 +274,15 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans, + if (ret) + goto err; + +- bch2_trans_iter_init(trans, &fpunch_iter, BTREE_ID_extents, +- POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), +- BTREE_ITER_intent); +- ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); +- bch2_trans_iter_exit(trans, &fpunch_iter); ++ { ++ CLASS(btree_iter, fpunch_iter)(trans, BTREE_ID_extents, ++ POS(inum.inum, round_up(new_i_size, block_bytes(c)) >> 9), ++ BTREE_ITER_intent); ++ ret = bch2_fpunch_at(trans, &fpunch_iter, inum, U64_MAX, i_sectors_delta); + +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- ret = 0; ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ ret = 0; ++ } + err: + if (warn_errors) + bch_err_fn(c, ret); +@@ -292,17 +308,13 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec + * snapshot while they're in progress, then crashing, will result in the + * resume only proceeding in one of the snapshots + */ +- down_read(&c->snapshot_create_lock); +- struct btree_trans *trans = bch2_trans_get(c); ++ guard(rwsem_read)(&c->snapshot_create_lock); ++ CLASS(btree_trans, trans)(c); + int ret = bch2_logged_op_start(trans, &op.k_i); + if (ret) +- goto out; ++ return ret; + ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta); + ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; +-out: +- bch2_trans_put(trans); +- up_read(&c->snapshot_create_lock); +- + return ret; + } + +@@ -349,7 +361,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, + + ret = bch2_inode_write(trans, &iter, &inode_u); + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -399,7 +411,7 @@ case LOGGED_OP_FINSERT_start: + if (ret) + goto err; + } else { +- bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); ++ bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); + + ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -425,12 +437,12 @@ case LOGGED_OP_FINSERT_shift_extents: + if (ret) + goto btree_err; + +- bch2_btree_iter_set_snapshot(trans, &iter, snapshot); +- bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); ++ bch2_btree_iter_set_snapshot(&iter, snapshot); ++ bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); + + k = insert +- ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) +- : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); ++ ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) ++ : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); + if ((ret = bkey_err(k))) + goto btree_err; + +@@ -498,7 +510,7 @@ case LOGGED_OP_FINSERT_finish: + break; + } + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + if (warn_errors) + bch_err_fn(c, ret); + return ret; +@@ -528,16 +540,12 @@ int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum, + * snapshot while they're in progress, then crashing, will result in the + * resume only proceeding in one of the snapshots + */ +- down_read(&c->snapshot_create_lock); +- struct btree_trans *trans = bch2_trans_get(c); ++ guard(rwsem_read)(&c->snapshot_create_lock); ++ CLASS(btree_trans, trans)(c); + int ret = bch2_logged_op_start(trans, &op.k_i); + if (ret) +- goto out; ++ return ret; + ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta); + ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret; +-out: +- bch2_trans_put(trans); +- up_read(&c->snapshot_create_lock); +- + return ret; + } +diff --git a/fs/bcachefs/io_misc.h b/fs/bcachefs/io_misc.h +index 9cb44a7c43c1..b93e4d4b3c0c 100644 +--- a/fs/bcachefs/io_misc.h ++++ b/fs/bcachefs/io_misc.h +@@ -5,6 +5,8 @@ + int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, + u64, struct bch_io_opts, s64 *, + struct write_point_specifier); ++ ++int bch2_fpunch_snapshot(struct btree_trans *, struct bpos, struct bpos); + int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + subvol_inum, u64, s64 *); + int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); +diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c +index def4a26a3b45..c4f0f9d8f959 100644 +--- a/fs/bcachefs/io_read.c ++++ b/fs/bcachefs/io_read.c +@@ -9,6 +9,7 @@ + #include "bcachefs.h" + #include "alloc_background.h" + #include "alloc_foreground.h" ++#include "async_objs.h" + #include "btree_update.h" + #include "buckets.h" + #include "checksum.h" +@@ -17,6 +18,7 @@ + #include "data_update.h" + #include "disk_groups.h" + #include "ec.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "io_read.h" + #include "io_misc.h" +@@ -25,6 +27,7 @@ + #include "subvolume.h" + #include "trace.h" + ++#include + #include + #include + +@@ -34,41 +37,81 @@ module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); + MODULE_PARM_DESC(read_corrupt_ratio, ""); + #endif + ++static bool bch2_poison_extents_on_checksum_error; ++module_param_named(poison_extents_on_checksum_error, ++ bch2_poison_extents_on_checksum_error, bool, 0644); ++MODULE_PARM_DESC(poison_extents_on_checksum_error, ++ "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); ++ + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + ++static inline u32 bch2_dev_congested_read(struct bch_dev *ca, u64 now) ++{ ++ s64 congested = atomic_read(&ca->congested); ++ u64 last = READ_ONCE(ca->congested_last); ++ if (time_after64(now, last)) ++ congested -= (now - last) >> 12; ++ ++ return clamp(congested, 0LL, CONGESTED_MAX); ++} ++ + static bool bch2_target_congested(struct bch_fs *c, u16 target) + { + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; +- u64 now = local_clock(), last; +- s64 congested; +- struct bch_dev *ca; +- +- if (!target) +- return false; ++ u64 now = local_clock(); + +- rcu_read_lock(); ++ guard(rcu)(); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { +- ca = rcu_dereference(c->devs[d]); ++ struct bch_dev *ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + +- congested = atomic_read(&ca->congested); +- last = READ_ONCE(ca->congested_last); +- if (time_after64(now, last)) +- congested -= (now - last) >> 12; +- +- total += max(congested, 0LL); ++ total += bch2_dev_congested_read(ca, now); + nr++; + } +- rcu_read_unlock(); + + return get_random_u32_below(nr * CONGESTED_MAX) < total; + } + ++void bch2_dev_congested_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ printbuf_tabstop_push(out, 32); ++ ++ prt_printf(out, "current:\t%u%%\n", ++ bch2_dev_congested_read(ca, local_clock()) * ++ 100 / CONGESTED_MAX); ++ ++ prt_printf(out, "raw:\t%i/%u\n", atomic_read(&ca->congested), CONGESTED_MAX); ++ ++ prt_printf(out, "last io over threshold:\t"); ++ bch2_pr_time_units(out, local_clock() - ca->congested_last); ++ prt_newline(out); ++ ++ prt_printf(out, "read latency threshold:\t"); ++ bch2_pr_time_units(out, ++ ca->io_latency[READ].quantiles.entries[QUANTILE_IDX(1)].m << 2); ++ prt_newline(out); ++ ++ prt_printf(out, "median read latency:\t"); ++ bch2_pr_time_units(out, ++ ca->io_latency[READ].quantiles.entries[QUANTILE_IDX(7)].m); ++ prt_newline(out); ++ ++ prt_printf(out, "write latency threshold:\t"); ++ bch2_pr_time_units(out, ++ ca->io_latency[WRITE].quantiles.entries[QUANTILE_IDX(1)].m << 3); ++ prt_newline(out); ++ ++ prt_printf(out, "median write latency:\t"); ++ bch2_pr_time_units(out, ++ ca->io_latency[WRITE].quantiles.entries[QUANTILE_IDX(7)].m); ++ prt_newline(out); ++} ++ + #else + + static bool bch2_target_congested(struct bch_fs *c, u16 target) +@@ -80,18 +123,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) + + /* Cache promotion on read */ + +-struct promote_op { +- struct rcu_head rcu; +- u64 start_time; +- +- struct rhash_head hash; +- struct bpos pos; +- +- struct work_struct work; +- struct data_update write; +- struct bio_vec bi_inline_vecs[]; /* must be last */ +-}; +- + static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), +@@ -140,22 +171,32 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + if (!have_io_error(failed)) { + BUG_ON(!opts.promote_target); + +- if (!(flags & BCH_READ_may_promote)) +- return -BCH_ERR_nopromote_may_not; ++ if (!(flags & BCH_READ_may_promote)) { ++ count_event(c, io_read_nopromote_may_not); ++ return bch_err_throw(c, nopromote_may_not); ++ } + +- if (bch2_bkey_has_target(c, k, opts.promote_target)) +- return -BCH_ERR_nopromote_already_promoted; ++ if (bch2_bkey_has_target(c, k, opts.promote_target)) { ++ count_event(c, io_read_nopromote_already_promoted); ++ return bch_err_throw(c, nopromote_already_promoted); ++ } + +- if (bkey_extent_is_unwritten(k)) +- return -BCH_ERR_nopromote_unwritten; ++ if (bkey_extent_is_unwritten(k)) { ++ count_event(c, io_read_nopromote_unwritten); ++ return bch_err_throw(c, nopromote_unwritten); ++ } + +- if (bch2_target_congested(c, opts.promote_target)) +- return -BCH_ERR_nopromote_congested; ++ if (bch2_target_congested(c, opts.promote_target)) { ++ count_event(c, io_read_nopromote_congested); ++ return bch_err_throw(c, nopromote_congested); ++ } + } + + if (rhashtable_lookup_fast(&c->promote_table, &pos, +- bch_promote_params)) +- return -BCH_ERR_nopromote_in_flight; ++ bch_promote_params)) { ++ count_event(c, io_read_nopromote_in_flight); ++ return bch_err_throw(c, nopromote_in_flight); ++ } + + return 0; + } +@@ -169,9 +210,12 @@ static noinline void promote_free(struct bch_read_bio *rbio) + bch_promote_params); + BUG_ON(ret); + ++ async_object_list_del(c, promote, op->list_idx); ++ async_object_list_del(c, rbio, rbio->list_idx); ++ + bch2_data_update_exit(&op->write); + +- bch2_write_ref_put(c, BCH_WRITE_REF_promote); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); + } + +@@ -236,12 +280,12 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + return NULL; + } + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) + return ERR_PTR(-BCH_ERR_nopromote_no_writes); + + struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { +- ret = -BCH_ERR_nopromote_enomem; ++ ret = bch_err_throw(c, nopromote_enomem); + goto err_put; + } + +@@ -250,10 +294,14 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) { +- ret = -BCH_ERR_nopromote_in_flight; ++ ret = bch_err_throw(c, nopromote_in_flight); + goto err; + } + ++ ret = async_object_list_add(c, promote, op, &op->list_idx); ++ if (ret < 0) ++ goto err_remove_hash; ++ + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, + writepoint_hashed((unsigned long) current), + &orig->opts, +@@ -265,7 +313,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + * -BCH_ERR_ENOSPC_disk_reservation: + */ + if (ret) +- goto err_remove_hash; ++ goto err_remove_list; + + rbio_init_fragment(&op->write.rbio.bio, orig); + op->write.rbio.bounce = true; +@@ -273,6 +321,8 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + op->write.op.end_io = promote_done; + + return &op->write.rbio; ++err_remove_list: ++ async_object_list_del(c, promote, op->list_idx); + err_remove_hash: + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); +@@ -281,7 +331,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + /* We may have added to the rhashtable and thus need rcu freeing: */ + kfree_rcu(op, rcu); + err_put: +- bch2_write_ref_put(c, BCH_WRITE_REF_promote); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); + return ERR_PTR(ret); + } + +@@ -296,6 +346,13 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, + bool *read_full, + struct bch_io_failures *failed) + { ++ /* ++ * We're in the retry path, but we don't know what to repair yet, and we ++ * don't want to do a promote here: ++ */ ++ if (failed && !failed->nr) ++ return NULL; ++ + struct bch_fs *c = trans->c; + /* + * if failed != NULL we're not actually doing a promote, we're +@@ -332,12 +389,39 @@ static struct bch_read_bio *promote_alloc(struct btree_trans *trans, + + *bounce = true; + *read_full = promote_full; ++ ++ if (have_io_error(failed)) ++ orig->self_healing = true; ++ + return promote; + nopromote: +- trace_io_read_nopromote(c, ret); ++ if (trace_io_read_nopromote_enabled()) { ++ CLASS(printbuf, buf)(); ++ printbuf_indent_add_nextline(&buf, 2); ++ prt_printf(&buf, "%s\n", bch2_err_str(ret)); ++ bch2_bkey_val_to_text(&buf, c, k); ++ ++ trace_io_read_nopromote(c, buf.buf); ++ } ++ count_event(c, io_read_nopromote); ++ + return NULL; + } + ++void bch2_promote_op_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct promote_op *op) ++{ ++ if (!op->write.read_done) { ++ prt_printf(out, "parent read: %px\n", op->write.rbio.parent); ++ printbuf_indent_add(out, 2); ++ bch2_read_bio_to_text(out, c, op->write.rbio.parent); ++ printbuf_indent_sub(out, 2); ++ } ++ ++ bch2_data_update_to_text(out, &op->write); ++} ++ + /* Read */ + + static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, +@@ -359,7 +443,8 @@ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *o + static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, + struct bch_read_bio *rbio, struct bpos read_pos) + { +- bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); ++ CLASS(btree_trans, trans)(c); ++ bch2_read_err_msg_trans(trans, out, rbio, read_pos); + } + + enum rbio_context { +@@ -394,7 +479,7 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) + + if (rbio->have_ioref) { + struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); + } + + if (rbio->split) { +@@ -406,6 +491,8 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) + else + promote_free(rbio); + } else { ++ async_object_list_del(rbio->c, rbio, rbio->list_idx); ++ + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + +@@ -427,9 +514,80 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ if (rbio->list_idx) ++ async_object_list_del(rbio->c, rbio, rbio->list_idx); ++#endif + bio_endio(&rbio->bio); + } + ++static void get_rbio_extent(struct btree_trans *trans, ++ struct bch_read_bio *rbio, ++ struct bkey_buf *sk) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = lockrestart_do(trans, ++ bkey_err(k = bch2_bkey_get_iter(trans, &iter, ++ rbio->data_btree, rbio->data_pos, 0))); ++ if (ret) ++ return; ++ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ bkey_for_each_ptr(ptrs, ptr) ++ if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { ++ bch2_bkey_buf_reassemble(sk, trans->c, k); ++ break; ++ } ++ ++ bch2_trans_iter_exit(&iter); ++} ++ ++static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, ++ enum btree_id btree, struct bkey_s_c read_k) ++{ ++ if (!bch2_poison_extents_on_checksum_error) ++ return 0; ++ ++ struct bch_fs *c = trans->c; ++ ++ struct data_update *u = rbio_data_update(rbio); ++ if (u) ++ read_k = bkey_i_to_s_c(u->k.k); ++ ++ u64 flags = bch2_bkey_extent_flags(read_k); ++ if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ++ return 0; ++ ++ CLASS(btree_iter, iter)(trans, btree, bkey_start_pos(read_k.k), BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (!bkey_and_val_eq(k, read_k)) ++ return 0; ++ ++ struct bkey_i *new = bch2_trans_kmalloc(trans, ++ bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); ++ ret = PTR_ERR_OR_ZERO(new) ?: ++ (bkey_reassemble(new, k), 0) ?: ++ bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: ++ bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ if (ret) ++ return ret; ++ ++ /* ++ * Propagate key change back to data update path, in particular so it ++ * knows the extent has been poisoned and it's safe to change the ++ * checksum ++ */ ++ if (u) ++ bch2_bkey_buf_copy(&u->k, c, new); ++ return 0; ++} ++ + static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, +@@ -451,7 +609,7 @@ static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, + + if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { + /* extent we wanted to read no longer exists: */ +- rbio->ret = -BCH_ERR_data_read_key_overwritten; ++ rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); + goto err; + } + +@@ -461,9 +619,10 @@ static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, + bkey_i_to_s_c(u->k.k), + 0, failed, flags, -1); + err: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + +- if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || ++ bch2_err_matches(ret, BCH_ERR_data_read_retry)) + goto retry; + + if (ret) { +@@ -487,15 +646,21 @@ static void bch2_rbio_retry(struct work_struct *work) + .inum = rbio->read_pos.inode, + }; + struct bch_io_failures failed = { .nr = 0 }; +- int orig_error = rbio->ret; + +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); ++ ++ struct bkey_buf sk; ++ bch2_bkey_buf_init(&sk); ++ bkey_init(&sk.k->k); + + trace_io_read_retry(&rbio->bio); + this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], + bvec_iter_sectors(rbio->bvec_iter)); + +- if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) ++ get_rbio_extent(trans, rbio, &sk); ++ ++ if (!bkey_deleted(&sk.k->k) && ++ bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + bch2_mark_io_failure(&failed, &rbio->pick, + rbio->ret == -BCH_ERR_data_read_retry_csum_err); + +@@ -516,15 +681,16 @@ static void bch2_rbio_retry(struct work_struct *work) + + int ret = rbio->data_update + ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) +- : __bch2_read(trans, rbio, iter, inum, &failed, flags); ++ : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); + + if (ret) { + rbio->ret = ret; + rbio->bio.bi_status = BLK_STS_IOERR; +- } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace && +- orig_error != -BCH_ERR_data_read_ptr_stale_race && +- !failed.nr) { +- struct printbuf buf = PRINTBUF; ++ } ++ ++ if (failed.nr || ret) { ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); + + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, +@@ -532,14 +698,29 @@ static void bch2_rbio_retry(struct work_struct *work) + read_pos.offset << 9)); + if (rbio->data_update) + prt_str(&buf, "(internal move) "); +- prt_str(&buf, "successful retry"); + +- bch_err_ratelimited(c, "%s", buf.buf); +- printbuf_exit(&buf); ++ prt_str(&buf, "data read error, "); ++ if (!ret) { ++ prt_str(&buf, "successful retry"); ++ if (rbio->self_healing) ++ prt_str(&buf, ", self healing"); ++ } else ++ prt_str(&buf, bch2_err_str(ret)); ++ prt_newline(&buf); ++ ++ ++ if (!bkey_deleted(&sk.k->k)) { ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); ++ prt_newline(&buf); ++ } ++ ++ bch2_io_failures_to_text(&buf, c, &failed); ++ ++ bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); + } + + bch2_rbio_done(rbio); +- bch2_trans_put(trans); ++ bch2_bkey_buf_exit(&sk, c); + } + + static void bch2_rbio_error(struct bch_read_bio *rbio, +@@ -568,113 +749,60 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, + } + } + +-static void bch2_read_io_err(struct work_struct *work) +-{ +- struct bch_read_bio *rbio = +- container_of(work, struct bch_read_bio, work); +- struct bio *bio = &rbio->bio; +- struct bch_fs *c = rbio->c; +- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; +- struct printbuf buf = PRINTBUF; +- +- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); +- prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); +- +- if (ca) +- bch_err_ratelimited(ca, "%s", buf.buf); +- else +- bch_err_ratelimited(c, "%s", buf.buf); +- +- printbuf_exit(&buf); +- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); +-} +- + static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) + { + struct bch_fs *c = rbio->c; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; +- struct bch_extent_crc_unpacked new_crc; +- struct btree_iter iter; +- struct bkey_i *new; +- struct bkey_s_c k; + int ret = 0; + + if (crc_is_compressed(rbio->pick.crc)) + return 0; + +- k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, +- BTREE_ITER_slots|BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, rbio->data_btree, rbio->data_pos, BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + if ((ret = bkey_err(k))) +- goto out; ++ return ret; + + if (bversion_cmp(k.k->bversion, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) +- goto out; ++ return 0; + + /* Extent was merged? */ + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) +- goto out; ++ return 0; + ++ struct bch_extent_crc_unpacked new_crc; + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(k.k) - data_offset, k.k->size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); +- ret = 0; +- goto out; ++ return 0; + } + + /* + * going to be temporarily appending another checksum entry: + */ +- new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + +- sizeof(struct bch_extent_crc128)); ++ struct bkey_i *new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + ++ sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) +- goto out; ++ return ret; + + bkey_reassemble(new, k); + + if (!bch2_bkey_narrow_crcs(new, new_crc)) +- goto out; ++ return 0; + +- ret = bch2_trans_update(trans, &iter, new, +- BTREE_UPDATE_internal_snapshot_node); +-out: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node); + } + + static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) + { +- bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- __bch2_rbio_narrow_crcs(trans, rbio)); +-} +- +-static void bch2_read_csum_err(struct work_struct *work) +-{ +- struct bch_read_bio *rbio = +- container_of(work, struct bch_read_bio, work); +- struct bch_fs *c = rbio->c; +- struct bio *src = &rbio->bio; +- struct bch_extent_crc_unpacked crc = rbio->pick.crc; +- struct nonce nonce = extent_nonce(rbio->version, crc); +- struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); +- struct printbuf buf = PRINTBUF; +- +- bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); +- prt_str(&buf, "data "); +- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); +- +- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; +- if (ca) +- bch_err_ratelimited(ca, "%s", buf.buf); +- else +- bch_err_ratelimited(c, "%s", buf.buf); +- +- bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); +- printbuf_exit(&buf); ++ CLASS(btree_trans, trans)(rbio->c); ++ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ++ __bch2_rbio_narrow_crcs(trans, rbio)); + } + + static void bch2_read_decompress_err(struct work_struct *work) +@@ -682,7 +810,7 @@ static void bch2_read_decompress_err(struct work_struct *work) + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "decompression error"); +@@ -694,7 +822,6 @@ static void bch2_read_decompress_err(struct work_struct *work) + bch_err_ratelimited(c, "%s", buf.buf); + + bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); +- printbuf_exit(&buf); + } + + static void bch2_read_decrypt_err(struct work_struct *work) +@@ -702,7 +829,7 @@ static void bch2_read_decrypt_err(struct work_struct *work) + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "decrypt error"); +@@ -714,7 +841,6 @@ static void bch2_read_decrypt_err(struct work_struct *work) + bch_err_ratelimited(c, "%s", buf.buf); + + bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); +- printbuf_exit(&buf); + } + + /* Inner part that may run in process context */ +@@ -837,7 +963,7 @@ static void __bch2_read_endio(struct work_struct *work) + memalloc_nofs_restore(nofs_flags); + return; + csum_err: +- bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); ++ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); + goto out; + decompression_err: + bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); +@@ -863,7 +989,7 @@ static void bch2_read_endio(struct bio *bio) + rbio->bio.bi_end_io = rbio->end_io; + + if (unlikely(bio->bi_status)) { +- bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); ++ bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); + return; + } + +@@ -895,13 +1021,10 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bch_extent_ptr ptr) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; +- struct printbuf buf = PRINTBUF; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, +- PTR_BUCKET_POS(ca, &ptr), +- BTREE_ITER_cached); ++ CLASS(printbuf, buf)(); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, ++ PTR_BUCKET_POS(ca, &ptr), ++ BTREE_ITER_cached); + + int gen = bucket_gen_get(ca, iter.pos.offset); + if (gen >= 0) { +@@ -913,7 +1036,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + + prt_printf(&buf, "memory gen: %u", gen); + +- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); ++ int ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); +@@ -931,9 +1054,6 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + } + + bch2_fs_inconsistent(c, "%s", buf.buf); +- +- bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); + } + + int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, +@@ -963,6 +1083,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + bvec_iter_sectors(iter)); + goto out_read_done; + } ++ ++ if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && ++ !orig->data_update) ++ return bch_err_throw(c, extent_poisoned); + retry_pick: + ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); + +@@ -971,30 +1095,38 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + goto hole; + + if (unlikely(ret < 0)) { +- struct printbuf buf = PRINTBUF; ++ if (ret == -BCH_ERR_data_read_csum_err) { ++ int ret2 = maybe_poison_extent(trans, orig, data_btree, k); ++ if (ret2) { ++ ret = ret2; ++ goto err; ++ } ++ ++ trace_and_count(c, io_read_fail_and_poison, &orig->bio); ++ } ++ ++ CLASS(printbuf, buf)(); + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "%s\n ", bch2_err_str(ret)); + bch2_bkey_val_to_text(&buf, c, k); +- + bch_err_ratelimited(c, "%s", buf.buf); +- printbuf_exit(&buf); + goto err; + } + + if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && + !c->chacha20_key_set) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); +- printbuf_exit(&buf); +- ret = -BCH_ERR_data_read_no_encryption_key; ++ ret = bch_err_throw(c, data_read_no_encryption_key); + goto err; + } + +- struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, ++ BCH_DEV_READ_REF_io_read); + + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't +@@ -1008,7 +1140,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + unlikely(dev_ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); + bch2_mark_io_failure(failed, &pick, false); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); + goto retry_pick; + } + +@@ -1041,8 +1173,9 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + */ + if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { + if (ca) +- percpu_ref_put(&ca->io_ref[READ]); +- rbio->ret = -BCH_ERR_data_read_buffer_too_small; ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_io_read); ++ rbio->ret = bch_err_throw(c, data_read_buffer_too_small); + goto out_read_done; + } + +@@ -1138,6 +1271,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; + ++ async_object_list_add(c, rbio, rbio, &rbio->list_idx); ++ + if (rbio->bounce) + trace_and_count(c, io_read_bounce, &rbio->bio); + +@@ -1171,14 +1306,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + + if (likely(!rbio->pick.do_ec_reconstruct)) { + if (unlikely(!rbio->have_ioref)) { +- struct printbuf buf = PRINTBUF; +- bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); +- prt_printf(&buf, "no device to read from:\n "); +- bch2_bkey_val_to_text(&buf, c, k); +- +- bch_err_ratelimited(c, "%s", buf.buf); +- printbuf_exit(&buf); +- + bch2_rbio_error(rbio, + -BCH_ERR_data_read_retry_device_offline, + BLK_STS_IOERR); +@@ -1253,7 +1380,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * have to signal that: + */ + if (u) +- orig->ret = -BCH_ERR_data_read_key_overwritten; ++ orig->ret = bch_err_throw(c, data_read_key_overwritten); + + zero_fill_bio_iter(&orig->bio, iter); + out_read_done: +@@ -1265,23 +1392,25 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + + int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, +- struct bch_io_failures *failed, unsigned flags) ++ struct bch_io_failures *failed, ++ struct bkey_buf *prev_read, ++ unsigned flags) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; ++ enum btree_id data_btree; + int ret; + + EBUG_ON(rbio->data_update); + + bch2_bkey_buf_init(&sk); +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, +- POS(inum.inum, bvec_iter.bi_sector), +- BTREE_ITER_slots); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_extents, ++ POS(inum.inum, bvec_iter.bi_sector), ++ BTREE_ITER_slots); + + while (1) { +- enum btree_id data_btree = BTREE_ID_extents; ++ data_btree = BTREE_ID_extents; + + bch2_trans_begin(trans); + +@@ -1290,12 +1419,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + if (ret) + goto err; + +- bch2_btree_iter_set_snapshot(trans, &iter, snapshot); ++ bch2_btree_iter_set_snapshot(&iter, snapshot); + +- bch2_btree_iter_set_pos(trans, &iter, ++ bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); + +- k = bch2_btree_iter_peek_slot(trans, &iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -1313,6 +1442,12 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + + k = bkey_i_to_s_c(sk.k); + ++ if (unlikely(flags & BCH_READ_in_retry)) { ++ if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) ++ failed->nr = 0; ++ bch2_bkey_buf_copy(prev_read, c, sk.k); ++ } ++ + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: +@@ -1347,17 +1482,14 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + break; + } + +- bch2_trans_iter_exit(trans, &iter); +- + if (unlikely(ret)) { + if (ret != -BCH_ERR_extent_poisoned) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, + bvec_iter.bi_sector << 9)); + prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); +- printbuf_exit(&buf); + } + + rbio->bio.bi_status = BLK_STS_IOERR; +@@ -1371,26 +1503,90 @@ int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + return ret; + } + ++static const char * const bch2_read_bio_flags[] = { ++#define x(n) #n, ++ BCH_READ_FLAGS() ++#undef x ++ NULL ++}; ++ ++void bch2_read_bio_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_read_bio *rbio) ++{ ++ if (!out->nr_tabstops) ++ printbuf_tabstop_push(out, 20); ++ ++ bch2_read_err_msg(c, out, rbio, rbio->read_pos); ++ prt_newline(out); ++ ++ /* Are we in a retry? */ ++ ++ printbuf_indent_add(out, 2); ++ ++ u64 now = local_clock(); ++ prt_printf(out, "start_time:\t"); ++ bch2_pr_time_units(out, max_t(s64, 0, now - rbio->start_time)); ++ prt_newline(out); ++ ++ prt_printf(out, "submit_time:\t"); ++ bch2_pr_time_units(out, max_t(s64, 0, now - rbio->submit_time)); ++ prt_newline(out); ++ ++ if (!rbio->split) ++ prt_printf(out, "end_io:\t%ps\n", rbio->end_io); ++ else ++ prt_printf(out, "parent:\t%px\n", rbio->parent); ++ ++ prt_printf(out, "promote:\t%u\n", rbio->promote); ++ prt_printf(out, "bounce:\t%u\n", rbio->bounce); ++ prt_printf(out, "split:\t%u\n", rbio->split); ++ prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); ++ prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); ++ prt_printf(out, "context:\t%u\n", rbio->context); ++ ++ int ret = READ_ONCE(rbio->ret); ++ if (ret < 0) ++ prt_printf(out, "ret:\t%s\n", bch2_err_str(ret)); ++ else ++ prt_printf(out, "ret:\t%i\n", ret); ++ ++ prt_printf(out, "flags:\t"); ++ bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); ++ prt_newline(out); ++ ++ bch2_bio_to_text(out, &rbio->bio); ++ printbuf_indent_sub(out, 2); ++} ++ + void bch2_fs_io_read_exit(struct bch_fs *c) + { + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); ++ mempool_exit(&c->bio_bounce_pages); + } + + int bch2_fs_io_read_init(struct bch_fs *c) + { ++ if (mempool_init_page_pool(&c->bio_bounce_pages, ++ max_t(unsigned, ++ c->opts.btree_node_size, ++ c->opts.encoded_extent_max) / ++ PAGE_SIZE, 0)) ++ return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); ++ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) +- return -BCH_ERR_ENOMEM_bio_read_init; ++ return bch_err_throw(c, ENOMEM_bio_read_init); + + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) +- return -BCH_ERR_ENOMEM_bio_read_split_init; ++ return bch_err_throw(c, ENOMEM_bio_read_split_init); + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) +- return -BCH_ERR_ENOMEM_promote_table_init; ++ return bch_err_throw(c, ENOMEM_promote_table_init); + + return 0; + } +diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h +index c78025d863e0..1e1c0476bd03 100644 +--- a/fs/bcachefs/io_read.h ++++ b/fs/bcachefs/io_read.h +@@ -4,8 +4,13 @@ + + #include "bkey_buf.h" + #include "btree_iter.h" ++#include "extents_types.h" + #include "reflink.h" + ++#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT ++void bch2_dev_congested_to_text(struct printbuf *, struct bch_dev *); ++#endif ++ + struct bch_read_bio { + struct bch_fs *c; + u64 start_time; +@@ -43,11 +48,15 @@ struct bch_read_bio { + have_ioref:1, + narrow_crcs:1, + saw_error:1, ++ self_healing:1, + context:2; + }; + u16 _state; + }; + s16 ret; ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ unsigned list_idx; ++#endif + + struct extent_ptr_decoded pick; + +@@ -87,6 +96,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, + return 0; + + *data_btree = BTREE_ID_reflink; ++ ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, + offset_into_extent, +@@ -97,12 +108,12 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, + return ret; + + if (bkey_deleted(k.k)) { +- bch2_trans_iter_exit(trans, &iter); +- return -BCH_ERR_missing_indirect_extent; ++ bch2_trans_iter_exit(&iter); ++ return bch_err_throw(c, missing_indirect_extent); + } + +- bch2_bkey_buf_reassemble(extent, trans->c, k); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_bkey_buf_reassemble(extent, c, k); ++ bch2_trans_iter_exit(&iter); + return 0; + } + +@@ -140,11 +151,12 @@ static inline void bch2_read_extent(struct btree_trans *trans, + int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags, -1); + /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ +- WARN(ret, "unhandled error from __bch2_read_extent()"); ++ WARN(ret, "unhandled error from __bch2_read_extent(): %s", bch2_err_str(ret)); + } + + int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, +- subvol_inum, struct bch_io_failures *, unsigned flags); ++ subvol_inum, ++ struct bch_io_failures *, struct bkey_buf *, unsigned flags); + + static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum) +@@ -153,11 +165,11 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + + rbio->subvol = inum.subvol; + +- bch2_trans_run(c, +- __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, +- BCH_READ_retry_if_stale| +- BCH_READ_may_promote| +- BCH_READ_user_mapped)); ++ CLASS(btree_trans, trans)(c); ++ __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, ++ BCH_READ_retry_if_stale| ++ BCH_READ_may_promote| ++ BCH_READ_user_mapped); + } + + static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, +@@ -172,6 +184,9 @@ static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, + rbio->split = true; + rbio->parent = orig; + rbio->opts = orig->opts; ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ rbio->list_idx = 0; ++#endif + return rbio; + } + +@@ -189,9 +204,16 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio, + rbio->ret = 0; + rbio->opts = opts; + rbio->bio.bi_end_io = end_io; ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ rbio->list_idx = 0; ++#endif + return rbio; + } + ++struct promote_op; ++void bch2_promote_op_to_text(struct printbuf *, struct bch_fs *, struct promote_op *); ++void bch2_read_bio_to_text(struct printbuf *, struct bch_fs *, struct bch_read_bio *); ++ + void bch2_fs_io_read_exit(struct bch_fs *); + int bch2_fs_io_read_init(struct bch_fs *); + +diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c +index c1237da079ed..1d83dcc9731e 100644 +--- a/fs/bcachefs/io_write.c ++++ b/fs/bcachefs/io_write.c +@@ -6,6 +6,7 @@ + + #include "bcachefs.h" + #include "alloc_foreground.h" ++#include "async_objs.h" + #include "bkey_buf.h" + #include "bset.h" + #include "btree_update.h" +@@ -15,6 +16,7 @@ + #include "compress.h" + #include "debug.h" + #include "ec.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "extent_update.h" + #include "inode.h" +@@ -30,6 +32,7 @@ + #include "trace.h" + + #include ++#include + #include + #include + #include +@@ -52,14 +55,9 @@ static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, + s64 latency_over = io_latency - latency_threshold; + + if (latency_threshold && latency_over > 0) { +- /* +- * bump up congested by approximately latency_over * 4 / +- * latency_threshold - we don't need much accuracy here so don't +- * bother with the divide: +- */ + if (atomic_read(&ca->congested) < CONGESTED_MAX) +- atomic_add(latency_over >> +- max_t(int, ilog2(latency_threshold) - 2, 0), ++ atomic_add((u32) min(U32_MAX, io_latency * 2) / ++ (u32) min(U32_MAX, latency_threshold), + &ca->congested); + + ca->congested_last = now; +@@ -91,7 +89,12 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) + new = ewma_add(old, io_latency, 5); + } while (!atomic64_try_cmpxchg(latency, &old, new)); + +- bch2_congested_acct(ca, io_latency, now, rw); ++ /* ++ * Only track read latency for congestion accounting: writes are subject ++ * to heavy queuing delays from page cache writeback: ++ */ ++ if (rw == READ) ++ bch2_congested_acct(ca, io_latency, now, rw); + + __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); + } +@@ -168,9 +171,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + *i_sectors_delta = 0; + *disk_sectors_delta = 0; + +- bch2_trans_copy_iter(trans, &iter, extent_iter); ++ bch2_trans_copy_iter(&iter, extent_iter); + +- for_each_btree_key_max_continue_norestart(trans, iter, ++ for_each_btree_key_max_continue_norestart(iter, + new->k.p, BTREE_ITER_slots, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), +@@ -195,7 +198,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + break; + } + +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -217,13 +220,13 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + */ + unsigned inode_update_flags = BTREE_UPDATE_nojournal; + +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, +- SPOS(0, +- extent_iter->pos.inode, +- extent_iter->snapshot), +- BTREE_ITER_intent| +- BTREE_ITER_cached); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, ++ SPOS(0, ++ extent_iter->pos.inode, ++ extent_iter->snapshot), ++ BTREE_ITER_intent| ++ BTREE_ITER_cached); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (unlikely(ret)) + return ret; +@@ -235,7 +238,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + struct bkey_i *k_mut = bch2_trans_kmalloc_nomemzero(trans, bkey_bytes(k.k) + 8); + ret = PTR_ERR_OR_ZERO(k_mut); + if (unlikely(ret)) +- goto err; ++ return ret; + + bkey_reassemble(k_mut, k); + +@@ -243,7 +246,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + k_mut = bch2_inode_to_v3(trans, k_mut); + ret = PTR_ERR_OR_ZERO(k_mut); + if (unlikely(ret)) +- goto err; ++ return ret; + } + + struct bkey_i_inode_v3 *inode = bkey_i_to_inode_v3(k_mut); +@@ -258,17 +261,14 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); + if (unlikely(bi_sectors + i_sectors_delta < 0)) { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", + extent_iter->pos.inode, bi_sectors, i_sectors_delta); + +- bool repeat = false, print = false, suppress = false; +- bch2_count_fsck_err(c, inode_i_sectors_underflow, buf.buf, +- &repeat, &print, &suppress); ++ bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); + if (print) +- bch2_print_str(c, buf.buf); +- printbuf_exit(&buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); + + if (i_sectors_delta < 0) + i_sectors_delta = -bi_sectors; +@@ -280,17 +280,20 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + inode_update_flags = 0; + } + ++ /* ++ * extents, dirents and xattrs updates require that an inode update also ++ * happens - to ensure that if a key exists in one of those btrees with ++ * a given snapshot ID an inode is also present - so we may have to skip ++ * the nojournal optimization: ++ */ + if (inode->k.p.snapshot != iter.snapshot) { + inode->k.p.snapshot = iter.snapshot; + inode_update_flags = 0; + } + +- ret = bch2_trans_update(trans, &iter, &inode->k_i, +- BTREE_UPDATE_internal_snapshot_node| +- inode_update_flags); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_trans_update(trans, &iter, &inode->k_i, ++ BTREE_UPDATE_internal_snapshot_node| ++ inode_update_flags); + } + + int bch2_extent_update(struct btree_trans *trans, +@@ -313,7 +316,7 @@ int bch2_extent_update(struct btree_trans *trans, + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ +- ret = __bch2_btree_iter_traverse(trans, iter); ++ ret = __bch2_btree_iter_traverse(iter); + if (ret) + return ret; + +@@ -358,7 +361,7 @@ int bch2_extent_update(struct btree_trans *trans, + + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; +- bch2_btree_iter_set_pos(trans, iter, next_pos); ++ bch2_btree_iter_set_pos(iter, next_pos); + return 0; + } + +@@ -368,8 +371,6 @@ static int bch2_write_index_default(struct bch_write_op *op) + struct bkey_buf sk; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, +@@ -378,6 +379,7 @@ static int bch2_write_index_default(struct bch_write_op *op) + + BUG_ON(!inum.subvol); + ++ CLASS(btree_trans, trans)(c); + bch2_bkey_buf_init(&sk); + + do { +@@ -393,16 +395,14 @@ static int bch2_write_index_default(struct bch_write_op *op) + if (ret) + break; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, +- bkey_start_pos(&sk.k->k), +- BTREE_ITER_slots|BTREE_ITER_intent); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_extents, ++ bkey_start_pos(&sk.k->k), ++ BTREE_ITER_slots|BTREE_ITER_intent); + +- ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: +- bch2_extent_update(trans, inum, &iter, sk.k, ++ ret = bch2_extent_update(trans, inum, &iter, sk.k, + &op->res, + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_check_enospc); +- bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; +@@ -415,7 +415,6 @@ static int bch2_write_index_default(struct bch_write_op *op) + bch2_cut_front(iter.pos, k); + } while (!bch2_keylist_empty(keys)); + +- bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +@@ -425,7 +424,7 @@ static int bch2_write_index_default(struct bch_write_op *op) + + void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) + { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + if (op->subvol) { + bch2_inum_offset_err_msg(op->c, &buf, +@@ -452,7 +451,6 @@ void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, . + } + + bch_err_ratelimited(op->c, "%s", buf.buf); +- printbuf_exit(&buf); + } + + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, +@@ -462,9 +460,17 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + struct bch_write_bio *n; ++ unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; ++ unsigned ref_idx = type == BCH_DATA_btree ++ ? (unsigned) BCH_DEV_READ_REF_btree_node_write ++ : (unsigned) BCH_DEV_WRITE_REF_io_write; + + BUG_ON(c->opts.nochanges); + ++ const struct bch_extent_ptr *last = NULL; ++ bkey_for_each_ptr(ptrs, ptr) ++ last = ptr; ++ + bkey_for_each_ptr(ptrs, ptr) { + /* + * XXX: btree writes should be using io_ref[WRITE], but we +@@ -473,9 +479,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + */ + struct bch_dev *ca = nocow + ? bch2_dev_have_ref(c, ptr->dev) +- : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); ++ : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); + +- if (to_entry(ptr + 1) < ptrs.end) { ++ if (ptr != last) { + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); + + n->bio.bi_end_io = wbio->bio.bi_end_io; +@@ -533,17 +539,19 @@ static void bch2_write_done(struct closure *cl) + bch2_disk_reservation_put(c, &op->res); + + if (!(op->flags & BCH_WRITE_move)) +- bch2_write_ref_put(c, BCH_WRITE_REF_write); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + + EBUG_ON(cl->parent); + closure_debug_destroy(cl); ++ async_object_list_del(c, write_op, op->list_idx); + if (op->end_io) + op->end_io(op); + } + + static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) + { ++ struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + struct bkey_i *src, *dst = keys->keys, *n; + +@@ -555,7 +563,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) + test_bit(ptr->dev, op->failed.d)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) +- return -BCH_ERR_data_write_io; ++ return bch_err_throw(c, data_write_io); + } + + if (dst != src) +@@ -748,7 +756,8 @@ static void bch2_write_endio(struct bio *bio) + } + + if (wbio->have_ioref) +- percpu_ref_put(&ca->io_ref[WRITE]); ++ enumerated_ref_put(&ca->io_ref[WRITE], ++ BCH_DEV_WRITE_REF_io_write); + + if (wbio->bounce) + bch2_bio_free_pages_pool(c, bio); +@@ -784,6 +793,9 @@ static void init_append_extent(struct bch_write_op *op, + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, + op->flags & BCH_WRITE_cached); + ++ if (!(op->flags & BCH_WRITE_move)) ++ bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); ++ + bch2_keylist_push(&op->insert_keys); + } + +@@ -958,7 +970,7 @@ static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct + op->crc.csum_type < BCH_CSUM_NR + ? __bch2_csum_types[op->crc.csum_type] + : "(unknown)"); +- return -BCH_ERR_data_write_csum; ++ return bch_err_throw(c, data_write_csum); + } + + static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, +@@ -1190,22 +1202,20 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, + + e = bkey_s_c_to_extent(k); + +- rcu_read_lock(); ++ guard(rcu)(); + extent_for_each_ptr_decode(e, p, entry) { +- if (crc_is_encoded(p.crc) || p.has_ec) { +- rcu_read_unlock(); ++ if (crc_is_encoded(p.crc) || p.has_ec) + return false; +- } + + replicas += bch2_extent_ptr_durability(c, &p); + } +- rcu_read_unlock(); + + return replicas >= op->opts.data_replicas; + } + + static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, + struct btree_iter *iter, ++ struct bch_write_op *op, + struct bkey_i *orig, + struct bkey_s_c k, + u64 new_i_size) +@@ -1215,11 +1225,13 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, + return 0; + } + +- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ++ struct bkey_i *new = bch2_trans_kmalloc_nomemzero(trans, ++ bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance)); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + ++ bkey_reassemble(new, k); + bch2_cut_front(bkey_start_pos(&orig->k), new); + bch2_cut_back(orig->k.p, new); + +@@ -1227,6 +1239,8 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, + bkey_for_each_ptr(ptrs, ptr) + ptr->unwritten = 0; + ++ bch2_bkey_set_needs_rebalance(op->c, &op->opts, new); ++ + /* + * Note that we're not calling bch2_subvol_get_snapshot() in this path - + * that was done when we kicked off the write, and here it's important +@@ -1251,7 +1265,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) + bkey_start_pos(&orig->k), orig->k.p, + BTREE_ITER_intent, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ +- bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); ++ bch2_nocow_write_convert_one_unwritten(trans, &iter, op, orig, k, op->new_i_size); + })); + if (ret) + break; +@@ -1272,7 +1286,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) + static void __bch2_nocow_write_done(struct bch_write_op *op) + { + if (unlikely(op->flags & BCH_WRITE_io_error)) { +- op->error = -BCH_ERR_data_write_io; ++ op->error = bch_err_throw(op->c, data_write_io); + } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) + bch2_nocow_write_convert_unwritten(op); + } +@@ -1326,7 +1340,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + if (ret) + break; + +- k = bch2_btree_iter_peek_slot(trans, &iter); ++ k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; +@@ -1345,7 +1359,8 @@ static void bch2_nocow_write(struct bch_write_op *op) + /* Get iorefs before dropping btree locks: */ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { +- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, ++ BCH_DEV_WRITE_REF_io_write); + if (unlikely(!ca)) + goto err_get_ioref; + +@@ -1410,10 +1425,10 @@ static void bch2_nocow_write(struct bch_write_op *op) + bch2_keylist_push(&op->insert_keys); + if (op->flags & BCH_WRITE_submitted) + break; +- bch2_btree_iter_advance(trans, &iter); ++ bch2_btree_iter_advance(&iter); + } + out: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; +@@ -1447,7 +1462,8 @@ static void bch2_nocow_write(struct bch_write_op *op) + return; + err_get_ioref: + darray_for_each(buckets, i) +- percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE]); ++ enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], ++ BCH_DEV_WRITE_REF_io_write); + + /* Fall back to COW path: */ + goto out; +@@ -1458,17 +1474,16 @@ static void bch2_nocow_write(struct bch_write_op *op) + break; + } + +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + if (bch2_fs_inconsistent_on(stale < 0, c, + "pointer to invalid bucket in nocow path on device %llu\n %s", + stale_at->b.inode, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { +- ret = -BCH_ERR_data_write_invalid_ptr; ++ ret = bch_err_throw(c, data_write_invalid_ptr); + } else { + /* We can retry this: */ +- ret = -BCH_ERR_transaction_restart; ++ ret = bch_err_throw(c, transaction_restart); + } +- printbuf_exit(&buf); + + goto err_get_ioref; + } +@@ -1512,7 +1527,7 @@ static void __bch2_write(struct bch_write_op *op) + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ +- ret = bch2_trans_run(c, lockrestart_do(trans, ++ ret = bch2_trans_do(c, + bch2_alloc_sectors_start_trans(trans, + op->target, + op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), +@@ -1522,7 +1537,7 @@ static void __bch2_write(struct bch_write_op *op) + op->nr_replicas_required, + op->watermark, + op->flags, +- &op->cl, &wp))); ++ &op->cl, &wp)); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; +@@ -1661,6 +1676,8 @@ CLOSURE_CALLBACK(bch2_write) + BUG_ON(!op->write_point.v); + BUG_ON(bkey_eq(op->pos, POS_MAX)); + ++ async_object_list_add(c, write_op, op, &op->list_idx); ++ + if (op->flags & BCH_WRITE_only_specified_devs) + op->flags |= BCH_WRITE_alloc_nowait; + +@@ -1671,18 +1688,18 @@ CLOSURE_CALLBACK(bch2_write) + + if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { + bch2_write_op_error(op, op->pos.offset, "misaligned write"); +- op->error = -BCH_ERR_data_write_misaligned; ++ op->error = bch_err_throw(c, data_write_misaligned); + goto err; + } + + if (c->opts.nochanges) { +- op->error = -BCH_ERR_erofs_no_writes; ++ op->error = bch_err_throw(c, erofs_no_writes); + goto err; + } + + if (!(op->flags & BCH_WRITE_move) && +- !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { +- op->error = -BCH_ERR_erofs_no_writes; ++ !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { ++ op->error = bch_err_throw(c, erofs_no_writes); + goto err; + } + +@@ -1705,6 +1722,7 @@ CLOSURE_CALLBACK(bch2_write) + bch2_disk_reservation_put(c, &op->res); + + closure_debug_destroy(&op->cl); ++ async_object_list_del(c, write_op, op->list_idx); + if (op->end_io) + op->end_io(op); + } +@@ -1738,13 +1756,13 @@ void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) + prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); + + prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); ++ prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); + + printbuf_indent_sub(out, 2); + } + + void bch2_fs_io_write_exit(struct bch_fs *c) + { +- mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->replica_set); + bioset_exit(&c->bio_write); + } +@@ -1753,14 +1771,7 @@ int bch2_fs_io_write_init(struct bch_fs *c) + { + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || + bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) +- return -BCH_ERR_ENOMEM_bio_write_init; +- +- if (mempool_init_page_pool(&c->bio_bounce_pages, +- max_t(unsigned, +- c->opts.btree_node_size, +- c->opts.encoded_extent_max) / +- PAGE_SIZE, 0)) +- return -BCH_ERR_ENOMEM_bio_bounce_pages_init; ++ return bch_err_throw(c, ENOMEM_bio_write_init); + + return 0; + } +diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h +index b8ab19a1e1da..2c0a8f35ee1f 100644 +--- a/fs/bcachefs/io_write.h ++++ b/fs/bcachefs/io_write.h +@@ -17,34 +17,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + __printf(3, 4) + void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); + +-#define BCH_WRITE_FLAGS() \ +- x(alloc_nowait) \ +- x(cached) \ +- x(data_encoded) \ +- x(pages_stable) \ +- x(pages_owned) \ +- x(only_specified_devs) \ +- x(wrote_data_inline) \ +- x(check_enospc) \ +- x(sync) \ +- x(move) \ +- x(in_worker) \ +- x(submitted) \ +- x(io_error) \ +- x(convert_unwritten) +- +-enum __bch_write_flags { +-#define x(f) __BCH_WRITE_##f, +- BCH_WRITE_FLAGS() +-#undef x +-}; +- +-enum bch_write_flags { +-#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), +- BCH_WRITE_FLAGS() +-#undef x +-}; +- + static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) + { + return op->watermark == BCH_WATERMARK_copygc +diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h +index 3ef6df9145ef..5da4eb8bb6f6 100644 +--- a/fs/bcachefs/io_write_types.h ++++ b/fs/bcachefs/io_write_types.h +@@ -13,6 +13,34 @@ + #include + #include + ++#define BCH_WRITE_FLAGS() \ ++ x(alloc_nowait) \ ++ x(cached) \ ++ x(data_encoded) \ ++ x(pages_stable) \ ++ x(pages_owned) \ ++ x(only_specified_devs) \ ++ x(wrote_data_inline) \ ++ x(check_enospc) \ ++ x(sync) \ ++ x(move) \ ++ x(in_worker) \ ++ x(submitted) \ ++ x(io_error) \ ++ x(convert_unwritten) ++ ++enum __bch_write_flags { ++#define x(f) __BCH_WRITE_##f, ++ BCH_WRITE_FLAGS() ++#undef x ++}; ++ ++enum bch_write_flags { ++#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), ++ BCH_WRITE_FLAGS() ++#undef x ++}; ++ + struct bch_write_bio { + struct_group(wbio, + struct bch_fs *c; +@@ -43,6 +71,10 @@ struct bch_write_op { + void (*end_io)(struct bch_write_op *); + u64 start_time; + ++#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS ++ unsigned list_idx; ++#endif ++ + unsigned written; /* sectors */ + u16 flags; + s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index bb45d3634194..07869436a964 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -12,6 +12,7 @@ + #include "btree_update.h" + #include "btree_write_buffer.h" + #include "buckets.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "journal.h" + #include "journal_io.h" +@@ -87,7 +88,7 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 + static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) + { + lockdep_assert_held(&j->lock); +- out->atomic++; ++ guard(printbuf_atomic)(out); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); +@@ -97,8 +98,6 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) + seq++) + bch2_journal_buf_to_text(out, j, seq); + prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); +- +- --out->atomic; + } + + static inline struct journal_buf * +@@ -139,9 +138,9 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool stuck = false; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + +- buf.atomic++; ++ guard(printbuf_atomic)(&buf); + + if (!(error == -BCH_ERR_journal_full || + error == -BCH_ERR_journal_pin_full) || +@@ -149,36 +148,31 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) + return stuck; + +- spin_lock(&j->lock); ++ scoped_guard(spinlock, &j->lock) { ++ if (j->can_discard) ++ return stuck; + +- if (j->can_discard) { +- spin_unlock(&j->lock); +- return stuck; +- } ++ stuck = true; + +- stuck = true; ++ /* ++ * The journal shutdown path will set ->err_seq, but do it here first to ++ * serialize against concurrent failures and avoid duplicate error ++ * reports. ++ */ ++ if (j->err_seq) ++ return stuck; + +- /* +- * The journal shutdown path will set ->err_seq, but do it here first to +- * serialize against concurrent failures and avoid duplicate error +- * reports. +- */ +- if (j->err_seq) { +- spin_unlock(&j->lock); +- return stuck; +- } +- j->err_seq = journal_cur_seq(j); ++ j->err_seq = journal_cur_seq(j); + +- __bch2_journal_debug_to_text(&buf, j); +- spin_unlock(&j->lock); ++ __bch2_journal_debug_to_text(&buf, j); ++ } + prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), + bch2_err_str(error)); +- bch2_print_string_as_lines(KERN_ERR, buf.buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); + + printbuf_reset(&buf); + bch2_journal_pins_to_text(&buf, j); + bch_err(c, "Journal pins:\n%s", buf.buf); +- printbuf_exit(&buf); + + bch2_fatal_error(c); + dump_stack(); +@@ -188,6 +182,8 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) + + void bch2_journal_do_writes(struct journal *j) + { ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { +@@ -202,6 +198,7 @@ void bch2_journal_do_writes(struct journal *j) + if (!journal_state_seq_count(j, j->reservations, seq)) { + j->seq_write_started = seq; + w->write_started = true; ++ closure_get(&c->cl); + closure_call(&w->io, bch2_journal_write, j->wq, NULL); + } + +@@ -268,22 +265,21 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t + buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + + if (trace_journal_entry_close_enabled() && trace) { +- struct printbuf pbuf = PRINTBUF; +- pbuf.atomic++; +- +- prt_str(&pbuf, "entry size: "); +- prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data)); +- prt_newline(&pbuf); +- bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT); +- trace_journal_entry_close(c, pbuf.buf); +- printbuf_exit(&pbuf); ++ CLASS(printbuf, err)(); ++ guard(printbuf_atomic)(&err); ++ ++ prt_str(&err, "entry size: "); ++ prt_human_readable_u64(&err, vstruct_bytes(buf->data)); ++ prt_newline(&err); ++ bch2_prt_task_backtrace(&err, current, 1, GFP_NOWAIT); ++ trace_journal_entry_close(c, err.buf); + } + + sectors = vstruct_blocks_plus(buf->data, c->block_bits, + buf->u64s_reserved) << c->block_bits; + if (unlikely(sectors > buf->sectors)) { +- struct printbuf err = PRINTBUF; +- err.atomic++; ++ CLASS(printbuf, err)(); ++ guard(printbuf_atomic)(&err); + + prt_printf(&err, "journal entry overran reserved space: %u > %u\n", + sectors, buf->sectors); +@@ -295,7 +291,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t + bch2_journal_halt_locked(j); + + bch_err(c, "%s", err.buf); +- printbuf_exit(&err); + return; + } + +@@ -331,16 +326,6 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t + __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); + } + +-void bch2_journal_halt(struct journal *j) +-{ +- spin_lock(&j->lock); +- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); +- if (!j->err_seq) +- j->err_seq = journal_cur_seq(j); +- journal_wake(j); +- spin_unlock(&j->lock); +-} +- + void bch2_journal_halt_locked(struct journal *j) + { + lockdep_assert_held(&j->lock); +@@ -351,6 +336,12 @@ void bch2_journal_halt_locked(struct journal *j) + journal_wake(j); + } + ++void bch2_journal_halt(struct journal *j) ++{ ++ guard(spinlock)(&j->lock); ++ bch2_journal_halt_locked(j); ++} ++ + static bool journal_entry_want_write(struct journal *j) + { + bool ret = !journal_entry_is_open(j) || +@@ -373,13 +364,8 @@ static bool journal_entry_want_write(struct journal *j) + + bool bch2_journal_entry_close(struct journal *j) + { +- bool ret; +- +- spin_lock(&j->lock); +- ret = journal_entry_want_write(j); +- spin_unlock(&j->lock); +- +- return ret; ++ guard(spinlock)(&j->lock); ++ return journal_entry_want_write(j); + } + + /* +@@ -396,10 +382,10 @@ static int journal_entry_open(struct journal *j) + + lockdep_assert_held(&j->lock); + BUG_ON(journal_entry_is_open(j)); +- BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); ++ BUG_ON(c->sb.clean); + + if (j->blocked) +- return -BCH_ERR_journal_blocked; ++ return bch_err_throw(c, journal_blocked); + + if (j->cur_entry_error) + return j->cur_entry_error; +@@ -409,23 +395,23 @@ static int journal_entry_open(struct journal *j) + return ret; + + if (!fifo_free(&j->pin)) +- return -BCH_ERR_journal_pin_full; ++ return bch_err_throw(c, journal_pin_full); + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) +- return -BCH_ERR_journal_max_in_flight; ++ return bch_err_throw(c, journal_max_in_flight); + + if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) +- return -BCH_ERR_journal_max_open; ++ return bch_err_throw(c, journal_max_open); + +- if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { ++ if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { + bch_err(c, "cannot start: journal seq overflow"); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); +- return -BCH_ERR_journal_shutdown; ++ return bch_err_throw(c, journal_shutdown); + } + + if (!j->free_buf && !buf->data) +- return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ ++ return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ + + BUG_ON(!j->cur_entry_sectors); + +@@ -449,7 +435,7 @@ static int journal_entry_open(struct journal *j) + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + + if (u64s <= (ssize_t) j->early_journal_entries.nr) +- return -BCH_ERR_journal_full; ++ return bch_err_throw(c, journal_full); + + if (fifo_empty(&j->pin) && j->reclaim_thread) + wake_up_process(j->reclaim_thread); +@@ -461,6 +447,14 @@ static int journal_entry_open(struct journal *j) + atomic64_inc(&j->seq); + journal_pin_list_init(fifo_push_ref(&j->pin), 1); + ++ if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { ++ bch_err(c, "attempting to open blacklisted journal seq %llu", ++ journal_cur_seq(j)); ++ if (bch2_fs_emergency_read_only_locked(c)) ++ bch_err(c, "fatal error - emergency read only"); ++ return bch_err_throw(c, journal_shutdown); ++ } ++ + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + + BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); +@@ -536,7 +530,7 @@ static void journal_write_work(struct work_struct *work) + { + struct journal *j = container_of(work, struct journal, write_work.work); + +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + if (__journal_entry_is_open(j->reservations)) { + long delta = journal_cur_buf(j)->expires - jiffies; + +@@ -545,7 +539,6 @@ static void journal_write_work(struct work_struct *work) + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); + } +- spin_unlock(&j->lock); + } + + static void journal_buf_prealloc(struct journal *j) +@@ -591,16 +584,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, + return ret; + + if (j->blocked) +- return -BCH_ERR_journal_blocked; ++ return bch_err_throw(c, journal_blocked); + + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { +- ret = -BCH_ERR_journal_full; ++ ret = bch_err_throw(c, journal_full); + can_discard = j->can_discard; + goto out; + } + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { +- ret = -BCH_ERR_journal_max_in_flight; ++ ret = bch_err_throw(c, journal_max_in_flight); + goto out; + } + +@@ -641,39 +634,37 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, + goto retry; + + if (journal_error_check_stuck(j, ret, flags)) +- ret = -BCH_ERR_journal_stuck; ++ ret = bch_err_throw(c, journal_stuck); + + if (ret == -BCH_ERR_journal_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && + trace_journal_entry_full_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_printbuf_make_room(&buf, 4096); + +- spin_lock(&j->lock); +- prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); +- bch2_journal_bufs_to_text(&buf, j); +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) { ++ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); ++ bch2_journal_bufs_to_text(&buf, j); ++ } + + trace_journal_entry_full(c, buf.buf); +- printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + + if (ret == -BCH_ERR_journal_max_open && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && + trace_journal_entry_full_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_printbuf_make_room(&buf, 4096); + +- spin_lock(&j->lock); +- prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); +- bch2_journal_bufs_to_text(&buf, j); +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) { ++ prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); ++ bch2_journal_bufs_to_text(&buf, j); ++ } + + trace_journal_entry_full(c, buf.buf); +- printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + +@@ -702,7 +693,8 @@ static unsigned max_dev_latency(struct bch_fs *c) + { + u64 nsecs = 0; + +- for_each_rw_member(c, ca) ++ guard(rcu)(); ++ for_each_rw_member_rcu(c, ca) + nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); + + return nsecs_to_jiffies(nsecs); +@@ -744,11 +736,10 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, + remaining_wait)) + return ret; + +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_journal_debug_to_text(&buf, j); +- bch2_print_string_as_lines(KERN_ERR, buf.buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); + prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); +- printbuf_exit(&buf); + + closure_wait_event(&j->async_wait, + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || +@@ -765,11 +756,13 @@ void bch2_journal_entry_res_resize(struct journal *j, + union journal_res_state state; + int d = new_u64s - res->u64s; + +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); ++ ++ j->entry_u64s_reserved += d; ++ res->u64s += d; + +- j->entry_u64s_reserved += d; + if (d <= 0) +- goto out; ++ return; + + j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); + state = READ_ONCE(j->reservations); +@@ -784,9 +777,6 @@ void bch2_journal_entry_res_resize(struct journal *j, + } else { + journal_cur_buf(j)->u64s_reserved += d; + } +-out: +- spin_unlock(&j->lock); +- res->u64s += d; + } + + /* journal flushing: */ +@@ -805,6 +795,7 @@ void bch2_journal_entry_res_resize(struct journal *j, + int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct closure *parent) + { ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf; + int ret = 0; + +@@ -820,7 +811,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + + /* Recheck under lock: */ + if (j->err_seq && seq >= j->err_seq) { +- ret = -BCH_ERR_journal_flush_err; ++ ret = bch_err_throw(c, journal_flush_err); + goto out; + } + +@@ -936,7 +927,6 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 unwritten_seq; +- bool ret = false; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) + return false; +@@ -944,9 +934,10 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) + if (c->journal.flushed_seq_ondisk >= start) + return false; + +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); ++ + if (c->journal.flushed_seq_ondisk >= start) +- goto out; ++ return false; + + for (unwritten_seq = journal_last_unwritten_seq(j); + unwritten_seq < end; +@@ -955,15 +946,12 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) + + /* journal flush already in flight, or flush requseted */ + if (buf->must_flush) +- goto out; ++ return false; + + buf->noflush = true; + } + +- ret = true; +-out: +- spin_unlock(&j->lock); +- return ret; ++ return true; + } + + static int __bch2_journal_meta(struct journal *j) +@@ -990,11 +978,11 @@ int bch2_journal_meta(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) +- return -BCH_ERR_erofs_no_writes; ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) ++ return bch_err_throw(c, erofs_no_writes); + + int ret = __bch2_journal_meta(j); +- bch2_write_ref_put(c, BCH_WRITE_REF_journal); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); + return ret; + } + +@@ -1002,19 +990,18 @@ int bch2_journal_meta(struct journal *j) + + void bch2_journal_unblock(struct journal *j) + { +- spin_lock(&j->lock); +- if (!--j->blocked && +- j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && +- j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { +- union journal_res_state old, new; +- +- old.v = atomic64_read(&j->reservations.counter); +- do { +- new.v = old.v; +- new.cur_entry_offset = j->cur_entry_offset_if_blocked; +- } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); +- } +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) ++ if (!--j->blocked && ++ j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && ++ j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { ++ union journal_res_state old, new; ++ ++ old.v = atomic64_read(&j->reservations.counter); ++ do { ++ new.v = old.v; ++ new.cur_entry_offset = j->cur_entry_offset_if_blocked; ++ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); ++ } + + journal_wake(j); + } +@@ -1042,9 +1029,8 @@ static void __bch2_journal_block(struct journal *j) + + void bch2_journal_block(struct journal *j) + { +- spin_lock(&j->lock); +- __bch2_journal_block(j); +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) ++ __bch2_journal_block(j); + + journal_quiesce(j); + } +@@ -1057,7 +1043,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou + /* We're inside wait_event(), but using mutex_lock(: */ + sched_annotate_sleep(); + mutex_lock(&j->buf_lock); +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + max_seq = min(max_seq, journal_cur_seq(j)); + + for (u64 seq = journal_last_unwritten_seq(j); +@@ -1074,6 +1060,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou + + if (open && !*blocked) { + __bch2_journal_block(j); ++ s.v = atomic64_read_acquire(&j->reservations.counter); + *blocked = true; + } + +@@ -1084,7 +1071,6 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou + } + } + +- spin_unlock(&j->lock); + if (IS_ERR_OR_NULL(ret)) + mutex_unlock(&j->buf_lock); + return ret; +@@ -1124,7 +1110,7 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, + new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); + new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); + if (!bu || !ob || !new_buckets || !new_bucket_seq) { +- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; ++ ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); + goto err_free; + } + +@@ -1139,16 +1125,14 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, + if (ret) + break; + +- if (!new_fs) { +- ret = bch2_trans_run(c, +- bch2_trans_mark_metadata_bucket(trans, ca, +- ob[nr_got]->bucket, BCH_DATA_journal, +- ca->mi.bucket_size, BTREE_TRIGGER_transactional)); +- if (ret) { +- bch2_open_bucket_put(c, ob[nr_got]); +- bch_err_msg(c, ret, "marking new journal buckets"); +- break; +- } ++ CLASS(btree_trans, trans)(c); ++ ret = bch2_trans_mark_metadata_bucket(trans, ca, ++ ob[nr_got]->bucket, BCH_DATA_journal, ++ ca->mi.bucket_size, BTREE_TRIGGER_transactional); ++ if (ret) { ++ bch2_open_bucket_put(c, ob[nr_got]); ++ bch_err_msg(c, ret, "marking new journal buckets"); ++ break; + } + + bu[nr_got] = ob[nr_got]->bucket; +@@ -1218,12 +1202,13 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, + mutex_unlock(&c->sb_lock); + } + +- if (ret && !new_fs) ++ if (ret) { ++ CLASS(btree_trans, trans)(c); + for (i = 0; i < nr_got; i++) +- bch2_trans_run(c, +- bch2_trans_mark_metadata_bucket(trans, ca, ++ bch2_trans_mark_metadata_bucket(trans, ca, + bu[i], BCH_DATA_free, 0, +- BTREE_TRIGGER_transactional)); ++ BTREE_TRIGGER_transactional); ++ } + err_free: + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); +@@ -1275,7 +1260,7 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca + ret = 0; /* wait and retry */ + + bch2_disk_reservation_put(c, &disk_res); +- closure_sync(&cl); ++ bch2_wait_on_allocator(c, &cl); + } + + return ret; +@@ -1288,21 +1273,89 @@ static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca + int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) + { +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); +- up_write(&c->state_lock); +- + bch_err_fn(c, ret); + return ret; + } + ++int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal *j = &c->journal; ++ struct journal_device *ja = &ca->journal; ++ ++ guard(mutex)(&c->sb_lock); ++ unsigned pos; ++ for (pos = 0; pos < ja->nr; pos++) ++ if (ja->buckets[pos] == b) ++ break; ++ ++ if (pos == ja->nr) { ++ bch_err(ca, "journal bucket %llu not found when deleting", b); ++ return -EINVAL; ++ } ++ ++ u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!new_buckets) ++ return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); ++ ++ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); ++ memmove(&new_buckets[pos], ++ &new_buckets[pos + 1], ++ (ja->nr - 1 - pos) * sizeof(new_buckets[0])); ++ ++ int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: ++ bch2_write_super(c); ++ if (ret) { ++ kfree(new_buckets); ++ return ret; ++ } ++ ++ scoped_guard(spinlock, &j->lock) { ++ if (pos < ja->discard_idx) ++ --ja->discard_idx; ++ if (pos < ja->dirty_idx_ondisk) ++ --ja->dirty_idx_ondisk; ++ if (pos < ja->dirty_idx) ++ --ja->dirty_idx; ++ if (pos < ja->cur_idx) ++ --ja->cur_idx; ++ ++ ja->nr--; ++ ++ memmove(&ja->buckets[pos], ++ &ja->buckets[pos + 1], ++ (ja->nr - pos) * sizeof(ja->buckets[0])); ++ ++ memmove(&ja->bucket_seq[pos], ++ &ja->bucket_seq[pos + 1], ++ (ja->nr - pos) * sizeof(ja->bucket_seq[0])); ++ ++ bch2_journal_space_available(j); ++ } ++ ++ kfree(new_buckets); ++ return 0; ++} ++ + int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) + { ++ struct bch_fs *c = ca->fs; ++ ++ if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) ++ return 0; ++ ++ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { ++ bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); ++ return bch_err_throw(c, erofs_filesystem_full); ++ } ++ + unsigned nr; + int ret; + + if (dynamic_fault("bcachefs:add:journal_alloc")) { +- ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; ++ ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); + goto err; + } + +@@ -1318,7 +1371,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) + min(1 << 13, + (1 << 24) / ca->mi.bucket_size)); + +- ret = bch2_set_nr_journal_buckets_loop(ca->fs, ca, nr, new_fs); ++ ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); + err: + bch_err_fn(ca, ret); + return ret; +@@ -1326,13 +1379,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) + + int bch2_fs_journal_alloc(struct bch_fs *c) + { +- for_each_online_member(c, ca) { ++ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { + if (ca->journal.nr) + continue; + + int ret = bch2_dev_journal_alloc(ca, true); + if (ret) { +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_fs_journal_alloc); + return ret; + } + } +@@ -1344,21 +1398,18 @@ int bch2_fs_journal_alloc(struct bch_fs *c) + + static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) + { +- bool ret = false; +- u64 seq; ++ guard(spinlock)(&j->lock); + +- spin_lock(&j->lock); +- for (seq = journal_last_unwritten_seq(j); +- seq <= journal_cur_seq(j) && !ret; ++ for (u64 seq = journal_last_unwritten_seq(j); ++ seq <= journal_cur_seq(j); + seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, seq); + + if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) +- ret = true; ++ return true; + } +- spin_unlock(&j->lock); + +- return ret; ++ return false; + } + + void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) +@@ -1395,32 +1446,36 @@ void bch2_fs_journal_stop(struct journal *j) + clear_bit(JOURNAL_running, &j->flags); + } + +-int bch2_fs_journal_start(struct journal *j, u64 cur_seq) ++int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin_list *p; + struct journal_replay *i, **_i; + struct genradix_iter iter; + bool had_entries = false; +- u64 last_seq = cur_seq, nr, seq; ++ ++ /* ++ * ++ * XXX pick most recent non blacklisted sequence number ++ */ ++ ++ cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); + + if (cur_seq >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + return -EINVAL; + } + +- genradix_for_each_reverse(&c->journal_entries, iter, _i) { +- i = *_i; ++ /* Clean filesystem? */ ++ if (!last_seq) ++ last_seq = cur_seq; + +- if (journal_replay_ignore(i)) +- continue; +- +- last_seq = le64_to_cpu(i->j.last_seq); +- break; ++ u64 nr = cur_seq - last_seq; ++ if (nr * sizeof(struct journal_entry_pin_list) > 1U << 30) { ++ bch_err(c, "too many ntjournal fifo (%llu open entries)", nr); ++ return bch_err_throw(c, ENOMEM_journal_pin_fifo); + } + +- nr = cur_seq - last_seq; +- + /* + * Extra fudge factor, in case we crashed when the journal pin fifo was + * nearly or completely full. We'll need to be able to open additional +@@ -1429,13 +1484,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) + */ + nr += nr / 4; + +- if (nr + 1 > j->pin.size) { +- free_fifo(&j->pin); +- init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); +- if (!j->pin.data) { +- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); +- return -BCH_ERR_ENOMEM_journal_pin_fifo; +- } ++ nr = max(nr, JOURNAL_PIN); ++ init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); ++ if (!j->pin.data) { ++ bch_err(c, "error allocating journal fifo (%llu open entries)", nr); ++ return bch_err_throw(c, ENOMEM_journal_pin_fifo); + } + + j->replay_journal_seq = last_seq; +@@ -1448,6 +1501,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + ++ u64 seq; + fifo_for_each_entry_ptr(p, &j->pin, seq) + journal_pin_list_init(p, 1); + +@@ -1478,13 +1532,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) + if (!had_entries) + j->last_empty_seq = cur_seq - 1; /* to match j->seq */ + +- spin_lock(&j->lock); +- j->last_flush_write = jiffies; +- +- j->reservations.idx = journal_cur_seq(j); +- +- c->last_bucket_seq_cleanup = journal_cur_seq(j); +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) { ++ j->last_flush_write = jiffies; ++ j->reservations.idx = journal_cur_seq(j); ++ c->last_bucket_seq_cleanup = journal_cur_seq(j); ++ } + + return 0; + } +@@ -1495,13 +1547,12 @@ void bch2_journal_set_replay_done(struct journal *j) + * journal_space_available must happen before setting JOURNAL_running + * JOURNAL_running must happen before JOURNAL_replay_done + */ +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + bch2_journal_space_available(j); + + set_bit(JOURNAL_need_flush_write, &j->flags); + set_bit(JOURNAL_running, &j->flags); + set_bit(JOURNAL_replay_done, &j->flags); +- spin_unlock(&j->lock); + } + + /* init/exit: */ +@@ -1511,7 +1562,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca) + struct journal_device *ja = &ca->journal; + + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { +- kfree(ja->bio[i]); ++ kvfree(ja->bio[i]); + ja->bio[i] = NULL; + } + +@@ -1523,6 +1574,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca) + + int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) + { ++ struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch2_sb_field_get(sb, journal); +@@ -1542,15 +1594,24 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) + + ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->bucket_seq) +- return -BCH_ERR_ENOMEM_dev_journal_init; ++ return bch_err_throw(c, ENOMEM_dev_journal_init); + + unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { +- ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, ++ /* ++ * kvzalloc() is not what we want to be using here: ++ * JOURNAL_ENTRY_SIZE_MAX is probably quite a bit bigger than it ++ * needs to be. ++ * ++ * But changing that will require performance testing - ++ * performance can be sensitive to anything that affects journal ++ * pipelining. ++ */ ++ ja->bio[i] = kvzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + nr_bvecs), GFP_KERNEL); + if (!ja->bio[i]) +- return -BCH_ERR_ENOMEM_dev_journal_init; ++ return bch_err_throw(c, ENOMEM_dev_journal_init); + + ja->bio[i]->ca = ca; + ja->bio[i]->buf_idx = i; +@@ -1559,7 +1620,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) + + ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->buckets) +- return -BCH_ERR_ENOMEM_dev_journal_init; ++ return bch_err_throw(c, ENOMEM_dev_journal_init); + + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); +@@ -1590,7 +1651,7 @@ void bch2_fs_journal_exit(struct journal *j) + free_fifo(&j->pin); + } + +-int bch2_fs_journal_init(struct journal *j) ++void bch2_fs_journal_init_early(struct journal *j) + { + static struct lock_class_key res_key; + +@@ -1609,24 +1670,24 @@ int bch2_fs_journal_init(struct journal *j) + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); ++} + +- if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) +- return -BCH_ERR_ENOMEM_journal_pin_fifo; ++int bch2_fs_journal_init(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); + + j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; + j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); + if (!j->free_buf) +- return -BCH_ERR_ENOMEM_journal_buf; ++ return bch_err_throw(c, ENOMEM_journal_buf); + + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) + j->buf[i].idx = i; + +- j->pin.front = j->pin.back = 1; +- + j->wq = alloc_workqueue("bcachefs_journal", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); + if (!j->wq) +- return -BCH_ERR_ENOMEM_fs_other_alloc; ++ return bch_err_throw(c, ENOMEM_fs_other_alloc); + return 0; + } + +@@ -1648,9 +1709,10 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 28); +- out->atomic++; + +- rcu_read_lock(); ++ guard(printbuf_atomic)(out); ++ guard(rcu)(); ++ + s = READ_ONCE(j->reservations); + + prt_printf(out, "flags:\t"); +@@ -1740,15 +1802,10 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + } + + prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); +- +- rcu_read_unlock(); +- +- --out->atomic; + } + + void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + { +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + __bch2_journal_debug_to_text(out, j); +- spin_unlock(&j->lock); + } +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 641e20c05a14..b46b9718d841 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -297,9 +297,8 @@ static inline void bch2_journal_buf_put(struct journal *j, u64 seq) + + s = journal_state_buf_put(j, idx); + if (!journal_state_count(s, idx)) { +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + bch2_journal_buf_put_final(j, seq); +- spin_unlock(&j->lock); + } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) + wake_up(&j->wait); + } +@@ -426,8 +425,8 @@ int bch2_journal_flush(struct journal *); + bool bch2_journal_noflush_seq(struct journal *, u64, u64); + int bch2_journal_meta(struct journal *); + +-void bch2_journal_halt(struct journal *); + void bch2_journal_halt_locked(struct journal *); ++void bch2_journal_halt(struct journal *); + + static inline int bch2_journal_error(struct journal *j) + { +@@ -444,20 +443,22 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u + void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); + void bch2_journal_debug_to_text(struct printbuf *, struct journal *); + +-int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, +- unsigned nr); ++int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned); ++int bch2_dev_journal_bucket_delete(struct bch_dev *, u64); ++ + int bch2_dev_journal_alloc(struct bch_dev *, bool); + int bch2_fs_journal_alloc(struct bch_fs *); + + void bch2_dev_journal_stop(struct journal *, struct bch_dev *); + + void bch2_fs_journal_stop(struct journal *); +-int bch2_fs_journal_start(struct journal *, u64); ++int bch2_fs_journal_start(struct journal *, u64, u64); + void bch2_journal_set_replay_done(struct journal *); + + void bch2_dev_journal_exit(struct bch_dev *); + int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); + void bch2_fs_journal_exit(struct journal *); ++void bch2_fs_journal_init_early(struct journal *); + int bch2_fs_journal_init(struct journal *); + + #endif /* _BCACHEFS_JOURNAL_H */ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index ded18a94ed02..093e4acad085 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -35,7 +35,8 @@ void bch2_journal_pos_from_member_info_set(struct bch_fs *c) + + void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) + { +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); ++ + for_each_member_device(c, ca) { + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + +@@ -46,28 +47,28 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) + if (offset <= ca->mi.bucket_size) + ca->journal.sectors_free = ca->mi.bucket_size - offset; + } +- mutex_unlock(&c->sb_lock); + } + +-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, +- struct journal_replay *j) ++static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) ++{ ++ CLASS(bch2_dev_tryget_noerror, ca)(c, p->dev); ++ prt_printf(out, "%s %u:%u:%u (sector %llu)", ++ ca ? ca->name : "(invalid dev)", ++ p->dev, p->bucket, p->bucket_offset, p->sector); ++} ++ ++void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) + { + darray_for_each(j->ptrs, i) { + if (i != j->ptrs.data) + prt_printf(out, " "); +- prt_printf(out, "%u:%u:%u (sector %llu)", +- i->dev, i->bucket, i->bucket_offset, i->sector); ++ bch2_journal_ptr_to_text(out, c, i); + } + } + +-static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, +- struct journal_replay *j) ++static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) + { +- prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); +- +- bch2_journal_ptrs_to_text(out, c, j); +- +- for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { ++ for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); +@@ -75,6 +76,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + } + } + ++static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, ++ struct journal_replay *j) ++{ ++ prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); ++ bch2_journal_datetime_to_text(out, &j->j); ++ prt_char(out, ' '); ++ bch2_journal_ptrs_to_text(out, c, j); ++} ++ + static struct nonce journal_nonce(const struct jset *jset) + { + return (struct nonce) {{ +@@ -146,9 +156,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct journal_replay **_i, *i, *dup; + size_t bytes = vstruct_bytes(j); + u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = JOURNAL_ENTRY_ADD_OK; + ++ if (last_seq && c->opts.journal_rewind) ++ last_seq = min(last_seq, c->opts.journal_rewind); ++ + if (!c->journal.oldest_seq_found_ondisk || + le64_to_cpu(j->seq) < c->journal.oldest_seq_found_ondisk) + c->journal.oldest_seq_found_ondisk = le64_to_cpu(j->seq); +@@ -188,7 +201,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + journal_entry_radix_idx(c, le64_to_cpu(j->seq)), + GFP_KERNEL); + if (!_i) +- return -BCH_ERR_ENOMEM_journal_entry_add; ++ return bch_err_throw(c, ENOMEM_journal_entry_add); + + /* + * Duplicate journal entries? If so we want the one that didn't have a +@@ -209,7 +222,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + + ret = darray_push(&dup->ptrs, entry_ptr); + if (ret) +- goto out; ++ return ret; + + bch2_journal_replay_to_text(&buf, c, dup); + +@@ -226,12 +239,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + if (entry_ptr.csum_good && !identical) + goto replace; + +- goto out; ++ return ret; + } + replace: + i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + if (!i) +- return -BCH_ERR_ENOMEM_journal_entry_add; ++ return bch_err_throw(c, ENOMEM_journal_entry_add); + + darray_init(&i->ptrs); + i->csum_good = entry_ptr.csum_good; +@@ -249,9 +262,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + } + + *_i = i; +-out: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -298,7 +309,7 @@ static void journal_entry_err_msg(struct printbuf *out, + + #define journal_entry_err(c, version, jset, entry, _err, msg, ...) \ + ({ \ +- struct printbuf _buf = PRINTBUF; \ ++ CLASS(printbuf, _buf)(); \ + \ + journal_entry_err_msg(&_buf, version, jset, entry); \ + prt_printf(&_buf, msg, ##__VA_ARGS__); \ +@@ -311,13 +322,12 @@ static void journal_entry_err_msg(struct printbuf *out, + bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ + if (bch2_fs_inconsistent(c, \ + "corrupt metadata before write: %s\n", _buf.buf)) {\ +- ret = -BCH_ERR_fsck_errors_not_fixed; \ ++ ret = bch_err_throw(c, fsck_errors_not_fixed); \ + goto fsck_err; \ + } \ + break; \ + } \ + \ +- printbuf_exit(&_buf); \ + true; \ + }) + +@@ -423,6 +433,17 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs + bch2_prt_jset_entry_type(out, entry->type); + prt_str(out, ": "); + } ++ /* We may be called on entries that haven't been validated: */ ++ if (!k->k.u64s) { ++ prt_str(out, "(invalid, k->u64s 0)"); ++ break; ++ } ++ ++ if (bkey_next(k) > vstruct_last(entry)) { ++ prt_str(out, "(invalid, bkey overruns jset_entry)"); ++ break; ++ } ++ + bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); + prt_char(out, ' '); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); +@@ -599,7 +620,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); +- struct printbuf err = PRINTBUF; ++ CLASS(printbuf, err)(); + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u) || +@@ -608,7 +629,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, + journal_entry_data_usage_bad_size, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); +- goto out; ++ return 0; + } + + if (journal_entry_err_on(bch2_replicas_entry_validate(&u->r, c, &err), +@@ -616,11 +637,9 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, + journal_entry_data_usage_bad_size, + "invalid journal entry usage: %s", err.buf)) { + journal_entry_null_range(entry, vstruct_next(entry)); +- goto out; ++ return 0; + } +-out: + fsck_err: +- printbuf_exit(&err); + return ret; + } + +@@ -1005,19 +1024,19 @@ struct journal_read_buf { + size_t size; + }; + +-static int journal_read_buf_realloc(struct journal_read_buf *b, ++static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, + size_t new_size) + { + void *n; + + /* the bios are sized for this many pages, max: */ + if (new_size > JOURNAL_ENTRY_SIZE_MAX) +- return -BCH_ERR_ENOMEM_journal_read_buf_realloc; ++ return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); + + new_size = roundup_pow_of_two(new_size); + n = kvmalloc(new_size, GFP_KERNEL); + if (!n) +- return -BCH_ERR_ENOMEM_journal_read_buf_realloc; ++ return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); + + kvfree(b->data); + b->data = n; +@@ -1037,7 +1056,6 @@ static int journal_read_bucket(struct bch_dev *ca, + u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), + end = offset + ca->mi.bucket_size; + bool saw_bad = false, csum_good; +- struct printbuf err = PRINTBUF; + int ret = 0; + + pr_debug("reading %u", bucket); +@@ -1053,7 +1071,7 @@ static int journal_read_bucket(struct bch_dev *ca, + + bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!bio) +- return -BCH_ERR_ENOMEM_journal_read_bucket; ++ return bch_err_throw(c, ENOMEM_journal_read_bucket); + bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); + + bio->bi_iter.bi_sector = offset; +@@ -1064,7 +1082,7 @@ static int journal_read_bucket(struct bch_dev *ca, + kfree(bio); + + if (!ret && bch2_meta_read_fault("journal")) +- ret = -BCH_ERR_EIO_fault_injected; ++ ret = bch_err_throw(c, EIO_fault_injected); + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + submit_time, !ret); +@@ -1078,7 +1096,7 @@ static int journal_read_bucket(struct bch_dev *ca, + * found on a different device, and missing or + * no journal entries will be handled later + */ +- goto out; ++ return 0; + } + + j = buf->data; +@@ -1092,15 +1110,15 @@ static int journal_read_bucket(struct bch_dev *ca, + break; + case JOURNAL_ENTRY_REREAD: + if (vstruct_bytes(j) > buf->size) { +- ret = journal_read_buf_realloc(buf, ++ ret = journal_read_buf_realloc(c, buf, + vstruct_bytes(j)); + if (ret) +- goto err; ++ return ret; + } + goto reread; + case JOURNAL_ENTRY_NONE: + if (!saw_bad) +- goto out; ++ return 0; + /* + * On checksum error we don't really trust the size + * field of the journal entry we read, so try reading +@@ -1109,7 +1127,7 @@ static int journal_read_bucket(struct bch_dev *ca, + sectors = block_sectors(c); + goto next_block; + default: +- goto err; ++ return ret; + } + + if (le64_to_cpu(j->seq) > ja->highest_seq_found) { +@@ -1126,22 +1144,20 @@ static int journal_read_bucket(struct bch_dev *ca, + * bucket: + */ + if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) +- goto out; ++ return 0; + + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + +- enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); + struct bch_csum csum; + csum_good = jset_csum_good(c, j, &csum); + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) { +- bch_err_dev_ratelimited(ca, "%s", +- (printbuf_reset(&err), +- prt_str(&err, "journal "), +- bch2_csum_err_msg(&err, csum_type, j->csum, csum), +- err.buf)); ++ /* ++ * Don't print an error here, we'll print the error ++ * later if we need this journal entry ++ */ + saw_bad = true; + } + +@@ -1150,16 +1166,16 @@ static int journal_read_bucket(struct bch_dev *ca, + vstruct_end(j) - (void *) j->encrypted_start); + bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)); + +- mutex_lock(&jlist->lock); +- ret = journal_entry_add(c, ca, (struct journal_ptr) { +- .csum_good = csum_good, +- .dev = ca->dev_idx, +- .bucket = bucket, +- .bucket_offset = offset - +- bucket_to_sector(ca, ja->buckets[bucket]), +- .sector = offset, +- }, jlist, j); +- mutex_unlock(&jlist->lock); ++ scoped_guard(mutex, &jlist->lock) ++ ret = journal_entry_add(c, ca, (struct journal_ptr) { ++ .csum_good = csum_good, ++ .csum = csum, ++ .dev = ca->dev_idx, ++ .bucket = bucket, ++ .bucket_offset = offset - ++ bucket_to_sector(ca, ja->buckets[bucket]), ++ .sector = offset, ++ }, jlist, j); + + switch (ret) { + case JOURNAL_ENTRY_ADD_OK: +@@ -1167,7 +1183,7 @@ static int journal_read_bucket(struct bch_dev *ca, + case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: + break; + default: +- goto err; ++ return ret; + } + next_block: + pr_debug("next"); +@@ -1176,11 +1192,7 @@ static int journal_read_bucket(struct bch_dev *ca, + j = ((void *) j) + (sectors << 9); + } + +-out: +- ret = 0; +-err: +- printbuf_exit(&err); +- return ret; ++ return 0; + } + + static CLOSURE_CALLBACK(bch2_journal_read_device) +@@ -1197,7 +1209,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) + if (!ja->nr) + goto out; + +- ret = journal_read_buf_realloc(&buf, PAGE_SIZE); ++ ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); + if (ret) + goto err; + +@@ -1219,25 +1231,125 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) + out: + bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); + kvfree(buf.data); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); + closure_return(cl); + return; + err: +- mutex_lock(&jlist->lock); +- jlist->ret = ret; +- mutex_unlock(&jlist->lock); ++ scoped_guard(mutex, &jlist->lock) ++ jlist->ret = ret; + goto out; + } + ++noinline_for_stack ++static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) ++{ ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); ++ bool have_good = false; ++ ++ prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); ++ bch2_journal_datetime_to_text(&buf, &j->j); ++ prt_newline(&buf); ++ ++ darray_for_each(j->ptrs, ptr) ++ if (!ptr->csum_good) { ++ bch2_journal_ptr_to_text(&buf, c, ptr); ++ prt_char(&buf, ' '); ++ bch2_csum_to_text(&buf, csum_type, ptr->csum); ++ prt_newline(&buf); ++ } else { ++ have_good = true; ++ } ++ ++ prt_printf(&buf, "should be "); ++ bch2_csum_to_text(&buf, csum_type, j->j.csum); ++ ++ if (have_good) ++ prt_printf(&buf, "\n(had good copy on another device)"); ++ ++ bch2_print_str(c, KERN_ERR, buf.buf); ++} ++ ++struct u64_range bch2_journal_entry_missing_range(struct bch_fs *c, u64 start, u64 end) ++{ ++ BUG_ON(start > end); ++ ++ if (start == end) ++ return (struct u64_range) {}; ++ ++ start = bch2_journal_seq_next_nonblacklisted(c, start); ++ if (start >= end) ++ return (struct u64_range) {}; ++ ++ struct u64_range missing = { ++ .start = start, ++ .end = min(end, bch2_journal_seq_next_blacklisted(c, start)), ++ }; ++ ++ if (missing.start == missing.end) ++ return (struct u64_range) {}; ++ ++ return missing; ++} ++ ++noinline_for_stack ++static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) ++{ ++ int ret = 0; ++ ++ struct genradix_iter radix_iter; ++ struct journal_replay *i, **_i, *prev = NULL; ++ /* Sequence number we expect to find next, to check for missing entries */ ++ u64 seq = start_seq; ++ ++ genradix_for_each(&c->journal_entries, radix_iter, _i) { ++ i = *_i; ++ ++ if (journal_replay_ignore(i)) ++ continue; ++ ++ BUG_ON(seq > le64_to_cpu(i->j.seq)); ++ ++ struct u64_range missing; ++ ++ while ((missing = bch2_journal_entry_missing_range(c, seq, le64_to_cpu(i->j.seq))).start) { ++ CLASS(printbuf, buf)(); ++ prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", ++ missing.start, missing.end - 1, ++ start_seq, end_seq); ++ ++ if (prev) { ++ prt_printf(&buf, "\n%llu at ", le64_to_cpu(prev->j.seq)); ++ bch2_journal_ptrs_to_text(&buf, c, prev); ++ prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); ++ } ++ ++ prt_printf(&buf, "\n%llu at ", le64_to_cpu(i->j.seq)); ++ bch2_journal_ptrs_to_text(&buf, c, i); ++ prt_printf(&buf, ", continue?"); ++ ++ fsck_err(c, journal_entries_missing, "%s", buf.buf); ++ ++ seq = missing.end; ++ } ++ ++ prev = i; ++ seq = le64_to_cpu(i->j.seq) + 1; ++ } ++fsck_err: ++ return ret; ++} ++ + int bch2_journal_read(struct bch_fs *c, + u64 *last_seq, + u64 *blacklist_seq, + u64 *start_seq) + { + struct journal_list jlist; +- struct journal_replay *i, **_i, *prev = NULL; ++ struct journal_replay *i, **_i; + struct genradix_iter radix_iter; +- struct printbuf buf = PRINTBUF; + bool degraded = false, last_write_torn = false; + u64 seq; + int ret = 0; +@@ -1254,7 +1366,8 @@ int bch2_journal_read(struct bch_fs *c, + + if ((ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro) && +- percpu_ref_tryget(&ca->io_ref[READ])) ++ enumerated_ref_tryget(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_journal_read)) + closure_call(&ca->journal.read, + bch2_journal_read_device, + system_unbound_wq, +@@ -1325,14 +1438,27 @@ int bch2_journal_read(struct bch_fs *c, + return 0; + } + +- bch_info(c, "journal read done, replaying entries %llu-%llu", +- *last_seq, *blacklist_seq - 1); ++ u64 drop_before = *last_seq; ++ { ++ CLASS(printbuf, buf)(); ++ prt_printf(&buf, "journal read done, replaying entries %llu-%llu", ++ *last_seq, *blacklist_seq - 1); ++ ++ /* ++ * Drop blacklisted entries and entries older than last_seq (or start of ++ * journal rewind: ++ */ ++ if (c->opts.journal_rewind) { ++ drop_before = min(drop_before, c->opts.journal_rewind); ++ prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind); ++ } + +- if (*start_seq != *blacklist_seq) +- bch_info(c, "dropped unflushed entries %llu-%llu", +- *blacklist_seq, *start_seq - 1); ++ *last_seq = drop_before; ++ if (*start_seq != *blacklist_seq) ++ prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); ++ bch_info(c, "%s", buf.buf); ++ } + +- /* Drop blacklisted entries and entries older than last_seq: */ + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + +@@ -1340,7 +1466,7 @@ int bch2_journal_read(struct bch_fs *c, + continue; + + seq = le64_to_cpu(i->j.seq); +- if (seq < *last_seq) { ++ if (seq < drop_before) { + journal_replay_free(c, i, false); + continue; + } +@@ -1353,59 +1479,12 @@ int bch2_journal_read(struct bch_fs *c, + } + } + +- /* Check for missing entries: */ +- seq = *last_seq; +- genradix_for_each(&c->journal_entries, radix_iter, _i) { +- i = *_i; +- +- if (journal_replay_ignore(i)) +- continue; +- +- BUG_ON(seq > le64_to_cpu(i->j.seq)); +- +- while (seq < le64_to_cpu(i->j.seq)) { +- u64 missing_start, missing_end; +- struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; +- +- while (seq < le64_to_cpu(i->j.seq) && +- bch2_journal_seq_is_blacklisted(c, seq, false)) +- seq++; +- +- if (seq == le64_to_cpu(i->j.seq)) +- break; +- +- missing_start = seq; +- +- while (seq < le64_to_cpu(i->j.seq) && +- !bch2_journal_seq_is_blacklisted(c, seq, false)) +- seq++; +- +- if (prev) { +- bch2_journal_ptrs_to_text(&buf1, c, prev); +- prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); +- } else +- prt_printf(&buf1, "(none)"); +- bch2_journal_ptrs_to_text(&buf2, c, i); +- +- missing_end = seq - 1; +- fsck_err(c, journal_entries_missing, +- "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" +- "prev at %s\n" +- "next at %s, continue?", +- missing_start, missing_end, +- *last_seq, *blacklist_seq - 1, +- buf1.buf, buf2.buf); +- +- printbuf_exit(&buf1); +- printbuf_exit(&buf2); +- } +- +- prev = i; +- seq++; +- } ++ ret = bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1); ++ if (ret) ++ return ret; + + genradix_for_each(&c->journal_entries, radix_iter, _i) { +- struct bch_replicas_padded replicas = { ++ union bch_replicas_padded replicas = { + .e.data_type = BCH_DATA_journal, + .e.nr_devs = 0, + .e.nr_required = 1, +@@ -1415,15 +1494,15 @@ int bch2_journal_read(struct bch_fs *c, + if (journal_replay_ignore(i)) + continue; + +- darray_for_each(i->ptrs, ptr) { +- struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); +- +- if (!ptr->csum_good) +- bch_err_dev_offset(ca, ptr->sector, +- "invalid journal checksum, seq %llu%s", +- le64_to_cpu(i->j.seq), +- i->csum_good ? " (had good copy on another device)" : ""); +- } ++ /* ++ * Don't print checksum errors until we know we're going to use ++ * a given journal entry: ++ */ ++ darray_for_each(i->ptrs, ptr) ++ if (!ptr->csum_good) { ++ bch2_journal_print_checksum_error(c, i); ++ break; ++ } + + ret = jset_validate(c, + bch2_dev_have_ref(c, i->ptrs.data[0].dev), +@@ -1431,14 +1510,14 @@ int bch2_journal_read(struct bch_fs *c, + i->ptrs.data[0].sector, + READ); + if (ret) +- goto err; ++ return ret; + + darray_for_each(i->ptrs, ptr) + replicas_entry_add_dev(&replicas.e, ptr->dev); + + bch2_replicas_entry_sort(&replicas.e); + +- printbuf_reset(&buf); ++ CLASS(printbuf, buf)(); + bch2_replicas_entry_to_text(&buf, &replicas.e); + + if (!degraded && +@@ -1449,12 +1528,10 @@ int bch2_journal_read(struct bch_fs *c, + le64_to_cpu(i->j.seq), buf.buf))) { + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) +- goto err; ++ return ret; + } + } +-err: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -1466,6 +1543,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + ++ guard(rcu)(); + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); + if (!ca) +@@ -1499,7 +1577,8 @@ static void __journal_write_alloc(struct journal *j, + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + darray_for_each(*devs, i) { +- struct bch_dev *ca = rcu_dereference(c->devs[*i]); ++ struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, ++ BCH_DEV_WRITE_REF_journal_write); + if (!ca) + continue; + +@@ -1513,8 +1592,10 @@ static void __journal_write_alloc(struct journal *j, + ca->mi.state != BCH_MEMBER_STATE_rw || + !ja->nr || + bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || +- sectors > ja->sectors_free) ++ sectors > ja->sectors_free) { ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); + continue; ++ } + + bch2_dev_stripe_increment(ca, &j->wp.stripe); + +@@ -1537,15 +1618,8 @@ static void __journal_write_alloc(struct journal *j, + } + } + +-/** +- * journal_write_alloc - decide where to write next journal entry +- * +- * @j: journal object +- * @w: journal buf (entry to be written) +- * +- * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure +- */ +-static int journal_write_alloc(struct journal *j, struct journal_buf *w) ++static int journal_write_alloc(struct journal *j, struct journal_buf *w, ++ unsigned *replicas) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_devs_mask devs; +@@ -1553,29 +1627,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + unsigned target = c->opts.metadata_target ?: + c->opts.foreground_target; +- unsigned replicas = 0, replicas_want = +- READ_ONCE(c->opts.metadata_replicas); ++ unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_need = min_t(unsigned, replicas_want, + READ_ONCE(c->opts.metadata_replicas_required)); + bool advance_done = false; + +- rcu_read_lock(); +- +- /* We might run more than once if we have to stop and do discards: */ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); +- bkey_for_each_ptr(ptrs, p) { +- struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); +- if (ca) +- replicas += ca->mi.durability; +- } +- + retry_target: + devs = target_rw_devs(c, BCH_DATA_journal, target); +- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); ++ bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); + retry_alloc: +- __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); ++ __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); + +- if (likely(replicas >= replicas_want)) ++ if (likely(*replicas >= replicas_want)) + goto done; + + if (!advance_done) { +@@ -1584,18 +1647,26 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) + goto retry_alloc; + } + +- if (replicas < replicas_want && target) { ++ if (*replicas < replicas_want && target) { + /* Retry from all devices: */ + target = 0; + advance_done = false; + goto retry_target; + } + done: +- rcu_read_unlock(); +- + BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); + +- return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; ++#if 0 ++ /* ++ * XXX: we need a way to alert the user when we go degraded for any ++ * reason ++ */ ++ if (*replicas < min(replicas_want, ++ dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { ++ } ++#endif ++ ++ return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; + } + + static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) +@@ -1620,10 +1691,10 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) + + memcpy(new_buf, buf->data, buf->buf_size); + +- spin_lock(&j->lock); +- swap(buf->data, new_buf); +- swap(buf->buf_size, new_size); +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) { ++ swap(buf->data, new_buf); ++ swap(buf->buf_size, new_size); ++ } + + kvfree(new_buf); + } +@@ -1633,7 +1704,7 @@ static CLOSURE_CALLBACK(journal_write_done) + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); + struct bch_fs *c = container_of(j, struct bch_fs, journal); +- struct bch_replicas_padded replicas; ++ union bch_replicas_padded replicas; + u64 seq = le64_to_cpu(w->data->seq); + int err = 0; + +@@ -1642,17 +1713,27 @@ static CLOSURE_CALLBACK(journal_write_done) + : j->noflush_write_time, j->write_start_time); + + if (!w->devs_written.nr) { +- if (!bch2_journal_error(j)) +- bch_err(c, "unable to write journal to sufficient devices"); +- err = -BCH_ERR_journal_write_err; ++ err = bch_err_throw(c, journal_write_err); + } else { + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + w->devs_written); + err = bch2_mark_replicas(c, &replicas.e); + } + +- if (err) +- bch2_fatal_error(c); ++ if (err && !bch2_journal_error(j)) { ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ if (err == -BCH_ERR_journal_write_err) ++ prt_printf(&buf, "unable to write journal to sufficient devices\n"); ++ else ++ prt_printf(&buf, "journal write error marking replicas: %s\n", ++ bch2_err_str(err)); ++ ++ bch2_fs_emergency_read_only2(c, &buf); ++ ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ } + + closure_debug_destroy(cl); + +@@ -1694,6 +1775,7 @@ static CLOSURE_CALLBACK(journal_write_done) + + closure_wake_up(&c->freelist_wait); + bch2_reset_alloc_cursors(c); ++ do_discards = true; + } + + j->seq_ondisk = seq; +@@ -1745,6 +1827,8 @@ static CLOSURE_CALLBACK(journal_write_done) + + if (do_discards) + bch2_do_discards(c); ++ ++ closure_put(&c->cl); + } + + static void journal_write_endio(struct bio *bio) +@@ -1770,7 +1854,7 @@ static void journal_write_endio(struct bio *bio) + } + + closure_put(&w->io); +- percpu_ref_put(&ca->io_ref[WRITE]); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); + } + + static CLOSURE_CALLBACK(journal_write_submit) +@@ -1781,12 +1865,7 @@ static CLOSURE_CALLBACK(journal_write_submit) + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { +- struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); +- if (!ca) { +- /* XXX: fix this */ +- bch_err(c, "missing device %u for journal write", ptr->dev); +- continue; +- } ++ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], + sectors); +@@ -1797,7 +1876,11 @@ static CLOSURE_CALLBACK(journal_write_submit) + + jbio->submit_time = local_clock(); + +- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); ++ /* ++ * blk-wbt.c throttles all writes except those that have both ++ * REQ_SYNC and REQ_IDLE set... ++ */ ++ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_IDLE|REQ_META); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; +@@ -1844,8 +1927,9 @@ static CLOSURE_CALLBACK(journal_write_preflush) + } + + if (w->separate_flush) { +- for_each_rw_member(c, ca) { +- percpu_ref_get(&ca->io_ref[WRITE]); ++ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { ++ enumerated_ref_get(&ca->io_ref[WRITE], ++ BCH_DEV_WRITE_REF_journal_write); + + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio[w->idx]->bio; +@@ -1872,9 +1956,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) + struct jset_entry *start, *end; + struct jset *jset = w->data; + struct journal_keys_to_wb wb = { NULL }; +- unsigned sectors, bytes, u64s; ++ unsigned u64s; + unsigned long btree_roots_have = 0; +- bool validate_before_checksum = false; + u64 seq = le64_to_cpu(jset->seq); + int ret; + +@@ -1937,9 +2020,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) + } + } + +- spin_lock(&c->journal.lock); +- w->need_flush_to_write_buffer = false; +- spin_unlock(&c->journal.lock); ++ scoped_guard(spinlock, &c->journal.lock) ++ w->need_flush_to_write_buffer = false; + + start = end = vstruct_last(jset); + +@@ -1957,8 +2039,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) + + le32_add_cpu(&jset->u64s, u64s); + +- sectors = vstruct_sectors(jset, c->block_bits); +- bytes = vstruct_bytes(jset); ++ unsigned sectors = vstruct_sectors(jset, c->block_bits); + + if (sectors > w->sectors) { + bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", +@@ -1967,6 +2048,17 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) + return -EINVAL; + } + ++ return 0; ++} ++ ++static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct jset *jset = w->data; ++ u64 seq = le64_to_cpu(jset->seq); ++ bool validate_before_checksum = false; ++ int ret = 0; ++ + jset->magic = cpu_to_le64(jset_magic(c)); + jset->version = cpu_to_le32(c->sb.version); + +@@ -1989,7 +2081,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) + ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); +- if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) ++ if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) + return ret; + + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), +@@ -1999,6 +2091,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) + (ret = jset_validate(c, NULL, jset, 0, WRITE))) + return ret; + ++ unsigned sectors = vstruct_sectors(jset, c->block_bits); ++ unsigned bytes = vstruct_bytes(jset); + memset((void *) jset + bytes, 0, (sectors << 9) - bytes); + return 0; + } +@@ -2054,13 +2148,10 @@ CLOSURE_CALLBACK(bch2_journal_write) + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); + struct bch_fs *c = container_of(j, struct bch_fs, journal); +- struct bch_replicas_padded replicas; +- unsigned nr_rw_members = 0; ++ union bch_replicas_padded replicas; ++ unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); + int ret; + +- for_each_rw_member(c, ca) +- nr_rw_members++; +- + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + BUG_ON(!w->write_started); + BUG_ON(w->write_allocated); +@@ -2068,71 +2159,60 @@ CLOSURE_CALLBACK(bch2_journal_write) + + j->write_start_time = local_clock(); + +- spin_lock(&j->lock); +- if (nr_rw_members > 1) +- w->separate_flush = true; ++ scoped_guard(spinlock, &j->lock) { ++ if (nr_rw_members > 1) ++ w->separate_flush = true; + +- ret = bch2_journal_write_pick_flush(j, w); +- spin_unlock(&j->lock); +- if (ret) ++ ret = bch2_journal_write_pick_flush(j, w); ++ } ++ ++ if (unlikely(ret)) + goto err; + +- mutex_lock(&j->buf_lock); +- journal_buf_realloc(j, w); ++ scoped_guard(mutex, &j->buf_lock) { ++ journal_buf_realloc(j, w); + +- ret = bch2_journal_write_prep(j, w); +- mutex_unlock(&j->buf_lock); +- if (ret) +- goto err; ++ ret = bch2_journal_write_prep(j, w); ++ } + +- j->entry_bytes_written += vstruct_bytes(w->data); ++ if (unlikely(ret)) ++ goto err; + ++ unsigned replicas_allocated = 0; + while (1) { +- spin_lock(&j->lock); +- ret = journal_write_alloc(j, w); ++ ret = journal_write_alloc(j, w, &replicas_allocated); + if (!ret || !j->can_discard) + break; + +- spin_unlock(&j->lock); + bch2_journal_do_discards(j); + } + +- if (ret && !bch2_journal_error(j)) { +- struct printbuf buf = PRINTBUF; +- buf.atomic++; ++ if (unlikely(ret)) ++ goto err_allocate_write; + +- __bch2_journal_debug_to_text(&buf, j); +- spin_unlock(&j->lock); +- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), +- le64_to_cpu(w->data->seq), +- vstruct_sectors(w->data, c->block_bits), +- bch2_err_str(ret)); +- bch2_print_string_as_lines(KERN_ERR, buf.buf); +- printbuf_exit(&buf); +- } +- if (ret) ++ ret = bch2_journal_write_checksum(j, w); ++ if (unlikely(ret)) + goto err; + +- /* +- * write is allocated, no longer need to account for it in +- * bch2_journal_space_available(): +- */ +- w->sectors = 0; +- w->write_allocated = true; ++ scoped_guard(spinlock, &j->lock) { ++ /* ++ * write is allocated, no longer need to account for it in ++ * bch2_journal_space_available(): ++ */ ++ w->sectors = 0; ++ w->write_allocated = true; ++ j->entry_bytes_written += vstruct_bytes(w->data); + +- /* +- * journal entry has been compacted and allocated, recalculate space +- * available: +- */ +- bch2_journal_space_available(j); +- bch2_journal_do_writes(j); +- spin_unlock(&j->lock); ++ /* ++ * journal entry has been compacted and allocated, recalculate space ++ * available: ++ */ ++ bch2_journal_space_available(j); ++ bch2_journal_do_writes(j); ++ } + + w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + +- if (c->opts.nochanges) +- goto no_io; +- + /* + * Mark journal replicas before we submit the write to guarantee + * recovery will find the journal entries after a crash. +@@ -2143,15 +2223,32 @@ CLOSURE_CALLBACK(bch2_journal_write) + if (ret) + goto err; + ++ if (c->opts.nochanges) ++ goto no_io; ++ + if (!JSET_NO_FLUSH(w->data)) + continue_at(cl, journal_write_preflush, j->wq); + else + continue_at(cl, journal_write_submit, j->wq); + return; +-no_io: +- continue_at(cl, journal_write_done, j->wq); +- return; ++err_allocate_write: ++ if (!bch2_journal_error(j)) { ++ CLASS(printbuf, buf)(); ++ ++ bch2_journal_debug_to_text(&buf, j); ++ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), ++ le64_to_cpu(w->data->seq), ++ vstruct_sectors(w->data, c->block_bits), ++ bch2_err_str(ret)); ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ } + err: + bch2_fatal_error(c); ++no_io: ++ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { ++ struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); ++ enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); ++ } ++ + continue_at(cl, journal_write_done, j->wq); + } +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +index 12b39fcb4424..f53c5c81d137 100644 +--- a/fs/bcachefs/journal_io.h ++++ b/fs/bcachefs/journal_io.h +@@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *); + + struct journal_ptr { + bool csum_good; ++ struct bch_csum csum; + u8 dev; + u32 bucket; + u32 bucket_offset; +@@ -70,6 +71,13 @@ void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, + void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct journal_replay *); + ++struct u64_range { ++ u64 start; ++ u64 end; ++}; ++ ++struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64); ++ + int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); + + CLOSURE_CALLBACK(bch2_journal_write); +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index cc00b0fc40d8..f23e5ee9ad75 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -83,18 +83,20 @@ static struct journal_space + journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) + { ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_device *ja = &ca->journal; + unsigned sectors, buckets, unwritten; ++ unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); + u64 seq; + + if (from == journal_space_total) + return (struct journal_space) { +- .next_entry = ca->mi.bucket_size, +- .total = ca->mi.bucket_size * ja->nr, ++ .next_entry = bucket_size_aligned, ++ .total = bucket_size_aligned * ja->nr, + }; + + buckets = bch2_journal_dev_buckets_available(j, ja, from); +- sectors = ja->sectors_free; ++ sectors = round_down(ja->sectors_free, block_sectors(c)); + + /* + * We that we don't allocate the space for a journal entry +@@ -109,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, + continue; + + /* entry won't fit on this device, skip: */ +- if (unwritten > ca->mi.bucket_size) ++ if (unwritten > bucket_size_aligned) + continue; + + if (unwritten >= sectors) { +@@ -119,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, + } + + buckets--; +- sectors = ca->mi.bucket_size; ++ sectors = bucket_size_aligned; + } + + sectors -= unwritten; +@@ -127,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, + + if (sectors < ca->mi.bucket_size && buckets) { + buckets--; +- sectors = ca->mi.bucket_size; ++ sectors = bucket_size_aligned; + } + + return (struct journal_space) { + .next_entry = sectors, +- .total = sectors + buckets * ca->mi.bucket_size, ++ .total = sectors + buckets * bucket_size_aligned, + }; + } + +@@ -146,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne + + BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); + +- rcu_read_lock(); + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { + if (!ca->journal.nr || + !ca->mi.durability) +@@ -164,11 +165,16 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne + + array_insert_item(dev_space, nr_devs, pos, space); + } +- rcu_read_unlock(); + + if (nr_devs < nr_devs_want) + return (struct journal_space) { 0, 0 }; + ++ /* ++ * It's possible for bucket size to be misaligned w.r.t. the filesystem ++ * block size: ++ */ ++ min_bucket_size = round_down(min_bucket_size, block_sectors(c)); ++ + /* + * We sorted largest to smallest, and we want the smallest out of the + * @nr_devs_want largest devices: +@@ -189,8 +195,8 @@ void bch2_journal_space_available(struct journal *j) + int ret = 0; + + lockdep_assert_held(&j->lock); ++ guard(rcu)(); + +- rcu_read_lock(); + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + +@@ -210,24 +216,22 @@ void bch2_journal_space_available(struct journal *j) + max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); + nr_online++; + } +- rcu_read_unlock(); + + j->can_discard = can_discard; + + if (nr_online < metadata_replicas_required(c)) { +- struct printbuf buf = PRINTBUF; +- buf.atomic++; +- prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" +- "rw journal devs:", nr_online, metadata_replicas_required(c)); +- +- rcu_read_lock(); +- for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) +- prt_printf(&buf, " %s", ca->name); +- rcu_read_unlock(); +- +- bch_err(c, "%s", buf.buf); +- printbuf_exit(&buf); +- ret = -BCH_ERR_insufficient_journal_devices; ++ if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { ++ CLASS(printbuf, buf)(); ++ guard(printbuf_atomic)(&buf); ++ prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" ++ "rw journal devs:", nr_online, metadata_replicas_required(c)); ++ ++ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) ++ prt_printf(&buf, " %s", ca->name); ++ ++ bch_err(c, "%s", buf.buf); ++ } ++ ret = bch_err_throw(c, insufficient_journal_devices); + goto out; + } + +@@ -241,7 +245,7 @@ void bch2_journal_space_available(struct journal *j) + total = j->space[journal_space_total].total; + + if (!j->space[journal_space_discarded].next_entry) +- ret = -BCH_ERR_journal_full; ++ ret = bch_err_throw(c, journal_full); + + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && +@@ -254,8 +258,7 @@ void bch2_journal_space_available(struct journal *j) + bch2_journal_set_watermark(j); + out: + j->cur_entry_sectors = !ret +- ? round_down(j->space[journal_space_discarded].next_entry, +- block_sectors(c)) ++ ? j->space[journal_space_discarded].next_entry + : 0; + j->cur_entry_error = ret; + +@@ -276,11 +279,8 @@ static bool __should_discard_bucket(struct journal *j, struct journal_device *ja + + static bool should_discard_bucket(struct journal *j, struct journal_device *ja) + { +- spin_lock(&j->lock); +- bool ret = __should_discard_bucket(j, ja); +- spin_unlock(&j->lock); +- +- return ret; ++ guard(spinlock)(&j->lock); ++ return __should_discard_bucket(j, ja); + } + + /* +@@ -291,29 +291,26 @@ void bch2_journal_do_discards(struct journal *j) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + +- mutex_lock(&j->discard_lock); ++ guard(mutex)(&j->discard_lock); + +- for_each_rw_member(c, ca) { ++ for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { + struct journal_device *ja = &ca->journal; + + while (should_discard_bucket(j, ja)) { + if (!c->opts.nochanges && +- ca->mi.discard && ++ bch2_discard_opt_enabled(c, ca) && + bdev_max_discard_sectors(ca->disk_sb.bdev)) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, + ja->buckets[ja->discard_idx]), + ca->mi.bucket_size, GFP_NOFS); + +- spin_lock(&j->lock); +- ja->discard_idx = (ja->discard_idx + 1) % ja->nr; +- +- bch2_journal_space_available(j); +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) { ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ bch2_journal_space_available(j); ++ } + } + } +- +- mutex_unlock(&j->discard_lock); + } + + /* +@@ -354,9 +351,8 @@ bool __bch2_journal_pin_put(struct journal *j, u64 seq) + void bch2_journal_pin_put(struct journal *j, u64 seq) + { + if (__bch2_journal_pin_put(j, seq)) { +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + bch2_journal_reclaim_fast(j); +- spin_unlock(&j->lock); + } + } + +@@ -389,10 +385,9 @@ static inline bool __journal_pin_drop(struct journal *j, + void bch2_journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) + { +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + if (__journal_pin_drop(j, pin)) + bch2_journal_reclaim_fast(j); +- spin_unlock(&j->lock); + } + + static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, +@@ -439,7 +434,7 @@ void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) + { +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); + + u64 seq = READ_ONCE(src->seq); + +@@ -450,7 +445,6 @@ void bch2_journal_pin_copy(struct journal *j, + * longer to exist, but that means there's no longer anything to + * copy and we can bail out here: + */ +- spin_unlock(&j->lock); + return; + } + +@@ -467,31 +461,32 @@ void bch2_journal_pin_copy(struct journal *j, + */ + if (seq == journal_last_seq(j)) + journal_wake(j); +- spin_unlock(&j->lock); + } + + void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) + { +- spin_lock(&j->lock); ++ bool wake; + +- BUG_ON(seq < journal_last_seq(j)); ++ scoped_guard(spinlock, &j->lock) { ++ BUG_ON(seq < journal_last_seq(j)); + +- bool reclaim = __journal_pin_drop(j, pin); ++ bool reclaim = __journal_pin_drop(j, pin); + +- bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); ++ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); + +- if (reclaim) +- bch2_journal_reclaim_fast(j); +- /* +- * If the journal is currently full, we might want to call flush_fn +- * immediately: +- */ +- if (seq == journal_last_seq(j)) +- journal_wake(j); ++ if (reclaim) ++ bch2_journal_reclaim_fast(j); ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ wake = seq == journal_last_seq(j); ++ } + +- spin_unlock(&j->lock); ++ if (wake) ++ journal_wake(j); + } + + /** +@@ -576,17 +571,17 @@ static size_t journal_flush_pins(struct journal *j, + + j->last_flushed = jiffies; + +- spin_lock(&j->lock); +- pin = journal_get_next_pin(j, seq_to_flush, +- allowed_below, +- allowed_above, &seq); +- if (pin) { +- BUG_ON(j->flush_in_progress); +- j->flush_in_progress = pin; +- j->flush_in_progress_dropped = false; +- flush_fn = pin->flush; ++ scoped_guard(spinlock, &j->lock) { ++ pin = journal_get_next_pin(j, seq_to_flush, ++ allowed_below, ++ allowed_above, &seq); ++ if (pin) { ++ BUG_ON(j->flush_in_progress); ++ j->flush_in_progress = pin; ++ j->flush_in_progress_dropped = false; ++ flush_fn = pin->flush; ++ } + } +- spin_unlock(&j->lock); + + if (!pin) + break; +@@ -599,13 +594,13 @@ static size_t journal_flush_pins(struct journal *j, + + err = flush_fn(j, pin, seq); + +- spin_lock(&j->lock); +- /* Pin might have been dropped or rearmed: */ +- if (likely(!err && !j->flush_in_progress_dropped)) +- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); +- j->flush_in_progress = NULL; +- j->flush_in_progress_dropped = false; +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) { ++ /* Pin might have been dropped or rearmed: */ ++ if (likely(!err && !j->flush_in_progress_dropped)) ++ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); ++ j->flush_in_progress = NULL; ++ j->flush_in_progress_dropped = false; ++ } + + wake_up(&j->pin_flush_wait); + +@@ -623,9 +618,10 @@ static u64 journal_seq_to_flush(struct journal *j) + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 seq_to_flush = 0; + +- spin_lock(&j->lock); ++ guard(spinlock)(&j->lock); ++ guard(rcu)(); + +- for_each_rw_member(c, ca) { ++ for_each_rw_member_rcu(c, ca) { + struct journal_device *ja = &ca->journal; + unsigned nr_buckets, bucket_to_flush; + +@@ -635,20 +631,15 @@ static u64 journal_seq_to_flush(struct journal *j) + /* Try to keep the journal at most half full: */ + nr_buckets = ja->nr / 2; + +- nr_buckets = min(nr_buckets, ja->nr); +- + bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; + seq_to_flush = max(seq_to_flush, + ja->bucket_seq[bucket_to_flush]); + } + + /* Also flush if the pin fifo is more than half full */ +- seq_to_flush = max_t(s64, seq_to_flush, +- (s64) journal_cur_seq(j) - +- (j->pin.size >> 1)); +- spin_unlock(&j->lock); +- +- return seq_to_flush; ++ return max_t(s64, seq_to_flush, ++ (s64) journal_cur_seq(j) - ++ (j->pin.size >> 1)); + } + + /** +@@ -699,6 +690,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) + if (ret) + break; + ++ /* XXX shove journal discards off to another thread */ + bch2_journal_do_discards(j); + + seq_to_flush = journal_seq_to_flush(j); +@@ -769,9 +761,8 @@ static int bch2_journal_reclaim_thread(void *arg) + + j->reclaim_kicked = false; + +- mutex_lock(&j->reclaim_lock); +- ret = __bch2_journal_reclaim(j, false, kicked); +- mutex_unlock(&j->reclaim_lock); ++ scoped_guard(mutex, &j->reclaim_lock) ++ ret = __bch2_journal_reclaim(j, false, kicked); + + now = jiffies; + delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); +@@ -787,9 +778,8 @@ static int bch2_journal_reclaim_thread(void *arg) + if (j->reclaim_kicked) + break; + +- spin_lock(&j->lock); +- journal_empty = fifo_empty(&j->pin); +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) ++ journal_empty = fifo_empty(&j->pin); + + long timeout = j->next_reclaim - jiffies; + +@@ -843,10 +833,10 @@ int bch2_journal_reclaim_start(struct journal *j) + static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, + unsigned types) + { ++ guard(spinlock)(&j->lock); ++ + struct journal_entry_pin_list *pin_list; + u64 seq; +- +- spin_lock(&j->lock); + fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { + if (seq > seq_to_flush) + break; +@@ -854,12 +844,9 @@ static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, + for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) + if ((BIT(i) & types) && + (!list_empty(&pin_list->unflushed[i]) || +- !list_empty(&pin_list->flushed[i]))) { +- spin_unlock(&j->lock); ++ !list_empty(&pin_list->flushed[i]))) + return true; +- } + } +- spin_unlock(&j->lock); + + return false; + } +@@ -880,32 +867,54 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + if (ret) + return ret; + +- mutex_lock(&j->reclaim_lock); ++ guard(mutex)(&j->reclaim_lock); + + for (int type = JOURNAL_PIN_TYPE_NR - 1; + type >= 0; + --type) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { + *did_work = true; +- goto unlock; ++ ++ /* ++ * Question from Dan Carpenter, on the early return: ++ * ++ * If journal_flush_pins_or_still_flushing() returns ++ * true, then the flush hasn't complete and we must ++ * return 0; we want the outer closure_wait_event() in ++ * journal_flush_pins() to continue. ++ * ++ * The early return is there because we don't want to ++ * call journal_entry_close() until we've finished ++ * flushing all outstanding journal pins - otherwise ++ * seq_to_flush can be U64_MAX, and we'll close a bunch ++ * of journal entries and write tiny ones completely ++ * unnecessarily. ++ * ++ * Having the early return be in the loop where we loop ++ * over types is important, because flushing one journal ++ * pin can cause new journal pins to be added (even of ++ * the same type, btree node writes may generate more ++ * btree node writes, when updating the parent pointer ++ * has a full node and has to trigger a split/compact). ++ * ++ * This is part of our shutdown sequence, where order of ++ * flushing is important in order to make sure that it ++ * terminates... ++ */ ++ return 0; + } + + if (seq_to_flush > journal_cur_seq(j)) + bch2_journal_entry_close(j); + +- spin_lock(&j->lock); + /* + * If journal replay hasn't completed, the unreplayed journal entries + * hold refs on their corresponding sequence numbers + */ ++ guard(spinlock)(&j->lock); + ret = !test_bit(JOURNAL_replay_done, &j->flags) || + journal_last_seq(j) > seq_to_flush || + !fifo_used(&j->pin); +- +- spin_unlock(&j->lock); +-unlock: +- mutex_unlock(&j->reclaim_lock); +- + return ret; + } + +@@ -930,13 +939,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) + u64 iter, seq = 0; + int ret = 0; + +- spin_lock(&j->lock); +- fifo_for_each_entry_ptr(p, &j->pin, iter) +- if (dev_idx >= 0 +- ? bch2_dev_list_has_dev(p->devs, dev_idx) +- : p->devs.nr < c->opts.metadata_replicas) +- seq = iter; +- spin_unlock(&j->lock); ++ scoped_guard(spinlock, &j->lock) ++ fifo_for_each_entry_ptr(p, &j->pin, iter) ++ if (dev_idx >= 0 ++ ? bch2_dev_list_has_dev(p->devs, dev_idx) ++ : p->devs.nr < c->opts.metadata_replicas) ++ seq = iter; + + bch2_journal_flush_pins(j, seq); + +@@ -944,7 +952,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) + if (ret) + return ret; + +- mutex_lock(&c->replicas_gc_lock); ++ guard(mutex)(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); + + /* +@@ -959,29 +967,25 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) + goto err; + + seq = 0; +- spin_lock(&j->lock); +- while (!ret) { +- struct bch_replicas_padded replicas; ++ scoped_guard(spinlock, &j->lock) ++ while (!ret) { ++ union bch_replicas_padded replicas; + +- seq = max(seq, journal_last_seq(j)); +- if (seq >= j->pin.back) +- break; +- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, +- journal_seq_pin(j, seq)->devs); +- seq++; ++ seq = max(seq, journal_last_seq(j)); ++ if (seq >= j->pin.back) ++ break; ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, ++ journal_seq_pin(j, seq)->devs); ++ seq++; + +- if (replicas.e.nr_devs) { +- spin_unlock(&j->lock); +- ret = bch2_mark_replicas(c, &replicas.e); +- spin_lock(&j->lock); ++ if (replicas.e.nr_devs) { ++ spin_unlock(&j->lock); ++ ret = bch2_mark_replicas(c, &replicas.e); ++ spin_lock(&j->lock); ++ } + } +- } +- spin_unlock(&j->lock); + err: +- ret = bch2_replicas_gc_end(c, ret); +- mutex_unlock(&c->replicas_gc_lock); +- +- return ret; ++ return bch2_replicas_gc_end(c, ret); + } + + bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +@@ -989,20 +993,16 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + +- spin_lock(&j->lock); +- if (!test_bit(JOURNAL_running, &j->flags)) { +- spin_unlock(&j->lock); ++ guard(spinlock)(&j->lock); ++ guard(printbuf_atomic)(out); ++ ++ if (!test_bit(JOURNAL_running, &j->flags)) + return true; +- } + + *seq = max(*seq, j->pin.front); + +- if (*seq >= j->pin.back) { +- spin_unlock(&j->lock); ++ if (*seq >= j->pin.back) + return true; +- } +- +- out->atomic++; + + pin_list = journal_seq_pin(j, *seq); + +@@ -1021,9 +1021,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 + + printbuf_indent_sub(out, 2); + +- --out->atomic; +- spin_unlock(&j->lock); +- + return false; + } + +diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c +index 62b910f2fb27..0cb9b93f13e7 100644 +--- a/fs/bcachefs/journal_sb.c ++++ b/fs/bcachefs/journal_sb.c +@@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, + j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, + (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); + if (!j) +- return -BCH_ERR_ENOSPC_sb_journal; ++ return bch_err_throw(c, ENOSPC_sb_journal); + + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +index e463d2d95359..399db5b77d9f 100644 +--- a/fs/bcachefs/journal_seq_blacklist.c ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -49,7 +49,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) + unsigned i = 0, nr; + int ret = 0; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); + nr = blacklist_nr_entries(bl); + +@@ -77,10 +77,8 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) + + bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, + sb_blacklist_u64s(nr + 1)); +- if (!bl) { +- ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; +- goto out; +- } ++ if (!bl) ++ return bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist); + + array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { + .start = cpu_to_le64(start), +@@ -89,8 +87,6 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) + c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); + + ret = bch2_write_super(c); +-out: +- mutex_unlock(&c->sb_lock); + + return ret ?: bch2_blacklist_table_initialize(c); + } +@@ -103,6 +99,52 @@ static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r) + return cmp_int(l->start, r->start); + } + ++static int journal_seq_blacklist_table_end_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_seq_blacklist_table_entry *l = _l; ++ const struct journal_seq_blacklist_table_entry *r = _r; ++ ++ return cmp_int(l->end, r->end); ++} ++ ++u64 bch2_journal_seq_next_blacklisted(struct bch_fs *c, u64 seq) ++{ ++ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; ++ ++ if (!t) ++ return U64_MAX; ++ ++ struct journal_seq_blacklist_table_entry search = { .end = seq }; ++ int idx = eytzinger0_find_gt(t->entries, t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_end_cmp, ++ &search); ++ if (idx < 0) ++ return U64_MAX; ++ ++ return max(seq, t->entries[idx].start); ++} ++ ++u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *c, u64 seq) ++{ ++ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; ++ ++ if (!t) ++ return seq; ++ ++ while (true) { ++ struct journal_seq_blacklist_table_entry search = { .start = seq }; ++ int idx = eytzinger0_find_le(t->entries, t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ &search); ++ if (idx < 0 || t->entries[idx].end <= seq) ++ return seq; ++ ++ seq = t->entries[idx].end; ++ } ++} ++ + bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, + bool dirty) + { +@@ -130,6 +172,16 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, + return true; + } + ++u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) ++{ ++ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; ++ ++ if (!t || !t->nr) ++ return 0; ++ ++ return t->entries[eytzinger0_last(t->nr)].end - 1; ++} ++ + int bch2_blacklist_table_initialize(struct bch_fs *c) + { + struct bch_sb_field_journal_seq_blacklist *bl = +@@ -142,7 +194,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) + + t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); + if (!t) +- return -BCH_ERR_ENOMEM_blacklist_table_init; ++ return bch_err_throw(c, ENOMEM_blacklist_table_init); + + t->nr = nr; + +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +index d47636f96fdc..389b789b26f4 100644 +--- a/fs/bcachefs/journal_seq_blacklist.h ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -11,7 +11,11 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) + : 0; + } + ++u64 bch2_journal_seq_next_blacklisted(struct bch_fs *, u64); ++u64 bch2_journal_seq_next_nonblacklisted(struct bch_fs *, u64); ++ + bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); ++u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); + int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); + int bch2_blacklist_table_initialize(struct bch_fs *); + +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 8e0eba776b9d..51104bbb99da 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -151,8 +151,6 @@ enum journal_flags { + #undef x + }; + +-typedef DARRAY(u64) darray_u64; +- + struct journal_bio { + struct bch_dev *ca; + unsigned buf_idx; +diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c +index 75f27ec26f85..38cdacc6b067 100644 +--- a/fs/bcachefs/logged_ops.c ++++ b/fs/bcachefs/logged_ops.c +@@ -35,7 +35,7 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, + { + struct bch_fs *c = trans->c; + u32 restart_count = trans->restart_count; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + fsck_err_on(test_bit(BCH_FS_clean_recovery, &c->flags), +@@ -56,21 +56,18 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, + + bch2_bkey_buf_exit(&sk, c); + fsck_err: +- printbuf_exit(&buf); + return ret ?: trans_was_restarted(trans, restart_count); + } + + int bch2_resume_logged_ops(struct bch_fs *c) + { +- int ret = bch2_trans_run(c, +- for_each_btree_key_max(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_max(trans, iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_logged_ops, 0), + POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), + BTREE_ITER_prefetch, k, +- resume_logged_op(trans, &iter, k))); +- bch_err_fn(c, ret); +- return ret; ++ resume_logged_op(trans, &iter, k)); + } + + static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) +@@ -84,7 +81,7 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) + k->k.p = iter.pos; + + ret = bch2_trans_update(trans, &iter, k, 0); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -107,12 +104,11 @@ int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k) + */ + if (ret) { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + bch2_fs_fatal_error(c, "deleting logged operation %s: %s", + buf.buf, bch2_err_str(ret)); +- printbuf_exit(&buf); + } + + return ret; +diff --git a/fs/bcachefs/logged_ops.h b/fs/bcachefs/logged_ops.h +index 30ae9ef737dd..6dea6e2ac7a8 100644 +--- a/fs/bcachefs/logged_ops.h ++++ b/fs/bcachefs/logged_ops.h +@@ -10,7 +10,7 @@ + + static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i *op) + { +- return bch2_btree_insert_nonextent(trans, BTREE_ID_logged_ops, op, 0); ++ return bch2_btree_insert_trans(trans, BTREE_ID_logged_ops, op, BTREE_ITER_cached); + } + + int bch2_resume_logged_ops(struct bch_fs *); +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +index 2f63fc6d456f..b9c0834498dd 100644 +--- a/fs/bcachefs/lru.c ++++ b/fs/bcachefs/lru.c +@@ -9,6 +9,7 @@ + #include "ec.h" + #include "error.h" + #include "lru.h" ++#include "progress.h" + #include "recovery.h" + + /* KEY_TYPE_lru is obsolete: */ +@@ -86,11 +87,9 @@ int bch2_lru_check_set(struct btree_trans *trans, + struct bkey_buf *last_flushed) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; +- struct btree_iter lru_iter; +- struct bkey_s_c lru_k = +- bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, +- lru_pos(lru_id, dev_bucket, time), 0); ++ CLASS(printbuf, buf)(); ++ CLASS(btree_iter, lru_iter)(trans, BTREE_ID_lru, lru_pos(lru_id, dev_bucket, time), 0); ++ struct bkey_s_c lru_k = bch2_btree_iter_peek_slot(&lru_iter); + int ret = bkey_err(lru_k); + if (ret) + return ret; +@@ -98,7 +97,7 @@ int bch2_lru_check_set(struct btree_trans *trans, + if (lru_k.k->type != KEY_TYPE_set) { + ret = bch2_btree_write_buffer_maybe_flush(trans, referring_k, last_flushed); + if (ret) +- goto err; ++ return ret; + + if (fsck_err(trans, alloc_key_to_missing_lru_entry, + "missing %s lru entry\n%s", +@@ -106,13 +105,10 @@ int bch2_lru_check_set(struct btree_trans *trans, + (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { + ret = bch2_lru_set(trans, lru_id, dev_bucket, time); + if (ret) +- goto err; ++ return ret; + } + } +-err: + fsck_err: +- bch2_trans_iter_exit(trans, &lru_iter); +- printbuf_exit(&buf); + return ret; + } + +@@ -145,13 +141,11 @@ static u64 bkey_lru_type_idx(struct bch_fs *c, + case BCH_LRU_fragmentation: { + a = bch2_alloc_to_v4(k, &a_convert); + +- rcu_read_lock(); ++ guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); +- u64 idx = ca ++ return ca + ? alloc_lru_idx_fragmentation(*a, ca) + : 0; +- rcu_read_unlock(); +- return idx; + } + case BCH_LRU_stripes: + return k.k->type == KEY_TYPE_stripe +@@ -168,16 +162,16 @@ static int bch2_check_lru_key(struct btree_trans *trans, + struct bkey_buf *last_flushed) + { + struct bch_fs *c = trans->c; +- struct printbuf buf1 = PRINTBUF; +- struct printbuf buf2 = PRINTBUF; ++ CLASS(printbuf, buf1)(); ++ CLASS(printbuf, buf2)(); + + struct bbpos bp = lru_pos_to_bp(lru_k); + +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); ++ CLASS(btree_iter, iter)(trans, bp.btree, bp.pos, 0); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + enum bch_lru_type type = lru_type(lru_k); + u64 idx = bkey_lru_type_idx(c, type, k); +@@ -185,7 +179,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, + if (lru_pos_time(lru_k.k->p) != idx) { + ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); + if (ret) +- goto err; ++ return ret; + + if (fsck_err(trans, lru_entry_bad, + "incorrect lru entry: lru %s time %llu\n" +@@ -195,13 +189,9 @@ static int bch2_check_lru_key(struct btree_trans *trans, + lru_pos_time(lru_k.k->p), + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) +- ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); ++ return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + } +-err: + fsck_err: +- bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf2); +- printbuf_exit(&buf1); + return ret; + } + +@@ -212,14 +202,18 @@ int bch2_check_lrus(struct bch_fs *c) + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_lru)); ++ ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_check_lru_key(trans, &iter, k, &last_flushed))); ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ progress_update_iter(trans, &progress, &iter); ++ bch2_check_lru_key(trans, &iter, k, &last_flushed); ++ })); + + bch2_bkey_buf_exit(&last_flushed, c); +- bch_err_fn(c, ret); + return ret; + + } +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +index 90dcf80bd64a..a66d01d04e57 100644 +--- a/fs/bcachefs/migrate.c ++++ b/fs/bcachefs/migrate.c +@@ -4,10 +4,13 @@ + */ + + #include "bcachefs.h" ++#include "backpointers.h" + #include "bkey_buf.h" + #include "btree_update.h" + #include "btree_update_interior.h" ++#include "btree_write_buffer.h" + #include "buckets.h" ++#include "ec.h" + #include "errcode.h" + #include "extents.h" + #include "io_write.h" +@@ -20,7 +23,7 @@ + #include "super-io.h" + + static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, +- unsigned dev_idx, int flags, bool metadata) ++ unsigned dev_idx, unsigned flags, bool metadata) + { + unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; + unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; +@@ -32,16 +35,33 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, + nr_good = bch2_bkey_durability(c, k.s_c); + if ((!nr_good && !(flags & lost)) || + (nr_good < replicas && !(flags & degraded))) +- return -BCH_ERR_remove_would_lose_data; ++ return bch_err_throw(c, remove_would_lose_data); + + return 0; + } + ++static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, ++ struct btree *b, unsigned dev_idx, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_buf k; ++ ++ bch2_bkey_buf_init(&k); ++ bch2_bkey_buf_copy(&k, c, &b->key); ++ ++ int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: ++ bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); ++ ++ bch_err_fn(c, ret); ++ bch2_bkey_buf_exit(&k, c); ++ return ret; ++} ++ + static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, +- int flags) ++ unsigned flags) + { + struct bch_fs *c = trans->c; + struct bkey_i *n; +@@ -77,38 +97,51 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, + return 0; + } + ++static int bch2_dev_btree_drop_key(struct btree_trans *trans, ++ struct bkey_s_c_backpointer bp, ++ unsigned dev_idx, ++ struct bkey_buf *last_flushed, ++ unsigned flags) ++{ ++ struct btree_iter iter; ++ struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); ++ int ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; ++ ++ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); ++ ++ bch2_trans_iter_exit(&iter); ++ return ret; ++} ++ + static int bch2_dev_usrdata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, +- unsigned dev_idx, int flags) ++ unsigned dev_idx, unsigned flags) + { +- struct btree_trans *trans = bch2_trans_get(c); +- enum btree_id id; +- int ret = 0; ++ CLASS(btree_trans, trans)(c); + +- for (id = 0; id < BTREE_ID_NR; id++) { ++ for (unsigned id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_ptrs(id)) + continue; + +- ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, ++ int ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); + })); + if (ret) +- break; ++ return ret; + } + +- bch2_trans_put(trans); +- +- return ret; ++ return 0; + } + + static int bch2_dev_metadata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, +- unsigned dev_idx, int flags) ++ unsigned dev_idx, unsigned flags) + { +- struct btree_trans *trans; + struct btree_iter iter; + struct closure cl; + struct btree *b; +@@ -118,9 +151,9 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, + + /* don't handle this yet: */ + if (flags & BCH_FORCE_IF_METADATA_LOST) +- return -BCH_ERR_remove_with_metadata_missing_unimplemented; ++ return bch_err_throw(c, remove_with_metadata_missing_unimplemented); + +- trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + bch2_bkey_buf_init(&k); + closure_init_stack(&cl); + +@@ -130,36 +163,28 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, + retry: + ret = 0; + while (bch2_trans_begin(trans), +- (b = bch2_btree_iter_peek_node(trans, &iter)) && ++ (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { + bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); + + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) + goto next; + +- bch2_bkey_buf_copy(&k, c, &b->key); +- +- ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), +- dev_idx, flags, true); +- if (ret) +- break; +- +- ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); ++ ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + +- bch_err_msg(c, ret, "updating btree node key"); + if (ret) + break; + next: +- bch2_btree_iter_next_node(trans, &iter); ++ bch2_btree_iter_next_node(&iter); + } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + + if (ret) + goto err; +@@ -169,14 +194,71 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, + ret = 0; + err: + bch2_bkey_buf_exit(&k, c); +- bch2_trans_put(trans); + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + return ret; + } + +-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, ++ struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, ++ unsigned flags) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, ++ last_flushed); ++ int ret = bkey_err(k); ++ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ++ return 0; ++ if (ret) ++ return ret; ++ ++ if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) ++ goto out; ++ ++ /* ++ * XXX: pass flags arg to invalidate_stripe_to_dev and handle it ++ * properly ++ */ ++ ++ if (bkey_is_btree_ptr(k.k)) ++ ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); ++ else if (k.k->type == KEY_TYPE_stripe) ++ ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); ++ else ++ ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); ++out: ++ bch2_trans_iter_exit(&iter); ++ return ret; ++} ++ ++int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) ++{ ++ CLASS(btree_trans, trans)(c); ++ ++ struct bkey_buf last_flushed; ++ bch2_bkey_buf_init(&last_flushed); ++ bkey_init(&last_flushed.k->k); ++ ++ int ret = bch2_btree_write_buffer_flush_sync(trans) ?: ++ for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, ++ POS(dev_idx, 0), ++ POS(dev_idx, U64_MAX), 0, k, ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ if (k.k->type != KEY_TYPE_backpointer) ++ continue; ++ ++ data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), ++ &last_flushed, flags); ++ ++ })); ++ ++ bch2_bkey_buf_exit(&last_flushed, trans->c); ++ bch_err_fn(c, ret); ++ return ret; ++} ++ ++int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) + { + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, +diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h +index 027efaa0d575..30018140711b 100644 +--- a/fs/bcachefs/migrate.h ++++ b/fs/bcachefs/migrate.h +@@ -2,6 +2,7 @@ + #ifndef _BCACHEFS_MIGRATE_H + #define _BCACHEFS_MIGRATE_H + +-int bch2_dev_data_drop(struct bch_fs *, unsigned, int); ++int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); ++int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); + + #endif /* _BCACHEFS_MIGRATE_H */ +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index dfdbb9259985..30fe269d531d 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -38,36 +38,77 @@ const char * const bch2_data_ops_strs[] = { + NULL + }; + +-static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, +- struct bch_io_opts *io_opts, +- struct data_update_opts *data_opts) ++struct evacuate_bucket_arg { ++ struct bpos bucket; ++ int gen; ++ struct data_update_opts data_opts; ++}; ++ ++static bool evacuate_bucket_pred(struct bch_fs *, void *, ++ enum btree_id, struct bkey_s_c, ++ struct bch_io_opts *, ++ struct data_update_opts *); ++ ++static noinline void ++trace_io_move2(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts) + { +- if (trace_io_move_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + +- bch2_bkey_val_to_text(&buf, c, k); +- prt_newline(&buf); +- bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); +- trace_io_move(c, buf.buf); +- printbuf_exit(&buf); +- } ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_newline(&buf); ++ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); ++ trace_io_move(c, buf.buf); + } + +-static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) ++static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) + { +- if (trace_io_move_read_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + +- bch2_bkey_val_to_text(&buf, c, k); +- trace_io_move_read(c, buf.buf); +- printbuf_exit(&buf); ++ bch2_bkey_val_to_text(&buf, c, k); ++ trace_io_move_read(c, buf.buf); ++} ++ ++static noinline void ++trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_update_opts *data_opts, ++ move_pred_fn pred, void *_arg, bool p) ++{ ++ CLASS(printbuf, buf)(); ++ ++ prt_printf(&buf, "%ps: %u", pred, p); ++ ++ if (pred == evacuate_bucket_pred) { ++ struct evacuate_bucket_arg *arg = _arg; ++ prt_printf(&buf, " gen=%u", arg->gen); + } ++ ++ prt_newline(&buf); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_newline(&buf); ++ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); ++ trace_io_move_pred(c, buf.buf); ++} ++ ++static noinline void ++trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) ++{ ++ struct printbuf buf = PRINTBUF; ++ ++ prt_printf(&buf, "bucket: "); ++ bch2_bpos_to_text(&buf, bucket); ++ prt_printf(&buf, " gen: %i\n", gen); ++ ++ trace_io_move_evacuate_bucket(c, buf.buf); ++ printbuf_exit(&buf); + } + + struct moving_io { + struct list_head read_list; + struct list_head io_list; +- struct move_bucket_in_flight *b; ++ struct move_bucket *b; + struct closure cl; + bool read_completed; + +@@ -84,10 +125,9 @@ static void move_free(struct moving_io *io) + if (io->b) + atomic_dec(&io->b->count); + +- mutex_lock(&ctxt->lock); +- list_del(&io->io_list); ++ scoped_guard(mutex, &ctxt->lock) ++ list_del(&io->io_list); + wake_up(&ctxt->wait); +- mutex_unlock(&ctxt->lock); + + if (!io->write.data_opts.scrub) { + bch2_data_update_exit(&io->write); +@@ -106,12 +146,9 @@ static void move_write_done(struct bch_write_op *op) + + if (op->error) { + if (trace_io_move_write_fail_enabled()) { +- struct printbuf buf = PRINTBUF; +- ++ CLASS(printbuf, buf)(); + bch2_write_op_to_text(&buf, op); +- prt_printf(&buf, "ret\t%s\n", bch2_err_str(op->error)); + trace_io_move_write_fail(c, buf.buf); +- printbuf_exit(&buf); + } + this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); + +@@ -126,31 +163,43 @@ static void move_write_done(struct bch_write_op *op) + + static void move_write(struct moving_io *io) + { ++ struct bch_fs *c = io->write.op.c; + struct moving_context *ctxt = io->write.ctxt; ++ struct bch_read_bio *rbio = &io->write.rbio; + + if (ctxt->stats) { +- if (io->write.rbio.bio.bi_status) ++ if (rbio->bio.bi_status) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_uncorrected); +- else if (io->write.rbio.saw_error) ++ else if (rbio->saw_error) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_corrected); + } + +- if (unlikely(io->write.rbio.ret || +- io->write.rbio.bio.bi_status || +- io->write.data_opts.scrub)) { ++ /* ++ * If the extent has been bitrotted, we're going to have to give it a ++ * new checksum in order to move it - but the poison bit will ensure ++ * that userspace still gets the appropriate error. ++ */ ++ if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && ++ (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { ++ struct bch_extent_crc_unpacked crc = rbio->pick.crc; ++ struct nonce nonce = extent_nonce(rbio->version, crc); ++ ++ rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, ++ nonce, &rbio->bio); ++ rbio->ret = 0; ++ } ++ ++ if (unlikely(rbio->ret || io->write.data_opts.scrub)) { + move_free(io); + return; + } + + if (trace_io_move_write_enabled()) { +- struct bch_fs *c = io->write.op.c; +- struct printbuf buf = PRINTBUF; +- ++ CLASS(printbuf, buf)(); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); + trace_io_move_write(c, buf.buf); +- printbuf_exit(&buf); + } + + closure_get(&io->write.ctxt->cl); +@@ -219,9 +268,8 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) + EBUG_ON(atomic_read(&ctxt->read_sectors)); + EBUG_ON(atomic_read(&ctxt->read_ios)); + +- mutex_lock(&c->moving_context_lock); +- list_del(&ctxt->list); +- mutex_unlock(&c->moving_context_lock); ++ scoped_guard(mutex, &c->moving_context_lock) ++ list_del(&ctxt->list); + + /* + * Generally, releasing a transaction within a transaction restart means +@@ -257,9 +305,8 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt, + INIT_LIST_HEAD(&ctxt->ios); + init_waitqueue_head(&ctxt->wait); + +- mutex_lock(&c->moving_context_lock); +- list_add(&ctxt->list, &c->moving_context_list); +- mutex_unlock(&c->moving_context_lock); ++ scoped_guard(mutex, &c->moving_context_lock) ++ list_add(&ctxt->list, &c->moving_context_list); + } + + void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c) +@@ -275,7 +322,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) + } + + int bch2_move_extent(struct moving_context *ctxt, +- struct move_bucket_in_flight *bucket_in_flight, ++ struct move_bucket *bucket_in_flight, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_io_opts io_opts, +@@ -283,9 +330,10 @@ int bch2_move_extent(struct moving_context *ctxt, + { + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; +- int ret = -ENOMEM; ++ int ret = 0; + +- trace_io_move2(c, k, &io_opts, &data_opts); ++ if (trace_io_move_enabled()) ++ trace_io_move2(c, k, &io_opts, &data_opts); + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); + + if (ctxt->stats) +@@ -296,19 +344,20 @@ int bch2_move_extent(struct moving_context *ctxt, + if (!data_opts.rewrite_ptrs && + !data_opts.extra_replicas && + !data_opts.scrub) { +- if (data_opts.kill_ptrs) ++ if (data_opts.kill_ptrs) { ++ this_cpu_add(c->counters[BCH_COUNTER_io_move_drop_only], k.k->size); + return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); +- return 0; ++ } else { ++ this_cpu_add(c->counters[BCH_COUNTER_io_move_noop], k.k->size); ++ return 0; ++ } + } + +- /* +- * Before memory allocations & taking nocow locks in +- * bch2_data_update_init(): +- */ +- bch2_trans_unlock(trans); +- +- struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); +- if (!io) ++ struct moving_io *io = allocate_dropping_locks(trans, ret, ++ kzalloc(sizeof(struct moving_io), _gfp)); ++ if (!io && !ret) ++ ret = bch_err_throw(c, ENOMEM_move_extent); ++ if (ret) + goto err; + + INIT_LIST_HEAD(&io->io_list); +@@ -320,7 +369,7 @@ int bch2_move_extent(struct moving_context *ctxt, + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + &io_opts, data_opts, iter->btree_id, k); + if (ret) +- goto err_free; ++ goto err; + + io->write.op.end_io = move_write_done; + } else { +@@ -330,9 +379,11 @@ int bch2_move_extent(struct moving_context *ctxt, + io->write.op.c = c; + io->write.data_opts = data_opts; + ++ bch2_trans_unlock(trans); ++ + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); + if (ret) +- goto err_free; ++ goto err; + } + + io->write.rbio.bio.bi_end_io = move_read_endio; +@@ -351,15 +402,16 @@ int bch2_move_extent(struct moving_context *ctxt, + atomic_inc(&io->b->count); + } + +- trace_io_move_read2(c, k); ++ if (trace_io_move_read_enabled()) ++ trace_io_move_read2(c, k); + +- mutex_lock(&ctxt->lock); +- atomic_add(io->read_sectors, &ctxt->read_sectors); +- atomic_inc(&ctxt->read_ios); ++ scoped_guard(mutex, &ctxt->lock) { ++ atomic_add(io->read_sectors, &ctxt->read_sectors); ++ atomic_inc(&ctxt->read_ios); + +- list_add_tail(&io->read_list, &ctxt->reads); +- list_add_tail(&io->io_list, &ctxt->ios); +- mutex_unlock(&ctxt->lock); ++ list_add_tail(&io->read_list, &ctxt->reads); ++ list_add_tail(&io->io_list, &ctxt->ios); ++ } + + /* + * dropped by move_read_endio() - guards against use after free of +@@ -374,12 +426,8 @@ int bch2_move_extent(struct moving_context *ctxt, + BCH_READ_last_fragment, + data_opts.scrub ? data_opts.read_dev : -1); + return 0; +-err_free: +- kfree(io); + err: +- if (bch2_err_matches(ret, BCH_ERR_data_update_done)) +- return 0; +- ++ kfree(io); + if (bch2_err_matches(ret, EROFS) || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; +@@ -387,18 +435,19 @@ int bch2_move_extent(struct moving_context *ctxt, + count_event(c, io_move_start_fail); + + if (trace_io_move_start_fail_enabled()) { +- struct printbuf buf = PRINTBUF; +- ++ CLASS(printbuf, buf)(); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, ": "); + prt_str(&buf, bch2_err_str(ret)); + trace_io_move_start_fail(c, buf.buf); +- printbuf_exit(&buf); + } ++ ++ if (bch2_err_matches(ret, BCH_ERR_data_update_done)) ++ return 0; + return ret; + } + +-static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, ++struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, +@@ -409,6 +458,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, + struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; + int ret = 0; + ++ if (btree_iter_path(trans, extent_iter)->level) ++ return opts_ret; ++ + if (extent_k.k->type == KEY_TYPE_reflink_v) + goto out; + +@@ -463,24 +515,22 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, + *io_opts = bch2_opts_to_inode_opts(c->opts); + + /* reflink btree? */ +- if (!extent_k.k->p.inode) +- goto out; +- +- struct btree_iter inode_iter; +- struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, +- SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), +- BTREE_ITER_cached); +- int ret = bkey_err(inode_k); +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- return ret; ++ if (extent_k.k->p.inode) { ++ CLASS(btree_iter, inode_iter)(trans, BTREE_ID_inodes, ++ SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), ++ BTREE_ITER_cached); ++ struct bkey_s_c inode_k = bch2_btree_iter_peek_slot(&inode_iter); ++ int ret = bkey_err(inode_k); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ return ret; + +- if (!ret && bkey_is_inode(inode_k.k)) { +- struct bch_inode_unpacked inode; +- bch2_inode_unpack(inode_k, &inode); +- bch2_inode_opts_get(io_opts, c, &inode); ++ if (!ret && bkey_is_inode(inode_k.k)) { ++ struct bch_inode_unpacked inode; ++ bch2_inode_unpack(inode_k, &inode); ++ bch2_inode_opts_get(io_opts, c, &inode); ++ } + } +- bch2_trans_iter_exit(trans, &inode_iter); +-out: ++ + return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); + } + +@@ -545,25 +595,25 @@ static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans * + BTREE_ID_reflink, reflink_pos, + BTREE_ITER_not_extents); + +- struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); ++ struct bkey_s_c k = bch2_btree_iter_peek(iter); + if (!k.k || bkey_err(k)) { +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return k; + } + + if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + return bkey_s_c_null; + } + + return k; + } + +-static int bch2_move_data_btree(struct moving_context *ctxt, +- struct bpos start, +- struct bpos end, +- move_pred_fn pred, void *arg, +- enum btree_id btree_id) ++int bch2_move_data_btree(struct moving_context *ctxt, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ enum btree_id btree_id, unsigned level) + { + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; +@@ -589,11 +639,55 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + ctxt->stats->pos = BBPOS(btree_id, start); + } + ++retry_root: + bch2_trans_begin(trans); +- bch2_trans_iter_init(trans, &iter, btree_id, start, +- BTREE_ITER_prefetch| +- BTREE_ITER_not_extents| +- BTREE_ITER_all_snapshots); ++ ++ if (level == bch2_btree_id_root(c, btree_id)->level + 1) { ++ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, ++ BTREE_ITER_prefetch| ++ BTREE_ITER_not_extents| ++ BTREE_ITER_all_snapshots); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); ++ ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ goto root_err; ++ ++ if (b != btree_node_root(c, b)) { ++ bch2_trans_iter_exit(&iter); ++ goto retry_root; ++ } ++ ++ k = bkey_i_to_s_c(&b->key); ++ ++ io_opts = &snapshot_io_opts.fs_io_opts; ++ ret = PTR_ERR_OR_ZERO(io_opts); ++ if (ret) ++ goto root_err; ++ ++ memset(&data_opts, 0, sizeof(data_opts)); ++ if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) ++ goto out; ++ ++ ++ if (!data_opts.scrub) ++ ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, ++ k.k->p, data_opts.target, 0); ++ else ++ ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); ++ ++root_err: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ++ bch2_trans_iter_exit(&iter); ++ goto retry_root; ++ } ++ ++ goto out; ++ } ++ ++ bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, ++ BTREE_ITER_prefetch| ++ BTREE_ITER_not_extents| ++ BTREE_ITER_all_snapshots); + + if (ctxt->rate) + bch2_ratelimit_reset(ctxt->rate); +@@ -603,7 +697,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + + bch2_trans_begin(trans); + +- k = bch2_btree_iter_peek(trans, &iter); ++ k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + +@@ -613,7 +707,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + if (ret) + break; + +- if (bkey_ge(bkey_start_pos(k.k), end)) ++ if (bkey_gt(bkey_start_pos(k.k), end)) + break; + + if (ctxt->stats) +@@ -624,7 +718,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + +- bch2_trans_iter_exit(trans, &reflink_iter); ++ bch2_trans_iter_exit(&reflink_iter); + k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -653,7 +747,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + continue; + + memset(&data_opts, 0, sizeof(data_opts)); +- if (!pred(c, arg, k, io_opts, &data_opts)) ++ if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) + goto next; + + /* +@@ -663,7 +757,14 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + +- ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); ++ if (!level) ++ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); ++ else if (!data_opts.scrub) ++ ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, ++ k.k->p, data_opts.target, 0); ++ else ++ ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); ++ + if (ret2) { + if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) + continue; +@@ -681,83 +782,86 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + next_nondata: +- bch2_btree_iter_advance(trans, &iter); ++ if (!bch2_btree_iter_advance(&iter)) ++ break; + } +- +- bch2_trans_iter_exit(trans, &reflink_iter); +- bch2_trans_iter_exit(trans, &iter); ++out: ++ bch2_trans_iter_exit(&reflink_iter); ++ bch2_trans_iter_exit(&iter); + bch2_bkey_buf_exit(&sk, c); + per_snapshot_io_opts_exit(&snapshot_io_opts); + + return ret; + } + +-int __bch2_move_data(struct moving_context *ctxt, +- struct bbpos start, +- struct bbpos end, +- move_pred_fn pred, void *arg) ++static int bch2_move_data(struct bch_fs *c, ++ struct bbpos start, ++ struct bbpos end, ++ unsigned min_depth, ++ struct bch_ratelimit *rate, ++ struct bch_move_stats *stats, ++ struct write_point_specifier wp, ++ bool wait_on_copygc, ++ move_pred_fn pred, void *arg) + { +- struct bch_fs *c = ctxt->trans->c; +- enum btree_id id; + int ret = 0; + +- for (id = start.btree; ++ struct moving_context ctxt; ++ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); ++ ++ for (enum btree_id id = start.btree; + id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1); + id++) { +- ctxt->stats->pos = BBPOS(id, POS_MIN); ++ ctxt.stats->pos = BBPOS(id, POS_MIN); + +- if (!btree_type_has_ptrs(id) || +- !bch2_btree_id_root(c, id)->b) ++ if (!bch2_btree_id_root(c, id)->b) + continue; + +- ret = bch2_move_data_btree(ctxt, +- id == start.btree ? start.pos : POS_MIN, +- id == end.btree ? end.pos : POS_MAX, +- pred, arg, id); ++ unsigned min_depth_this_btree = min_depth; ++ ++ if (!btree_type_has_ptrs(id)) ++ min_depth_this_btree = max(min_depth_this_btree, 1); ++ ++ for (unsigned level = min_depth_this_btree; ++ level < BTREE_MAX_DEPTH; ++ level++) { ++ ret = bch2_move_data_btree(&ctxt, ++ id == start.btree ? start.pos : POS_MIN, ++ id == end.btree ? end.pos : POS_MAX, ++ pred, arg, id, level); ++ if (ret) ++ break; ++ } ++ + if (ret) + break; + } + +- return ret; +-} +- +-int bch2_move_data(struct bch_fs *c, +- struct bbpos start, +- struct bbpos end, +- struct bch_ratelimit *rate, +- struct bch_move_stats *stats, +- struct write_point_specifier wp, +- bool wait_on_copygc, +- move_pred_fn pred, void *arg) +-{ +- struct moving_context ctxt; +- +- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); +- int ret = __bch2_move_data(&ctxt, start, end, pred, arg); + bch2_moving_ctxt_exit(&ctxt); +- + return ret; + } + + static int __bch2_move_data_phys(struct moving_context *ctxt, +- struct move_bucket_in_flight *bucket_in_flight, ++ struct move_bucket *bucket_in_flight, + unsigned dev, + u64 bucket_start, + u64 bucket_end, + unsigned data_types, ++ bool copygc, + move_pred_fn pred, void *arg) + { + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + bool is_kthread = current->flags & PF_KTHREAD; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); +- struct btree_iter iter = {}, bp_iter = {}; ++ struct btree_iter iter = {}; + struct bkey_buf sk; + struct bkey_s_c k; + struct bkey_buf last_flushed; ++ u64 check_mismatch_done = bucket_start; + int ret = 0; + +- struct bch_dev *ca = bch2_dev_tryget(c, dev); ++ CLASS(bch2_dev_tryget, ca)(c, dev); + if (!ca) + return 0; + +@@ -765,8 +869,6 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, + + struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); + struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); +- bch2_dev_put(ca); +- ca = NULL; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); +@@ -777,11 +879,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, + */ + bch2_trans_begin(trans); + +- bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); +- +- bch_err_msg(c, ret, "looking up alloc key"); +- if (ret) +- goto err; ++ CLASS(btree_iter, bp_iter)(trans, BTREE_ID_backpointers, bp_start, 0); + + ret = bch2_btree_write_buffer_tryflush(trans); + if (!bch2_err_matches(ret, EROFS)) +@@ -795,7 +893,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, + + bch2_trans_begin(trans); + +- k = bch2_btree_iter_peek(trans, &bp_iter); ++ k = bch2_btree_iter_peek(&bp_iter); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; +@@ -805,6 +903,14 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, + if (!k.k || bkey_gt(k.k->p, bp_end)) + break; + ++ if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { ++ while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { ++ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, ++ copygc, &last_flushed); ++ } ++ continue; ++ } ++ + if (k.k->type != KEY_TYPE_backpointer) + goto next; + +@@ -831,21 +937,27 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, + if (!bp.v->level) { + ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); + if (ret) { +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + continue; + } + } + + struct data_update_opts data_opts = {}; +- if (!pred(c, arg, k, &io_opts, &data_opts)) { +- bch2_trans_iter_exit(trans, &iter); ++ bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); ++ ++ if (trace_io_move_pred_enabled()) ++ trace_io_move_pred2(c, k, &io_opts, &data_opts, ++ pred, arg, p); ++ ++ if (!p) { ++ bch2_trans_iter_exit(&iter); + goto next; + } + + if (data_opts.scrub && + !bch2_dev_idx_is_online(c, data_opts.read_dev)) { +- bch2_trans_iter_exit(trans, &iter); +- ret = -BCH_ERR_device_offline; ++ bch2_trans_iter_exit(&iter); ++ ret = bch_err_throw(c, device_offline); + break; + } + +@@ -858,11 +970,12 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, + if (!bp.v->level) + ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); + else if (!data_opts.scrub) +- ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, k.k->p, 0); ++ ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, ++ k.k->p, data_opts.target, 0); + else + ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); + +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; +@@ -877,47 +990,48 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, + if (ctxt->stats) + atomic64_add(sectors, &ctxt->stats->sectors_seen); + next: +- bch2_btree_iter_advance(trans, &bp_iter); ++ bch2_btree_iter_advance(&bp_iter); + } ++ ++ while (check_mismatch_done < bucket_end) ++ bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, ++ copygc, &last_flushed); + err: +- bch2_trans_iter_exit(trans, &bp_iter); + bch2_bkey_buf_exit(&sk, c); + bch2_bkey_buf_exit(&last_flushed, c); + return ret; + } + +-static int bch2_move_data_phys(struct bch_fs *c, +- unsigned dev, +- u64 start, +- u64 end, +- unsigned data_types, +- struct bch_ratelimit *rate, +- struct bch_move_stats *stats, +- struct write_point_specifier wp, +- bool wait_on_copygc, +- move_pred_fn pred, void *arg) ++int bch2_move_data_phys(struct bch_fs *c, ++ unsigned dev, ++ u64 start, ++ u64 end, ++ unsigned data_types, ++ struct bch_ratelimit *rate, ++ struct bch_move_stats *stats, ++ struct write_point_specifier wp, ++ bool wait_on_copygc, ++ move_pred_fn pred, void *arg) + { + struct moving_context ctxt; + +- bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); +- + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); +- ctxt.stats->phys = true; +- ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; ++ bch2_btree_write_buffer_flush_sync(ctxt.trans); ++ ++ if (ctxt.stats) { ++ ctxt.stats->phys = true; ++ ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; ++ } + +- int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, data_types, pred, arg); ++ int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, ++ data_types, false, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; + } + +-struct evacuate_bucket_arg { +- struct bpos bucket; +- int gen; +- struct data_update_opts data_opts; +-}; +- +-static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k, ++static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, ++ enum btree_id btree, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +@@ -938,17 +1052,23 @@ static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, struct bkey_s_c k + } + + int bch2_evacuate_bucket(struct moving_context *ctxt, +- struct move_bucket_in_flight *bucket_in_flight, +- struct bpos bucket, int gen, +- struct data_update_opts data_opts) ++ struct move_bucket *bucket_in_flight, ++ struct bpos bucket, int gen, ++ struct data_update_opts data_opts) + { ++ struct bch_fs *c = ctxt->trans->c; + struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + ++ count_event(c, io_move_evacuate_bucket); ++ if (trace_io_move_evacuate_bucket_enabled()) ++ trace_io_move_evacuate_bucket2(c, bucket, gen); ++ + return __bch2_move_data_phys(ctxt, bucket_in_flight, + bucket.inode, + bucket.offset, + bucket.offset + 1, + ~0, ++ true, + evacuate_bucket_pred, &arg); + } + +@@ -992,7 +1112,7 @@ static int bch2_move_btree(struct bch_fs *c, + retry: + ret = 0; + while (bch2_trans_begin(trans), +- (b = bch2_btree_iter_peek_node(trans, &iter)) && ++ (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { + if (kthread && kthread_should_stop()) + break; +@@ -1006,18 +1126,18 @@ static int bch2_move_btree(struct bch_fs *c, + if (!pred(c, arg, b, &io_opts, &data_opts)) + goto next; + +- ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; ++ ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + next: +- bch2_btree_iter_next_node(trans, &iter); ++ bch2_btree_iter_next_node(&iter); + } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + + if (kthread && kthread_should_stop()) + break; +@@ -1031,7 +1151,7 @@ static int bch2_move_btree(struct bch_fs *c, + } + + static bool rereplicate_pred(struct bch_fs *c, void *arg, +- struct bkey_s_c k, ++ enum btree_id btree, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +@@ -1040,7 +1160,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, + ? c->opts.metadata_replicas + : io_opts->data_replicas; + +- rcu_read_lock(); ++ guard(rcu)(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { +@@ -1050,7 +1170,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, + data_opts->kill_ptrs |= BIT(i); + i++; + } +- rcu_read_unlock(); + + if (!data_opts->kill_ptrs && + (!nr_good || nr_good >= replicas)) +@@ -1063,7 +1182,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, + } + + static bool migrate_pred(struct bch_fs *c, void *arg, +- struct bkey_s_c k, ++ enum btree_id btree, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +@@ -1085,14 +1204,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg, + return data_opts->rewrite_ptrs != 0; + } + +-static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, +- struct btree *b, +- struct bch_io_opts *io_opts, +- struct data_update_opts *data_opts) +-{ +- return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +-} +- + /* + * Ancient versions of bcachefs produced packed formats which could represent + * keys that the in memory format cannot represent; this checks for those +@@ -1133,12 +1244,11 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) + BBPOS_MAX, + rewrite_old_nodes_pred, c, stats); + if (!ret) { +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + c->disk_sb.sb->version_min = c->disk_sb.sb->version; + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + } + + bch_err_fn(c, ret); +@@ -1146,7 +1256,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) + } + + static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, +- struct bkey_s_c k, ++ enum btree_id btree, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +@@ -1158,7 +1268,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, + struct extent_ptr_decoded p; + unsigned i = 0; + +- rcu_read_lock(); ++ guard(rcu)(); + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + unsigned d = bch2_extent_ptr_durability(c, &p); + +@@ -1169,21 +1279,12 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, + + i++; + } +- rcu_read_unlock(); + + return data_opts->kill_ptrs != 0; + } + +-static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, +- struct btree *b, +- struct bch_io_opts *io_opts, +- struct data_update_opts *data_opts) +-{ +- return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +-} +- + static bool scrub_pred(struct bch_fs *c, void *_arg, +- struct bkey_s_c k, ++ enum btree_id btree, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +@@ -1208,18 +1309,18 @@ static bool scrub_pred(struct bch_fs *c, void *_arg, + + int bch2_data_job(struct bch_fs *c, + struct bch_move_stats *stats, +- struct bch_ioctl_data op) ++ struct bch_ioctl_data *op) + { +- struct bbpos start = BBPOS(op.start_btree, op.start_pos); +- struct bbpos end = BBPOS(op.end_btree, op.end_pos); ++ struct bbpos start = BBPOS(op->start_btree, op->start_pos); ++ struct bbpos end = BBPOS(op->end_btree, op->end_pos); + int ret = 0; + +- if (op.op >= BCH_DATA_OP_NR) ++ if (op->op >= BCH_DATA_OP_NR) + return -EINVAL; + +- bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); ++ bch2_move_stats_init(stats, bch2_data_ops_strs[op->op]); + +- switch (op.op) { ++ switch (op->op) { + case BCH_DATA_OP_scrub: + /* + * prevent tests from spuriously failing, make sure we see all +@@ -1227,41 +1328,38 @@ int bch2_data_job(struct bch_fs *c, + */ + bch2_btree_interior_updates_flush(c); + +- ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, +- op.scrub.data_types, ++ ret = bch2_move_data_phys(c, op->scrub.dev, 0, U64_MAX, ++ op->scrub.data_types, + NULL, + stats, + writepoint_hashed((unsigned long) current), + false, +- scrub_pred, &op) ?: ret; ++ scrub_pred, op) ?: ret; + break; + + case BCH_DATA_OP_rereplicate: + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, -1); +- ret = bch2_move_btree(c, start, end, +- rereplicate_btree_pred, c, stats) ?: ret; +- ret = bch2_move_data(c, start, end, +- NULL, +- stats, ++ ret = bch2_move_data(c, start, end, 0, NULL, stats, + writepoint_hashed((unsigned long) current), + true, + rereplicate_pred, c) ?: ret; ++ bch2_btree_interior_updates_flush(c); + ret = bch2_replicas_gc2(c) ?: ret; + break; + case BCH_DATA_OP_migrate: +- if (op.migrate.dev >= c->sb.nr_devices) ++ if (op->migrate.dev >= c->sb.nr_devices) + return -EINVAL; + + stats->data_type = BCH_DATA_journal; +- ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); +- ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, ++ ret = bch2_journal_flush_device_pins(&c->journal, op->migrate.dev); ++ ret = bch2_move_data_phys(c, op->migrate.dev, 0, U64_MAX, + ~0, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, +- migrate_pred, &op) ?: ret; ++ migrate_pred, op) ?: ret; + bch2_btree_interior_updates_flush(c); + ret = bch2_replicas_gc2(c) ?: ret; + break; +@@ -1269,12 +1367,10 @@ int bch2_data_job(struct bch_fs *c, + ret = bch2_scan_old_btree_nodes(c, stats); + break; + case BCH_DATA_OP_drop_extra_replicas: +- ret = bch2_move_btree(c, start, end, +- drop_extra_replicas_btree_pred, c, stats) ?: ret; +- ret = bch2_move_data(c, start, end, NULL, stats, +- writepoint_hashed((unsigned long) current), +- true, +- drop_extra_replicas_pred, c) ?: ret; ++ ret = bch2_move_data(c, start, end, 0, NULL, stats, ++ writepoint_hashed((unsigned long) current), ++ true, ++ drop_extra_replicas_pred, c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + break; + default: +@@ -1333,11 +1429,11 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str + + printbuf_indent_add(out, 2); + +- mutex_lock(&ctxt->lock); +- struct moving_io *io; +- list_for_each_entry(io, &ctxt->ios, io_list) +- bch2_data_update_inflight_to_text(out, &io->write); +- mutex_unlock(&ctxt->lock); ++ scoped_guard(mutex, &ctxt->lock) { ++ struct moving_io *io; ++ list_for_each_entry(io, &ctxt->ios, io_list) ++ bch2_data_update_inflight_to_text(out, &io->write); ++ } + + printbuf_indent_sub(out, 4); + } +@@ -1346,10 +1442,9 @@ void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) + { + struct moving_context *ctxt; + +- mutex_lock(&c->moving_context_lock); +- list_for_each_entry(ctxt, &c->moving_context_list, list) +- bch2_moving_ctxt_to_text(out, c, ctxt); +- mutex_unlock(&c->moving_context_lock); ++ scoped_guard(mutex, &c->moving_context_lock) ++ list_for_each_entry(ctxt, &c->moving_context_list, list) ++ bch2_moving_ctxt_to_text(out, c, ctxt); + } + + void bch2_fs_move_init(struct bch_fs *c) +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +index 51e0505a8156..481026ff99ab 100644 +--- a/fs/bcachefs/move.h ++++ b/fs/bcachefs/move.h +@@ -72,7 +72,7 @@ do { \ + break; \ + } while (1) + +-typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, ++typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, + struct bch_io_opts *, struct data_update_opts *); + + extern const char * const bch2_data_ops_strs[]; +@@ -116,32 +116,31 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, + int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); + + int bch2_move_extent(struct moving_context *, +- struct move_bucket_in_flight *, ++ struct move_bucket *, + struct btree_iter *, + struct bkey_s_c, + struct bch_io_opts, + struct data_update_opts); + +-int __bch2_move_data(struct moving_context *, +- struct bbpos, +- struct bbpos, +- move_pred_fn, void *); +-int bch2_move_data(struct bch_fs *, +- struct bbpos start, +- struct bbpos end, +- struct bch_ratelimit *, +- struct bch_move_stats *, +- struct write_point_specifier, +- bool, +- move_pred_fn, void *); ++struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, ++ struct per_snapshot_io_opts *, struct bpos, ++ struct btree_iter *, struct bkey_s_c); ++ ++int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, ++ move_pred_fn, void *, enum btree_id, unsigned); ++ ++int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, ++ struct bch_ratelimit *, struct bch_move_stats *, ++ struct write_point_specifier, bool, ++ move_pred_fn, void *); + + int bch2_evacuate_bucket(struct moving_context *, +- struct move_bucket_in_flight *, ++ struct move_bucket *, + struct bpos, int, + struct data_update_opts); + int bch2_data_job(struct bch_fs *, + struct bch_move_stats *, +- struct bch_ioctl_data); ++ struct bch_ioctl_data *); + + void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *); + void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *); +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +index 807f779f6f76..c5c62cd600de 100644 +--- a/fs/bcachefs/move_types.h ++++ b/fs/bcachefs/move_types.h +@@ -36,14 +36,10 @@ struct move_bucket_key { + }; + + struct move_bucket { ++ struct move_bucket *next; ++ struct rhash_head hash; + struct move_bucket_key k; + unsigned sectors; +-}; +- +-struct move_bucket_in_flight { +- struct move_bucket_in_flight *next; +- struct rhash_head hash; +- struct move_bucket bucket; + atomic_t count; + }; + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index 96873372b516..b0cbe3c1aab6 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -8,6 +8,7 @@ + #include "bcachefs.h" + #include "alloc_background.h" + #include "alloc_foreground.h" ++#include "backpointers.h" + #include "btree_iter.h" + #include "btree_update.h" + #include "btree_write_buffer.h" +@@ -27,47 +28,32 @@ + #include + + struct buckets_in_flight { +- struct rhashtable table; +- struct move_bucket_in_flight *first; +- struct move_bucket_in_flight *last; +- size_t nr; +- size_t sectors; ++ struct rhashtable *table; ++ struct move_bucket *first; ++ struct move_bucket *last; ++ size_t nr; ++ size_t sectors; ++ ++ DARRAY(struct move_bucket *) to_evacuate; + }; + + static const struct rhashtable_params bch_move_bucket_params = { +- .head_offset = offsetof(struct move_bucket_in_flight, hash), +- .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), ++ .head_offset = offsetof(struct move_bucket, hash), ++ .key_offset = offsetof(struct move_bucket, k), + .key_len = sizeof(struct move_bucket_key), + .automatic_shrinking = true, + }; + +-static struct move_bucket_in_flight * +-move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) ++static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) + { +- struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); +- int ret; +- +- if (!new) +- return ERR_PTR(-ENOMEM); +- +- new->bucket = b; +- +- ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, +- bch_move_bucket_params); +- if (ret) { +- kfree(new); +- return ERR_PTR(ret); +- } +- + if (!list->first) +- list->first = new; ++ list->first = b; + else +- list->last->next = new; ++ list->last->next = b; + +- list->last = new; ++ list->last = b; + list->nr++; +- list->sectors += b.sectors; +- return new; ++ list->sectors += b->sectors; + } + + static int bch2_bucket_is_movable(struct btree_trans *trans, +@@ -78,20 +64,22 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, + if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) + return 0; + +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, +- b->k.bucket, BTREE_ITER_cached); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_alloc, b->k.bucket, BTREE_ITER_cached); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; + +- struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode); ++ CLASS(bch2_dev_bucket_tryget, ca)(c, k.k->p); + if (!ca) +- goto out; ++ return 0; ++ ++ if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) ++ return 0; + + if (ca->mi.state != BCH_MEMBER_STATE_rw || + !bch2_dev_is_online(ca)) +- goto out_put; ++ return 0; + + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); +@@ -99,20 +87,23 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, + b->sectors = bch2_bucket_sectors_dirty(*a); + u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); + +- ret = lru_idx && lru_idx <= time; +-out_put: +- bch2_dev_put(ca); +-out: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return lru_idx && lru_idx <= time; ++} ++ ++static void move_bucket_free(struct buckets_in_flight *list, ++ struct move_bucket *b) ++{ ++ int ret = rhashtable_remove_fast(list->table, &b->hash, ++ bch_move_bucket_params); ++ BUG_ON(ret); ++ kfree(b); + } + + static void move_buckets_wait(struct moving_context *ctxt, + struct buckets_in_flight *list, + bool flush) + { +- struct move_bucket_in_flight *i; +- int ret; ++ struct move_bucket *i; + + while ((i = list->first)) { + if (flush) +@@ -126,12 +117,9 @@ static void move_buckets_wait(struct moving_context *ctxt, + list->last = NULL; + + list->nr--; +- list->sectors -= i->bucket.sectors; ++ list->sectors -= i->sectors; + +- ret = rhashtable_remove_fast(&list->table, &i->hash, +- bch_move_bucket_params); +- BUG_ON(ret); +- kfree(i); ++ move_bucket_free(list, i); + } + + bch2_trans_unlock_long(ctxt->trans); +@@ -140,14 +128,11 @@ static void move_buckets_wait(struct moving_context *ctxt, + static bool bucket_in_flight(struct buckets_in_flight *list, + struct move_bucket_key k) + { +- return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); ++ return rhashtable_lookup_fast(list->table, &k, bch_move_bucket_params); + } + +-typedef DARRAY(struct move_bucket) move_buckets; +- + static int bch2_copygc_get_buckets(struct moving_context *ctxt, +- struct buckets_in_flight *buckets_in_flight, +- move_buckets *buckets) ++ struct buckets_in_flight *buckets_in_flight) + { + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; +@@ -164,8 +149,6 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, + if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) + return ret; + +- bch2_trans_begin(trans); +- + ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), +@@ -184,20 +167,34 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, + else if (bucket_in_flight(buckets_in_flight, b.k)) + in_flight++; + else { +- ret2 = darray_push(buckets, b); ++ struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); ++ ret2 = b_i ? 0 : -ENOMEM; + if (ret2) + goto err; ++ ++ *b_i = b; ++ ++ ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); ++ if (ret2) { ++ kfree(b_i); ++ goto err; ++ } ++ ++ ret2 = rhashtable_lookup_insert_fast(buckets_in_flight->table, &b_i->hash, ++ bch_move_bucket_params); ++ BUG_ON(ret2); ++ + sectors += b.sectors; + } + +- ret2 = buckets->nr >= nr_to_get; ++ ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; + err: + ret2; + })); + + pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", + buckets_in_flight->nr, buckets_in_flight->sectors, +- saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); ++ saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); + + return ret < 0 ? ret : 0; + } +@@ -212,40 +209,30 @@ static int bch2_copygc(struct moving_context *ctxt, + struct data_update_opts data_opts = { + .btree_insert_flags = BCH_WATERMARK_copygc, + }; +- move_buckets buckets = { 0 }; +- struct move_bucket_in_flight *f; + u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); + u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); + int ret = 0; + +- ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); ++ ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); + if (ret) + goto err; + +- darray_for_each(buckets, i) { ++ darray_for_each(buckets_in_flight->to_evacuate, i) { + if (kthread_should_stop() || freezing(current)) + break; + +- f = move_bucket_in_flight_add(buckets_in_flight, *i); +- ret = PTR_ERR_OR_ZERO(f); +- if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ +- ret = 0; +- continue; +- } +- if (ret == -ENOMEM) { /* flush IO, continue later */ +- ret = 0; +- break; +- } ++ struct move_bucket *b = *i; ++ *i = NULL; ++ ++ move_bucket_in_flight_add(buckets_in_flight, b); + +- ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, +- f->bucket.k.gen, data_opts); ++ ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); + if (ret) + goto err; + + *did_work = true; + } + err: +- + /* no entries in LRU btree found, or got to end: */ + if (bch2_err_matches(ret, ENOENT)) + ret = 0; +@@ -255,12 +242,34 @@ static int bch2_copygc(struct moving_context *ctxt, + + sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; + sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; +- trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); ++ trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); + +- darray_exit(&buckets); ++ darray_for_each(buckets_in_flight->to_evacuate, i) ++ if (*i) ++ move_bucket_free(buckets_in_flight, *i); ++ darray_exit(&buckets_in_flight->to_evacuate); + return ret; + } + ++static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) ++{ ++ struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); ++ struct bch_dev_usage usage; ++ ++ for (unsigned i = 0; i < BCH_DATA_NR; i++) ++ usage.buckets[i] = usage_full.d[i].buckets; ++ ++ s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * ++ ca->mi.bucket_size) >> 1); ++ s64 fragmented = 0; ++ ++ for (unsigned i = 0; i < BCH_DATA_NR; i++) ++ if (data_type_movable(i)) ++ fragmented += usage_full.d[i].fragmented; ++ ++ return max(0LL, fragmented_allowed - fragmented); ++} ++ + /* + * Copygc runs when the amount of fragmented data is above some arbitrary + * threshold: +@@ -275,28 +284,13 @@ static int bch2_copygc(struct moving_context *ctxt, + * often and continually reduce the amount of fragmented space as the device + * fills up. So, we increase the threshold by half the current free space. + */ +-unsigned long bch2_copygc_wait_amount(struct bch_fs *c) ++u64 bch2_copygc_wait_amount(struct bch_fs *c) + { +- s64 wait = S64_MAX, fragmented_allowed, fragmented; +- +- for_each_rw_member(c, ca) { +- struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); +- struct bch_dev_usage usage; +- +- for (unsigned i = 0; i < BCH_DATA_NR; i++) +- usage.buckets[i] = usage_full.d[i].buckets; +- +- fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * +- ca->mi.bucket_size) >> 1); +- fragmented = 0; +- +- for (unsigned i = 0; i < BCH_DATA_NR; i++) +- if (data_type_movable(i)) +- fragmented += usage_full.d[i].fragmented; +- +- wait = min(wait, max(0LL, fragmented_allowed - fragmented)); +- } ++ u64 wait = U64_MAX; + ++ guard(rcu)(); ++ for_each_rw_member_rcu(c, ca) ++ wait = min(wait, bch2_copygc_dev_wait_amount(ca)); + return wait; + } + +@@ -318,15 +312,22 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) + c->copygc_wait_at) << 9); + prt_newline(out); + +- prt_printf(out, "Currently calculated wait:\t"); +- prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); +- prt_newline(out); ++ bch2_printbuf_make_room(out, 4096); + +- rcu_read_lock(); +- struct task_struct *t = rcu_dereference(c->copygc_thread); +- if (t) +- get_task_struct(t); +- rcu_read_unlock(); ++ struct task_struct *t; ++ scoped_guard(rcu) { ++ guard(printbuf_atomic)(out); ++ prt_printf(out, "Currently calculated wait:\n"); ++ for_each_rw_member_rcu(c, ca) { ++ prt_printf(out, " %s:\t", ca->name); ++ prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); ++ prt_newline(out); ++ } ++ ++ t = rcu_dereference(c->copygc_thread); ++ if (t) ++ get_task_struct(t); ++ } + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); +@@ -340,19 +341,16 @@ static int bch2_copygc_thread(void *arg) + struct moving_context ctxt; + struct bch_move_stats move_stats; + struct io_clock *clock = &c->io_clock[WRITE]; +- struct buckets_in_flight *buckets; ++ struct buckets_in_flight buckets = {}; + u64 last, wait; +- int ret = 0; + +- buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); +- if (!buckets) +- return -ENOMEM; +- ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); ++ buckets.table = kzalloc(sizeof(*buckets.table), GFP_KERNEL); ++ int ret = !buckets.table ++ ? -ENOMEM ++ : rhashtable_init(buckets.table, &bch_move_bucket_params); + bch_err_msg(c, ret, "allocating copygc buckets in flight"); +- if (ret) { +- kfree(buckets); +- return ret; +- } ++ if (ret) ++ goto err; + + set_freezable(); + +@@ -360,7 +358,7 @@ static int bch2_copygc_thread(void *arg) + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + */ +- kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || ++ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_should_stop()); + + bch2_move_stats_init(&move_stats, "copygc"); +@@ -375,13 +373,13 @@ static int bch2_copygc_thread(void *arg) + cond_resched(); + + if (!c->opts.copygc_enabled) { +- move_buckets_wait(&ctxt, buckets, true); ++ move_buckets_wait(&ctxt, &buckets, true); + kthread_wait_freezable(c->opts.copygc_enabled || + kthread_should_stop()); + } + + if (unlikely(freezing(current))) { +- move_buckets_wait(&ctxt, buckets, true); ++ move_buckets_wait(&ctxt, &buckets, true); + __refrigerator(false); + continue; + } +@@ -392,7 +390,7 @@ static int bch2_copygc_thread(void *arg) + if (wait > clock->max_slop) { + c->copygc_wait_at = last; + c->copygc_wait = last + wait; +- move_buckets_wait(&ctxt, buckets, true); ++ move_buckets_wait(&ctxt, &buckets, true); + trace_and_count(c, copygc_wait, c, wait, last + wait); + bch2_kthread_io_clock_wait(clock, last + wait, + MAX_SCHEDULE_TIMEOUT); +@@ -402,7 +400,7 @@ static int bch2_copygc_thread(void *arg) + c->copygc_wait = 0; + + c->copygc_running = true; +- ret = bch2_copygc(&ctxt, buckets, &did_work); ++ ret = bch2_copygc(&ctxt, &buckets, &did_work); + c->copygc_running = false; + + wake_up(&c->copygc_running_wq); +@@ -413,20 +411,19 @@ static int bch2_copygc_thread(void *arg) + if (min_member_capacity == U64_MAX) + min_member_capacity = 128 * 2048; + +- move_buckets_wait(&ctxt, buckets, true); ++ move_buckets_wait(&ctxt, &buckets, true); + bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), + MAX_SCHEDULE_TIMEOUT); + } + } + +- move_buckets_wait(&ctxt, buckets, true); +- +- rhashtable_destroy(&buckets->table); +- kfree(buckets); ++ move_buckets_wait(&ctxt, &buckets, true); ++ rhashtable_destroy(buckets.table); + bch2_moving_ctxt_exit(&ctxt); + bch2_move_stats_exit(&move_stats, c); +- +- return 0; ++err: ++ kfree(buckets.table); ++ return ret; + } + + void bch2_copygc_stop(struct bch_fs *c) +diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h +index d1885cf67a45..f615910d6f98 100644 +--- a/fs/bcachefs/movinggc.h ++++ b/fs/bcachefs/movinggc.h +@@ -2,16 +2,15 @@ + #ifndef _BCACHEFS_MOVINGGC_H + #define _BCACHEFS_MOVINGGC_H + +-unsigned long bch2_copygc_wait_amount(struct bch_fs *); ++u64 bch2_copygc_wait_amount(struct bch_fs *); + void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); + + static inline void bch2_copygc_wakeup(struct bch_fs *c) + { +- rcu_read_lock(); ++ guard(rcu)(); + struct task_struct *p = rcu_dereference(c->copygc_thread); + if (p) + wake_up_process(p); +- rcu_read_unlock(); + } + + void bch2_copygc_stop(struct bch_fs *); +diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c +index 9136a9097789..d1019052f182 100644 +--- a/fs/bcachefs/namei.c ++++ b/fs/bcachefs/namei.c +@@ -11,6 +11,14 @@ + + #include + ++static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) ++{ ++ return (subvol_inum) { ++ .subvol = inode->bi_parent_subvol ?: inum.subvol, ++ .inum = inode->bi_dir, ++ }; ++} ++ + static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) + { + return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; +@@ -28,8 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, + unsigned flags) + { + struct bch_fs *c = trans->c; +- struct btree_iter dir_iter = {}; +- struct btree_iter inode_iter = {}; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; + subvol_inum new_inum = dir; + u64 now = bch2_current_time(c); + u64 cpu = raw_smp_processor_id(); +@@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans, + + if (!(flags & BCH_CREATE_SNAPSHOT)) { + /* Normal create path - allocate a new inode: */ +- bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); ++ bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); + + if (flags & BCH_CREATE_TMPFILE) + new_inode->bi_flags |= BCH_INODE_unlinked; +@@ -91,7 +99,9 @@ int bch2_create_trans(struct btree_trans *trans, + * If we're not root, we have to own the subvolume being + * snapshotted: + */ +- if (uid && new_inode->bi_uid != uid) { ++ if (uid && ++ !capable(CAP_FOWNER) && ++ new_inode->bi_uid != uid) { + ret = -EPERM; + goto err; + } +@@ -123,8 +133,8 @@ int bch2_create_trans(struct btree_trans *trans, + if (ret) + goto err; + +- bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); +- ret = bch2_btree_iter_traverse(trans, &dir_iter); ++ bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); ++ ret = bch2_btree_iter_traverse(&dir_iter); + if (ret) + goto err; + } +@@ -167,18 +177,28 @@ int bch2_create_trans(struct btree_trans *trans, + new_inode->bi_dir_offset = dir_offset; + } + ++ if (S_ISDIR(mode)) { ++ ret = bch2_maybe_propagate_has_case_insensitive(trans, ++ (subvol_inum) { ++ new_inode->bi_subvol ?: dir.subvol, ++ new_inode->bi_inum }, ++ new_inode); ++ if (ret) ++ goto err; ++ } ++ + if (S_ISDIR(mode) && + !new_inode->bi_subvol) + new_inode->bi_depth = dir_u->bi_depth + 1; + + inode_iter.flags &= ~BTREE_ITER_all_snapshots; +- bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); ++ bch2_btree_iter_set_snapshot(&inode_iter, snapshot); + +- ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: ++ ret = bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, new_inode); + err: +- bch2_trans_iter_exit(trans, &inode_iter); +- bch2_trans_iter_exit(trans, &dir_iter); ++ bch2_trans_iter_exit(&inode_iter); ++ bch2_trans_iter_exit(&dir_iter); + return ret; + } + +@@ -188,8 +208,8 @@ int bch2_link_trans(struct btree_trans *trans, + const struct qstr *name) + { + struct bch_fs *c = trans->c; +- struct btree_iter dir_iter = {}; +- struct btree_iter inode_iter = {}; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; + u64 now = bch2_current_time(c); + u64 dir_offset = 0; +@@ -234,8 +254,8 @@ int bch2_link_trans(struct btree_trans *trans, + ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); + err: +- bch2_trans_iter_exit(trans, &dir_iter); +- bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(&dir_iter); ++ bch2_trans_iter_exit(&inode_iter); + return ret; + } + +@@ -247,9 +267,9 @@ int bch2_unlink_trans(struct btree_trans *trans, + bool deleting_subvol) + { + struct bch_fs *c = trans->c; +- struct btree_iter dir_iter = {}; +- struct btree_iter dirent_iter = {}; +- struct btree_iter inode_iter = {}; ++ struct btree_iter dir_iter = { NULL }; ++ struct btree_iter dirent_iter = { NULL }; ++ struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; + subvol_inum inum; + u64 now = bch2_current_time(c); +@@ -279,7 +299,7 @@ int bch2_unlink_trans(struct btree_trans *trans, + } + + if (deleting_subvol && !inode_u->bi_subvol) { +- ret = -BCH_ERR_ENOENT_not_subvol; ++ ret = bch_err_throw(c, ENOENT_not_subvol); + goto err; + } + +@@ -295,7 +315,7 @@ int bch2_unlink_trans(struct btree_trans *trans, + if (ret) + goto err; + +- k = bch2_btree_iter_peek_slot(trans, &dirent_iter); ++ k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; +@@ -304,8 +324,8 @@ int bch2_unlink_trans(struct btree_trans *trans, + * If we're deleting a subvolume, we need to really delete the + * dirent, not just emit a whiteout in the current snapshot: + */ +- bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); +- ret = bch2_btree_iter_traverse(trans, &dirent_iter); ++ bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); ++ ret = bch2_btree_iter_traverse(&dirent_iter); + if (ret) + goto err; + } else { +@@ -327,9 +347,9 @@ int bch2_unlink_trans(struct btree_trans *trans, + bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); + err: +- bch2_trans_iter_exit(trans, &inode_iter); +- bch2_trans_iter_exit(trans, &dirent_iter); +- bch2_trans_iter_exit(trans, &dir_iter); ++ bch2_trans_iter_exit(&inode_iter); ++ bch2_trans_iter_exit(&dirent_iter); ++ bch2_trans_iter_exit(&dir_iter); + return ret; + } + +@@ -363,9 +383,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, + + static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) + { +- struct btree_iter iter; + struct bkey_i_subvolume *s = +- bch2_bkey_get_mut_typed(trans, &iter, ++ bch2_bkey_get_mut_typed(trans, + BTREE_ID_subvolumes, POS(0, subvol), + BTREE_ITER_cached, subvolume); + int ret = PTR_ERR_OR_ZERO(s); +@@ -373,7 +392,6 @@ static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_p + return ret; + + s->v.fs_path_parent = cpu_to_le32(new_parent); +- bch2_trans_iter_exit(trans, &iter); + return 0; + } + +@@ -387,10 +405,10 @@ int bch2_rename_trans(struct btree_trans *trans, + enum bch_rename_mode mode) + { + struct bch_fs *c = trans->c; +- struct btree_iter src_dir_iter = {}; +- struct btree_iter dst_dir_iter = {}; +- struct btree_iter src_inode_iter = {}; +- struct btree_iter dst_inode_iter = {}; ++ struct btree_iter src_dir_iter = { NULL }; ++ struct btree_iter dst_dir_iter = { NULL }; ++ struct btree_iter src_inode_iter = { NULL }; ++ struct btree_iter dst_inode_iter = { NULL }; + struct bch_hash_info src_hash, dst_hash; + subvol_inum src_inum, dst_inum; + u64 src_offset, dst_offset; +@@ -404,8 +422,7 @@ int bch2_rename_trans(struct btree_trans *trans, + + src_hash = bch2_hash_info_init(c, src_dir_u); + +- if (dst_dir.inum != src_dir.inum || +- dst_dir.subvol != src_dir.subvol) { ++ if (!subvol_inum_eq(dst_dir, src_dir)) { + ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, + BTREE_ITER_intent); + if (ret) +@@ -418,8 +435,8 @@ int bch2_rename_trans(struct btree_trans *trans, + } + + ret = bch2_dirent_rename(trans, +- src_dir, &src_hash, &src_dir_u->bi_size, +- dst_dir, &dst_hash, &dst_dir_u->bi_size, ++ src_dir, &src_hash, ++ dst_dir, &dst_hash, + src_name, &src_inum, &src_offset, + dst_name, &dst_inum, &dst_offset, + mode); +@@ -497,32 +514,41 @@ int bch2_rename_trans(struct btree_trans *trans, + } + } + +- if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && +- S_ISDIR(src_inode_u->bi_mode)) { +- ret = -EXDEV; +- goto err; +- } ++ if (!subvol_inum_eq(dst_dir, src_dir)) { ++ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && ++ S_ISDIR(src_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } + +- if (mode == BCH_RENAME_EXCHANGE && +- bch2_reinherit_attrs(dst_inode_u, src_dir_u) && +- S_ISDIR(dst_inode_u->bi_mode)) { +- ret = -EXDEV; +- goto err; +- } ++ if (mode == BCH_RENAME_EXCHANGE && ++ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } + +- if (is_subdir_for_nlink(src_inode_u)) { +- src_dir_u->bi_nlink--; +- dst_dir_u->bi_nlink++; +- } ++ ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: ++ (mode == BCH_RENAME_EXCHANGE ++ ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) ++ : 0); ++ if (ret) ++ goto err; + +- if (S_ISDIR(src_inode_u->bi_mode) && +- !src_inode_u->bi_subvol) +- src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; ++ if (is_subdir_for_nlink(src_inode_u)) { ++ src_dir_u->bi_nlink--; ++ dst_dir_u->bi_nlink++; ++ } + +- if (mode == BCH_RENAME_EXCHANGE && +- S_ISDIR(dst_inode_u->bi_mode) && +- !dst_inode_u->bi_subvol) +- dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; ++ if (S_ISDIR(src_inode_u->bi_mode) && ++ !src_inode_u->bi_subvol) ++ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ S_ISDIR(dst_inode_u->bi_mode) && ++ !dst_inode_u->bi_subvol) ++ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; ++ } + + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { + dst_dir_u->bi_nlink--; +@@ -554,15 +580,31 @@ int bch2_rename_trans(struct btree_trans *trans, + ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) + : 0); + err: +- bch2_trans_iter_exit(trans, &dst_inode_iter); +- bch2_trans_iter_exit(trans, &src_inode_iter); +- bch2_trans_iter_exit(trans, &dst_dir_iter); +- bch2_trans_iter_exit(trans, &src_dir_iter); ++ bch2_trans_iter_exit(&dst_inode_iter); ++ bch2_trans_iter_exit(&src_inode_iter); ++ bch2_trans_iter_exit(&dst_dir_iter); ++ bch2_trans_iter_exit(&src_dir_iter); + return ret; + } + + /* inum_to_path */ + ++static inline void reverse_bytes(void *b, size_t n) ++{ ++ char *e = b + n, *s = b; ++ ++ while (s < e) { ++ --e; ++ swap(*s, *e); ++ s++; ++ } ++} ++ ++static inline void printbuf_reverse_from(struct printbuf *out, unsigned pos) ++{ ++ reverse_bytes(out->buf + pos, out->pos - pos); ++} ++ + static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) + { + bch2_printbuf_make_room(out, n); +@@ -582,78 +624,116 @@ static inline void prt_str_reversed(struct printbuf *out, const char *s) + prt_bytes_reversed(out, s, strlen(s)); + } + +-static inline void reverse_bytes(void *b, size_t n) ++__printf(2, 3) ++static inline void prt_printf_reversed(struct printbuf *out, const char *fmt, ...) + { +- char *e = b + n, *s = b; ++ unsigned orig_pos = out->pos; + +- while (s < e) { +- --e; +- swap(*s, *e); +- s++; +- } ++ va_list args; ++ va_start(args, fmt); ++ prt_vprintf(out, fmt, args); ++ va_end(args); ++ ++ printbuf_reverse_from(out, orig_pos); + } + +-/* XXX: we don't yet attempt to print paths when we don't know the subvol */ +-int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) ++static int __bch2_inum_to_path(struct btree_trans *trans, ++ u32 subvol, u64 inum, u32 snapshot, ++ struct printbuf *path) + { + unsigned orig_pos = path->pos; + int ret = 0; ++ DARRAY(subvol_inum) inums = {}; + +- while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && +- inum.inum == BCACHEFS_ROOT_INO)) { +- struct bch_inode_unpacked inode; +- ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); ++ if (!snapshot) { ++ ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); + if (ret) + goto disconnected; ++ } + +- if (!inode.bi_dir && !inode.bi_dir_offset) { +- ret = -BCH_ERR_ENOENT_inode_no_backpointer; +- goto disconnected; ++ while (true) { ++ subvol_inum n = (subvol_inum) { subvol ?: snapshot, inum }; ++ ++ if (darray_find_p(inums, i, i->subvol == n.subvol && i->inum == n.inum)) { ++ prt_printf_reversed(path, "(loop at %llu:%u)", inum, snapshot); ++ break; + } + +- inum.subvol = inode.bi_parent_subvol ?: inum.subvol; +- inum.inum = inode.bi_dir; ++ ret = darray_push(&inums, n); ++ if (ret) ++ goto err; + +- u32 snapshot; +- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ struct bch_inode_unpacked inode; ++ ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); + if (ret) + goto disconnected; + +- struct btree_iter d_iter; +- struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, +- BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), +- 0, dirent); ++ if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && ++ inode.bi_inum == BCACHEFS_ROOT_INO) ++ break; ++ ++ if (!inode.bi_dir && !inode.bi_dir_offset) { ++ ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer); ++ goto disconnected; ++ } ++ ++ inum = inode.bi_dir; ++ if (inode.bi_parent_subvol) { ++ subvol = inode.bi_parent_subvol; ++ ret = bch2_subvolume_get_snapshot(trans, inode.bi_parent_subvol, &snapshot); ++ if (ret) ++ goto disconnected; ++ } ++ ++ CLASS(btree_iter, d_iter)(trans, BTREE_ID_dirents, ++ SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), 0); ++ struct bkey_s_c_dirent d = bch2_bkey_get_typed(&d_iter, dirent); + ret = bkey_err(d.s_c); + if (ret) + goto disconnected; + + struct qstr dirent_name = bch2_dirent_get_name(d); ++ + prt_bytes_reversed(path, dirent_name.name, dirent_name.len); + + prt_char(path, '/'); +- +- bch2_trans_iter_exit(trans, &d_iter); + } + + if (orig_pos == path->pos) + prt_char(path, '/'); + out: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto err; ++ + ret = path->allocation_failure ? -ENOMEM : 0; + if (ret) + goto err; + +- reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); ++ printbuf_reverse_from(path, orig_pos); ++ darray_exit(&inums); + return 0; + err: ++ darray_exit(&inums); + return ret; + disconnected: +- if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +- goto err; +- +- prt_str_reversed(path, "(disconnected)"); ++ prt_printf_reversed(path, "(disconnected at %llu.%u)", inum, snapshot); + goto out; + } + ++int bch2_inum_to_path(struct btree_trans *trans, ++ subvol_inum inum, ++ struct printbuf *path) ++{ ++ return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); ++} ++ ++int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, ++ snapshot_id_list *snapshot_overwrites, ++ struct printbuf *path) ++{ ++ return __bch2_inum_to_path(trans, 0, inum, snapshot, path); ++} ++ + /* fsck */ + + static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, +@@ -662,15 +742,14 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + bool in_fsck) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + struct btree_iter bp_iter = {}; + int ret = 0; + + if (inode_points_to_dirent(target, d)) + return 0; + +- if (!target->bi_dir && +- !target->bi_dir_offset) { ++ if (!bch2_inode_has_backpointer(target)) { + fsck_err_on(S_ISDIR(target->bi_mode), + trans, inode_dir_missing_backpointer, + "directory with missing backpointer\n%s", +@@ -695,19 +774,9 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + return __bch2_fsck_write_inode(trans, target); + } + +- if (bch2_inode_should_have_single_bp(target) && +- !fsck_err(trans, inode_wrong_backpointer, +- "dirent points to inode that does not point back:\n%s", +- (bch2_bkey_val_to_text(&buf, c, d.s_c), +- prt_newline(&buf), +- bch2_inode_unpacked_to_text(&buf, target), +- buf.buf))) +- goto err; +- +- struct bkey_s_c_dirent bp_dirent = +- bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, +- SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), +- 0, dirent); ++ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_dirents, ++ SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), 0); ++ struct bkey_s_c_dirent bp_dirent = bch2_bkey_get_typed(&bp_iter, dirent); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; +@@ -730,6 +799,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + ret = __bch2_fsck_write_inode(trans, target); + } + } else { ++ printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); +@@ -778,8 +848,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + out: + err: + fsck_err: +- bch2_trans_iter_exit(trans, &bp_iter); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&bp_iter); + bch_err_fn(c, ret); + return ret; + } +@@ -791,7 +860,7 @@ int __bch2_check_dirent_target(struct btree_trans *trans, + bool in_fsck) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); +@@ -819,13 +888,157 @@ int __bch2_check_dirent_target(struct btree_trans *trans, + n->v.d_inum = cpu_to_le64(target->bi_inum); + } + +- ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); ++ ret = bch2_trans_update(trans, dirent_iter, &n->k_i, ++ BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto err; + } + err: + fsck_err: +- printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; + } ++ ++/* ++ * BCH_INODE_has_case_insensitive: ++ * We have to track whether directories have any descendent directory that is ++ * casefolded - for overlayfs: ++ */ ++ ++static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) ++{ ++ struct btree_iter iter = {}; ++ int ret = 0; ++ ++ while (true) { ++ struct bch_inode_unpacked inode; ++ ret = bch2_inode_peek(trans, &iter, &inode, inum, ++ BTREE_ITER_intent|BTREE_ITER_with_updates); ++ if (ret) ++ break; ++ ++ if (inode.bi_flags & BCH_INODE_has_case_insensitive) ++ break; ++ ++ inode.bi_flags |= BCH_INODE_has_case_insensitive; ++ ret = bch2_inode_write(trans, &iter, &inode); ++ if (ret) ++ break; ++ ++ bch2_trans_iter_exit(&iter); ++ if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) ++ break; ++ ++ inum = parent_inum(inum, &inode); ++ } ++ ++ bch2_trans_iter_exit(&iter); ++ return ret; ++} ++ ++int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, ++ struct bch_inode_unpacked *inode) ++{ ++ if (!bch2_inode_casefold(trans->c, inode)) ++ return 0; ++ ++ inode->bi_flags |= BCH_INODE_has_case_insensitive; ++ ++ return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); ++} ++ ++int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ snapshot_id_list *snapshot_overwrites, ++ bool *do_update) ++{ ++ CLASS(printbuf, buf)(); ++ bool repairing_parents = false; ++ int ret = 0; ++ ++ if (!S_ISDIR(inode->bi_mode)) { ++ /* ++ * Old versions set bi_casefold for non dirs, but that's ++ * unnecessary and wasteful ++ */ ++ if (inode->bi_casefold) { ++ inode->bi_casefold = 0; ++ *do_update = true; ++ } ++ return 0; ++ } ++ ++ if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) ++ return 0; ++ ++ if (bch2_inode_casefold(trans->c, inode) && ++ !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { ++ prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", ++ inode->bi_inum, inode->bi_snapshot); ++ ++ ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, ++ snapshot_overwrites, &buf); ++ if (ret) ++ return ret; ++ ++ if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { ++ inode->bi_flags |= BCH_INODE_has_case_insensitive; ++ *do_update = true; ++ } ++ } ++ ++ if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) ++ goto out; ++ ++ struct bch_inode_unpacked dir = *inode; ++ u32 snapshot = dir.bi_snapshot; ++ ++ while (!(dir.bi_inum == BCACHEFS_ROOT_INO && ++ dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { ++ if (dir.bi_parent_subvol) { ++ ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); ++ if (ret) ++ return ret; ++ ++ snapshot_overwrites = NULL; ++ } ++ ++ ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); ++ if (ret) ++ return ret; ++ ++ if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { ++ prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); ++ ++ ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, ++ snapshot_overwrites, &buf); ++ if (ret) ++ return ret; ++ ++ if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { ++ dir.bi_flags |= BCH_INODE_has_case_insensitive; ++ ret = __bch2_fsck_write_inode(trans, &dir); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ /* ++ * We only need to check the first parent, unless we find an ++ * inconsistency ++ */ ++ if (!repairing_parents) ++ break; ++ } ++out: ++fsck_err: ++ if (ret) ++ return ret; ++ ++ if (repairing_parents) { ++ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ bch_err_throw(trans->c, transaction_restart_nested); ++ } ++ ++ return 0; ++} +diff --git a/fs/bcachefs/namei.h b/fs/bcachefs/namei.h +index 2e6f6364767f..ae6ebc2d0785 100644 +--- a/fs/bcachefs/namei.h ++++ b/fs/bcachefs/namei.h +@@ -43,6 +43,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, + struct bch_inode_unpacked *); + + int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); ++int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, ++ snapshot_id_list *, struct printbuf *); + + int __bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, +@@ -69,4 +71,9 @@ static inline int bch2_check_dirent_target(struct btree_trans *trans, + return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); + } + ++int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, ++ struct bch_inode_unpacked *); ++int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, ++ snapshot_id_list *, bool *); ++ + #endif /* _BCACHEFS_NAMEI_H */ +diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c +index 3c21981a4a1c..58cfd540c6d6 100644 +--- a/fs/bcachefs/nocow_locking.c ++++ b/fs/bcachefs/nocow_locking.c +@@ -47,7 +47,7 @@ bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, + int v, lock_val = flags ? 1 : -1; + unsigned i; + +- spin_lock(&l->lock); ++ guard(spinlock)(&l->lock); + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket) +@@ -58,21 +58,19 @@ bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, + l->b[i] = dev_bucket; + goto take_lock; + } +-fail: +- spin_unlock(&l->lock); ++ + return false; + got_entry: + v = atomic_read(&l->l[i]); + if (lock_val > 0 ? v < 0 : v > 0) +- goto fail; ++ return false; + take_lock: + v = atomic_read(&l->l[i]); + /* Overflow? */ + if (v && sign(v + lock_val) != sign(v)) +- goto fail; ++ return false; + + atomic_add(lock_val, &l->l[i]); +- spin_unlock(&l->lock); + return true; + } + +@@ -133,12 +131,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c) + BUG_ON(atomic_read(&l->l[j])); + } + +-int bch2_fs_nocow_locking_init(struct bch_fs *c) ++void bch2_fs_nocow_locking_init_early(struct bch_fs *c) + { + struct bucket_nocow_lock_table *t = &c->nocow_locks; + + for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) + spin_lock_init(&l->lock); +- +- return 0; + } +diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h +index f9d6a426a960..48b8a003c0d2 100644 +--- a/fs/bcachefs/nocow_locking.h ++++ b/fs/bcachefs/nocow_locking.h +@@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, + void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); + + void bch2_fs_nocow_locking_exit(struct bch_fs *); +-int bch2_fs_nocow_locking_init(struct bch_fs *); ++void bch2_fs_nocow_locking_init_early(struct bch_fs *); + + #endif /* _BCACHEFS_NOCOW_LOCKING_H */ +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index af3258814822..921f9049912d 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -7,7 +7,9 @@ + #include "compress.h" + #include "disk_groups.h" + #include "error.h" ++#include "movinggc.h" + #include "opts.h" ++#include "rebalance.h" + #include "recovery_passes.h" + #include "super-io.h" + #include "util.h" +@@ -19,6 +21,11 @@ const char * const bch2_error_actions[] = { + NULL + }; + ++const char * const bch2_degraded_actions[] = { ++ BCH_DEGRADED_ACTIONS() ++ NULL ++}; ++ + const char * const bch2_fsck_fix_opts[] = { + BCH_FIX_ERRORS_OPTS() + NULL +@@ -273,20 +280,20 @@ int bch2_opt_lookup(const char *name) + return -1; + } + +-struct synonym { ++struct opt_synonym { + const char *s1, *s2; + }; + +-static const struct synonym bch_opt_synonyms[] = { ++static const struct opt_synonym bch2_opt_synonyms[] = { + { "quota", "usrquota" }, + }; + + static int bch2_mount_opt_lookup(const char *name) + { +- const struct synonym *i; ++ const struct opt_synonym *i; + +- for (i = bch_opt_synonyms; +- i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); ++ for (i = bch2_opt_synonyms; ++ i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); + i++) + if (!strcmp(name, i->s1)) + name = i->s2; +@@ -294,6 +301,30 @@ static int bch2_mount_opt_lookup(const char *name) + return bch2_opt_lookup(name); + } + ++struct opt_val_synonym { ++ const char *opt, *v1, *v2; ++}; ++ ++static const struct opt_val_synonym bch2_opt_val_synonyms[] = { ++ { "degraded", "true", "yes" }, ++ { "degraded", "false", "no" }, ++ { "degraded", "1", "yes" }, ++ { "degraded", "0", "no" }, ++}; ++ ++static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) ++{ ++ const struct opt_val_synonym *i; ++ ++ for (i = bch2_opt_val_synonyms; ++ i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); ++ i++) ++ if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) ++ return i->v2; ++ ++ return val; ++} ++ + int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) + { + if (v < opt->min) { +@@ -337,21 +368,22 @@ int bch2_opt_parse(struct bch_fs *c, + { + ssize_t ret; + ++ if (err) ++ printbuf_indent_add_nextline(err, 2); ++ + switch (opt->type) { + case BCH_OPT_BOOL: +- if (val) { +- ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); +- if (ret != -BCH_ERR_option_not_bool) { +- *res = ret; +- } else { +- if (err) +- prt_printf(err, "%s: must be bool", opt->attr.name); +- return ret; +- } ++ if (!val) ++ val = "1"; ++ ++ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); ++ if (ret != -BCH_ERR_option_not_bool) { ++ *res = ret; + } else { +- *res = 1; ++ if (err) ++ prt_printf(err, "%s: must be bool", opt->attr.name); ++ return ret; + } +- + break; + case BCH_OPT_UINT: + if (!val) { +@@ -360,9 +392,15 @@ int bch2_opt_parse(struct bch_fs *c, + return -EINVAL; + } + +- ret = opt->flags & OPT_HUMAN_READABLE +- ? bch2_strtou64_h(val, res) +- : kstrtou64(val, 10, res); ++ if (*val != '-') { ++ ret = opt->flags & OPT_HUMAN_READABLE ++ ? bch2_strtou64_h(val, res) ++ : kstrtou64(val, 10, res); ++ } else { ++ prt_printf(err, "%s: must be a non-negative number", opt->attr.name); ++ return -BCH_ERR_option_negative; ++ } ++ + if (ret < 0) { + if (err) + prt_printf(err, "%s: must be a number", +@@ -480,7 +518,7 @@ void bch2_opts_to_text(struct printbuf *out, + } + } + +-int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) ++int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) + { + int ret = 0; + +@@ -498,15 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, struct bch_dev *ca, int id, u64 v) + if (v) + bch2_check_set_feature(c, BCH_FEATURE_ec); + break; ++ default: ++ break; + } + + return ret; + } + +-int bch2_opts_check_may_set(struct bch_fs *c) ++int bch2_opts_hooks_pre_set(struct bch_fs *c) + { + for (unsigned i = 0; i < bch2_opts_nr; i++) { +- int ret = bch2_opt_check_may_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); ++ int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); + if (ret) + return ret; + } +@@ -514,11 +554,64 @@ int bch2_opts_check_may_set(struct bch_fs *c) + return 0; + } + ++void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, ++ struct bch_opts *new_opts, enum bch_opt_id id) ++{ ++ switch (id) { ++ case Opt_foreground_target: ++ if (new_opts->foreground_target && ++ !new_opts->background_target) ++ bch2_set_rebalance_needs_scan(c, inum); ++ break; ++ case Opt_compression: ++ if (new_opts->compression && ++ !new_opts->background_compression) ++ bch2_set_rebalance_needs_scan(c, inum); ++ break; ++ case Opt_background_target: ++ if (new_opts->background_target) ++ bch2_set_rebalance_needs_scan(c, inum); ++ break; ++ case Opt_background_compression: ++ if (new_opts->background_compression) ++ bch2_set_rebalance_needs_scan(c, inum); ++ break; ++ case Opt_rebalance_enabled: ++ bch2_rebalance_wakeup(c); ++ break; ++ case Opt_copygc_enabled: ++ bch2_copygc_wakeup(c); ++ break; ++ case Opt_discard: ++ if (!ca) { ++ guard(mutex)(&c->sb_lock); ++ for_each_member_device(c, ca) { ++ struct bch_member *m = ++ bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); ++ SET_BCH_MEMBER_DISCARD(m, c->opts.discard); ++ } ++ ++ bch2_write_super(c); ++ } ++ break; ++ case Opt_version_upgrade: ++ /* ++ * XXX: in the future we'll likely want to do compatible ++ * upgrades at runtime as well, but right now there's nothing ++ * that does that: ++ */ ++ if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) ++ bch2_sb_upgrade_incompat(c); ++ break; ++ default: ++ break; ++ } ++} ++ + int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, + struct printbuf *parse_later, + const char *name, const char *val) + { +- struct printbuf err = PRINTBUF; + u64 v; + int ret, id; + +@@ -536,47 +629,43 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, + if (id < 0) + return 0; + ++ /* must have a value for synonym lookup - but OPT_FN is weird */ ++ if (!val && bch2_opt_table[id].type != BCH_OPT_FN) ++ val = "1"; ++ ++ val = bch2_opt_val_synonym_lookup(name, val); ++ + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) +- goto bad_opt; ++ return -BCH_ERR_option_name; + + if (id == Opt_acl && + !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) +- goto bad_opt; ++ return -BCH_ERR_option_name; + + if ((id == Opt_usrquota || + id == Opt_grpquota) && + !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) +- goto bad_opt; ++ return -BCH_ERR_option_name; + ++ CLASS(printbuf, err)(); + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + if (ret == -BCH_ERR_option_needs_open_fs) { +- ret = 0; +- + if (parse_later) { + prt_printf(parse_later, "%s=%s,", name, val); + if (parse_later->allocation_failure) +- ret = -ENOMEM; ++ return -ENOMEM; + } + +- goto out; ++ return 0; + } + + if (ret < 0) +- goto bad_val; ++ return -BCH_ERR_option_value; + + if (opts) + bch2_opt_set_by_id(opts, id, v); + +- ret = 0; +-out: +- printbuf_exit(&err); +- return ret; +-bad_opt: +- ret = -BCH_ERR_option_name; +- goto out; +-bad_val: +- ret = -BCH_ERR_option_value; +- goto out; ++ return 0; + } + + int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, +@@ -667,9 +756,11 @@ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) + return 0; + } + +-void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, ++bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, + const struct bch_option *opt, u64 v) + { ++ bool changed = false; ++ + if (opt->flags & OPT_SB_FIELD_SECTORS) + v >>= 9; + +@@ -679,26 +770,34 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, + if (opt->flags & OPT_SB_FIELD_ONE_BIAS) + v++; + +- if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) ++ if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { ++ changed = v != opt->get_sb(sb); ++ + opt->set_sb(sb, v); ++ } + + if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { + if (WARN(!bch2_member_exists(sb, dev_idx), + "tried to set device option %s on nonexistent device %i", + opt->attr.name, dev_idx)) +- return; ++ return false; + +- opt->set_member(bch2_members_v2_get_mut(sb, dev_idx), v); ++ struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); ++ changed = v != opt->get_member(m); ++ opt->set_member(m, v); + } ++ ++ return changed; + } + +-void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, ++bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, + const struct bch_option *opt, u64 v) + { +- mutex_lock(&c->sb_lock); +- __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); ++ bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); ++ if (changed) ++ bch2_write_super(c); ++ return changed; + } + + /* io opts: */ +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index dfb14810124c..84ce69a7f131 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -11,6 +11,7 @@ + struct bch_fs; + + extern const char * const bch2_error_actions[]; ++extern const char * const bch2_degraded_actions[]; + extern const char * const bch2_fsck_fix_opts[]; + extern const char * const bch2_version_upgrade_opts[]; + extern const char * const bch2_sb_features[]; +@@ -149,12 +150,12 @@ enum fsck_err_opts { + NULL, "Number of consecutive write errors allowed before kicking out a device")\ + x(metadata_replicas, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ + BCH_SB_META_REPLICAS_WANT, 1, \ + "#", "Number of metadata replicas") \ + x(data_replicas, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ + BCH_SB_DATA_REPLICAS_WANT, 1, \ + "#", "Number of data replicas") \ + x(metadata_replicas_required, u8, \ +@@ -164,7 +165,7 @@ enum fsck_err_opts { + "#", NULL) \ + x(data_replicas_required, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ +- OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ OPT_UINT(1, BCH_REPLICAS_MAX + 1), \ + BCH_SB_DATA_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(encoded_extent_max, u32, \ +@@ -233,6 +234,11 @@ enum fsck_err_opts { + OPT_BOOL(), \ + BCH_SB_CASEFOLD, false, \ + NULL, "Dirent lookups are casefolded") \ ++ x(casefold_disabled, u8, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, false, \ ++ NULL, "Disable casefolding filesystem wide") \ + x(inodes_32bit, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ +@@ -307,14 +313,9 @@ enum fsck_err_opts { + NULL, "Enable project quotas") \ + x(degraded, u8, \ + OPT_FS|OPT_MOUNT, \ +- OPT_BOOL(), \ +- BCH2_NO_SB_OPT, false, \ ++ OPT_STR(bch2_degraded_actions), \ ++ BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ + NULL, "Allow mounting in degraded mode") \ +- x(very_degraded, u8, \ +- OPT_FS|OPT_MOUNT, \ +- OPT_BOOL(), \ +- BCH2_NO_SB_OPT, false, \ +- NULL, "Allow mounting in when data will be missing") \ + x(no_splitbrain_check, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ +@@ -383,6 +384,11 @@ enum fsck_err_opts { + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Exit recovery immediately prior to journal replay")\ ++ x(journal_rewind, u64, \ ++ OPT_FS|OPT_MOUNT, \ ++ OPT_UINT(0, U64_MAX), \ ++ BCH2_NO_SB_OPT, 0, \ ++ NULL, "Rewind journal") \ + x(recovery_passes, u64, \ + OPT_FS|OPT_MOUNT, \ + OPT_BITFIELD(bch2_recovery_passes), \ +@@ -454,7 +460,7 @@ enum fsck_err_opts { + BCH2_NO_SB_OPT, false, \ + NULL, "Reconstruct alloc btree") \ + x(version_upgrade, u8, \ +- OPT_FS|OPT_MOUNT, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_version_upgrade_opts), \ + BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ + NULL, "Set superblock to latest version,\n" \ +@@ -494,6 +500,17 @@ enum fsck_err_opts { + BCH2_NO_SB_OPT, true, \ + NULL, "Enable rebalance: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ ++ x(rebalance_on_ac_only, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_REBALANCE_AC_ONLY, false, \ ++ NULL, "Enable rebalance while on mains power only\n") \ ++ x(auto_snapshot_deletion, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, true, \ ++ NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ ++ "quiet the system when doing performance testing\n")\ + x(no_data_io, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +@@ -512,7 +529,7 @@ enum fsck_err_opts { + "size", "Specifies the bucket size; must be greater than the btree node size")\ + x(durability, u8, \ + OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ +- OPT_UINT(0, BCH_REPLICAS_MAX), \ ++ OPT_UINT(0, BCH_REPLICAS_MAX + 1), \ + BCH_MEMBER_DURABILITY, 1, \ + "n", "Data written to this device will be considered\n"\ + "to have already been replicated n times") \ +@@ -522,7 +539,7 @@ enum fsck_err_opts { + BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ + "types", "Allowed data types for this device: journal, btree, and/or user")\ + x(discard, u8, \ +- OPT_MOUNT|OPT_DEVICE|OPT_RUNTIME, \ ++ OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_MEMBER_DISCARD, true, \ + NULL, "Enable discard/TRIM support") \ +@@ -530,7 +547,7 @@ enum fsck_err_opts { + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ +- NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ ++ NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ + " prefetched sequentially") + + struct bch_opts { +@@ -616,10 +633,10 @@ void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); + + u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); + int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); +-void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); ++bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); + + struct bch_dev; +-void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); ++bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); + + int bch2_opt_lookup(const char *); + int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); +@@ -636,8 +653,11 @@ void bch2_opts_to_text(struct printbuf *, + struct bch_fs *, struct bch_sb *, + unsigned, unsigned, unsigned); + +-int bch2_opt_check_may_set(struct bch_fs *, struct bch_dev *, int, u64); +-int bch2_opts_check_may_set(struct bch_fs *); ++int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); ++int bch2_opts_hooks_pre_set(struct bch_fs *); ++void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, ++ struct bch_opts *, enum bch_opt_id); ++ + int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, + struct printbuf *, const char *, const char *); + int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, +diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h +index 1ca476adbf6f..907e5c97550b 100644 +--- a/fs/bcachefs/printbuf.h ++++ b/fs/bcachefs/printbuf.h +@@ -140,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], + .size = _size, \ + }) + ++static inline struct printbuf bch2_printbuf_init(void) ++{ ++ return PRINTBUF; ++} ++ ++DEFINE_CLASS(printbuf, struct printbuf, ++ bch2_printbuf_exit(&_T), bch2_printbuf_init(), void) ++ + /* + * Returns size remaining of output buffer: + */ +@@ -287,4 +295,8 @@ static inline void printbuf_atomic_dec(struct printbuf *buf) + buf->atomic--; + } + ++DEFINE_GUARD(printbuf_atomic, struct printbuf *, ++ printbuf_atomic_inc(_T), ++ printbuf_atomic_dec(_T)); ++ + #endif /* _BCACHEFS_PRINTBUF_H */ +diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c +index d09898566abe..792fc6fef270 100644 +--- a/fs/bcachefs/progress.c ++++ b/fs/bcachefs/progress.c +@@ -46,16 +46,16 @@ void bch2_progress_update_iter(struct btree_trans *trans, + s->last_node = b; + + if (progress_update_p(s)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + unsigned percent = s->nodes_total + ? div64_u64(s->nodes_seen * 100, s->nodes_total) + : 0; + + prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", +- msg, percent, s->nodes_seen, s->nodes_total); ++ strip_bch2(msg), ++ percent, s->nodes_seen, s->nodes_total); + bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); + + bch_info(c, "%s", buf.buf); +- printbuf_exit(&buf); + } + } +diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h +index 23fb1811f943..972a73087ffe 100644 +--- a/fs/bcachefs/progress.h ++++ b/fs/bcachefs/progress.h +@@ -26,4 +26,7 @@ void bch2_progress_update_iter(struct btree_trans *, + struct btree_iter *, + const char *); + ++#define progress_update_iter(trans, p, iter) \ ++ bch2_progress_update_iter(trans, p, iter, __func__) ++ + #endif /* _BCACHEFS_PROGRESS_H */ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 3d4755d73af7..eaa43ad9baa6 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -394,12 +394,10 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, + dq = bkey_s_c_to_quota(k); + q = &c->quotas[k.k->p.inode]; + +- mutex_lock(&q->lock); ++ guard(mutex)(&q->lock); + mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); +- if (!mq) { +- mutex_unlock(&q->lock); ++ if (!mq) + return -ENOMEM; +- } + + for (i = 0; i < Q_COUNTERS; i++) { + mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); +@@ -414,8 +412,6 @@ static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, + mq->c[Q_INO].timer = qdq->d_ino_timer; + if (qdq && qdq->d_fieldmask & QC_INO_WARNS) + mq->c[Q_INO].warns = qdq->d_ino_warns; +- +- mutex_unlock(&q->lock); + } + + return 0; +@@ -516,30 +512,27 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, + bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, + KEY_TYPE_QUOTA_NOCHECK); + advance: +- bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); ++ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + return 0; + } + + int bch2_fs_quota_read(struct bch_fs *c) + { ++ scoped_guard(mutex, &c->sb_lock) { ++ struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); ++ if (!sb_quota) ++ return bch_err_throw(c, ENOSPC_sb_quota); + +- mutex_lock(&c->sb_lock); +- struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); +- if (!sb_quota) { +- mutex_unlock(&c->sb_lock); +- return -BCH_ERR_ENOSPC_sb_quota; ++ bch2_sb_quota_read(c); + } + +- bch2_sb_quota_read(c); +- mutex_unlock(&c->sb_lock); +- +- int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN, + BTREE_ITER_prefetch, k, + __bch2_quota_set(c, k, NULL)) ?: + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, +- bch2_fs_quota_read_inode(trans, &iter, k))); ++ bch2_fs_quota_read_inode(trans, &iter, k)); + bch_err_fn(c, ret); + return ret; + } +@@ -550,7 +543,6 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) + { + struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; +- int ret = 0; + + if (sb->s_flags & SB_RDONLY) + return -EROFS; +@@ -569,11 +561,12 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) + if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) + return -EINVAL; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { +- ret = -BCH_ERR_ENOSPC_sb_quota; +- goto unlock; ++ int ret = bch_err_throw(c, ENOSPC_sb_quota); ++ bch_err_fn(c, ret); ++ return ret; + } + + if (uflags & FS_QUOTA_UDQ_ENFD) +@@ -586,10 +579,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); + + bch2_write_super(c); +-unlock: +- mutex_unlock(&c->sb_lock); +- +- return bch2_err_class(ret); ++ return 0; + } + + static int bch2_quota_disable(struct super_block *sb, unsigned uflags) +@@ -599,7 +589,7 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags) + if (sb->s_flags & SB_RDONLY) + return -EROFS; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + if (uflags & FS_QUOTA_UDQ_ENFD) + SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); + +@@ -610,8 +600,6 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags) + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); + + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- + return 0; + } + +@@ -700,14 +688,12 @@ static int bch2_quota_set_info(struct super_block *sb, int type, + { + struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; +- int ret = 0; + + if (0) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + qc_info_to_text(&buf, info); + pr_info("setting:\n%s", buf.buf); +- printbuf_exit(&buf); + } + + if (sb->s_flags & SB_RDONLY) +@@ -723,11 +709,12 @@ static int bch2_quota_set_info(struct super_block *sb, int type, + ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) + return -EINVAL; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { +- ret = -BCH_ERR_ENOSPC_sb_quota; +- goto unlock; ++ int ret = bch_err_throw(c, ENOSPC_sb_quota); ++ bch_err_fn(c, ret); ++ return bch2_err_class(ret); + } + + if (info->i_fieldmask & QC_SPC_TIMER) +@@ -749,10 +736,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type, + bch2_sb_quota_read(c); + + bch2_write_super(c); +-unlock: +- mutex_unlock(&c->sb_lock); +- +- return bch2_err_class(ret); ++ return 0; + } + + /* Get/set individual quotas: */ +@@ -778,15 +762,13 @@ static int bch2_get_quota(struct super_block *sb, struct kqid kqid, + struct bch_fs *c = sb->s_fs_info; + struct bch_memquota_type *q = &c->quotas[kqid.type]; + qid_t qid = from_kqid(&init_user_ns, kqid); +- struct bch_memquota *mq; + + memset(qdq, 0, sizeof(*qdq)); + +- mutex_lock(&q->lock); +- mq = genradix_ptr(&q->table, qid); ++ guard(mutex)(&q->lock); ++ struct bch_memquota *mq = genradix_ptr(&q->table, qid); + if (mq) + __bch2_quota_get(qdq, mq); +- mutex_unlock(&q->lock); + + return 0; + } +@@ -799,34 +781,27 @@ static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, + qid_t qid = from_kqid(&init_user_ns, *kqid); + struct genradix_iter iter; + struct bch_memquota *mq; +- int ret = 0; + +- mutex_lock(&q->lock); ++ guard(mutex)(&q->lock); + + genradix_for_each_from(&q->table, iter, mq, qid) + if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { + __bch2_quota_get(qdq, mq); + *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); +- goto found; ++ return 0; + } + +- ret = -ENOENT; +-found: +- mutex_unlock(&q->lock); +- return bch2_err_class(ret); ++ return -ENOENT; + } + + static int bch2_set_quota_trans(struct btree_trans *trans, + struct bkey_i_quota *new_quota, + struct qc_dqblk *qdq) + { +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; +- +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, +- BTREE_ITER_slots|BTREE_ITER_intent); +- ret = bkey_err(k); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_quotas, new_quota->k.p, ++ BTREE_ITER_slots|BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); + if (unlikely(ret)) + return ret; + +@@ -843,33 +818,29 @@ static int bch2_set_quota_trans(struct btree_trans *trans, + if (qdq->d_fieldmask & QC_INO_HARD) + new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); + +- ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_trans_update(trans, &iter, &new_quota->k_i, 0); + } + + static int bch2_set_quota(struct super_block *sb, struct kqid qid, + struct qc_dqblk *qdq) + { + struct bch_fs *c = sb->s_fs_info; +- struct bkey_i_quota new_quota; +- int ret; + + if (0) { +- struct printbuf buf = PRINTBUF; +- ++ CLASS(printbuf, buf)(); + qc_dqblk_to_text(&buf, qdq); + pr_info("setting:\n%s", buf.buf); +- printbuf_exit(&buf); + } + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + ++ struct bkey_i_quota new_quota; + bkey_quota_init(&new_quota.k_i); + new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); + +- ret = bch2_trans_commit_do(c, NULL, NULL, 0, ++ CLASS(btree_trans, trans)(c); ++ int ret = commit_do(trans, NULL, NULL, 0, + bch2_set_quota_trans(trans, &new_quota, qdq)) ?: + __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); + +diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c +index bef2aa1b8bcd..b1438be9d690 100644 +--- a/fs/bcachefs/rcu_pending.c ++++ b/fs/bcachefs/rcu_pending.c +@@ -182,11 +182,6 @@ static inline void kfree_bulk(size_t nr, void ** p) + while (nr--) + kfree(*p); + } +- +-#define local_irq_save(flags) \ +-do { \ +- flags = 0; \ +-} while (0) + #endif + + static noinline void __process_finished_items(struct rcu_pending *pending, +@@ -429,9 +424,15 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, + + BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); + +- local_irq_save(flags); +- p = this_cpu_ptr(pending->p); +- spin_lock(&p->lock); ++ /* We could technically be scheduled before taking the lock and end up ++ * using a different cpu's rcu_pending_pcpu: that's ok, it needs a lock ++ * anyways ++ * ++ * And we have to do it this way to avoid breaking PREEMPT_RT, which ++ * redefines how spinlocks work: ++ */ ++ p = raw_cpu_ptr(pending->p); ++ spin_lock_irqsave(&p->lock, flags); + rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); + restart: + if (may_sleep && +@@ -520,9 +521,8 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, + goto free_node; + } + +- local_irq_save(flags); +- p = this_cpu_ptr(pending->p); +- spin_lock(&p->lock); ++ p = raw_cpu_ptr(pending->p); ++ spin_lock_irqsave(&p->lock, flags); + goto restart; + } + +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index 623273556aa9..c0c5fe961a83 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -15,6 +15,7 @@ + #include "inode.h" + #include "io_write.h" + #include "move.h" ++#include "progress.h" + #include "rebalance.h" + #include "subvolume.h" + #include "super-io.h" +@@ -80,6 +81,7 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + ++ guard(rcu)(); + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) + rewrite_ptrs |= ptr_bit; +@@ -95,6 +97,9 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + ++ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ++ return 0; ++ + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | + bch2_bkey_ptrs_need_move(c, opts, ptrs); + } +@@ -107,6 +112,9 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) + if (!opts) + return 0; + ++ if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ++ return 0; ++ + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 sectors = 0; +@@ -126,10 +134,13 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) + } + } + incompressible: +- if (opts->background_target) ++ if (opts->background_target) { ++ guard(rcu)(); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) ++ if (!p.ptr.cached && ++ !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + sectors += p.crc.compressed_size; ++ } + + return sectors; + } +@@ -210,7 +221,7 @@ int bch2_get_update_rebalance_opts(struct btree_trans *trans, + return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0) ?: +- -BCH_ERR_transaction_restart_nested; ++ bch_err_throw(trans->c, transaction_restart_nested); + } + + #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) +@@ -224,43 +235,34 @@ static const char * const bch2_rebalance_state_strs[] = { + + int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) + { +- struct btree_iter iter; +- struct bkey_s_c k; +- struct bkey_i_cookie *cookie; +- u64 v; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, +- SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), +- BTREE_ITER_intent); +- k = bch2_btree_iter_peek_slot(trans, &iter); +- ret = bkey_err(k); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, ++ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), ++ BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + +- v = k.k->type == KEY_TYPE_cookie ++ u64 v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + +- cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); ++ struct bkey_i_cookie *cookie = bch2_trans_kmalloc(trans, sizeof(*cookie)); + ret = PTR_ERR_OR_ZERO(cookie); + if (ret) +- goto err; ++ return ret; + + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter.pos; + cookie->v.cookie = cpu_to_le64(v + 1); + +- ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_trans_update(trans, &iter, &cookie->k_i, 0); + } + + int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) + { +- int ret = bch2_trans_commit_do(c, NULL, NULL, +- BCH_TRANS_COMMIT_no_enospc, ++ CLASS(btree_trans, trans)(c); ++ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_set_rebalance_needs_scan_trans(trans, inum)); + bch2_rebalance_wakeup(c); + return ret; +@@ -273,35 +275,28 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *c) + + static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie) + { +- struct btree_iter iter; +- struct bkey_s_c k; +- u64 v; +- int ret; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, +- SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), +- BTREE_ITER_intent); +- k = bch2_btree_iter_peek_slot(trans, &iter); +- ret = bkey_err(k); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work, ++ SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), ++ BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + +- v = k.k->type == KEY_TYPE_cookie ++ u64 v = k.k->type == KEY_TYPE_cookie + ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie) + : 0; + +- if (v == cookie) +- ret = bch2_btree_delete_at(trans, &iter, 0); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return v == cookie ++ ? bch2_btree_delete_at(trans, &iter, 0) ++ : 0; + } + + static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, + struct btree_iter *work_iter) + { + return !kthread_should_stop() +- ? bch2_btree_iter_peek(trans, work_iter) ++ ? bch2_btree_iter_peek(work_iter) + : bkey_s_c_null; + } + +@@ -330,12 +325,12 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + +- bch2_trans_iter_exit(trans, extent_iter); ++ bch2_trans_iter_exit(extent_iter); + bch2_trans_iter_init(trans, extent_iter, + work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, + work_pos, + BTREE_ITER_all_snapshots); +- struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); + if (bkey_err(k)) + return k; + +@@ -363,7 +358,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + } + + if (trace_rebalance_extent_enabled()) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); +@@ -389,7 +384,6 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + } + + trace_rebalance_extent(c, buf.buf); +- printbuf_exit(&buf); + } + + return k; +@@ -433,7 +427,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, + if (bch2_err_matches(ret, ENOMEM)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); +- ret = -BCH_ERR_transaction_restart_nested; ++ ret = bch_err_throw(c, transaction_restart_nested); + } + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -447,22 +441,11 @@ static int do_rebalance_extent(struct moving_context *ctxt, + return ret; + } + +-static bool rebalance_pred(struct bch_fs *c, void *arg, +- struct bkey_s_c k, +- struct bch_io_opts *io_opts, +- struct data_update_opts *data_opts) +-{ +- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); +- data_opts->target = io_opts->background_target; +- data_opts->write_flags |= BCH_WRITE_only_specified_devs; +- return data_opts->rewrite_ptrs != 0; +-} +- + static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) + { + struct btree_trans *trans = ctxt->trans; ++ struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &trans->c->rebalance; +- int ret; + + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); + ctxt->stats = &r->scan_stats; +@@ -477,11 +460,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) + + r->state = BCH_REBALANCE_scanning; + +- ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: +- commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_clear_rebalance_needs_scan(trans, inum, cookie)); ++ struct per_snapshot_io_opts snapshot_io_opts; ++ per_snapshot_io_opts_init(&snapshot_io_opts, c); ++ ++ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, ++ r->scan_start.pos, r->scan_end.pos, ++ BTREE_ITER_all_snapshots| ++ BTREE_ITER_not_extents| ++ BTREE_ITER_prefetch, k, ({ ++ ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + ++ struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, ++ &snapshot_io_opts, iter.pos, &iter, k); ++ PTR_ERR_OR_ZERO(io_opts); ++ })) ?: ++ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ++ bch2_clear_rebalance_needs_scan(trans, inum, cookie)); ++ ++ per_snapshot_io_opts_exit(&snapshot_io_opts); + bch2_move_stats_exit(&r->scan_stats, trans->c); ++ ++ /* ++ * Ensure that the rebalance_work entries we created are seen by the ++ * next iteration of do_rebalance(), so we don't end up stuck in ++ * rebalance_wait(): ++ */ ++ atomic64_inc(&r->scan_stats.sectors_seen); ++ bch2_btree_write_buffer_flush_sync(trans); ++ + return ret; + } + +@@ -503,7 +509,14 @@ static void rebalance_wait(struct bch_fs *c) + r->state = BCH_REBALANCE_waiting; + } + +- bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); ++ bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); ++} ++ ++static bool bch2_rebalance_enabled(struct bch_fs *c) ++{ ++ return c->opts.rebalance_enabled && ++ !(c->opts.rebalance_on_ac_only && ++ c->rebalance.on_battery); + } + + static int do_rebalance(struct moving_context *ctxt) +@@ -511,8 +524,9 @@ static int do_rebalance(struct moving_context *ctxt) + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; + struct bch_fs_rebalance *r = &c->rebalance; +- struct btree_iter rebalance_work_iter, extent_iter = {}; ++ struct btree_iter extent_iter = { NULL }; + struct bkey_s_c k; ++ u32 kick = r->kick; + int ret = 0; + + bch2_trans_begin(trans); +@@ -520,14 +534,14 @@ static int do_rebalance(struct moving_context *ctxt) + bch2_move_stats_init(&r->work_stats, "rebalance_work"); + bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); + +- bch2_trans_iter_init(trans, &rebalance_work_iter, +- BTREE_ID_rebalance_work, POS_MIN, +- BTREE_ITER_all_snapshots); ++ CLASS(btree_iter, rebalance_work_iter)(trans, ++ BTREE_ID_rebalance_work, POS_MIN, ++ BTREE_ITER_all_snapshots); + + while (!bch2_move_ratelimit(ctxt)) { +- if (!c->opts.rebalance_enabled) { ++ if (!bch2_rebalance_enabled(c)) { + bch2_moving_ctxt_flush_all(ctxt); +- kthread_wait_freezable(c->opts.rebalance_enabled || ++ kthread_wait_freezable(bch2_rebalance_enabled(c) || + kthread_should_stop()); + } + +@@ -552,17 +566,17 @@ static int do_rebalance(struct moving_context *ctxt) + if (ret) + break; + +- bch2_btree_iter_advance(trans, &rebalance_work_iter); ++ bch2_btree_iter_advance(&rebalance_work_iter); + } + +- bch2_trans_iter_exit(trans, &extent_iter); +- bch2_trans_iter_exit(trans, &rebalance_work_iter); ++ bch2_trans_iter_exit(&extent_iter); + bch2_move_stats_exit(&r->scan_stats, c); + + if (!ret && + !kthread_should_stop() && + !atomic64_read(&r->work_stats.sectors_seen) && +- !atomic64_read(&r->scan_stats.sectors_seen)) { ++ !atomic64_read(&r->scan_stats.sectors_seen) && ++ kick == r->kick) { + bch2_moving_ctxt_flush_all(ctxt); + bch2_trans_unlock_long(trans); + rebalance_wait(c); +@@ -585,7 +599,7 @@ static int bch2_rebalance_thread(void *arg) + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + */ +- kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || ++ kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_should_stop()); + + bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, +@@ -646,11 +660,12 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) + } + prt_newline(out); + +- rcu_read_lock(); +- struct task_struct *t = rcu_dereference(c->rebalance.thread); +- if (t) +- get_task_struct(t); +- rcu_read_unlock(); ++ struct task_struct *t; ++ scoped_guard(rcu) { ++ t = rcu_dereference(c->rebalance.thread); ++ if (t) ++ get_task_struct(t); ++ } + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); +@@ -681,17 +696,15 @@ void bch2_rebalance_stop(struct bch_fs *c) + + int bch2_rebalance_start(struct bch_fs *c) + { +- struct task_struct *p; +- int ret; +- + if (c->rebalance.thread) + return 0; + + if (c->opts.nochanges) + return 0; + +- p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); +- ret = PTR_ERR_OR_ZERO(p); ++ struct task_struct *p = ++ kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); ++ int ret = PTR_ERR_OR_ZERO(p); + bch_err_msg(c, ret, "creating rebalance thread"); + if (ret) + return ret; +@@ -702,7 +715,152 @@ int bch2_rebalance_start(struct bch_fs *c) + return 0; + } + +-void bch2_fs_rebalance_init(struct bch_fs *c) ++#ifdef CONFIG_POWER_SUPPLY ++#include ++ ++static int bch2_rebalance_power_notifier(struct notifier_block *nb, ++ unsigned long event, void *data) ++{ ++ struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); ++ ++ c->rebalance.on_battery = !power_supply_is_system_supplied(); ++ bch2_rebalance_wakeup(c); ++ return NOTIFY_OK; ++} ++#endif ++ ++void bch2_fs_rebalance_exit(struct bch_fs *c) ++{ ++#ifdef CONFIG_POWER_SUPPLY ++ power_supply_unreg_notifier(&c->rebalance.power_notifier); ++#endif ++} ++ ++int bch2_fs_rebalance_init(struct bch_fs *c) ++{ ++ struct bch_fs_rebalance *r = &c->rebalance; ++ ++ bch2_pd_controller_init(&r->pd); ++ ++#ifdef CONFIG_POWER_SUPPLY ++ r->power_notifier.notifier_call = bch2_rebalance_power_notifier; ++ int ret = power_supply_reg_notifier(&r->power_notifier); ++ if (ret) ++ return ret; ++ ++ r->on_battery = !power_supply_is_system_supplied(); ++#endif ++ return 0; ++} ++ ++static int check_rebalance_work_one(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct btree_iter *rebalance_iter, ++ struct bkey_buf *last_flushed) + { +- bch2_pd_controller_init(&c->rebalance.pd); ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c extent_k, rebalance_k; ++ CLASS(printbuf, buf)(); ++ ++ int ret = bkey_err(extent_k = bch2_btree_iter_peek(extent_iter)) ?: ++ bkey_err(rebalance_k = bch2_btree_iter_peek(rebalance_iter)); ++ if (ret) ++ return ret; ++ ++ if (!extent_k.k && ++ extent_iter->btree_id == BTREE_ID_reflink && ++ (!rebalance_k.k || ++ rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { ++ bch2_trans_iter_exit(extent_iter); ++ bch2_trans_iter_init(trans, extent_iter, ++ BTREE_ID_extents, POS_MIN, ++ BTREE_ITER_prefetch| ++ BTREE_ITER_all_snapshots); ++ return bch_err_throw(c, transaction_restart_nested); ++ } ++ ++ if (!extent_k.k && !rebalance_k.k) ++ return 1; ++ ++ int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, ++ rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); ++ ++ struct bkey deleted; ++ bkey_init(&deleted); ++ ++ if (cmp < 0) { ++ deleted.p = extent_k.k->p; ++ rebalance_k.k = &deleted; ++ } else if (cmp > 0) { ++ deleted.p = rebalance_k.k->p; ++ extent_k.k = &deleted; ++ } ++ ++ bool should_have_rebalance = ++ bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; ++ bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; ++ ++ if (should_have_rebalance != have_rebalance) { ++ ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); ++ if (ret) ++ return ret; ++ ++ bch2_bkey_val_to_text(&buf, c, extent_k); ++ } ++ ++ if (fsck_err_on(!should_have_rebalance && have_rebalance, ++ trans, rebalance_work_incorrectly_set, ++ "rebalance work incorrectly set\n%s", buf.buf)) { ++ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, ++ extent_k.k->p, false); ++ if (ret) ++ return ret; ++ } ++ ++ if (fsck_err_on(should_have_rebalance && !have_rebalance, ++ trans, rebalance_work_incorrectly_unset, ++ "rebalance work incorrectly unset\n%s", buf.buf)) { ++ ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, ++ extent_k.k->p, true); ++ if (ret) ++ return ret; ++ } ++ ++ if (cmp <= 0) ++ bch2_btree_iter_advance(extent_iter); ++ if (cmp >= 0) ++ bch2_btree_iter_advance(rebalance_iter); ++fsck_err: ++ return ret; ++} ++ ++int bch2_check_rebalance_work(struct bch_fs *c) ++{ ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, extent_iter)(trans, BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_prefetch); ++ CLASS(btree_iter, rebalance_iter)(trans, BTREE_ID_rebalance_work, POS_MIN, ++ BTREE_ITER_prefetch); ++ ++ struct bkey_buf last_flushed; ++ bch2_bkey_buf_init(&last_flushed); ++ bkey_init(&last_flushed.k->k); ++ ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, BIT_ULL(BTREE_ID_rebalance_work)); ++ ++ int ret = 0; ++ while (!ret) { ++ progress_update_iter(trans, &progress, &rebalance_iter); ++ ++ bch2_trans_begin(trans); ++ ++ ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); ++ ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ ret = 0; ++ } ++ ++ bch2_bkey_buf_exit(&last_flushed, c); ++ return ret < 0 ? ret : 0; + } +diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h +index e5e8eb4a2dd1..7a565ea7dbfc 100644 +--- a/fs/bcachefs/rebalance.h ++++ b/fs/bcachefs/rebalance.h +@@ -39,19 +39,21 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *); + + static inline void bch2_rebalance_wakeup(struct bch_fs *c) + { +- struct task_struct *p; +- +- rcu_read_lock(); +- p = rcu_dereference(c->rebalance.thread); ++ c->rebalance.kick++; ++ guard(rcu)(); ++ struct task_struct *p = rcu_dereference(c->rebalance.thread); + if (p) + wake_up_process(p); +- rcu_read_unlock(); + } + + void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); + + void bch2_rebalance_stop(struct bch_fs *); + int bch2_rebalance_start(struct bch_fs *); +-void bch2_fs_rebalance_init(struct bch_fs *); ++ ++void bch2_fs_rebalance_exit(struct bch_fs *); ++int bch2_fs_rebalance_init(struct bch_fs *); ++ ++int bch2_check_rebalance_work(struct bch_fs *); + + #endif /* _BCACHEFS_REBALANCE_H */ +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +index fe5098c17dfc..c659da149fa3 100644 +--- a/fs/bcachefs/rebalance_types.h ++++ b/fs/bcachefs/rebalance_types.h +@@ -18,6 +18,7 @@ enum bch_rebalance_states { + + struct bch_fs_rebalance { + struct task_struct __rcu *thread; ++ u32 kick; + struct bch_pd_controller pd; + + enum bch_rebalance_states state; +@@ -30,6 +31,11 @@ struct bch_fs_rebalance { + struct bbpos scan_start; + struct bbpos scan_end; + struct bch_move_stats scan_stats; ++ ++ bool on_battery; ++#ifdef CONFIG_POWER_SUPPLY ++ struct notifier_block power_notifier; ++#endif + }; + + #endif /* _BCACHEFS_REBALANCE_TYPES_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index d6c4ef819d40..c57ff235a97a 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -33,77 +33,86 @@ + #include + #include + +- +-int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) ++int bch2_btree_lost_data(struct bch_fs *c, ++ struct printbuf *msg, ++ enum btree_id btree) + { +- u64 b = BIT_ULL(btree); + int ret = 0; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); ++ bool write_sb = false; + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + +- if (!(c->sb.btrees_lost_data & b)) { +- struct printbuf buf = PRINTBUF; +- bch2_btree_id_to_text(&buf, btree); +- bch_err(c, "flagging btree %s lost data", buf.buf); +- printbuf_exit(&buf); +- ext->btrees_lost_data |= cpu_to_le64(b); ++ if (!(c->sb.btrees_lost_data & BIT_ULL(btree))) { ++ prt_printf(msg, "flagging btree "); ++ bch2_btree_id_to_text(msg, btree); ++ prt_printf(msg, " lost data\n"); ++ ++ write_sb |= !__test_and_set_bit_le64(btree, &ext->btrees_lost_data); + } + + /* Once we have runtime self healing for topology errors we won't need this: */ +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; + + /* Btree node accounting will be off: */ +- __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret; + + #ifdef CONFIG_BCACHEFS_DEBUG + /* + * These are much more minor, and don't need to be corrected right away, + * but in debug mode we want the next fsck run to be clean: + */ +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0, &write_sb) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0, &write_sb) ?: ret; + #endif + ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_backpointer_to_missing_ptr, ext->errors_silent); ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); ++ + switch (btree) { + case BTREE_ID_alloc: +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; +- +- __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); +- __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); +- __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); +- __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); +- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); +- __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; ++ ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); ++ write_sb |= !__test_and_set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + goto out; + case BTREE_ID_backpointers: +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0, &write_sb) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0, &write_sb) ?: ret; + goto out; + case BTREE_ID_need_discard: +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; + goto out; + case BTREE_ID_freespace: +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; + goto out; + case BTREE_ID_bucket_gens: +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; + goto out; + case BTREE_ID_lru: +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0, &write_sb) ?: ret; + goto out; + case BTREE_ID_accounting: +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0, &write_sb) ?: ret; ++ goto out; ++ case BTREE_ID_snapshots: ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0, &write_sb) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret; + goto out; + default: +- ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0, &write_sb) ?: ret; ++ ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0, &write_sb) ?: ret; + goto out; + } + out: +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- ++ if (write_sb) ++ bch2_write_super(c); + return ret; + } + +@@ -114,12 +123,9 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree) + } + + /* for -o reconstruct_alloc: */ +-static void bch2_reconstruct_alloc(struct bch_fs *c) ++void bch2_reconstruct_alloc(struct bch_fs *c) + { +- bch2_journal_log_msg(c, "dropping alloc info"); +- bch_info(c, "dropping and reconstructing all alloc info"); +- +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); +@@ -160,8 +166,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) + + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + ++ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); ++ + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) + if (btree_id_is_alloc(i)) +@@ -199,7 +206,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + BTREE_ITER_intent); +- int ret = bch2_btree_iter_traverse(trans, &iter); ++ int ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + +@@ -227,7 +234,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, + + ret = bch2_trans_update(trans, &iter, new, BTREE_TRIGGER_norun); + out: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -262,16 +269,38 @@ static int bch2_journal_replay_key(struct btree_trans *trans, + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + iter_flags); +- ret = bch2_btree_iter_traverse(trans, &iter); ++ ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + struct btree_path *path = btree_iter_path(trans, &iter); + if (unlikely(!btree_path_node(path, k->level))) { +- bch2_trans_iter_exit(trans, &iter); ++ struct bch_fs *c = trans->c; ++ ++ CLASS(printbuf, buf)(); ++ prt_str(&buf, "btree="); ++ bch2_btree_id_to_text(&buf, k->btree_id); ++ prt_printf(&buf, " level=%u ", k->level); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k->k)); ++ ++ if (!(c->recovery.passes_complete & (BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes)| ++ BIT_ULL(BCH_RECOVERY_PASS_check_topology)))) { ++ bch_err(c, "have key in journal replay for btree depth that does not exist, confused\n%s", ++ buf.buf); ++ ret = -EINVAL; ++ } ++ ++ if (!k->allocated) { ++ bch_notice(c, "dropping key in journal replay for depth that does not exist because we're recovering from scan\n%s", ++ buf.buf); ++ k->overwritten = true; ++ goto out; ++ } ++ ++ bch2_trans_iter_exit(&iter); + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, 0, iter_flags); +- ret = bch2_btree_iter_traverse(trans, &iter) ?: ++ ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_increase_depth(trans, iter.path, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; +@@ -282,13 +311,18 @@ static int bch2_journal_replay_key(struct btree_trans *trans, + goto out; + + if (k->k->k.type == KEY_TYPE_accounting) { +- ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); ++ struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); ++ ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ goto out; ++ ++ bkey_copy(n, k->k); + goto out; + } + + ret = bch2_trans_update(trans, &iter, k->k, update_flags); + out: +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -306,14 +340,15 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r) + return cmp_int(l->journal_seq - 1, r->journal_seq - 1); + } + ++DEFINE_DARRAY_NAMED(darray_journal_keys, struct journal_key *) ++ + int bch2_journal_replay(struct bch_fs *c) + { + struct journal_keys *keys = &c->journal_keys; +- DARRAY(struct journal_key *) keys_sorted = { 0 }; ++ CLASS(darray_journal_keys, keys_sorted)(); + struct journal *j = &c->journal; + u64 start_seq = c->journal_replay_seq_start; + u64 end_seq = c->journal_replay_seq_start; +- struct btree_trans *trans = NULL; + bool immediate_flush = false; + int ret = 0; + +@@ -321,13 +356,13 @@ int bch2_journal_replay(struct bch_fs *c) + ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", + keys->nr, start_seq, end_seq); + if (ret) +- goto err; ++ return ret; + } + + BUG_ON(!atomic_read(&keys->ref)); + + move_gap(keys, keys->nr); +- trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + + /* + * Replay accounting keys first: we can't allow the write buffer to +@@ -347,7 +382,7 @@ int bch2_journal_replay(struct bch_fs *c) + BCH_WATERMARK_reclaim, + bch2_journal_replay_accounting_key(trans, k)); + if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret))) +- goto err; ++ return ret; + + k->overwritten = true; + } +@@ -381,7 +416,7 @@ int bch2_journal_replay(struct bch_fs *c) + if (ret) { + ret = darray_push(&keys_sorted, k); + if (ret) +- goto err; ++ return ret; + } + } + +@@ -412,25 +447,19 @@ int bch2_journal_replay(struct bch_fs *c) + : 0), + bch2_journal_replay_key(trans, k)); + if (ret) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); + bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); +- printbuf_exit(&buf); +- goto err; ++ return ret; + } + + BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); + } + +- /* +- * We need to put our btree_trans before calling flush_all_pins(), since +- * that will use a btree_trans internally +- */ +- bch2_trans_put(trans); +- trans = NULL; ++ bch2_trans_unlock_long(trans); + + if (!c->opts.retain_recovery_info && +- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) ++ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) + bch2_journal_keys_put_initial(c); + + replay_now_at(j, j->replay_journal_seq_end); +@@ -446,12 +475,7 @@ int bch2_journal_replay(struct bch_fs *c) + + if (keys->nr) + bch2_journal_log_msg(c, "journal replay finished"); +-err: +- if (trans) +- bch2_trans_put(trans); +- darray_exit(&keys_sorted); +- bch_err_fn(c, ret); +- return ret; ++ return 0; + } + + /* journal replay early: */ +@@ -563,7 +587,7 @@ static int journal_replay_early(struct bch_fs *c, + + static int read_btree_roots(struct bch_fs *c) + { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { +@@ -585,9 +609,7 @@ static int read_btree_roots(struct bch_fs *c) + buf.buf, bch2_err_str(ret))) { + if (btree_id_is_alloc(i)) + r->error = 0; +- +- ret = bch2_btree_lost_data(c, i); +- BUG_ON(ret); ++ ret = 0; + } + } + +@@ -601,7 +623,6 @@ static int read_btree_roots(struct bch_fs *c) + } + } + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -635,7 +656,7 @@ static bool check_version_upgrade(struct bch_fs *c) + } + + if (new_version > old_version) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + if (old_version < bcachefs_metadata_required_upgrade_below) + prt_str(&buf, "Version upgrade required:\n"); +@@ -667,15 +688,13 @@ static bool check_version_upgrade(struct bch_fs *c) + bch2_recovery_passes_from_stable(le64_to_cpu(passes))); + } + +- bch_info(c, "%s", buf.buf); +- printbuf_exit(&buf); +- ++ bch_notice(c, "%s", buf.buf); + ret = true; + } + +- if (new_version > c->sb.version_incompat && ++ if (new_version > c->sb.version_incompat_allowed && + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, new_version); +@@ -683,9 +702,7 @@ static bool check_version_upgrade(struct bch_fs *c) + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); + +- bch_info(c, "%s", buf.buf); +- printbuf_exit(&buf); +- ++ bch_notice(c, "%s", buf.buf); + ret = true; + } + +@@ -733,7 +750,24 @@ int bch2_fs_recovery(struct bch_fs *c) + ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) + : BCH_RECOVERY_PASS_snapshots_read; + c->opts.nochanges = true; ++ } ++ ++ if (c->opts.nochanges) + c->opts.read_only = true; ++ ++ if (c->opts.journal_rewind) { ++ bch_info(c, "rewinding journal, fsck required"); ++ c->opts.fsck = true; ++ } ++ ++ if (go_rw_in_recovery(c)) { ++ /* ++ * start workqueues/kworkers early - kthread creation checks for ++ * pending signals, which is _very_ annoying ++ */ ++ ret = bch2_fs_init_rw(c); ++ if (ret) ++ goto err; + } + + mutex_lock(&c->sb_lock); +@@ -748,15 +782,14 @@ int bch2_fs_recovery(struct bch_fs *c) + + u64 sb_passes = bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (sb_passes) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + prt_str(&buf, "superblock requires following recovery passes to be run:\n "); + prt_bitflags(&buf, bch2_recovery_passes, sb_passes); + bch_info(c, "%s", buf.buf); +- printbuf_exit(&buf); + } + + if (bch2_check_version_downgrade(c)) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_str(&buf, "Version downgrade required:"); + +@@ -772,7 +805,6 @@ int bch2_fs_recovery(struct bch_fs *c) + } + + bch_info(c, "%s", buf.buf); +- printbuf_exit(&buf); + write_sb = true; + } + +@@ -790,11 +822,11 @@ int bch2_fs_recovery(struct bch_fs *c) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- if (c->opts.fsck) +- set_bit(BCH_FS_fsck_running, &c->flags); + if (c->sb.clean) + set_bit(BCH_FS_clean_recovery, &c->flags); +- set_bit(BCH_FS_recovery_running, &c->flags); ++ if (c->opts.fsck) ++ set_bit(BCH_FS_in_fsck, &c->flags); ++ set_bit(BCH_FS_in_recovery, &c->flags); + + ret = bch2_blacklist_table_initialize(c); + if (ret) { +@@ -873,7 +905,7 @@ int bch2_fs_recovery(struct bch_fs *c) + use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); +- ret = -BCH_ERR_fsck_repair_impossible; ++ ret = bch_err_throw(c, fsck_repair_impossible); + goto err; + + } +@@ -889,8 +921,36 @@ int bch2_fs_recovery(struct bch_fs *c) + if (ret) + goto err; + +- if (c->opts.reconstruct_alloc) ++ scoped_guard(rwsem_write, &c->state_lock) ++ ret = bch2_fs_resize_on_mount(c); ++ if (ret) ++ goto err; ++ ++ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { ++ bch_info(c, "filesystem is an unresized image file, mounting ro"); ++ c->opts.read_only = true; ++ } ++ ++ if (!c->opts.read_only && ++ (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { ++ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); ++ ++ bch2_reconstruct_alloc(c); ++ } else if (c->opts.reconstruct_alloc) { ++ bch2_journal_log_msg(c, "dropping alloc info"); ++ bch_info(c, "dropping and reconstructing all alloc info"); ++ + bch2_reconstruct_alloc(c); ++ } ++ ++ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { ++ /* We can't go RW to fix errors without alloc info */ ++ if (c->opts.fix_errors == FSCK_FIX_yes || ++ c->opts.fix_errors == FSCK_FIX_ask) ++ c->opts.fix_errors = FSCK_FIX_no; ++ if (c->opts.errors == BCH_ON_ERROR_fix_safe) ++ c->opts.errors = BCH_ON_ERROR_continue; ++ } + + /* + * After an unclean shutdown, skip then next few journal sequence +@@ -915,7 +975,7 @@ int bch2_fs_recovery(struct bch_fs *c) + + ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", + journal_seq, last_seq, blacklist_seq - 1) ?: +- bch2_fs_journal_start(&c->journal, journal_seq); ++ bch2_fs_journal_start(&c->journal, last_seq, journal_seq); + if (ret) + goto err; + +@@ -933,8 +993,10 @@ int bch2_fs_recovery(struct bch_fs *c) + set_bit(BCH_FS_btree_running, &c->flags); + + ret = bch2_sb_set_upgrade_extra(c); ++ if (ret) ++ goto err; + +- ret = bch2_run_recovery_passes(c); ++ ret = bch2_run_recovery_passes(c, 0); + if (ret) + goto err; + +@@ -945,8 +1007,7 @@ int bch2_fs_recovery(struct bch_fs *c) + * multithreaded use: + */ + set_bit(BCH_FS_may_go_rw, &c->flags); +- clear_bit(BCH_FS_fsck_running, &c->flags); +- clear_bit(BCH_FS_recovery_running, &c->flags); ++ clear_bit(BCH_FS_in_fsck, &c->flags); + + /* in case we don't run journal replay, i.e. norecovery mode */ + set_bit(BCH_FS_accounting_replay_done, &c->flags); +@@ -969,9 +1030,8 @@ int bch2_fs_recovery(struct bch_fs *c) + bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); + clear_bit(BCH_FS_errors_fixed, &c->flags); + +- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; +- +- ret = bch2_run_recovery_passes(c); ++ ret = bch2_run_recovery_passes(c, ++ BCH_RECOVERY_PASS_check_alloc_info); + if (ret) + goto err; + +@@ -1015,7 +1075,7 @@ int bch2_fs_recovery(struct bch_fs *c) + + if (c->opts.fsck && + !test_bit(BCH_FS_error, &c->flags) && +- c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && ++ c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && + ext->btrees_lost_data) { + ext->btrees_lost_data = 0; + write_sb = true; +@@ -1042,10 +1102,9 @@ int bch2_fs_recovery(struct bch_fs *c) + + bch2_move_stats_init(&stats, "recovery"); + +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + bch2_version_to_text(&buf, c->sb.version_min); + bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf); +- printbuf_exit(&buf); + + ret = bch2_fs_read_write_early(c) ?: + bch2_scan_old_btree_nodes(c, &stats); +@@ -1058,13 +1117,6 @@ int bch2_fs_recovery(struct bch_fs *c) + out: + bch2_flush_fsck_errs(c); + +- if (!c->opts.retain_recovery_info) { +- bch2_journal_keys_put_initial(c); +- bch2_find_btree_nodes_exit(&c->found_btree_nodes); +- } +- if (!IS_ERR(clean)) +- kfree(clean); +- + if (!ret && + test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) && + !c->opts.nochanges) { +@@ -1073,11 +1125,22 @@ int bch2_fs_recovery(struct bch_fs *c) + } + + bch_err_fn(c, ret); ++final_out: ++ if (!IS_ERR(clean)) ++ kfree(clean); + return ret; + err: + fsck_err: +- bch2_fs_emergency_read_only(c); +- goto out; ++ { ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ prt_printf(&buf, "error in recovery: %s\n", bch2_err_str(ret)); ++ bch2_fs_emergency_read_only2(c, &buf); ++ ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ } ++ goto final_out; + } + + int bch2_fs_initialize(struct bch_fs *c) +@@ -1085,58 +1148,36 @@ int bch2_fs_initialize(struct bch_fs *c) + struct bch_inode_unpacked root_inode, lostfound_inode; + struct bkey_inode_buf packed_inode; + struct qstr lostfound = QSTR("lost+found"); +- struct bch_member *m; + int ret; + + bch_notice(c, "initializing new filesystem"); + set_bit(BCH_FS_new_fs, &c->flags); + +- mutex_lock(&c->sb_lock); +- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); +- c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); ++ scoped_guard(mutex, &c->sb_lock) { ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); ++ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + +- bch2_check_version_downgrade(c); ++ bch2_check_version_downgrade(c); + +- if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { +- bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); +- SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); +- bch2_write_super(c); +- } ++ if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { ++ bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); ++ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); ++ bch2_write_super(c); ++ } + +- for_each_member_device(c, ca) { +- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); +- SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); +- ca->mi = bch2_mi_to_cpu(m); +- } ++ for_each_member_device(c, ca) { ++ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); ++ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false); ++ } + +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ bch2_write_super(c); ++ } + + set_bit(BCH_FS_btree_running, &c->flags); +- set_bit(BCH_FS_may_go_rw, &c->flags); + + for (unsigned i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc_fake(c, i, 0); + +- ret = bch2_fs_journal_alloc(c); +- if (ret) +- goto err; +- +- /* +- * journal_res_get() will crash if called before this has +- * set up the journal.pin FIFO and journal.cur pointer: +- */ +- ret = bch2_fs_journal_start(&c->journal, 1); +- if (ret) +- goto err; +- +- ret = bch2_fs_read_write_early(c); +- if (ret) +- goto err; +- +- set_bit(BCH_FS_accounting_replay_done, &c->flags); +- bch2_journal_set_replay_done(&c->journal); +- + for_each_member_device(c, ca) { + ret = bch2_dev_usage_init(ca, false); + if (ret) { +@@ -1155,6 +1196,27 @@ int bch2_fs_initialize(struct bch_fs *c) + if (ret) + goto err; + ++ ret = bch2_fs_journal_alloc(c); ++ if (ret) ++ goto err; ++ ++ /* ++ * journal_res_get() will crash if called before this has ++ * set up the journal.pin FIFO and journal.cur pointer: ++ */ ++ ret = bch2_fs_journal_start(&c->journal, 1, 1); ++ if (ret) ++ goto err; ++ ++ set_bit(BCH_FS_may_go_rw, &c->flags); ++ ret = bch2_fs_read_write_early(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_journal_replay(c); ++ if (ret) ++ goto err; ++ + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; +@@ -1193,7 +1255,7 @@ int bch2_fs_initialize(struct bch_fs *c) + if (ret) + goto err; + +- c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; ++ c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; + + bch2_copygc_wakeup(c); + bch2_rebalance_wakeup(c); +@@ -1209,14 +1271,13 @@ int bch2_fs_initialize(struct bch_fs *c) + if (ret) + goto err; + +- mutex_lock(&c->sb_lock); +- SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); +- SET_BCH_SB_CLEAN(c->disk_sb.sb, false); +- +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) { ++ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ bch2_write_super(c); ++ } + +- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; ++ c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; + return 0; + err: + bch_err_fn(c, ret); +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index b0d55754b21b..c023f52fc2d6 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -2,7 +2,8 @@ + #ifndef _BCACHEFS_RECOVERY_H + #define _BCACHEFS_RECOVERY_H + +-int bch2_btree_lost_data(struct bch_fs *, enum btree_id); ++int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); ++void bch2_reconstruct_alloc(struct bch_fs *); + + int bch2_journal_replay(struct bch_fs *); + +diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c +index 22f72bb5b853..bd442652d0f5 100644 +--- a/fs/bcachefs/recovery_passes.c ++++ b/fs/bcachefs/recovery_passes.c +@@ -28,6 +28,176 @@ const char * const bch2_recovery_passes[] = { + NULL + }; + ++static const u8 passes_to_stable_map[] = { ++#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, ++ BCH_RECOVERY_PASSES() ++#undef x ++}; ++ ++static const u8 passes_from_stable_map[] = { ++#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, ++ BCH_RECOVERY_PASSES() ++#undef x ++}; ++ ++static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) ++{ ++ return passes_to_stable_map[pass]; ++} ++ ++u64 bch2_recovery_passes_to_stable(u64 v) ++{ ++ u64 ret = 0; ++ for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) ++ if (v & BIT_ULL(i)) ++ ret |= BIT_ULL(passes_to_stable_map[i]); ++ return ret; ++} ++ ++static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) ++{ ++ return pass < ARRAY_SIZE(passes_from_stable_map) ++ ? passes_from_stable_map[pass] ++ : 0; ++} ++ ++u64 bch2_recovery_passes_from_stable(u64 v) ++{ ++ u64 ret = 0; ++ for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) ++ if (v & BIT_ULL(i)) ++ ret |= BIT_ULL(passes_from_stable_map[i]); ++ return ret; ++} ++ ++static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, ++ enum bch_validate_flags flags, struct printbuf *err) ++{ ++ return 0; ++} ++ ++static void bch2_sb_recovery_passes_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_recovery_passes *r = ++ field_to_type(f, recovery_passes); ++ unsigned nr = recovery_passes_nr_entries(r); ++ ++ if (out->nr_tabstops < 1) ++ printbuf_tabstop_push(out, 32); ++ if (out->nr_tabstops < 2) ++ printbuf_tabstop_push(out, 16); ++ ++ prt_printf(out, "Pass\tLast run\tLast runtime\n"); ++ ++ for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { ++ if (!i->last_run) ++ continue; ++ ++ unsigned idx = i - r->start; ++ ++ prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); ++ ++ bch2_prt_datetime(out, le64_to_cpu(i->last_run)); ++ prt_tab(out); ++ ++ bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); ++ ++ if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) ++ prt_str(out, " (no ratelimit)"); ++ ++ prt_newline(out); ++ } ++} ++ ++static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, ++ enum bch_recovery_pass pass) ++{ ++ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ struct bch_sb_field_recovery_passes *r = ++ bch2_sb_field_get(c->disk_sb.sb, recovery_passes); ++ ++ if (stable >= recovery_passes_nr_entries(r)) { ++ unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); ++ ++ r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); ++ if (!r) { ++ bch_err(c, "error creating recovery_passes sb section"); ++ return NULL; ++ } ++ } ++ ++ return r->start + stable; ++} ++ ++static void bch2_sb_recovery_pass_complete(struct bch_fs *c, ++ enum bch_recovery_pass pass, ++ s64 start_time) ++{ ++ guard(mutex)(&c->sb_lock); ++ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); ++ __clear_bit_le64(bch2_recovery_pass_to_stable(pass), ++ ext->recovery_passes_required); ++ ++ struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); ++ if (e) { ++ s64 end_time = ktime_get_real_seconds(); ++ e->last_run = cpu_to_le64(end_time); ++ e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); ++ SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); ++ } ++ ++ bch2_write_super(c); ++} ++ ++void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, ++ enum bch_recovery_pass pass) ++{ ++ guard(mutex)(&c->sb_lock); ++ ++ struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); ++ if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { ++ SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); ++ bch2_write_super(c); ++ } ++} ++ ++static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) ++{ ++ enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); ++ bool ret = false; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ struct bch_sb_field_recovery_passes *r = ++ bch2_sb_field_get(c->disk_sb.sb, recovery_passes); ++ ++ if (stable < recovery_passes_nr_entries(r)) { ++ struct recovery_pass_entry *i = r->start + stable; ++ ++ /* ++ * Ratelimit if the last runtime was more than 1% of the time ++ * since we last ran ++ */ ++ ret = (u64) le32_to_cpu(i->last_runtime) * 100 > ++ ktime_get_real_seconds() - le64_to_cpu(i->last_run); ++ ++ if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) ++ ret = false; ++ } ++ ++ return ret; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { ++ .validate = bch2_sb_recovery_passes_validate, ++ .to_text = bch2_sb_recovery_passes_to_text ++}; ++ + /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ + static int bch2_recovery_pass_empty(struct bch_fs *c) + { +@@ -47,268 +217,436 @@ static int bch2_set_may_go_rw(struct bch_fs *c) + + set_bit(BCH_FS_may_go_rw, &c->flags); + +- if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) ++ if (go_rw_in_recovery(c)) { ++ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { ++ bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); ++ bch2_reconstruct_alloc(c); ++ } ++ + return bch2_fs_read_write_early(c); ++ } + return 0; + } + ++/* ++ * Make sure root inode is readable while we're still in recovery and can rewind ++ * for repair: ++ */ ++static int bch2_lookup_root_inode(struct bch_fs *c) ++{ ++ subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; ++ struct bch_inode_unpacked inode_u; ++ struct bch_subvolume subvol; ++ CLASS(btree_trans, trans)(c); ++ ++ return lockrestart_do(trans, ++ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: ++ bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); ++} ++ + struct recovery_pass_fn { + int (*fn)(struct bch_fs *); ++ const char *name; + unsigned when; + }; + + static struct recovery_pass_fn recovery_pass_fns[] = { +-#define x(_fn, _id, _when) { .fn = bch2_##_fn, .when = _when }, ++#define x(_fn, _id, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when }, + BCH_RECOVERY_PASSES() + #undef x + }; + +-static const u8 passes_to_stable_map[] = { +-#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, +- BCH_RECOVERY_PASSES() +-#undef x +-}; ++static u64 bch2_recovery_passes_match(unsigned flags) ++{ ++ u64 ret = 0; + +-static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) ++ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) ++ if (recovery_pass_fns[i].when & flags) ++ ret |= BIT_ULL(i); ++ return ret; ++} ++ ++u64 bch2_fsck_recovery_passes(void) + { +- return passes_to_stable_map[pass]; ++ return bch2_recovery_passes_match(PASS_FSCK); + } + +-u64 bch2_recovery_passes_to_stable(u64 v) ++static void bch2_run_async_recovery_passes(struct bch_fs *c) + { +- u64 ret = 0; +- for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) +- if (v & BIT_ULL(i)) +- ret |= BIT_ULL(passes_to_stable_map[i]); +- return ret; ++ if (!down_trylock(&c->recovery.run_lock)) ++ return; ++ ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) ++ goto unlock; ++ ++ if (queue_work(system_long_wq, &c->recovery.work)) ++ return; ++ ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); ++unlock: ++ up(&c->recovery.run_lock); + } + +-u64 bch2_recovery_passes_from_stable(u64 v) ++static bool recovery_pass_needs_set(struct bch_fs *c, ++ enum bch_recovery_pass pass, ++ enum bch_run_recovery_pass_flags *flags) + { +- static const u8 map[] = { +-#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, +- BCH_RECOVERY_PASSES() +-#undef x +- }; ++ struct bch_fs_recovery *r = &c->recovery; + +- u64 ret = 0; +- for (unsigned i = 0; i < ARRAY_SIZE(map); i++) +- if (v & BIT_ULL(i)) +- ret |= BIT_ULL(map[i]); +- return ret; ++ /* ++ * Never run scan_for_btree_nodes persistently: check_topology will run ++ * it if required ++ */ ++ if (pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) ++ *flags |= RUN_RECOVERY_PASS_nopersistent; ++ ++ if ((*flags & RUN_RECOVERY_PASS_ratelimit) && ++ !bch2_recovery_pass_want_ratelimit(c, pass)) ++ *flags &= ~RUN_RECOVERY_PASS_ratelimit; ++ ++ /* ++ * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do ++ * anything if the pass has already run: these mean we need a prior pass ++ * to run before we continue to repair, we don't expect that pass to fix ++ * the damage we encountered. ++ * ++ * Otherwise, we run run_explicit_recovery_pass when we find damage, so ++ * it should run again even if it's already run: ++ */ ++ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); ++ bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); ++ bool rewind = in_recovery && ++ r->curr_pass > pass && ++ !(r->passes_complete & BIT_ULL(pass)); ++ ++ if (persistent ++ ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) ++ : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) ++ return true; ++ ++ if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && ++ (r->passes_ratelimiting & BIT_ULL(pass))) ++ return true; ++ ++ if (rewind) ++ return true; ++ ++ return false; + } + + /* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +-static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, +- enum bch_recovery_pass pass) ++int __bch2_run_explicit_recovery_pass(struct bch_fs *c, ++ struct printbuf *out, ++ enum bch_recovery_pass pass, ++ enum bch_run_recovery_pass_flags flags, ++ bool *write_sb) + { +- if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) +- return -BCH_ERR_not_in_recovery; ++ struct bch_fs_recovery *r = &c->recovery; ++ int ret = 0; ++ ++ lockdep_assert_held(&c->sb_lock); + +- if (c->recovery_passes_complete & BIT_ULL(pass)) ++ bch2_printbuf_make_room(out, 1024); ++ guard(printbuf_atomic)(out); ++ guard(spinlock_irq)(&r->lock); ++ ++ if (!recovery_pass_needs_set(c, pass, &flags)) + return 0; + +- bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); ++ bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); ++ bool rewind = in_recovery && ++ r->curr_pass > pass && ++ !(r->passes_complete & BIT_ULL(pass)); ++ bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; ++ ++ if (!(flags & RUN_RECOVERY_PASS_nopersistent)) { ++ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); ++ *write_sb |= !__test_and_set_bit_le64(bch2_recovery_pass_to_stable(pass), ++ ext->recovery_passes_required); ++ } + + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && +- c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { +- if (print) +- bch_info(c, "need recovery pass %s (%u), but already rw", +- bch2_recovery_passes[pass], pass); +- return -BCH_ERR_cannot_rewind_recovery; ++ (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { ++ prt_printf(out, "need recovery pass %s (%u), but already rw\n", ++ bch2_recovery_passes[pass], pass); ++ return bch_err_throw(c, cannot_rewind_recovery); + } + +- if (print) +- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", +- bch2_recovery_passes[pass], pass, +- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); ++ if (ratelimit) ++ r->passes_ratelimiting |= BIT_ULL(pass); ++ else ++ r->passes_ratelimiting &= ~BIT_ULL(pass); ++ ++ if (in_recovery && !ratelimit) { ++ prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", ++ bch2_recovery_passes[pass], pass, ++ bch2_recovery_passes[r->curr_pass], r->curr_pass, ++ rewind ? " - rewinding" : ""); + +- c->opts.recovery_passes |= BIT_ULL(pass); ++ r->passes_to_run |= BIT_ULL(pass); + +- if (c->curr_recovery_pass > pass) { +- c->next_recovery_pass = pass; +- c->recovery_passes_complete &= (1ULL << pass) >> 1; +- return -BCH_ERR_restart_recovery; ++ if (rewind) { ++ r->next_pass = pass; ++ r->passes_complete &= (1ULL << pass) >> 1; ++ ret = bch_err_throw(c, restart_recovery); ++ } + } else { +- return 0; ++ prt_printf(out, "scheduling recovery pass %s (%u)%s\n", ++ bch2_recovery_passes[pass], pass, ++ ratelimit ? " - ratelimiting" : ""); ++ ++ struct recovery_pass_fn *p = recovery_pass_fns + pass; ++ if (p->when & PASS_ONLINE) ++ bch2_run_async_recovery_passes(c); + } ++ ++ return ret; + } + + int bch2_run_explicit_recovery_pass(struct bch_fs *c, +- enum bch_recovery_pass pass) ++ struct printbuf *out, ++ enum bch_recovery_pass pass, ++ enum bch_run_recovery_pass_flags flags) + { +- unsigned long flags; +- spin_lock_irqsave(&c->recovery_pass_lock, flags); +- int ret = __bch2_run_explicit_recovery_pass(c, pass); +- spin_unlock_irqrestore(&c->recovery_pass_lock, flags); ++ /* ++ * With RUN_RECOVERY_PASS_ratelimit, recovery_pass_needs_set needs ++ * sb_lock ++ */ ++ if (!(flags & RUN_RECOVERY_PASS_ratelimit) && ++ !recovery_pass_needs_set(c, pass, &flags)) ++ return 0; ++ ++ guard(mutex)(&c->sb_lock); ++ bool write_sb = false; ++ int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb); ++ if (write_sb) ++ bch2_write_super(c); + return ret; + } + +-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, +- enum bch_recovery_pass pass) ++/* ++ * Returns 0 if @pass has run recently, otherwise one of ++ * -BCH_ERR_restart_recovery ++ * -BCH_ERR_recovery_pass_will_run ++ */ ++int bch2_require_recovery_pass(struct bch_fs *c, ++ struct printbuf *out, ++ enum bch_recovery_pass pass) + { +- lockdep_assert_held(&c->sb_lock); +- +- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); +- __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); ++ if (test_bit(BCH_FS_in_recovery, &c->flags) && ++ c->recovery.passes_complete & BIT_ULL(pass)) ++ return 0; + +- return bch2_run_explicit_recovery_pass(c, pass); +-} ++ guard(mutex)(&c->sb_lock); + +-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, +- enum bch_recovery_pass pass) +-{ +- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); ++ if (bch2_recovery_pass_want_ratelimit(c, pass)) ++ return 0; + +- mutex_lock(&c->sb_lock); +- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); ++ enum bch_run_recovery_pass_flags flags = 0; + +- if (!test_bit_le64(s, ext->recovery_passes_required)) { +- __set_bit_le64(s, ext->recovery_passes_required); ++ bool write_sb = false; ++ int ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags, &write_sb) ?: ++ bch_err_throw(c, recovery_pass_will_run); ++ if (write_sb) + bch2_write_super(c); +- } +- mutex_unlock(&c->sb_lock); +- +- return bch2_run_explicit_recovery_pass(c, pass); ++ return ret; + } + +-static void bch2_clear_recovery_pass_required(struct bch_fs *c, +- enum bch_recovery_pass pass) ++int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) + { +- enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); ++ enum bch_run_recovery_pass_flags flags = 0; + +- mutex_lock(&c->sb_lock); +- struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); ++ if (!recovery_pass_needs_set(c, pass, &flags)) ++ return 0; + +- if (test_bit_le64(s, ext->recovery_passes_required)) { +- __clear_bit_le64(s, ext->recovery_passes_required); +- bch2_write_super(c); +- } +- mutex_unlock(&c->sb_lock); +-} ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); + +-u64 bch2_fsck_recovery_passes(void) +-{ +- u64 ret = 0; ++ guard(mutex)(&c->sb_lock); ++ bool write_sb = false; ++ int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, ++ RUN_RECOVERY_PASS_nopersistent, ++ &write_sb); + +- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) +- if (recovery_pass_fns[i].when & PASS_FSCK) +- ret |= BIT_ULL(i); ++ bch2_print_str(c, KERN_NOTICE, buf.buf); + return ret; + } + +-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +-{ +- struct recovery_pass_fn *p = recovery_pass_fns + pass; +- +- if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) +- return false; +- if (c->opts.recovery_passes & BIT_ULL(pass)) +- return true; +- if ((p->when & PASS_FSCK) && c->opts.fsck) +- return true; +- if ((p->when & PASS_UNCLEAN) && !c->sb.clean) +- return true; +- if (p->when & PASS_ALWAYS) +- return true; +- return false; +-} +- + static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) + { ++ struct bch_fs_recovery *r = &c->recovery; + struct recovery_pass_fn *p = recovery_pass_fns + pass; +- int ret; + + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), + bch2_recovery_passes[pass]); +- ret = p->fn(c); +- if (ret) ++ ++ s64 start_time = ktime_get_real_seconds(); ++ int ret = p->fn(c); ++ ++ r->passes_to_run &= ~BIT_ULL(pass); ++ ++ if (ret) { ++ bch_err(c, "%s(): error %s", p->name, bch2_err_str(ret)); ++ r->passes_failing |= BIT_ULL(pass); + return ret; ++ } ++ ++ r->passes_failing = 0; ++ ++ if (!test_bit(BCH_FS_error, &c->flags)) ++ bch2_sb_recovery_pass_complete(c, pass, start_time); ++ + if (!(p->when & PASS_SILENT)) + bch2_print(c, KERN_CONT " done\n"); + + return 0; + } + +-int bch2_run_online_recovery_passes(struct bch_fs *c) ++static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, ++ bool online) + { +- for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { +- struct recovery_pass_fn *p = recovery_pass_fns + i; ++ struct bch_fs_recovery *r = &c->recovery; ++ int ret = 0; + +- if (!(p->when & PASS_ONLINE)) +- continue; ++ spin_lock_irq(&r->lock); + +- int ret = bch2_run_recovery_pass(c, i); +- if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { +- i = c->curr_recovery_pass; +- continue; +- } +- if (ret) +- return ret; +- } ++ if (online) ++ orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); + +- return 0; +-} +- +-int bch2_run_recovery_passes(struct bch_fs *c) +-{ +- int ret = 0; ++ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) ++ orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); + + /* +- * We can't allow set_may_go_rw to be excluded; that would cause us to +- * use the journal replay keys for updates where it's not expected. ++ * A failed recovery pass will be retried after another pass succeeds - ++ * but not this iteration. ++ * ++ * This is because some passes depend on repair done by other passes: we ++ * may want to retry, but we don't want to loop on failing passes. + */ +- c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + +- spin_lock_irq(&c->recovery_pass_lock); ++ orig_passes_to_run &= ~r->passes_failing; + +- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { +- unsigned prev_done = c->recovery_pass_done; +- unsigned pass = c->curr_recovery_pass; ++ r->passes_to_run = orig_passes_to_run; + +- c->next_recovery_pass = pass + 1; ++ while (r->passes_to_run) { ++ unsigned prev_done = r->pass_done; ++ unsigned pass = __ffs64(r->passes_to_run); ++ r->curr_pass = pass; ++ r->next_pass = r->curr_pass + 1; ++ r->passes_to_run &= ~BIT_ULL(pass); + +- if (c->opts.recovery_pass_last && +- c->curr_recovery_pass > c->opts.recovery_pass_last) +- break; ++ spin_unlock_irq(&r->lock); ++ ++ int ret2 = bch2_run_recovery_pass(c, pass) ?: ++ bch2_journal_flush(&c->journal); + +- if (should_run_recovery_pass(c, pass)) { +- spin_unlock_irq(&c->recovery_pass_lock); +- ret = bch2_run_recovery_pass(c, pass) ?: +- bch2_journal_flush(&c->journal); +- +- if (!ret && !test_bit(BCH_FS_error, &c->flags)) +- bch2_clear_recovery_pass_required(c, pass); +- spin_lock_irq(&c->recovery_pass_lock); +- +- if (c->next_recovery_pass < c->curr_recovery_pass) { +- /* +- * bch2_run_explicit_recovery_pass() was called: we +- * can't always catch -BCH_ERR_restart_recovery because +- * it may have been called from another thread (btree +- * node read completion) +- */ +- ret = 0; +- c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); +- } else { +- c->recovery_passes_complete |= BIT_ULL(pass); +- c->recovery_pass_done = max(c->recovery_pass_done, pass); +- } ++ spin_lock_irq(&r->lock); ++ ++ if (r->next_pass < r->curr_pass) { ++ /* Rewind: */ ++ r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); ++ } else if (!ret2) { ++ r->pass_done = max(r->pass_done, pass); ++ r->passes_complete |= BIT_ULL(pass); ++ } else { ++ ret = ret2; + } + +- c->curr_recovery_pass = c->next_recovery_pass; ++ if (ret && !online) ++ break; + + if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && +- c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) { ++ r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { + bch2_copygc_wakeup(c); + bch2_rebalance_wakeup(c); + } + } + +- spin_unlock_irq(&c->recovery_pass_lock); ++ clear_bit(BCH_FS_in_recovery, &c->flags); ++ spin_unlock_irq(&r->lock); ++ ++ return ret; ++} ++ ++static void bch2_async_recovery_passes_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); ++ struct bch_fs_recovery *r = &c->recovery; ++ ++ __bch2_run_recovery_passes(c, ++ c->sb.recovery_passes_required & ~r->passes_ratelimiting, ++ true); ++ ++ up(&r->run_lock); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); ++} ++ ++int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) ++{ ++ return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); ++} ++ ++int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) ++{ ++ u64 passes = ++ bch2_recovery_passes_match(PASS_ALWAYS) | ++ (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | ++ (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | ++ c->opts.recovery_passes | ++ c->sb.recovery_passes_required; ++ ++ if (c->opts.recovery_pass_last) ++ passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; ++ ++ /* ++ * We can't allow set_may_go_rw to be excluded; that would cause us to ++ * use the journal replay keys for updates where it's not expected. ++ */ ++ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; ++ passes &= ~c->opts.recovery_passes_exclude; ++ ++ passes &= ~(BIT_ULL(from) - 1); ++ ++ down(&c->recovery.run_lock); ++ int ret = __bch2_run_recovery_passes(c, passes, false); ++ up(&c->recovery.run_lock); + + return ret; + } ++ ++static void prt_passes(struct printbuf *out, const char *msg, u64 passes) ++{ ++ prt_printf(out, "%s:\t", msg); ++ prt_bitflags(out, bch2_recovery_passes, passes); ++ prt_newline(out); ++} ++ ++void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct bch_fs_recovery *r = &c->recovery; ++ ++ printbuf_tabstop_push(out, 32); ++ prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); ++ prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & ++ bch2_recovery_passes_match(PASS_ONLINE)); ++ prt_passes(out, "Complete passes", r->passes_complete); ++ prt_passes(out, "Failing passes", r->passes_failing); ++ ++ if (r->curr_pass) { ++ prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); ++ prt_passes(out, "Current passes", r->passes_to_run); ++ } ++ ++ prt_printf(out, "Pass done:\t%s\n", bch2_recovery_passes[r->pass_done]); ++} ++ ++void bch2_fs_recovery_passes_init(struct bch_fs *c) ++{ ++ spin_lock_init(&c->recovery.lock); ++ sema_init(&c->recovery.run_lock, 1); ++ ++ INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); ++} +diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h +index 7d7339c8fa29..95e3612bb96c 100644 +--- a/fs/bcachefs/recovery_passes.h ++++ b/fs/bcachefs/recovery_passes.h +@@ -3,16 +3,53 @@ + + extern const char * const bch2_recovery_passes[]; + ++extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; ++ + u64 bch2_recovery_passes_to_stable(u64 v); + u64 bch2_recovery_passes_from_stable(u64 v); + + u64 bch2_fsck_recovery_passes(void); + +-int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +-int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); +-int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); ++void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass); ++ ++enum bch_run_recovery_pass_flags { ++ RUN_RECOVERY_PASS_nopersistent = BIT(0), ++ RUN_RECOVERY_PASS_ratelimit = BIT(1), ++}; ++ ++static inline bool go_rw_in_recovery(struct bch_fs *c) ++{ ++ return (c->journal_keys.nr || ++ !c->opts.read_only || ++ !c->sb.clean || ++ c->opts.recovery_passes || ++ (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))); ++} ++ ++static inline bool recovery_pass_will_run(struct bch_fs *c, enum bch_recovery_pass pass) ++{ ++ return unlikely(test_bit(BCH_FS_in_recovery, &c->flags) && ++ c->recovery.passes_to_run & BIT_ULL(pass)); ++} ++ ++int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); ++ ++int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, ++ enum bch_recovery_pass, ++ enum bch_run_recovery_pass_flags, ++ bool *); ++int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, ++ enum bch_recovery_pass, ++ enum bch_run_recovery_pass_flags); ++ ++int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *, ++ enum bch_recovery_pass); ++ ++int bch2_run_online_recovery_passes(struct bch_fs *, u64); ++int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); ++ ++void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); + +-int bch2_run_online_recovery_passes(struct bch_fs *); +-int bch2_run_recovery_passes(struct bch_fs *); ++void bch2_fs_recovery_passes_init(struct bch_fs *); + + #endif /* _BCACHEFS_RECOVERY_PASSES_H */ +diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h +new file mode 100644 +index 000000000000..b63c20558d3d +--- /dev/null ++++ b/fs/bcachefs/recovery_passes_format.h +@@ -0,0 +1,106 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H ++#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H ++ ++#define PASS_SILENT BIT(0) ++#define PASS_FSCK BIT(1) ++#define PASS_UNCLEAN BIT(2) ++#define PASS_ALWAYS BIT(3) ++#define PASS_ONLINE BIT(4) ++#define PASS_ALLOC BIT(5) ++#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++#define PASS_FSCK_DEBUG BIT(1) ++#else ++#define PASS_FSCK_DEBUG 0 ++#endif ++ ++/* ++ * Passes may be reordered, but the second field is a persistent identifier and ++ * must never change: ++ */ ++#define BCH_RECOVERY_PASSES() \ ++ x(recovery_pass_empty, 41, PASS_SILENT) \ ++ x(scan_for_btree_nodes, 37, 0) \ ++ x(check_topology, 4, 0) \ ++ x(accounting_read, 39, PASS_ALWAYS) \ ++ x(alloc_read, 0, PASS_ALWAYS) \ ++ x(stripes_read, 1, 0) \ ++ x(initialize_subvolumes, 2, 0) \ ++ x(snapshots_read, 3, PASS_ALWAYS) \ ++ x(check_allocations, 5, PASS_FSCK_ALLOC) \ ++ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ ++ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ ++ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ ++ x(journal_replay, 9, PASS_ALWAYS) \ ++ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ ++ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ ++ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ ++ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ ++ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ ++ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ ++ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ ++ x(bucket_gens_init, 17, 0) \ ++ x(reconstruct_snapshots, 38, 0) \ ++ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ ++ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ ++ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ ++ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ ++ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ ++ x(fs_upgrade_for_subvolumes, 22, 0) \ ++ x(check_inodes, 24, PASS_FSCK) \ ++ x(check_extents, 25, PASS_FSCK) \ ++ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ ++ x(check_dirents, 27, PASS_FSCK) \ ++ x(check_xattrs, 28, PASS_FSCK) \ ++ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ ++ x(check_unreachable_inodes, 40, PASS_FSCK) \ ++ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ ++ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ ++ x(check_nlinks, 31, PASS_FSCK) \ ++ x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ ++ x(resume_logged_ops, 23, PASS_ALWAYS) \ ++ x(delete_dead_inodes, 32, PASS_ALWAYS) \ ++ x(fix_reflink_p, 33, 0) \ ++ x(set_fs_needs_rebalance, 34, 0) \ ++ x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) ++ ++/* We normally enumerate recovery passes in the order we run them: */ ++enum bch_recovery_pass { ++#define x(n, id, when) BCH_RECOVERY_PASS_##n, ++ BCH_RECOVERY_PASSES() ++#undef x ++ BCH_RECOVERY_PASS_NR ++}; ++ ++/* But we also need stable identifiers that can be used in the superblock */ ++enum bch_recovery_pass_stable { ++#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, ++ BCH_RECOVERY_PASSES() ++#undef x ++}; ++ ++struct recovery_pass_entry { ++ __le64 last_run; ++ __le32 last_runtime; ++ __le32 flags; ++}; ++ ++LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1) ++ ++struct bch_sb_field_recovery_passes { ++ struct bch_sb_field field; ++ struct recovery_pass_entry start[]; ++}; ++ ++static inline unsigned ++recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) ++{ ++ return r ++ ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / ++ sizeof(struct recovery_pass_entry)) ++ : 0; ++} ++ ++#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ +diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h +index e89b9c783285..aa9526938cc3 100644 +--- a/fs/bcachefs/recovery_passes_types.h ++++ b/fs/bcachefs/recovery_passes_types.h +@@ -2,79 +2,26 @@ + #ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H + #define _BCACHEFS_RECOVERY_PASSES_TYPES_H + +-#define PASS_SILENT BIT(0) +-#define PASS_FSCK BIT(1) +-#define PASS_UNCLEAN BIT(2) +-#define PASS_ALWAYS BIT(3) +-#define PASS_ONLINE BIT(4) +- +-#ifdef CONFIG_BCACHEFS_DEBUG +-#define PASS_FSCK_DEBUG BIT(1) +-#else +-#define PASS_FSCK_DEBUG 0 +-#endif +- +-/* +- * Passes may be reordered, but the second field is a persistent identifier and +- * must never change: +- */ +-#define BCH_RECOVERY_PASSES() \ +- x(recovery_pass_empty, 41, PASS_SILENT) \ +- x(scan_for_btree_nodes, 37, 0) \ +- x(check_topology, 4, 0) \ +- x(accounting_read, 39, PASS_ALWAYS) \ +- x(alloc_read, 0, PASS_ALWAYS) \ +- x(stripes_read, 1, 0) \ +- x(initialize_subvolumes, 2, 0) \ +- x(snapshots_read, 3, PASS_ALWAYS) \ +- x(check_allocations, 5, PASS_FSCK) \ +- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ +- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ +- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ +- x(journal_replay, 9, PASS_ALWAYS) \ +- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ +- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ +- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ +- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ +- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ +- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ +- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ +- x(bucket_gens_init, 17, 0) \ +- x(reconstruct_snapshots, 38, 0) \ +- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ +- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ +- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ +- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ +- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ +- x(fs_upgrade_for_subvolumes, 22, 0) \ +- x(check_inodes, 24, PASS_FSCK) \ +- x(check_extents, 25, PASS_FSCK) \ +- x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ +- x(check_dirents, 27, PASS_FSCK) \ +- x(check_xattrs, 28, PASS_FSCK) \ +- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ +- x(check_unreachable_inodes, 40, PASS_FSCK) \ +- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ +- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ +- x(check_nlinks, 31, PASS_FSCK) \ +- x(resume_logged_ops, 23, PASS_ALWAYS) \ +- x(delete_dead_inodes, 32, PASS_ALWAYS) \ +- x(fix_reflink_p, 33, 0) \ +- x(set_fs_needs_rebalance, 34, 0) +- +-/* We normally enumerate recovery passes in the order we run them: */ +-enum bch_recovery_pass { +-#define x(n, id, when) BCH_RECOVERY_PASS_##n, +- BCH_RECOVERY_PASSES() +-#undef x +- BCH_RECOVERY_PASS_NR +-}; +- +-/* But we also need stable identifiers that can be used in the superblock */ +-enum bch_recovery_pass_stable { +-#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, +- BCH_RECOVERY_PASSES() +-#undef x ++struct bch_fs_recovery { ++ /* ++ * Two different uses: ++ * "Has this fsck pass?" - i.e. should this type of error be an ++ * emergency read-only ++ * And, in certain situations fsck will rewind to an earlier pass: used ++ * for signaling to the toplevel code which pass we want to run now. ++ */ ++ enum bch_recovery_pass curr_pass; ++ enum bch_recovery_pass next_pass; ++ /* never rewinds version of curr_pass */ ++ enum bch_recovery_pass pass_done; ++ u64 passes_to_run; ++ /* bitmask of recovery passes that we actually ran */ ++ u64 passes_complete; ++ u64 passes_failing; ++ u64 passes_ratelimiting; ++ spinlock_t lock; ++ struct semaphore run_lock; ++ struct work_struct work; + }; + + #endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index 710178e3da4c..238a362de19e 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -3,6 +3,7 @@ + #include "bkey_buf.h" + #include "btree_update.h" + #include "buckets.h" ++#include "enumerated_ref.h" + #include "error.h" + #include "extents.h" + #include "inode.h" +@@ -63,6 +64,9 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + REFLINK_P_IDX(p.v), + le32_to_cpu(p.v->front_pad), + le32_to_cpu(p.v->back_pad)); ++ ++ if (REFLINK_P_ERROR(p.v)) ++ prt_str(out, " error"); + } + + bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +@@ -163,7 +167,7 @@ static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bk + return 0; + + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: +- -BCH_ERR_transaction_restart_nested; ++ bch_err_throw(trans->c, transaction_restart_nested); + } + + static int bch2_indirect_extent_missing_error(struct btree_trans *trans, +@@ -179,7 +183,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, + u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; + u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); + u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; + + BUG_ON(missing_start < refd_start); +@@ -191,7 +195,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, + prt_printf(&buf, "pointer to missing indirect extent in "); + ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); + if (ret) +- goto err; ++ return ret; + + prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9); + bch2_bkey_val_to_text(&buf, c, p.s_c); +@@ -203,7 +207,7 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, + struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); + ret = PTR_ERR_OR_ZERO(new); + if (ret) +- goto err; ++ return ret; + + /* + * Is the missing range not actually needed? +@@ -234,15 +238,13 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, + + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); + if (ret) +- goto err; ++ return ret; + + if (should_commit) + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: +- -BCH_ERR_transaction_restart_nested; ++ bch_err_throw(c, transaction_restart_nested); + } +-err: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -262,33 +264,32 @@ struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, + + u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; + +- struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, +- POS(0, reflink_offset), iter_flags); +- if (bkey_err(k)) +- return k; ++ bch2_trans_iter_init(trans, iter, BTREE_ID_reflink, POS(0, reflink_offset), iter_flags); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ int ret = bkey_err(k); ++ if (ret) ++ goto err; + + if (unlikely(!bkey_extent_is_reflink_data(k.k))) { +- unsigned size = min((u64) k.k->size, +- REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - +- reflink_offset); +- bch2_key_resize(&iter->k, size); +- +- int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, +- k.k->p.offset, should_commit); +- if (ret) { +- bch2_trans_iter_exit(trans, iter); +- return bkey_s_c_err(ret); +- } ++ u64 missing_end = min(k.k->p.offset, ++ REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad)); ++ BUG_ON(reflink_offset == missing_end); ++ ++ ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, ++ missing_end, should_commit); ++ if (ret) ++ goto err; + } else if (unlikely(REFLINK_P_ERROR(p.v))) { +- int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); +- if (ret) { +- bch2_trans_iter_exit(trans, iter); +- return bkey_s_c_err(ret); +- } ++ ret = bch2_indirect_extent_not_missing(trans, p, should_commit); ++ if (ret) ++ goto err; + } + + *offset_into_extent = reflink_offset - bkey_start_offset(k.k); + return k; ++err: ++ bch2_trans_iter_exit(iter); ++ return bkey_s_c_err(ret); + } + + /* reflink pointer trigger */ +@@ -298,7 +299,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, + enum btree_iter_update_trigger_flags flags) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); + struct btree_iter iter; +@@ -311,7 +312,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, + + if (!bkey_refcount_c(k)) { + if (!(flags & BTREE_TRIGGER_overwrite)) +- ret = -BCH_ERR_missing_indirect_extent; ++ ret = bch_err_throw(c, missing_indirect_extent); + goto next; + } + +@@ -356,8 +357,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, + *idx = k.k->p.offset; + err: + fsck_err: +- bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -371,7 +371,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; + u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); + s64 ret = 0; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + if (r_idx >= c->reflink_gc_nr) + goto not_found; +@@ -391,12 +391,10 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, + if (flags & BTREE_TRIGGER_check_repair) { + ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); + if (ret) +- goto err; ++ return ret; + } + + *idx = next_idx; +-err: +- printbuf_exit(&buf); + return ret; + } + +@@ -495,22 +493,16 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + bool reflink_p_may_update_opts_field) + { + struct bch_fs *c = trans->c; +- struct btree_iter reflink_iter = {}; +- struct bkey_s_c k; +- struct bkey_i *r_v; +- struct bkey_i_reflink_p *r_p; +- __le64 *refcount; +- int ret; + + if (orig->k.type == KEY_TYPE_inline_data) + bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); + +- bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, +- BTREE_ITER_intent); +- k = bch2_btree_iter_peek_prev(trans, &reflink_iter); +- ret = bkey_err(k); ++ CLASS(btree_iter, reflink_iter)(trans, BTREE_ID_reflink, POS_MAX, ++ BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_prev(&reflink_iter); ++ int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + /* + * XXX: we're assuming that 56 bits will be enough for the life of the +@@ -520,10 +512,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) + return -ENOSPC; + +- r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ++ struct bkey_i *r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) +- goto err; ++ return ret; + + bkey_init(&r_v->k); + r_v->k.type = bkey_type_to_indirect(&orig->k); +@@ -533,20 +525,21 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + + set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); + +- refcount = bkey_refcount(bkey_i_to_s(r_v)); ++ __le64 *refcount = bkey_refcount(bkey_i_to_s(r_v)); + *refcount = 0; + memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); + + ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); + if (ret) +- goto err; ++ return ret; + + /* + * orig is in a bkey_buf which statically allocates 5 64s for the val, + * so we know it will be big enough: + */ + orig->k.type = KEY_TYPE_reflink_p; +- r_p = bkey_i_to_reflink_p(orig); ++ ++ struct bkey_i_reflink_p *r_p = bkey_i_to_reflink_p(orig); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + + /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ +@@ -561,21 +554,16 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + if (reflink_p_may_update_opts_field) + SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); + +- ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, +- BTREE_UPDATE_internal_snapshot_node); +-err: +- bch2_trans_iter_exit(trans, &reflink_iter); +- +- return ret; ++ return bch2_trans_update(trans, extent_iter, &r_p->k_i, ++ BTREE_UPDATE_internal_snapshot_node); + } + +-static struct bkey_s_c get_next_src(struct btree_trans *trans, +- struct btree_iter *iter, struct bpos end) ++static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + { + struct bkey_s_c k; + int ret; + +- for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { ++ for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { + if (bkey_extent_is_unwritten(k)) + continue; + +@@ -584,7 +572,7 @@ static struct bkey_s_c get_next_src(struct btree_trans *trans, + } + + if (bkey_ge(iter->pos, end)) +- bch2_btree_iter_set_pos(trans, iter, end); ++ bch2_btree_iter_set_pos(iter, end); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + } + +@@ -595,7 +583,6 @@ s64 bch2_remap_range(struct bch_fs *c, + u64 new_i_size, s64 *i_sectors_delta, + bool may_change_src_io_path_opts) + { +- struct btree_trans *trans; + struct btree_iter dst_iter, src_iter; + struct bkey_s_c src_k; + struct bkey_buf new_dst, new_src; +@@ -610,8 +597,8 @@ s64 bch2_remap_range(struct bch_fs *c, + !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); + int ret = 0, ret2 = 0; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) +- return -BCH_ERR_erofs_no_writes; ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) ++ return bch_err_throw(c, erofs_no_writes); + + bch2_check_set_feature(c, BCH_FEATURE_reflink); + +@@ -620,7 +607,7 @@ s64 bch2_remap_range(struct bch_fs *c, + + bch2_bkey_buf_init(&new_dst); + bch2_bkey_buf_init(&new_src); +- trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + + ret = bch2_inum_opts_get(trans, src_inum, &opts); + if (ret) +@@ -648,27 +635,27 @@ s64 bch2_remap_range(struct bch_fs *c, + if (ret) + continue; + +- bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); ++ bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + + ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, + &dst_snapshot); + if (ret) + continue; + +- bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); ++ bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + + if (dst_inum.inum < src_inum.inum) { + /* Avoid some lock cycle transaction restarts */ +- ret = bch2_btree_iter_traverse(trans, &dst_iter); ++ ret = bch2_btree_iter_traverse(&dst_iter); + if (ret) + continue; + } + + dst_done = dst_iter.pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); +- bch2_btree_iter_set_pos(trans, &src_iter, src_want); ++ bch2_btree_iter_set_pos(&src_iter, src_want); + +- src_k = get_next_src(trans, &src_iter, src_end); ++ src_k = get_next_src(&src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + continue; +@@ -710,7 +697,8 @@ s64 bch2_remap_range(struct bch_fs *c, + SET_REFLINK_P_IDX(&dst_p->v, offset); + + if (reflink_p_may_update_opts_field && +- may_change_src_io_path_opts) ++ may_change_src_io_path_opts && ++ REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) + SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); + } else { + BUG(); +@@ -728,8 +716,8 @@ s64 bch2_remap_range(struct bch_fs *c, + true); + bch2_disk_reservation_put(c, &disk_res); + } +- bch2_trans_iter_exit(trans, &dst_iter); +- bch2_trans_iter_exit(trans, &src_iter); ++ bch2_trans_iter_exit(&dst_iter); ++ bch2_trans_iter_exit(&src_iter); + + BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); + BUG_ON(bkey_gt(dst_iter.pos, dst_end)); +@@ -739,7 +727,7 @@ s64 bch2_remap_range(struct bch_fs *c, + + do { + struct bch_inode_unpacked inode_u; +- struct btree_iter inode_iter = {}; ++ struct btree_iter inode_iter = { NULL }; + + bch2_trans_begin(trans); + +@@ -754,14 +742,13 @@ s64 bch2_remap_range(struct bch_fs *c, + BCH_TRANS_COMMIT_no_enospc); + } + +- bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(&inode_iter); + } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); + err: +- bch2_trans_put(trans); + bch2_bkey_buf_exit(&new_src, c); + bch2_bkey_buf_exit(&new_dst, c); + +- bch2_write_ref_put(c, BCH_WRITE_REF_reflink); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); + + return dst_done ?: ret ?: ret2; + } +@@ -775,7 +762,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, + { + struct bch_fs *c = trans->c; + const __le64 *refcount = bkey_refcount_c(k); +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + struct reflink_gc *r; + int ret = 0; + +@@ -803,7 +790,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) +- goto out; ++ return ret; + + if (!r->refcount) + new->k.type = KEY_TYPE_deleted; +@@ -811,32 +798,30 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, + *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); + ret = bch2_trans_update(trans, iter, new, 0); + } +-out: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + + int bch2_gc_reflink_done(struct bch_fs *c) + { ++ CLASS(btree_trans, trans)(c); + size_t idx = 0; + +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, ++ int ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_gc_write_reflink_key(trans, &iter, k, &idx))); ++ bch2_gc_write_reflink_key(trans, &iter, k, &idx)); + c->reflink_gc_nr = 0; + return ret; + } + + int bch2_gc_reflink_start(struct bch_fs *c) + { ++ CLASS(btree_trans, trans)(c); + c->reflink_gc_nr = 0; + +- int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, ++ int ret = for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch, k, ({ + const __le64 *refcount = bkey_refcount_c(k); + +@@ -846,7 +831,7 @@ int bch2_gc_reflink_start(struct bch_fs *c) + struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, + c->reflink_gc_nr++, GFP_KERNEL); + if (!r) { +- ret = -BCH_ERR_ENOMEM_gc_reflink_start; ++ ret = bch_err_throw(c, ENOMEM_gc_reflink_start); + break; + } + +@@ -854,7 +839,7 @@ int bch2_gc_reflink_start(struct bch_fs *c) + r->size = k.k->size; + r->refcount = 0; + 0; +- }))); ++ })); + + bch_err_fn(c, ret); + return ret; +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +index 477ef0997949..0784283ce78c 100644 +--- a/fs/bcachefs/replicas.c ++++ b/fs/bcachefs/replicas.c +@@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, + return 0; + bad: + bch2_replicas_entry_to_text(err, r); +- return -BCH_ERR_invalid_replicas_entry; ++ return bch_err_throw(c, invalid_replicas_entry); + } + + void bch2_cpu_replicas_to_text(struct printbuf *out, +@@ -286,11 +286,8 @@ bool bch2_replicas_marked_locked(struct bch_fs *c, + bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry_v1 *search) + { +- percpu_down_read(&c->mark_lock); +- bool ret = bch2_replicas_marked_locked(c, search); +- percpu_up_read(&c->mark_lock); +- +- return ret; ++ guard(percpu_read)(&c->mark_lock); ++ return bch2_replicas_marked_locked(c, search); + } + + noinline +@@ -305,27 +302,27 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, + memset(&new_r, 0, sizeof(new_r)); + memset(&new_gc, 0, sizeof(new_gc)); + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + + if (c->replicas_gc.entries && + !__replicas_has_entry(&c->replicas_gc, new_entry)) { + new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); + if (!new_gc.entries) { +- ret = -BCH_ERR_ENOMEM_cpu_replicas; +- goto err; ++ ret = bch_err_throw(c, ENOMEM_cpu_replicas); ++ goto out; + } + } + + if (!__replicas_has_entry(&c->replicas, new_entry)) { + new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); + if (!new_r.entries) { +- ret = -BCH_ERR_ENOMEM_cpu_replicas; +- goto err; ++ ret = bch_err_throw(c, ENOMEM_cpu_replicas); ++ goto out; + } + + ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); + if (ret) +- goto err; ++ goto out; + } + + if (!new_r.entries && +@@ -338,22 +335,18 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, + bch2_write_super(c); + + /* don't update in memory replicas until changes are persistent */ +- percpu_down_write(&c->mark_lock); +- if (new_r.entries) +- swap(c->replicas, new_r); +- if (new_gc.entries) +- swap(new_gc, c->replicas_gc); +- percpu_up_write(&c->mark_lock); ++ scoped_guard(percpu_write, &c->mark_lock) { ++ if (new_r.entries) ++ swap(c->replicas, new_r); ++ if (new_gc.entries) ++ swap(new_gc, c->replicas_gc); ++ } + out: +- mutex_unlock(&c->sb_lock); +- + kfree(new_r.entries); + kfree(new_gc.entries); + +- return ret; +-err: + bch_err_msg(c, ret, "adding replicas entry"); +- goto out; ++ return ret; + } + + int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) +@@ -371,24 +364,20 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) + { + lockdep_assert_held(&c->replicas_gc_lock); + +- mutex_lock(&c->sb_lock); +- percpu_down_write(&c->mark_lock); +- +- ret = ret ?: +- bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); +- if (!ret) +- swap(c->replicas, c->replicas_gc); +- +- kfree(c->replicas_gc.entries); +- c->replicas_gc.entries = NULL; ++ guard(mutex)(&c->sb_lock); ++ scoped_guard(percpu_write, &c->mark_lock) { ++ ret = ret ?: ++ bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); ++ if (!ret) ++ swap(c->replicas, c->replicas_gc); + +- percpu_up_write(&c->mark_lock); ++ kfree(c->replicas_gc.entries); ++ c->replicas_gc.entries = NULL; ++ } + + if (!ret) + bch2_write_super(c); + +- mutex_unlock(&c->sb_lock); +- + return ret; + } + +@@ -399,7 +388,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) + + lockdep_assert_held(&c->replicas_gc_lock); + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + BUG_ON(c->replicas_gc.entries); + + c->replicas_gc.nr = 0; +@@ -420,9 +409,8 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) + c->replicas_gc.entry_size, + GFP_KERNEL); + if (!c->replicas_gc.entries) { +- mutex_unlock(&c->sb_lock); + bch_err(c, "error allocating c->replicas_gc"); +- return -BCH_ERR_ENOMEM_replicas_gc; ++ return bch_err_throw(c, ENOMEM_replicas_gc); + } + + for_each_cpu_replicas_entry(&c->replicas, e) +@@ -432,8 +420,6 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) + e, c->replicas_gc.entry_size); + + bch2_cpu_replicas_sort(&c->replicas_gc); +- mutex_unlock(&c->sb_lock); +- + return 0; + } + +@@ -458,58 +444,51 @@ int bch2_replicas_gc2(struct bch_fs *c) + new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); + if (!new.entries) { + bch_err(c, "error allocating c->replicas_gc"); +- return -BCH_ERR_ENOMEM_replicas_gc; +- } +- +- mutex_lock(&c->sb_lock); +- percpu_down_write(&c->mark_lock); +- +- if (nr != c->replicas.nr || +- new.entry_size != c->replicas.entry_size) { +- percpu_up_write(&c->mark_lock); +- mutex_unlock(&c->sb_lock); +- kfree(new.entries); +- goto retry; ++ return bch_err_throw(c, ENOMEM_replicas_gc); + } + +- for (unsigned i = 0; i < c->replicas.nr; i++) { +- struct bch_replicas_entry_v1 *e = +- cpu_replicas_entry(&c->replicas, i); ++ guard(mutex)(&c->sb_lock); ++ scoped_guard(percpu_write, &c->mark_lock) { ++ if (nr != c->replicas.nr || ++ new.entry_size != c->replicas.entry_size) { ++ kfree(new.entries); ++ goto retry; ++ } + +- struct disk_accounting_pos k = { +- .type = BCH_DISK_ACCOUNTING_replicas, +- }; ++ for (unsigned i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry_v1 *e = ++ cpu_replicas_entry(&c->replicas, i); + +- unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), +- "embedded variable length struct"); ++ struct disk_accounting_pos k = { ++ .type = BCH_DISK_ACCOUNTING_replicas, ++ }; + +- struct bpos p = disk_accounting_pos_to_bpos(&k); ++ unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), ++ "embedded variable length struct"); + +- struct bch_accounting_mem *acc = &c->accounting; +- bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), +- accounting_pos_cmp, &p) >= acc->k.nr; ++ struct bpos p = disk_accounting_pos_to_bpos(&k); + +- if (e->data_type == BCH_DATA_journal || !kill) +- memcpy(cpu_replicas_entry(&new, new.nr++), +- e, new.entry_size); +- } ++ struct bch_accounting_mem *acc = &c->accounting; ++ bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), ++ accounting_pos_cmp, &p) >= acc->k.nr; + +- bch2_cpu_replicas_sort(&new); ++ if (e->data_type == BCH_DATA_journal || !kill) ++ memcpy(cpu_replicas_entry(&new, new.nr++), ++ e, new.entry_size); ++ } + +- ret = bch2_cpu_replicas_to_sb_replicas(c, &new); ++ bch2_cpu_replicas_sort(&new); + +- if (!ret) +- swap(c->replicas, new); ++ ret = bch2_cpu_replicas_to_sb_replicas(c, &new); + +- kfree(new.entries); ++ if (!ret) ++ swap(c->replicas, new); + +- percpu_up_write(&c->mark_lock); ++ kfree(new.entries); ++ } + + if (!ret) + bch2_write_super(c); +- +- mutex_unlock(&c->sb_lock); +- + return ret; + } + +@@ -597,9 +576,8 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) + + bch2_cpu_replicas_sort(&new_r); + +- percpu_down_write(&c->mark_lock); ++ guard(percpu_write)(&c->mark_lock); + swap(c->replicas, new_r); +- percpu_up_write(&c->mark_lock); + + kfree(new_r.entries); + +@@ -622,7 +600,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, + sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, + DIV_ROUND_UP(bytes, sizeof(u64))); + if (!sb_r) +- return -BCH_ERR_ENOSPC_sb_replicas; ++ return bch_err_throw(c, ENOSPC_sb_replicas); + + bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); + sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); +@@ -667,7 +645,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, + sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, + DIV_ROUND_UP(bytes, sizeof(u64))); + if (!sb_r) +- return -BCH_ERR_ENOSPC_sb_replicas; ++ return bch_err_throw(c, ENOSPC_sb_replicas); + + bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); + sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); +@@ -809,9 +787,8 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + unsigned flags, bool print) + { + struct bch_replicas_entry_v1 *e; +- bool ret = true; + +- percpu_down_read(&c->mark_lock); ++ guard(percpu_read)(&c->mark_lock); + for_each_cpu_replicas_entry(&c->replicas, e) { + unsigned nr_online = 0, nr_failed = 0, dflags = 0; + bool metadata = e->data_type < BCH_DATA_user; +@@ -819,19 +796,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + if (e->data_type == BCH_DATA_cached) + continue; + +- rcu_read_lock(); +- for (unsigned i = 0; i < e->nr_devs; i++) { +- if (e->devs[i] == BCH_SB_MEMBER_INVALID) { +- nr_failed++; +- continue; +- } ++ scoped_guard(rcu) ++ for (unsigned i = 0; i < e->nr_devs; i++) { ++ if (e->devs[i] == BCH_SB_MEMBER_INVALID) { ++ nr_failed++; ++ continue; ++ } + +- nr_online += test_bit(e->devs[i], devs.d); ++ nr_online += test_bit(e->devs[i], devs.d); + +- struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); +- nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; +- } +- rcu_read_unlock(); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); ++ nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; ++ } + + if (nr_online + nr_failed == e->nr_devs) + continue; +@@ -848,21 +824,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + + if (dflags & ~flags) { + if (print) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + bch2_replicas_entry_to_text(&buf, e); + bch_err(c, "insufficient devices online (%u) for replicas entry %s", + nr_online, buf.buf); +- printbuf_exit(&buf); + } +- ret = false; +- break; ++ return false; + } + + } +- percpu_up_read(&c->mark_lock); + +- return ret; ++ return true; + } + + unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) +@@ -905,11 +878,8 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) + + unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) + { +- mutex_lock(&c->sb_lock); +- unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); +- mutex_unlock(&c->sb_lock); +- +- return ret; ++ guard(mutex)(&c->sb_lock); ++ return bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); + } + + void bch2_fs_replicas_exit(struct bch_fs *c) +diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c +index 59c8770e4a0e..a5916984565e 100644 +--- a/fs/bcachefs/sb-clean.c ++++ b/fs/bcachefs/sb-clean.c +@@ -89,8 +89,8 @@ int bch2_verify_superblock_clean(struct bch_fs *c, + { + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; +- struct printbuf buf1 = PRINTBUF; +- struct printbuf buf2 = PRINTBUF; ++ CLASS(printbuf, buf1)(); ++ CLASS(printbuf, buf2)(); + int ret = 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, +@@ -140,8 +140,6 @@ int bch2_verify_superblock_clean(struct bch_fs *c, + l2, buf2.buf); + } + fsck_err: +- printbuf_exit(&buf2); +- printbuf_exit(&buf1); + return ret; + } + +@@ -150,7 +148,7 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + sb_clean = bch2_sb_field_get(c->disk_sb.sb, clean); + + if (fsck_err_on(!sb_clean, c, +@@ -158,29 +156,22 @@ struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; +- mutex_unlock(&c->sb_lock); + return ERR_PTR(-BCH_ERR_invalid_sb_clean); + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); +- if (!clean) { +- mutex_unlock(&c->sb_lock); ++ if (!clean) + return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); +- } + + ret = bch2_sb_clean_validate_late(c, clean, READ); + if (ret) { + kfree(clean); +- mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); + } + +- mutex_unlock(&c->sb_lock); +- + return clean; + fsck_err: +- mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); + } + +@@ -265,21 +256,16 @@ const struct bch_sb_field_ops bch_sb_field_ops_clean = { + + int bch2_fs_mark_dirty(struct bch_fs *c) + { +- int ret; +- + /* + * Unconditionally write superblock, to verify it hasn't changed before + * we go rw: + */ + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + +- ret = bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- +- return ret; ++ return bch2_write_super(c); + } + + void bch2_fs_mark_clean(struct bch_fs *c) +@@ -289,9 +275,9 @@ void bch2_fs_mark_clean(struct bch_fs *c) + unsigned u64s; + int ret; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + if (BCH_SB_CLEAN(c->disk_sb.sb)) +- goto out; ++ return; + + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); + +@@ -305,7 +291,7 @@ void bch2_fs_mark_clean(struct bch_fs *c) + sb_clean = bch2_sb_field_resize(&c->disk_sb, clean, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); +- goto out; ++ return; + } + + sb_clean->flags = 0; +@@ -329,12 +315,10 @@ void bch2_fs_mark_clean(struct bch_fs *c) + ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); +- goto out; ++ return; + } + + bch2_journal_pos_from_member_info_set(c); + + bch2_write_super(c); +-out: +- mutex_unlock(&c->sb_lock); + } +diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h +index fa27ec59a647..f3ea53a55384 100644 +--- a/fs/bcachefs/sb-counters_format.h ++++ b/fs/bcachefs/sb-counters_format.h +@@ -12,10 +12,17 @@ enum counters_flags { + x(io_read_inline, 80, TYPE_SECTORS) \ + x(io_read_hole, 81, TYPE_SECTORS) \ + x(io_read_promote, 30, TYPE_COUNTER) \ ++ x(io_read_nopromote, 85, TYPE_COUNTER) \ ++ x(io_read_nopromote_may_not, 86, TYPE_COUNTER) \ ++ x(io_read_nopromote_already_promoted, 87, TYPE_COUNTER) \ ++ x(io_read_nopromote_unwritten, 88, TYPE_COUNTER) \ ++ x(io_read_nopromote_congested, 89, TYPE_COUNTER) \ ++ x(io_read_nopromote_in_flight, 90, TYPE_COUNTER) \ + x(io_read_bounce, 31, TYPE_COUNTER) \ + x(io_read_split, 33, TYPE_COUNTER) \ + x(io_read_reuse_race, 34, TYPE_COUNTER) \ + x(io_read_retry, 32, TYPE_COUNTER) \ ++ x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ + x(io_write, 1, TYPE_SECTORS) \ + x(io_move, 2, TYPE_SECTORS) \ + x(io_move_read, 35, TYPE_SECTORS) \ +@@ -24,6 +31,10 @@ enum counters_flags { + x(io_move_fail, 38, TYPE_COUNTER) \ + x(io_move_write_fail, 82, TYPE_COUNTER) \ + x(io_move_start_fail, 39, TYPE_COUNTER) \ ++ x(io_move_drop_only, 91, TYPE_COUNTER) \ ++ x(io_move_noop, 92, TYPE_COUNTER) \ ++ x(io_move_created_rebalance, 83, TYPE_COUNTER) \ ++ x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ + x(bucket_invalidate, 3, TYPE_COUNTER) \ + x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_discard_fast, 79, TYPE_COUNTER) \ +diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c +index badd0e17ada5..de56a1ee79db 100644 +--- a/fs/bcachefs/sb-downgrade.c ++++ b/fs/bcachefs/sb-downgrade.c +@@ -100,7 +100,11 @@ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(stripe_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ +- BCH_FSCK_ERR_ptr_to_missing_backpointer) ++ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ ++ x(inode_has_case_insensitive, \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ ++ BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ ++ BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) + + #define DOWNGRADE_TABLE() \ + x(bucket_stripe_sectors, \ +@@ -187,7 +191,7 @@ int bch2_sb_set_upgrade_extra(struct bch_fs *c) + bool write_sb = false; + int ret = 0; + +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (old_version < bcachefs_metadata_version_bucket_stripe_sectors && +@@ -201,7 +205,6 @@ int bch2_sb_set_upgrade_extra(struct bch_fs *c) + + if (write_sb) + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + + return ret < 0 ? ret : 0; + } +@@ -249,6 +252,7 @@ DOWNGRADE_TABLE() + + static int downgrade_table_extra(struct bch_fs *c, darray_char *table) + { ++ unsigned dst_offset = table->nr; + struct bch_sb_field_downgrade_entry *dst = (void *) &darray_top(*table); + unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * le16_to_cpu(dst->nr_errors); + int ret = 0; +@@ -264,6 +268,9 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) + if (ret) + return ret; + ++ dst = (void *) &table->data[dst_offset]; ++ dst->nr_errors = cpu_to_le16(nr_errors + 1); ++ + /* open coded __set_bit_le64, as dst is packed and + * dst->recovery_passes is misaligned */ + unsigned b = BCH_RECOVERY_PASS_STABLE_check_allocations; +@@ -274,7 +281,6 @@ static int downgrade_table_extra(struct bch_fs *c, darray_char *table) + break; + } + +- dst->nr_errors = cpu_to_le16(nr_errors); + return ret; + } + +@@ -365,7 +371,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) + if (!test_bit(BCH_FS_btree_running, &c->flags)) + return 0; + +- darray_char table = {}; ++ CLASS(darray_char, table)(); + int ret = 0; + + for (const struct upgrade_downgrade_entry *src = downgrade_table; +@@ -374,12 +380,15 @@ int bch2_sb_downgrade_update(struct bch_fs *c) + if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) + continue; + ++ if (src->version < c->sb.version_incompat) ++ continue; ++ + struct bch_sb_field_downgrade_entry *dst; + unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; + + ret = darray_make_room(&table, bytes); + if (ret) +- goto out; ++ return ret; + + dst = (void *) &darray_top(table); + dst->version = cpu_to_le16(src->version); +@@ -391,7 +400,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) + + ret = downgrade_table_extra(c, &table); + if (ret) +- goto out; ++ return ret; + + if (!dst->recovery_passes[0] && + !dst->recovery_passes[1] && +@@ -406,18 +415,14 @@ int bch2_sb_downgrade_update(struct bch_fs *c) + unsigned sb_u64s = DIV_ROUND_UP(sizeof(*d) + table.nr, sizeof(u64)); + + if (d && le32_to_cpu(d->field.u64s) > sb_u64s) +- goto out; ++ return 0; + + d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); +- if (!d) { +- ret = -BCH_ERR_ENOSPC_sb_downgrade; +- goto out; +- } ++ if (!d) ++ return bch_err_throw(c, ENOSPC_sb_downgrade); + + memcpy(d->entries, table.data, table.nr); + memset_u64s_tail(d->entries, 0, table.nr); +-out: +- darray_exit(&table); + return ret; + } + +diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c +index 013a96883b4e..41a259eab4fb 100644 +--- a/fs/bcachefs/sb-errors.c ++++ b/fs/bcachefs/sb-errors.c +@@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = { + .to_text = bch2_sb_errors_to_text, + }; + ++void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ if (out->nr_tabstops < 1) ++ printbuf_tabstop_push(out, 48); ++ if (out->nr_tabstops < 2) ++ printbuf_tabstop_push(out, 8); ++ if (out->nr_tabstops < 3) ++ printbuf_tabstop_push(out, 16); ++ ++ guard(mutex)(&c->fsck_error_counts_lock); ++ ++ bch_sb_errors_cpu *e = &c->fsck_error_counts; ++ darray_for_each(*e, i) { ++ bch2_sb_error_id_to_text(out, i->id); ++ prt_tab(out); ++ prt_u64(out, i->nr); ++ prt_tab(out); ++ bch2_prt_datetime(out, i->last_error_time); ++ prt_newline(out); ++ } ++} ++ + void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) + { + bch_sb_errors_cpu *e = &c->fsck_error_counts; +@@ -88,75 +110,66 @@ void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) + }; + unsigned i; + +- mutex_lock(&c->fsck_error_counts_lock); ++ guard(mutex)(&c->fsck_error_counts_lock); ++ + for (i = 0; i < e->nr; i++) { + if (err == e->data[i].id) { + e->data[i].nr++; + e->data[i].last_error_time = n.last_error_time; +- goto out; ++ return; + } + if (err < e->data[i].id) + break; + } + + if (darray_make_room(e, 1)) +- goto out; ++ return; + + darray_insert_item(e, i, n); +-out: +- mutex_unlock(&c->fsck_error_counts_lock); + } + + void bch2_sb_errors_from_cpu(struct bch_fs *c) + { +- bch_sb_errors_cpu *src = &c->fsck_error_counts; +- struct bch_sb_field_errors *dst; +- unsigned i; +- +- mutex_lock(&c->fsck_error_counts_lock); +- +- dst = bch2_sb_field_resize(&c->disk_sb, errors, +- bch2_sb_field_errors_u64s(src->nr)); ++ guard(mutex)(&c->fsck_error_counts_lock); + ++ bch_sb_errors_cpu *src = &c->fsck_error_counts; ++ struct bch_sb_field_errors *dst = ++ bch2_sb_field_resize(&c->disk_sb, errors, ++ bch2_sb_field_errors_u64s(src->nr)); + if (!dst) +- goto err; ++ return; + +- for (i = 0; i < src->nr; i++) { ++ for (unsigned i = 0; i < src->nr; i++) { + SET_BCH_SB_ERROR_ENTRY_ID(&dst->entries[i], src->data[i].id); + SET_BCH_SB_ERROR_ENTRY_NR(&dst->entries[i], src->data[i].nr); + dst->entries[i].last_error_time = cpu_to_le64(src->data[i].last_error_time); + } +- +-err: +- mutex_unlock(&c->fsck_error_counts_lock); + } + + static int bch2_sb_errors_to_cpu(struct bch_fs *c) + { ++ guard(mutex)(&c->fsck_error_counts_lock); ++ + struct bch_sb_field_errors *src = bch2_sb_field_get(c->disk_sb.sb, errors); + bch_sb_errors_cpu *dst = &c->fsck_error_counts; +- unsigned i, nr = bch2_sb_field_errors_nr_entries(src); +- int ret; ++ unsigned nr = bch2_sb_field_errors_nr_entries(src); + + if (!nr) + return 0; + +- mutex_lock(&c->fsck_error_counts_lock); +- ret = darray_make_room(dst, nr); ++ int ret = darray_make_room(dst, nr); + if (ret) +- goto err; ++ return ret; + + dst->nr = nr; + +- for (i = 0; i < nr; i++) { ++ for (unsigned i = 0; i < nr; i++) { + dst->data[i].id = BCH_SB_ERROR_ENTRY_ID(&src->entries[i]); + dst->data[i].nr = BCH_SB_ERROR_ENTRY_NR(&src->entries[i]); + dst->data[i].last_error_time = le64_to_cpu(src->entries[i].last_error_time); + } +-err: +- mutex_unlock(&c->fsck_error_counts_lock); + +- return ret; ++ return 0; + } + + void bch2_fs_sb_errors_exit(struct bch_fs *c) +diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h +index b2357b8e6107..e86267264692 100644 +--- a/fs/bcachefs/sb-errors.h ++++ b/fs/bcachefs/sb-errors.h +@@ -7,6 +7,7 @@ + extern const char * const bch2_sb_error_strs[]; + + void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); ++void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *); + + extern const struct bch_sb_field_ops bch_sb_field_ops_errors; + +diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h +index 4036a20c6adc..5317b1bfe2e5 100644 +--- a/fs/bcachefs/sb-errors_format.h ++++ b/fs/bcachefs/sb-errors_format.h +@@ -3,9 +3,11 @@ + #define _BCACHEFS_SB_ERRORS_FORMAT_H + + enum bch_fsck_flags { +- FSCK_CAN_FIX = 1 << 0, +- FSCK_CAN_IGNORE = 1 << 1, +- FSCK_AUTOFIX = 1 << 2, ++ FSCK_CAN_FIX = BIT(0), ++ FSCK_CAN_IGNORE = BIT(1), ++ FSCK_AUTOFIX = BIT(2), ++ FSCK_ERR_NO_LOG = BIT(3), ++ FSCK_ERR_SILENT = BIT(4), + }; + + #define BCH_SB_ERRS() \ +@@ -74,6 +76,8 @@ enum bch_fsck_flags { + x(btree_node_read_error, 62, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ ++ x(btree_node_topology_bad_root_min_key, 323, FSCK_AUTOFIX) \ ++ x(btree_node_topology_bad_root_max_key, 324, FSCK_AUTOFIX) \ + x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ + x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ + x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ +@@ -134,7 +138,7 @@ enum bch_fsck_flags { + x(bucket_gens_to_invalid_buckets, 121, FSCK_AUTOFIX) \ + x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \ + x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \ +- x(need_discard_freespace_key_bad, 124, 0) \ ++ x(need_discard_freespace_key_bad, 124, FSCK_AUTOFIX) \ + x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ + x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_level_bad, 294, 0) \ +@@ -156,6 +160,7 @@ enum bch_fsck_flags { + x(extent_ptrs_unwritten, 140, 0) \ + x(extent_ptrs_written_and_unwritten, 141, 0) \ + x(ptr_to_invalid_device, 142, 0) \ ++ x(ptr_to_removed_device, 322, 0) \ + x(ptr_to_duplicate_device, 143, 0) \ + x(ptr_after_last_bucket, 144, 0) \ + x(ptr_before_first_bucket, 145, 0) \ +@@ -165,7 +170,7 @@ enum bch_fsck_flags { + x(ptr_to_missing_replicas_entry, 149, FSCK_AUTOFIX) \ + x(ptr_to_missing_stripe, 150, 0) \ + x(ptr_to_incorrect_stripe, 151, 0) \ +- x(ptr_gen_newer_than_bucket_gen, 152, 0) \ ++ x(ptr_gen_newer_than_bucket_gen, 152, FSCK_AUTOFIX) \ + x(ptr_too_stale, 153, 0) \ + x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ + x(ptr_bucket_data_type_mismatch, 155, 0) \ +@@ -209,7 +214,7 @@ enum bch_fsck_flags { + x(subvol_to_missing_root, 188, 0) \ + x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ + x(bkey_in_missing_snapshot, 190, 0) \ +- x(bkey_in_deleted_snapshot, 315, 0) \ ++ x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \ + x(inode_pos_inode_nonzero, 191, 0) \ + x(inode_pos_blockdev_range, 192, 0) \ + x(inode_alloc_cursor_inode_bad, 301, 0) \ +@@ -217,7 +222,7 @@ enum bch_fsck_flags { + x(inode_str_hash_invalid, 194, 0) \ + x(inode_v3_fields_start_bad, 195, 0) \ + x(inode_snapshot_mismatch, 196, 0) \ +- x(snapshot_key_missing_inode_snapshot, 314, 0) \ ++ x(snapshot_key_missing_inode_snapshot, 314, FSCK_AUTOFIX) \ + x(inode_unlinked_but_clean, 197, 0) \ + x(inode_unlinked_but_nlink_nonzero, 198, 0) \ + x(inode_unlinked_and_not_open, 281, 0) \ +@@ -232,10 +237,11 @@ enum bch_fsck_flags { + x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ + x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ + x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ ++ x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ + x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ + x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ + x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ +- x(inode_has_child_snapshots_wrong, 287, 0) \ ++ x(inode_has_child_snapshots_wrong, 287, FSCK_AUTOFIX) \ + x(inode_unreachable, 210, FSCK_AUTOFIX) \ + x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ + x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ +@@ -243,26 +249,27 @@ enum bch_fsck_flags { + x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ + x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ + x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ ++ x(vfs_bad_inode_rm, 320, 0) \ + x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ + x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ + x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ + x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \ + x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \ + x(extent_overlapping, 215, 0) \ +- x(key_in_missing_inode, 216, 0) \ ++ x(key_in_missing_inode, 216, FSCK_AUTOFIX) \ + x(key_in_wrong_inode_type, 217, 0) \ +- x(extent_past_end_of_inode, 218, 0) \ ++ x(extent_past_end_of_inode, 218, FSCK_AUTOFIX) \ + x(dirent_empty_name, 219, 0) \ + x(dirent_val_too_big, 220, 0) \ + x(dirent_name_too_long, 221, 0) \ + x(dirent_name_embedded_nul, 222, 0) \ + x(dirent_name_dot_or_dotdot, 223, 0) \ + x(dirent_name_has_slash, 224, 0) \ +- x(dirent_d_type_wrong, 225, 0) \ ++ x(dirent_d_type_wrong, 225, FSCK_AUTOFIX) \ + x(inode_bi_parent_wrong, 226, 0) \ + x(dirent_in_missing_dir_inode, 227, 0) \ + x(dirent_in_non_dir_inode, 228, 0) \ +- x(dirent_to_missing_inode, 229, 0) \ ++ x(dirent_to_missing_inode, 229, FSCK_AUTOFIX) \ + x(dirent_to_overwritten_inode, 302, 0) \ + x(dirent_to_missing_subvol, 230, 0) \ + x(dirent_to_itself, 231, 0) \ +@@ -276,9 +283,9 @@ enum bch_fsck_flags { + x(root_subvol_missing, 238, 0) \ + x(root_dir_missing, 239, 0) \ + x(root_inode_not_dir, 240, 0) \ +- x(dir_loop, 241, 0) \ +- x(hash_table_key_duplicate, 242, 0) \ +- x(hash_table_key_wrong_offset, 243, 0) \ ++ x(dir_loop, 241, FSCK_AUTOFIX) \ ++ x(hash_table_key_duplicate, 242, FSCK_AUTOFIX) \ ++ x(hash_table_key_wrong_offset, 243, FSCK_AUTOFIX) \ + x(unlinked_inode_not_on_deleted_list, 244, FSCK_AUTOFIX) \ + x(reflink_p_front_pad_bad, 245, 0) \ + x(journal_entry_dup_same_device, 246, 0) \ +@@ -287,18 +294,19 @@ enum bch_fsck_flags { + x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \ + x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \ + x(inode_bi_parent_nonzero, 251, 0) \ ++ x(missing_inode_with_contents, 321, FSCK_AUTOFIX) \ + x(dirent_to_missing_parent_subvol, 252, 0) \ + x(dirent_not_visible_in_parent_subvol, 253, 0) \ + x(subvol_fs_path_parent_wrong, 254, 0) \ + x(subvol_root_fs_path_parent_nonzero, 255, 0) \ + x(subvol_children_not_set, 256, 0) \ + x(subvol_children_bad, 257, 0) \ +- x(subvol_loop, 258, 0) \ ++ x(subvol_loop, 258, FSCK_AUTOFIX) \ + x(subvol_unreachable, 259, FSCK_AUTOFIX) \ + x(btree_node_bkey_bad_u64s, 260, 0) \ + x(btree_node_topology_empty_interior_node, 261, 0) \ + x(btree_ptr_v2_min_key_bad, 262, 0) \ +- x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ ++ x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ + x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ + x(dup_backpointer_to_bad_csum_extent, 265, 0) \ + x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ +@@ -311,7 +319,7 @@ enum bch_fsck_flags { + x(accounting_mismatch, 272, FSCK_AUTOFIX) \ + x(accounting_replicas_not_marked, 273, 0) \ + x(accounting_to_invalid_device, 289, 0) \ +- x(invalid_btree_id, 274, 0) \ ++ x(invalid_btree_id, 274, FSCK_AUTOFIX) \ + x(alloc_key_io_time_bad, 275, 0) \ + x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \ + x(accounting_key_junk_at_end, 277, FSCK_AUTOFIX) \ +@@ -328,7 +336,7 @@ enum bch_fsck_flags { + x(dirent_stray_data_after_cf_name, 305, 0) \ + x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ + x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ +- x(MAX, 319, 0) ++ x(MAX, 325, 0) + + enum bch_sb_error_id { + #define x(t, n, ...) BCH_FSCK_ERR_##t = n, +diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c +index 72779912939b..e3c73d903898 100644 +--- a/fs/bcachefs/sb-members.c ++++ b/fs/bcachefs/sb-members.c +@@ -5,14 +5,41 @@ + #include "disk_groups.h" + #include "error.h" + #include "opts.h" ++#include "recovery_passes.h" + #include "replicas.h" + #include "sb-members.h" + #include "super-io.h" + +-void bch2_dev_missing(struct bch_fs *c, unsigned dev) ++int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) ++{ ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ bool removed = test_bit(dev, c->devs_removed.d); ++ ++ prt_printf(&buf, "pointer to %s device %u in key\n", ++ removed ? "removed" : "nonexistent", dev); ++ bch2_bkey_val_to_text(&buf, c, k); ++ prt_newline(&buf); ++ ++ bool print = removed ++ ? bch2_count_fsck_err(c, ptr_to_removed_device, &buf) ++ : bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); ++ ++ int ret = bch2_run_explicit_recovery_pass(c, &buf, ++ BCH_RECOVERY_PASS_check_allocations, 0); ++ ++ if (print) ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ return ret; ++} ++ ++void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) + { + if (dev != BCH_SB_MEMBER_INVALID) +- bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); ++ bch2_fs_inconsistent(c, "pointer to %s device %u", ++ test_bit(dev, c->devs_removed.d) ++ ? "removed" : "nonexistent", dev); + } + + void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) +@@ -41,34 +68,13 @@ struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i) + return __bch2_members_v2_get_mut(bch2_sb_field_get(sb, members_v2), i); + } + +-static struct bch_member members_v2_get(struct bch_sb_field_members_v2 *mi, int i) +-{ +- struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); +- memset(&ret, 0, sizeof(ret)); +- memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); +- return ret; +-} +- +-static struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) +-{ +- return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); +-} +- +-static struct bch_member members_v1_get(struct bch_sb_field_members_v1 *mi, int i) +-{ +- struct bch_member ret, *p = members_v1_get_mut(mi, i); +- memset(&ret, 0, sizeof(ret)); +- memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); +- return ret; +-} +- + struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i) + { + struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(sb, members_v2); + if (mi2) +- return members_v2_get(mi2, i); ++ return bch2_members_v2_get(mi2, i); + struct bch_sb_field_members_v1 *mi1 = bch2_sb_field_get(sb, members_v1); +- return members_v1_get(mi1, i); ++ return bch2_members_v1_get(mi1, i); + } + + static int sb_members_v2_resize_entries(struct bch_fs *c) +@@ -81,7 +87,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) + + mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); + if (!mi) +- return -BCH_ERR_ENOSPC_sb_members_v2; ++ return bch_err_throw(c, ENOSPC_sb_members_v2); + + for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { + void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); +@@ -119,6 +125,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) + struct bch_sb_field_members_v1 *mi1; + struct bch_sb_field_members_v2 *mi2; + ++ if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { ++ bch2_sb_field_resize(disk_sb, members_v1, 0); ++ return 0; ++ } ++ + mi1 = bch2_sb_field_resize(disk_sb, members_v1, + DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * + disk_sb->sb->nr_devices, sizeof(u64))); +@@ -170,42 +181,34 @@ static int validate_member(struct printbuf *err, + return -BCH_ERR_invalid_sb_members; + } + ++ if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && ++ sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { ++ prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); ++ return -BCH_ERR_invalid_sb_members; ++ } ++ + return 0; + } + +-static void member_to_text(struct printbuf *out, +- struct bch_member m, +- struct bch_sb_field_disk_groups *gi, +- struct bch_sb *sb, +- int i) ++void bch2_member_to_text(struct printbuf *out, ++ struct bch_member *m, ++ struct bch_sb_field_disk_groups *gi, ++ struct bch_sb *sb, ++ unsigned idx) + { +- unsigned data_have = bch2_sb_dev_has_data(sb, i); +- u64 bucket_size = le16_to_cpu(m.bucket_size); +- u64 device_size = le64_to_cpu(m.nbuckets) * bucket_size; +- +- if (!bch2_member_alive(&m)) +- return; +- +- prt_printf(out, "Device:\t%u\n", i); +- +- printbuf_indent_add(out, 2); ++ u64 bucket_size = le16_to_cpu(m->bucket_size); ++ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; + + prt_printf(out, "Label:\t"); +- if (BCH_MEMBER_GROUP(&m)) { +- unsigned idx = BCH_MEMBER_GROUP(&m) - 1; +- +- if (idx < disk_groups_nr(gi)) +- prt_printf(out, "%s (%u)", +- gi->entries[idx].label, idx); +- else +- prt_printf(out, "(bad disk labels section)"); +- } else { ++ if (BCH_MEMBER_GROUP(m)) ++ bch2_disk_path_to_text_sb(out, sb, ++ BCH_MEMBER_GROUP(m) - 1); ++ else + prt_printf(out, "(none)"); +- } + prt_newline(out); + + prt_printf(out, "UUID:\t"); +- pr_uuid(out, m.uuid.b); ++ pr_uuid(out, m->uuid.b); + prt_newline(out); + + prt_printf(out, "Size:\t"); +@@ -213,40 +216,41 @@ static void member_to_text(struct printbuf *out, + prt_newline(out); + + for (unsigned i = 0; i < BCH_MEMBER_ERROR_NR; i++) +- prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m.errors[i])); ++ prt_printf(out, "%s errors:\t%llu\n", bch2_member_error_strs[i], le64_to_cpu(m->errors[i])); + + for (unsigned i = 0; i < BCH_IOPS_NR; i++) +- prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m.iops[i])); ++ prt_printf(out, "%s iops:\t%u\n", bch2_iops_measurements[i], le32_to_cpu(m->iops[i])); + + prt_printf(out, "Bucket size:\t"); + prt_units_u64(out, bucket_size << 9); + prt_newline(out); + +- prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m.first_bucket)); +- prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m.nbuckets)); ++ prt_printf(out, "First bucket:\t%u\n", le16_to_cpu(m->first_bucket)); ++ prt_printf(out, "Buckets:\t%llu\n", le64_to_cpu(m->nbuckets)); + + prt_printf(out, "Last mount:\t"); +- if (m.last_mount) +- bch2_prt_datetime(out, le64_to_cpu(m.last_mount)); ++ if (m->last_mount) ++ bch2_prt_datetime(out, le64_to_cpu(m->last_mount)); + else + prt_printf(out, "(never)"); + prt_newline(out); + +- prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m.seq)); ++ prt_printf(out, "Last superblock write:\t%llu\n", le64_to_cpu(m->seq)); + + prt_printf(out, "State:\t%s\n", +- BCH_MEMBER_STATE(&m) < BCH_MEMBER_STATE_NR +- ? bch2_member_states[BCH_MEMBER_STATE(&m)] ++ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR ++ ? bch2_member_states[BCH_MEMBER_STATE(m)] + : "unknown"); + + prt_printf(out, "Data allowed:\t"); +- if (BCH_MEMBER_DATA_ALLOWED(&m)) +- prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m)); ++ if (BCH_MEMBER_DATA_ALLOWED(m)) ++ prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Has data:\t"); ++ unsigned data_have = bch2_sb_dev_has_data(sb, idx); + if (data_have) + prt_bitflags(out, __bch2_data_types, data_have); + else +@@ -254,21 +258,36 @@ static void member_to_text(struct printbuf *out, + prt_newline(out); + + prt_printf(out, "Btree allocated bitmap blocksize:\t"); +- if (m.btree_bitmap_shift < 64) +- prt_units_u64(out, 1ULL << m.btree_bitmap_shift); ++ if (m->btree_bitmap_shift < 64) ++ prt_units_u64(out, 1ULL << m->btree_bitmap_shift); + else +- prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift); ++ prt_printf(out, "(invalid shift %u)", m->btree_bitmap_shift); + prt_newline(out); + + prt_printf(out, "Btree allocated bitmap:\t"); +- bch2_prt_u64_base2_nbits(out, le64_to_cpu(m.btree_allocated_bitmap), 64); ++ bch2_prt_u64_base2_nbits(out, le64_to_cpu(m->btree_allocated_bitmap), 64); + prt_newline(out); + +- prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1); ++ prt_printf(out, "Durability:\t%llu\n", BCH_MEMBER_DURABILITY(m) ? BCH_MEMBER_DURABILITY(m) - 1 : 1); ++ ++ prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(m)); ++ prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(m)); ++ prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(m)); ++} ++ ++static void member_to_text(struct printbuf *out, ++ struct bch_member m, ++ struct bch_sb_field_disk_groups *gi, ++ struct bch_sb *sb, ++ unsigned idx) ++{ ++ if (!bch2_member_alive(&m)) ++ return; + +- prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); +- prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); ++ prt_printf(out, "Device:\t%u\n", idx); + ++ printbuf_indent_add(out, 2); ++ bch2_member_to_text(out, &m, gi, sb, idx); + printbuf_indent_sub(out, 2); + } + +@@ -284,7 +303,7 @@ static int bch2_sb_members_v1_validate(struct bch_sb *sb, struct bch_sb_field *f + } + + for (i = 0; i < sb->nr_devices; i++) { +- struct bch_member m = members_v1_get(mi, i); ++ struct bch_member m = bch2_members_v1_get(mi, i); + + int ret = validate_member(err, m, sb, i); + if (ret) +@@ -299,10 +318,18 @@ static void bch2_sb_members_v1_to_text(struct printbuf *out, struct bch_sb *sb, + { + struct bch_sb_field_members_v1 *mi = field_to_type(f, members_v1); + struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); +- unsigned i; + +- for (i = 0; i < sb->nr_devices; i++) +- member_to_text(out, members_v1_get(mi, i), gi, sb, i); ++ if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { ++ prt_printf(out, "field ends before start of entries"); ++ return; ++ } ++ ++ unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / sizeof(mi->_members[0]); ++ if (nr != sb->nr_devices) ++ prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); ++ ++ for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) ++ member_to_text(out, bch2_members_v1_get(mi, i), gi, sb, i); + } + + const struct bch_sb_field_ops bch_sb_field_ops_members_v1 = { +@@ -315,10 +342,28 @@ static void bch2_sb_members_v2_to_text(struct printbuf *out, struct bch_sb *sb, + { + struct bch_sb_field_members_v2 *mi = field_to_type(f, members_v2); + struct bch_sb_field_disk_groups *gi = bch2_sb_field_get(sb, disk_groups); +- unsigned i; + +- for (i = 0; i < sb->nr_devices; i++) +- member_to_text(out, members_v2_get(mi, i), gi, sb, i); ++ if (vstruct_end(&mi->field) <= (void *) &mi->_members[0]) { ++ prt_printf(out, "field ends before start of entries"); ++ return; ++ } ++ ++ if (!le16_to_cpu(mi->member_bytes)) { ++ prt_printf(out, "member_bytes 0"); ++ return; ++ } ++ ++ unsigned nr = (vstruct_end(&mi->field) - (void *) &mi->_members[0]) / le16_to_cpu(mi->member_bytes); ++ if (nr != sb->nr_devices) ++ prt_printf(out, "nr_devices mismatch: have %i entries, should be %u", nr, sb->nr_devices); ++ ++ /* ++ * We call to_text() on superblock sections that haven't passed ++ * validate, so we can't trust sb->nr_devices. ++ */ ++ ++ for (unsigned i = 0; i < min(sb->nr_devices, nr); i++) ++ member_to_text(out, bch2_members_v2_get(mi, i), gi, sb, i); + } + + static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f, +@@ -335,7 +380,7 @@ static int bch2_sb_members_v2_validate(struct bch_sb *sb, struct bch_sb_field *f + } + + for (unsigned i = 0; i < sb->nr_devices; i++) { +- int ret = validate_member(err, members_v2_get(mi, i), sb, i); ++ int ret = validate_member(err, bch2_members_v2_get(mi, i), sb, i); + if (ret) + return ret; + } +@@ -352,14 +397,29 @@ void bch2_sb_members_from_cpu(struct bch_fs *c) + { + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + +- rcu_read_lock(); ++ guard(rcu)(); + for_each_member_device_rcu(c, ca, NULL) { + struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); + + for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) + m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); + } +- rcu_read_unlock(); ++} ++ ++void bch2_sb_members_to_cpu(struct bch_fs *c) ++{ ++ for_each_member_device(c, ca) { ++ struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); ++ ca->mi = bch2_mi_to_cpu(&m); ++ } ++ ++ struct bch_sb_field_members_v2 *mi2 = bch2_sb_field_get(c->disk_sb.sb, members_v2); ++ if (mi2) ++ for (unsigned i = 0; i < c->sb.nr_devices; i++) { ++ struct bch_member m = bch2_members_v2_get(mi2, i); ++ bool removed = uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); ++ mod_bit(i, c->devs_removed.d, removed); ++ } + } + + void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) +@@ -367,9 +427,8 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) + struct bch_fs *c = ca->fs; + struct bch_member m; + +- mutex_lock(&ca->fs->sb_lock); +- m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); +- mutex_unlock(&ca->fs->sb_lock); ++ scoped_guard(mutex, &ca->fs->sb_lock) ++ m = bch2_sb_member_get(c->disk_sb.sb, ca->dev_idx); + + printbuf_tabstop_push(out, 12); + +@@ -396,16 +455,15 @@ void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) + void bch2_dev_errors_reset(struct bch_dev *ca) + { + struct bch_fs *c = ca->fs; +- struct bch_member *m; + +- mutex_lock(&c->sb_lock); +- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); ++ guard(mutex)(&c->sb_lock); ++ ++ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++) + m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i])); + m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds()); + + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + } + + /* +@@ -417,20 +475,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca) + + bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) + { +- bool ret = true; +- rcu_read_lock(); ++ guard(rcu)(); + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); +- if (!ca) +- continue; +- +- if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { +- ret = false; +- break; +- } ++ if (ca && ++ !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) ++ return false; + } +- rcu_read_unlock(); +- return ret; ++ return true; + } + + static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, +@@ -493,6 +545,7 @@ int bch2_sb_member_alloc(struct bch_fs *c) + unsigned u64s; + int best = -1; + u64 best_last_mount = 0; ++ unsigned nr_deleted = 0; + + if (dev_idx < BCH_SB_MEMBERS_MAX) + goto have_slot; +@@ -503,7 +556,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) + continue; + + struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); +- if (bch2_member_alive(&m)) ++ ++ nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); ++ ++ if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) + continue; + + u64 last_mount = le64_to_cpu(m.last_mount); +@@ -517,6 +573,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) + goto have_slot; + } + ++ if (nr_deleted) ++ bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", ++ nr_deleted); ++ + return -BCH_ERR_ENOSPC_sb_members; + have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); +@@ -532,3 +592,21 @@ int bch2_sb_member_alloc(struct bch_fs *c) + c->disk_sb.sb->nr_devices = nr_devices; + return dev_idx; + } ++ ++void bch2_sb_members_clean_deleted(struct bch_fs *c) ++{ ++ guard(mutex)(&c->sb_lock); ++ bool write_sb = false; ++ ++ for (unsigned i = 0; i < c->sb.nr_devices; i++) { ++ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); ++ ++ if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { ++ memset(&m->uuid, 0, sizeof(m->uuid)); ++ write_sb = true; ++ } ++ } ++ ++ if (write_sb) ++ bch2_write_super(c); ++} +diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h +index 42786657522c..6de999cf71cb 100644 +--- a/fs/bcachefs/sb-members.h ++++ b/fs/bcachefs/sb-members.h +@@ -4,6 +4,7 @@ + + #include "darray.h" + #include "bkey_types.h" ++#include "enumerated_ref.h" + + extern char * const bch2_member_error_strs[]; + +@@ -13,26 +14,48 @@ __bch2_members_v2_get_mut(struct bch_sb_field_members_v2 *mi, unsigned i) + return (void *) mi->_members + (i * le16_to_cpu(mi->member_bytes)); + } + ++static inline struct bch_member bch2_members_v2_get(struct bch_sb_field_members_v2 *mi, int i) ++{ ++ struct bch_member ret, *p = __bch2_members_v2_get_mut(mi, i); ++ memset(&ret, 0, sizeof(ret)); ++ memcpy(&ret, p, min_t(size_t, le16_to_cpu(mi->member_bytes), sizeof(ret))); ++ return ret; ++} ++ ++static inline struct bch_member *members_v1_get_mut(struct bch_sb_field_members_v1 *mi, int i) ++{ ++ return (void *) mi->_members + (i * BCH_MEMBER_V1_BYTES); ++} ++ ++static inline struct bch_member bch2_members_v1_get(struct bch_sb_field_members_v1 *mi, int i) ++{ ++ struct bch_member ret, *p = members_v1_get_mut(mi, i); ++ memset(&ret, 0, sizeof(ret)); ++ memcpy(&ret, p, min_t(size_t, BCH_MEMBER_V1_BYTES, sizeof(ret))); ++ return ret; ++} ++ + int bch2_sb_members_v2_init(struct bch_fs *c); + int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb); + struct bch_member *bch2_members_v2_get_mut(struct bch_sb *sb, int i); + struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); + ++void bch2_member_to_text(struct printbuf *, struct bch_member *, ++ struct bch_sb_field_disk_groups *, ++ struct bch_sb *, unsigned); ++ + static inline bool bch2_dev_is_online(struct bch_dev *ca) + { +- return !percpu_ref_is_zero(&ca->io_ref[READ]); ++ return !enumerated_ref_is_zero(&ca->io_ref[READ]); + } + + static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); + + static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) + { +- rcu_read_lock(); ++ guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); +- bool ret = ca && bch2_dev_is_online(ca); +- rcu_read_unlock(); +- +- return ret; ++ return ca && bch2_dev_is_online(ca); + } + + static inline bool bch2_dev_is_healthy(struct bch_dev *ca) +@@ -104,6 +127,12 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * + for (struct bch_dev *_ca = NULL; \ + (_ca = __bch2_next_dev((_c), _ca, (_mask)));) + ++#define for_each_online_member_rcu(_c, _ca) \ ++ for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) ++ ++#define for_each_rw_member_rcu(_c, _ca) \ ++ for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) ++ + static inline void bch2_dev_get(struct bch_dev *ca) + { + #ifdef CONFIG_BCACHEFS_DEBUG +@@ -129,18 +158,16 @@ static inline void __bch2_dev_put(struct bch_dev *ca) + + static inline void bch2_dev_put(struct bch_dev *ca) + { +- if (ca) ++ if (!IS_ERR_OR_NULL(ca)) + __bch2_dev_put(ca); + } + + static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) + { +- rcu_read_lock(); ++ guard(rcu)(); + bch2_dev_put(ca); + if ((ca = __bch2_next_dev(c, ca, NULL))) + bch2_dev_get(ca); +- rcu_read_unlock(); +- + return ca; + } + +@@ -157,33 +184,32 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev + static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, + struct bch_dev *ca, + unsigned state_mask, +- int rw) ++ int rw, unsigned ref_idx) + { +- rcu_read_lock(); ++ guard(rcu)(); + if (ca) +- percpu_ref_put(&ca->io_ref[rw]); ++ enumerated_ref_put(&ca->io_ref[rw], ref_idx); + + while ((ca = __bch2_next_dev(c, ca, NULL)) && + (!((1 << ca->mi.state) & state_mask) || +- !percpu_ref_tryget(&ca->io_ref[rw]))) ++ !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) + ; +- rcu_read_unlock(); + + return ca; + } + +-#define __for_each_online_member(_c, _ca, state_mask, rw) \ ++#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ + for (struct bch_dev *_ca = NULL; \ +- (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw));) ++ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) + +-#define for_each_online_member(c, ca) \ +- __for_each_online_member(c, ca, ~0, READ) ++#define for_each_online_member(c, ca, ref_idx) \ ++ __for_each_online_member(c, ca, ~0, READ, ref_idx) + +-#define for_each_rw_member(c, ca) \ +- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE) ++#define for_each_rw_member(c, ca, ref_idx) \ ++ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) + +-#define for_each_readable_member(c, ca) \ +- __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ) ++#define for_each_readable_member(c, ca, ref_idx) \ ++ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) + + static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) + { +@@ -218,34 +244,43 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de + : NULL; + } + +-void bch2_dev_missing(struct bch_fs *, unsigned); ++int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); ++ ++void bch2_dev_missing_atomic(struct bch_fs *, unsigned); + + static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) + { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); + if (unlikely(!ca)) +- bch2_dev_missing(c, dev); ++ bch2_dev_missing_atomic(c, dev); + return ca; + } + + static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) + { +- rcu_read_lock(); ++ guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); + if (ca) + bch2_dev_get(ca); +- rcu_read_unlock(); + return ca; + } + ++DEFINE_CLASS(bch2_dev_tryget_noerror, struct bch_dev *, ++ bch2_dev_put(_T), bch2_dev_tryget_noerror(c, dev), ++ struct bch_fs *c, unsigned dev); ++ + static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) + { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + if (unlikely(!ca)) +- bch2_dev_missing(c, dev); ++ bch2_dev_missing_atomic(c, dev); + return ca; + } + ++DEFINE_CLASS(bch2_dev_tryget, struct bch_dev *, ++ bch2_dev_put(_T), bch2_dev_tryget(c, dev), ++ struct bch_fs *c, unsigned dev); ++ + static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) + { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); +@@ -256,6 +291,10 @@ static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, s + return ca; + } + ++DEFINE_CLASS(bch2_dev_bucket_tryget_noerror, struct bch_dev *, ++ bch2_dev_put(_T), bch2_dev_bucket_tryget_noerror(c, bucket), ++ struct bch_fs *c, struct bpos bucket); ++ + void bch2_dev_bucket_missing(struct bch_dev *, u64); + + static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) +@@ -269,6 +308,10 @@ static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bp + return ca; + } + ++DEFINE_CLASS(bch2_dev_bucket_tryget, struct bch_dev *, ++ bch2_dev_put(_T), bch2_dev_bucket_tryget(c, bucket), ++ struct bch_fs *c, struct bpos bucket); ++ + static inline struct bch_dev *bch2_dev_iterate_noerror(struct bch_fs *c, struct bch_dev *ca, unsigned dev_idx) + { + if (ca && ca->dev_idx == dev_idx) +@@ -285,43 +328,31 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev + return bch2_dev_tryget(c, dev_idx); + } + +-static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) ++static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, ++ int rw, unsigned ref_idx) + { + might_sleep(); + +- rcu_read_lock(); ++ guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); +- if (ca && !percpu_ref_tryget(&ca->io_ref[rw])) +- ca = NULL; +- rcu_read_unlock(); ++ if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) ++ return NULL; + +- if (ca && +- (ca->mi.state == BCH_MEMBER_STATE_rw || +- (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) ++ if (ca->mi.state == BCH_MEMBER_STATE_rw || ++ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) + return ca; + +- if (ca) +- percpu_ref_put(&ca->io_ref[rw]); ++ enumerated_ref_put(&ca->io_ref[rw], ref_idx); + return NULL; + } + +-/* XXX kill, move to struct bch_fs */ +-static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) +-{ +- struct bch_devs_mask devs; +- +- memset(&devs, 0, sizeof(devs)); +- for_each_online_member(c, ca) +- __set_bit(ca->dev_idx, devs.d); +- return devs; +-} +- + extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; + extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; + + static inline bool bch2_member_alive(struct bch_member *m) + { +- return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); ++ return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && ++ !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); + } + + static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) +@@ -351,6 +382,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) + ? BCH_MEMBER_DURABILITY(mi) - 1 + : 1, + .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), ++ .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), + .valid = bch2_member_alive(mi), + .btree_bitmap_shift = mi->btree_bitmap_shift, + .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), +@@ -358,6 +390,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) + } + + void bch2_sb_members_from_cpu(struct bch_fs *); ++void bch2_sb_members_to_cpu(struct bch_fs *); + + void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *); + void bch2_dev_errors_reset(struct bch_dev *); +@@ -381,5 +414,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); + void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); + + int bch2_sb_member_alloc(struct bch_fs *); ++void bch2_sb_members_clean_deleted(struct bch_fs *); + + #endif /* _BCACHEFS_SB_MEMBERS_H */ +diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h +index 3affec823b3f..b2b892687cdd 100644 +--- a/fs/bcachefs/sb-members_format.h ++++ b/fs/bcachefs/sb-members_format.h +@@ -13,7 +13,11 @@ + */ + #define BCH_SB_MEMBER_INVALID 255 + +-#define BCH_MIN_NR_NBUCKETS (1 << 6) ++#define BCH_SB_MEMBER_DELETED_UUID \ ++ UUID_INIT(0xffffffff, 0xffff, 0xffff, \ ++ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) ++ ++#define BCH_MIN_NR_NBUCKETS (1 << 9) + + #define BCH_IOPS_MEASUREMENTS() \ + x(seqread, 0) \ +@@ -88,6 +92,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) + LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) + LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags, 30, 31) ++LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, ++ struct bch_member, flags, 31, 32) + + #if 0 + LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h +index c0eda888fe39..d6443e186872 100644 +--- a/fs/bcachefs/sb-members_types.h ++++ b/fs/bcachefs/sb-members_types.h +@@ -13,6 +13,7 @@ struct bch_member_cpu { + u8 data_allowed; + u8 durability; + u8 freespace_initialized; ++ u8 resize_on_mount; + u8 valid; + u8 btree_bitmap_shift; + u64 btree_allocated_bitmap; +diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c +index 7c403427fbdb..08083d6ca8bc 100644 +--- a/fs/bcachefs/six.c ++++ b/fs/bcachefs/six.c +@@ -152,16 +152,16 @@ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, + * here. + */ + if (type == SIX_LOCK_read && lock->readers) { +- preempt_disable(); +- this_cpu_inc(*lock->readers); /* signal that we own lock */ ++ scoped_guard(preempt) { ++ this_cpu_inc(*lock->readers); /* signal that we own lock */ + +- smp_mb(); ++ smp_mb(); + +- old = atomic_read(&lock->state); +- ret = !(old & l[type].lock_fail); ++ old = atomic_read(&lock->state); ++ ret = !(old & l[type].lock_fail); + +- this_cpu_sub(*lock->readers, !ret); +- preempt_enable(); ++ this_cpu_sub(*lock->readers, !ret); ++ } + + if (!ret) { + smp_mb(); +@@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock) + * acquiring the lock and setting the owner field. If we're an RT task + * that will live-lock because we won't let the owner complete. + */ +- rcu_read_lock(); ++ guard(rcu)(); + struct task_struct *owner = READ_ONCE(lock->owner); +- bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); +- rcu_read_unlock(); +- +- return ret; ++ return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); + } + + static inline bool six_optimistic_spin(struct six_lock *lock, +@@ -363,7 +360,7 @@ static inline bool six_optimistic_spin(struct six_lock *lock, + if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN) + return false; + +- preempt_disable(); ++ guard(preempt)(); + end_time = sched_clock() + 10 * NSEC_PER_USEC; + + while (!need_resched() && six_owner_running(lock)) { +@@ -372,10 +369,8 @@ static inline bool six_optimistic_spin(struct six_lock *lock, + * wait->lock_acquired: pairs with the smp_store_release in + * __six_lock_wakeup + */ +- if (smp_load_acquire(&wait->lock_acquired)) { +- preempt_enable(); ++ if (smp_load_acquire(&wait->lock_acquired)) + return true; +- } + + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); +@@ -391,7 +386,6 @@ static inline bool six_optimistic_spin(struct six_lock *lock, + cpu_relax(); + } + +- preempt_enable(); + return false; + } + +diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c +index fec569c7deb1..84f987d3a02a 100644 +--- a/fs/bcachefs/snapshot.c ++++ b/fs/bcachefs/snapshot.c +@@ -1,14 +1,17 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "bbpos.h" + #include "bkey_buf.h" + #include "btree_cache.h" + #include "btree_key_cache.h" + #include "btree_update.h" + #include "buckets.h" ++#include "enumerated_ref.h" + #include "errcode.h" + #include "error.h" + #include "fs.h" ++#include "progress.h" + #include "recovery_passes.h" + #include "snapshot.h" + +@@ -52,7 +55,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, + BTREE_ITER_with_updates, snapshot_tree, s); + + if (bch2_err_matches(ret, ENOENT)) +- ret = -BCH_ERR_ENOENT_snapshot_tree; ++ ret = bch_err_throw(trans->c, ENOENT_snapshot_tree); + return ret; + } + +@@ -65,13 +68,13 @@ __bch2_snapshot_tree_create(struct btree_trans *trans) + struct bkey_i_snapshot_tree *s_t; + + if (ret == -BCH_ERR_ENOSPC_btree_slot) +- ret = -BCH_ERR_ENOSPC_snapshot_tree; ++ ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree); + if (ret) + return ERR_PTR(ret); + + s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret ? ERR_PTR(ret) : s_t; + } + +@@ -103,11 +106,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, + + static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) + { +- rcu_read_lock(); +- bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); +- rcu_read_unlock(); +- +- return ret; ++ guard(rcu)(); ++ return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); + } + + static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) +@@ -136,28 +136,25 @@ static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor) + + bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) + { +- bool ret; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ u32 orig_id = id; ++#endif + +- rcu_read_lock(); ++ guard(rcu)(); + struct snapshot_table *t = rcu_dereference(c->snapshots); + +- if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { +- ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); +- goto out; +- } ++ if (unlikely(recovery_pass_will_run(c, BCH_RECOVERY_PASS_check_snapshots))) ++ return __bch2_snapshot_is_ancestor_early(t, id, ancestor); + + if (likely(ancestor >= IS_ANCESTOR_BITMAP)) + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); + +- ret = id && id < ancestor ++ bool ret = id && id < ancestor + ? test_ancestor_bitmap(t, id, ancestor) + : id == ancestor; + +- EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); +-out: +- rcu_read_unlock(); +- ++ EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, orig_id, ancestor)); + return ret; + } + +@@ -209,9 +206,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + +- prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", +- BCH_SNAPSHOT_SUBVOL(s.v), +- BCH_SNAPSHOT_DELETED(s.v), ++ if (BCH_SNAPSHOT_SUBVOL(s.v)) ++ prt_str(out, "subvol "); ++ if (BCH_SNAPSHOT_WILL_DELETE(s.v)) ++ prt_str(out, "will_delete "); ++ if (BCH_SNAPSHOT_DELETED(s.v)) ++ prt_str(out, "deleted "); ++ ++ prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", + le32_to_cpu(s.v->parent), + le32_to_cpu(s.v->children[0]), + le32_to_cpu(s.v->children[1]), +@@ -281,6 +283,14 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, + return ret; + } + ++static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) ++{ ++ guard(mutex)(&c->snapshot_table_lock); ++ return snapshot_t_mut(c, id) ++ ? 0 ++ : bch_err_throw(c, ENOMEM_mark_snapshot); ++} ++ + static int __bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, +@@ -289,20 +299,19 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct snapshot_t *t; + u32 id = new.k->p.offset; +- int ret = 0; + +- mutex_lock(&c->snapshot_table_lock); ++ guard(mutex)(&c->snapshot_table_lock); + + t = snapshot_t_mut(c, id); +- if (!t) { +- ret = -BCH_ERR_ENOMEM_mark_snapshot; +- goto err; +- } ++ if (!t) ++ return bch_err_throw(c, ENOMEM_mark_snapshot); + + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + +- t->live = true; ++ t->state = !BCH_SNAPSHOT_DELETED(s.v) ++ ? SNAPSHOT_ID_live ++ : SNAPSHOT_ID_deleted; + t->parent = le32_to_cpu(s.v->parent); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); +@@ -327,17 +336,16 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); + +- if (BCH_SNAPSHOT_DELETED(s.v)) { ++ if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); +- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) ++ if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) + bch2_delete_dead_snapshots_async(c); + } + } else { + memset(t, 0, sizeof(*t)); + } +-err: +- mutex_unlock(&c->snapshot_table_lock); +- return ret; ++ ++ return 0; + } + + int bch2_mark_snapshot(struct btree_trans *trans, +@@ -357,31 +365,32 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, + + /* fsck: */ + +-static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) ++static u32 bch2_snapshot_child(struct snapshot_table *t, ++ u32 id, unsigned child) + { +- return snapshot_t(c, id)->children[child]; ++ return __snapshot_t(t, id)->children[child]; + } + +-static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) ++static u32 bch2_snapshot_left_child(struct snapshot_table *t, u32 id) + { +- return bch2_snapshot_child(c, id, 0); ++ return bch2_snapshot_child(t, id, 0); + } + +-static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) ++static u32 bch2_snapshot_right_child(struct snapshot_table *t, u32 id) + { +- return bch2_snapshot_child(c, id, 1); ++ return bch2_snapshot_child(t, id, 1); + } + +-static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) ++static u32 bch2_snapshot_tree_next(struct snapshot_table *t, u32 id) + { + u32 n, parent; + +- n = bch2_snapshot_left_child(c, id); ++ n = bch2_snapshot_left_child(t, id); + if (n) + return n; + +- while ((parent = bch2_snapshot_parent(c, id))) { +- n = bch2_snapshot_right_child(c, parent); ++ while ((parent = __bch2_snapshot_parent(t, id))) { ++ n = bch2_snapshot_right_child(t, parent); + if (n && n != id) + return n; + id = parent; +@@ -390,21 +399,30 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) + return 0; + } + +-u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) ++u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, ++ snapshot_id_list *skip) + { +- u32 id = snapshot_root; +- u32 subvol = 0, s; +- +- rcu_read_lock(); +- while (id && bch2_snapshot_exists(c, id)) { +- s = snapshot_t(c, id)->subvol; +- +- if (s && (!subvol || s < subvol)) +- subvol = s; ++ guard(rcu)(); ++ struct snapshot_table *t = rcu_dereference(c->snapshots); ++ u32 id, subvol = 0, s; ++retry: ++ id = snapshot_root; ++ while (id && __bch2_snapshot_exists(t, id)) { ++ if (!(skip && snapshot_list_has_id(skip, id))) { ++ s = __snapshot_t(t, id)->subvol; ++ ++ if (s && (!subvol || s < subvol)) ++ subvol = s; ++ } ++ id = bch2_snapshot_tree_next(t, id); ++ if (id == snapshot_root) ++ break; ++ } + +- id = bch2_snapshot_tree_next(c, id); ++ if (!subvol && skip) { ++ skip = NULL; ++ goto retry; + } +- rcu_read_unlock(); + + return subvol; + } +@@ -413,9 +431,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, + u32 snapshot_root, u32 *subvol_id) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_s_c k; +- bool found = false; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, +@@ -428,28 +444,23 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, + continue; + if (!BCH_SUBVOLUME_SNAP(s.v)) { + *subvol_id = s.k->p.offset; +- found = true; +- break; ++ return 0; + } + } +- bch2_trans_iter_exit(trans, &iter); +- +- if (!ret && !found) { +- struct bkey_i_subvolume *u; +- +- *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); ++ if (ret) ++ return ret; + +- u = bch2_bkey_get_mut_typed(trans, &iter, +- BTREE_ID_subvolumes, POS(0, *subvol_id), +- 0, subvolume); +- ret = PTR_ERR_OR_ZERO(u); +- if (ret) +- return ret; ++ *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); + +- SET_BCH_SUBVOLUME_SNAP(&u->v, false); +- } ++ struct bkey_i_subvolume *u = ++ bch2_bkey_get_mut_typed(trans, BTREE_ID_subvolumes, POS(0, *subvol_id), ++ 0, subvolume); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; + +- return ret; ++ SET_BCH_SUBVOLUME_SNAP(&u->v, false); ++ return 0; + } + + static int check_snapshot_tree(struct btree_trans *trans, +@@ -457,27 +468,21 @@ static int check_snapshot_tree(struct btree_trans *trans, + struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- struct bkey_s_c_snapshot_tree st; +- struct bch_snapshot s; +- struct bch_subvolume subvol; +- struct printbuf buf = PRINTBUF; +- struct btree_iter snapshot_iter = {}; +- u32 root_id; +- int ret; ++ CLASS(printbuf, buf)(); + + if (k.k->type != KEY_TYPE_snapshot_tree) + return 0; + +- st = bkey_s_c_to_snapshot_tree(k); +- root_id = le32_to_cpu(st.v->root_snapshot); ++ struct bkey_s_c_snapshot_tree st = bkey_s_c_to_snapshot_tree(k); ++ u32 root_id = le32_to_cpu(st.v->root_snapshot); + +- struct bkey_s_c_snapshot snapshot_k = +- bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, +- POS(0, root_id), 0, snapshot); +- ret = bkey_err(snapshot_k); ++ CLASS(btree_iter, snapshot_iter)(trans, BTREE_ID_snapshots, POS(0, root_id), 0); ++ struct bkey_s_c_snapshot snapshot_k = bch2_bkey_get_typed(&snapshot_iter, snapshot); ++ int ret = bkey_err(snapshot_k); + if (ret && !bch2_err_matches(ret, ENOENT)) +- goto err; ++ return ret; + ++ struct bch_snapshot s; + if (!ret) + bkey_val_copy(&s, snapshot_k); + +@@ -491,17 +496,16 @@ static int check_snapshot_tree(struct btree_trans *trans, + ret + ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) + : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), +- buf.buf))) { +- ret = bch2_btree_delete_at(trans, iter, 0); +- goto err; +- } ++ buf.buf))) ++ return bch2_btree_delete_at(trans, iter, 0); + + if (!st.v->master_subvol) +- goto out; ++ return 0; + ++ struct bch_subvolume subvol; + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) +- goto err; ++ return ret; + + if (fsck_err_on(ret, + trans, snapshot_tree_to_missing_subvol, +@@ -526,27 +530,21 @@ static int check_snapshot_tree(struct btree_trans *trans, + ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + bch_err_fn(c, ret); + +- if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */ +- ret = 0; +- goto err; +- } ++ if (bch2_err_matches(ret, ENOENT)) /* nothing to be done here */ ++ return 0; + + if (ret) +- goto err; ++ return ret; + + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(u); + if (ret) +- goto err; ++ return ret; + + u->v.master_subvol = cpu_to_le32(subvol_id); + st = snapshot_tree_i_to_s_c(u); + } +-out: +-err: + fsck_err: +- bch2_trans_iter_exit(trans, &snapshot_iter); +- printbuf_exit(&buf); + return ret; + } + +@@ -559,14 +557,12 @@ static int check_snapshot_tree(struct btree_trans *trans, + */ + int bch2_check_snapshot_trees(struct bch_fs *c) + { +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_commit(trans, iter, + BTREE_ID_snapshot_trees, POS_MIN, + BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_snapshot_tree(trans, &iter, k))); +- bch_err_fn(c, ret); +- return ret; ++ check_snapshot_tree(trans, &iter, k)); + } + + /* +@@ -589,18 +585,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans, + + u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) + { +- const struct snapshot_t *s; +- + if (!id) + return 0; + +- rcu_read_lock(); +- s = snapshot_t(c, id); +- if (s->parent) +- id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); +- rcu_read_unlock(); +- +- return id; ++ guard(rcu)(); ++ const struct snapshot_t *s = snapshot_t(c, id); ++ return s->parent ++ ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)) ++ : id; + } + + static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) +@@ -630,22 +622,19 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, + struct bch_snapshot *s) + { + struct bch_fs *c = trans->c; +- struct btree_iter root_iter; +- struct bch_snapshot_tree s_t; +- struct bkey_s_c_snapshot root; + struct bkey_i_snapshot *u; +- u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; +- int ret; ++ u32 root_id = bch2_snapshot_root(c, k.k->p.offset); + +- root = bch2_bkey_get_iter_typed(trans, &root_iter, +- BTREE_ID_snapshots, POS(0, root_id), +- BTREE_ITER_with_updates, snapshot); +- ret = bkey_err(root); ++ CLASS(btree_iter, root_iter)(trans, BTREE_ID_snapshots, POS(0, root_id), ++ BTREE_ITER_with_updates); ++ struct bkey_s_c_snapshot root = bch2_bkey_get_typed(&root_iter, snapshot); ++ int ret = bkey_err(root); + if (ret) +- goto err; ++ return ret; + +- tree_id = le32_to_cpu(root.v->tree); ++ u32 tree_id = le32_to_cpu(root.v->tree); + ++ struct bch_snapshot_tree s_t; + ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; +@@ -654,10 +643,10 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, + u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u) ?: + bch2_snapshot_tree_create(trans, root_id, +- bch2_snapshot_tree_oldest_subvol(c, root_id), ++ bch2_snapshot_oldest_subvol(c, root_id, NULL), + &tree_id); + if (ret) +- goto err; ++ return ret; + + u->v.tree = cpu_to_le32(tree_id); + if (k.k->p.offset == root_id) +@@ -668,14 +657,13 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) +- goto err; ++ return ret; + + u->v.tree = cpu_to_le32(tree_id); + *s = u->v; + } +-err: +- bch2_trans_iter_exit(trans, &root_iter); +- return ret; ++ ++ return 0; + } + + static int check_snapshot(struct btree_trans *trans, +@@ -689,7 +677,7 @@ static int check_snapshot(struct btree_trans *trans, + struct bkey_i_snapshot *u; + u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); + u32 real_depth; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + u32 i, id; + int ret = 0; + +@@ -699,6 +687,9 @@ static int check_snapshot(struct btree_trans *trans, + memset(&s, 0, sizeof(s)); + memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); + ++ if (BCH_SNAPSHOT_DELETED(&s)) ++ return 0; ++ + id = le32_to_cpu(s.parent); + if (id) { + ret = bch2_snapshot_lookup(trans, id, &v); +@@ -736,7 +727,7 @@ static int check_snapshot(struct btree_trans *trans, + } + + bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && +- !BCH_SNAPSHOT_DELETED(&s); ++ !BCH_SNAPSHOT_WILL_DELETE(&s); + + if (should_have_subvol) { + id = le32_to_cpu(s.subvol); +@@ -819,7 +810,6 @@ static int check_snapshot(struct btree_trans *trans, + ret = 0; + err: + fsck_err: +- printbuf_exit(&buf); + return ret; + } + +@@ -829,14 +819,12 @@ int bch2_check_snapshots(struct bch_fs *c) + * We iterate backwards as checking/fixing the depth field requires that + * the parent's depth already be correct: + */ +- int ret = bch2_trans_run(c, +- for_each_btree_key_reverse_commit(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_reverse_commit(trans, iter, + BTREE_ID_snapshots, POS_MAX, + BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_snapshot(trans, &iter, k))); +- bch_err_fn(c, ret); +- return ret; ++ check_snapshot(trans, &iter, k)); + } + + static int check_snapshot_exists(struct btree_trans *trans, u32 id) +@@ -844,19 +832,18 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) + struct bch_fs *c = trans->c; + + /* Do we need to reconstruct the snapshot_tree entry as well? */ +- struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + u32 tree_id = 0; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN, + 0, k, ret) { +- if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { ++ if (k.k->type == KEY_TYPE_snapshot_tree && ++ le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) { + tree_id = k.k->p.offset; + break; + } + } +- bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; +@@ -879,17 +866,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { +- if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { ++ if (k.k->type == KEY_TYPE_subvolume && ++ le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) { + snapshot->v.subvol = cpu_to_le32(k.k->p.offset); + SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true); + break; + } + } +- bch2_trans_iter_exit(trans, &iter); + +- return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: +- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, +- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); ++ return bch2_snapshot_table_make_room(c, id) ?: ++ bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); + } + + /* Figure out which snapshot nodes belong in the same tree: */ +@@ -917,10 +903,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo + + static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) + { +- darray_for_each(*l, i) +- if (snapshot_list_has_id(r, *i)) +- return true; +- return false; ++ return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL; + } + + static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) +@@ -962,17 +945,21 @@ static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct + + int bch2_reconstruct_snapshots(struct bch_fs *c) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct printbuf buf = PRINTBUF; ++ CLASS(btree_trans, trans)(c); ++ CLASS(printbuf, buf)(); + struct snapshot_tree_reconstruct r = {}; + int ret = 0; + ++ struct progress_indicator_state progress; ++ bch2_progress_init(&progress, c, btree_has_snapshots_mask); ++ + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { + if (btree_type_has_snapshots(btree)) { + r.btree = btree; + + ret = for_each_btree_key(trans, iter, btree, POS_MIN, + BTREE_ITER_all_snapshots|BTREE_ITER_prefetch, k, ({ ++ progress_update_iter(trans, &progress, &iter); + get_snapshot_trees(c, &r, k.k->p); + })); + if (ret) +@@ -987,12 +974,12 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) + snapshot_id_list_to_text(&buf, t); + + darray_for_each(*t, id) { +- if (fsck_err_on(!bch2_snapshot_exists(c, *id), ++ if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, + trans, snapshot_node_missing, + "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { + if (t->nr > 1) { + bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); +- ret = -BCH_ERR_fsck_repair_unimplemented; ++ ret = bch_err_throw(c, fsck_repair_unimplemented); + goto err; + } + +@@ -1005,31 +992,90 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) + } + fsck_err: + err: +- bch2_trans_put(trans); + snapshot_tree_reconstruct_exit(&r); +- printbuf_exit(&buf); +- bch_err_fn(c, ret); + return ret; + } + +-int bch2_check_key_has_snapshot(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_s_c k) ++int __bch2_check_key_has_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + int ret = 0; ++ enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); + +- if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), +- trans, bkey_in_missing_snapshot, +- "key in missing snapshot %s, delete?", ++ /* Snapshot was definitively deleted, this error is marked autofix */ ++ if (fsck_err_on(state == SNAPSHOT_ID_deleted, ++ trans, bkey_in_deleted_snapshot, ++ "key in deleted snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, +- BTREE_UPDATE_internal_snapshot_node) ?: 1; ++ BTREE_UPDATE_internal_snapshot_node) ?: 1; ++ ++ if (state == SNAPSHOT_ID_empty) { ++ /* ++ * Snapshot missing: we should have caught this with btree_lost_data and ++ * kicked off reconstruct_snapshots, so if we end up here we have no ++ * idea what happened. ++ * ++ * Do not delete unless we know that subvolumes and snapshots ++ * are consistent: ++ * ++ * XXX: ++ * ++ * We could be smarter here, and instead of using the generic ++ * recovery pass ratelimiting, track if there have been any ++ * changes to the snapshots or inodes btrees since those passes ++ * last ran. ++ */ ++ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret; ++ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret; ++ ++ if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)) ++ ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; ++ ++ unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0); ++ ++ if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot, ++ "key in missing snapshot %s, delete?", ++ (bch2_btree_id_to_text(&buf, iter->btree_id), ++ prt_char(&buf, ' '), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_internal_snapshot_node) ?: 1; ++ } ++ } + fsck_err: +- printbuf_exit(&buf); ++ return ret; ++} ++ ++int __bch2_get_snapshot_overwrites(struct btree_trans *trans, ++ enum btree_id btree, struct bpos pos, ++ snapshot_id_list *s) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos), ++ BTREE_ITER_all_snapshots, k, ret) { ++ if (!bkey_eq(k.k->p, pos)) ++ break; ++ ++ if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) || ++ snapshot_list_has_ancestor(c, s, k.k->p.snapshot)) ++ continue; ++ ++ ret = snapshot_list_add(c, s, k.k->p.snapshot); ++ if (ret) ++ break; ++ } ++ if (ret) ++ darray_exit(s); ++ + return ret; + } + +@@ -1038,28 +1084,21 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, + */ + int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) + { +- struct btree_iter iter; + struct bkey_i_snapshot *s = +- bch2_bkey_get_mut_typed(trans, &iter, +- BTREE_ID_snapshots, POS(0, id), +- 0, snapshot); ++ bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, id), 0, snapshot); + int ret = PTR_ERR_OR_ZERO(s); +- if (unlikely(ret)) { +- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), +- trans->c, "missing snapshot %u", id); ++ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, "missing snapshot %u", id); ++ if (unlikely(ret)) + return ret; +- } + + /* already deleted? */ +- if (BCH_SNAPSHOT_DELETED(&s->v)) +- goto err; ++ if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) ++ return 0; + +- SET_BCH_SNAPSHOT_DELETED(&s->v, true); ++ SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); + s->v.subvol = 0; +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return 0; + } + + static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) +@@ -1071,39 +1110,33 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) + static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter, p_iter = {}; +- struct btree_iter c_iter = {}; +- struct btree_iter tree_iter = {}; +- struct bkey_s_c_snapshot s; + u32 parent_id, child_id; + unsigned i; +- int ret = 0; + +- s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), +- BTREE_ITER_intent, snapshot); +- ret = bkey_err(s); ++ struct bkey_i_snapshot *s = ++ bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, id), 0, snapshot); ++ int ret = PTR_ERR_OR_ZERO(s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", id); + + if (ret) +- goto err; ++ return ret; + +- BUG_ON(s.v->children[1]); ++ BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); ++ BUG_ON(s->v.children[1]); + +- parent_id = le32_to_cpu(s.v->parent); +- child_id = le32_to_cpu(s.v->children[0]); ++ parent_id = le32_to_cpu(s->v.parent); ++ child_id = le32_to_cpu(s->v.children[0]); + + if (parent_id) { +- struct bkey_i_snapshot *parent; +- +- parent = bch2_bkey_get_mut_typed(trans, &p_iter, +- BTREE_ID_snapshots, POS(0, parent_id), +- 0, snapshot); ++ struct bkey_i_snapshot *parent = ++ bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, parent_id), ++ 0, snapshot); + ret = PTR_ERR_OR_ZERO(parent); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", parent_id); + if (unlikely(ret)) +- goto err; ++ return ret; + + /* find entry in parent->children for node being deleted */ + for (i = 0; i < 2; i++) +@@ -1113,7 +1146,7 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) + if (bch2_fs_inconsistent_on(i == 2, c, + "snapshot %u missing child pointer to %u", + parent_id, id)) +- goto err; ++ return bch_err_throw(c, ENOENT_snapshot); + + parent->v.children[i] = cpu_to_le32(child_id); + +@@ -1121,16 +1154,14 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) + } + + if (child_id) { +- struct bkey_i_snapshot *child; +- +- child = bch2_bkey_get_mut_typed(trans, &c_iter, +- BTREE_ID_snapshots, POS(0, child_id), +- 0, snapshot); ++ struct bkey_i_snapshot *child = ++ bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, child_id), ++ 0, snapshot); + ret = PTR_ERR_OR_ZERO(child); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", child_id); + if (unlikely(ret)) +- goto err; ++ return ret; + + child->v.parent = cpu_to_le32(parent_id); + +@@ -1147,32 +1178,41 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) + * snapshot_tree entry to point to the new root, or delete it if + * this is the last snapshot ID in this tree: + */ +- struct bkey_i_snapshot_tree *s_t; + +- BUG_ON(s.v->children[1]); ++ BUG_ON(s->v.children[1]); + +- s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, +- BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), ++ struct bkey_i_snapshot_tree *s_t = bch2_bkey_get_mut_typed(trans, ++ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + if (ret) +- goto err; ++ return ret; + +- if (s.v->children[0]) { +- s_t->v.root_snapshot = s.v->children[0]; ++ if (s->v.children[0]) { ++ s_t->v.root_snapshot = s->v.children[0]; + } else { + s_t->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s_t->k, 0); + } + } + +- ret = bch2_btree_delete_at(trans, &iter, 0); +-err: +- bch2_trans_iter_exit(trans, &tree_iter); +- bch2_trans_iter_exit(trans, &p_iter); +- bch2_trans_iter_exit(trans, &c_iter); +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { ++ SET_BCH_SNAPSHOT_DELETED(&s->v, true); ++ s->v.parent = 0; ++ s->v.children[0] = 0; ++ s->v.children[1] = 0; ++ s->v.subvol = 0; ++ s->v.tree = 0; ++ s->v.depth = 0; ++ s->v.skip[0] = 0; ++ s->v.skip[1] = 0; ++ s->v.skip[2] = 0; ++ } else { ++ s->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&s->k, 0); ++ } ++ ++ return 0; + } + + static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, +@@ -1181,35 +1221,29 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + unsigned nr_snapids) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_i_snapshot *n; +- struct bkey_s_c k; +- unsigned i, j; + u32 depth = bch2_snapshot_depth(c, parent); +- int ret; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, +- POS_MIN, BTREE_ITER_intent); +- k = bch2_btree_iter_peek(trans, &iter); +- ret = bkey_err(k); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek(&iter); ++ int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + +- for (i = 0; i < nr_snapids; i++) { +- k = bch2_btree_iter_prev_slot(trans, &iter); ++ for (unsigned i = 0; i < nr_snapids; i++) { ++ k = bch2_btree_iter_prev_slot(&iter); + ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + if (!k.k || !k.k->p.offset) { +- ret = -BCH_ERR_ENOSPC_snapshot_create; +- goto err; ++ return bch_err_throw(c, ENOSPC_snapshot_create); + } + + n = bch2_bkey_alloc(trans, &iter, 0, snapshot); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +- goto err; ++ return ret; + + n->v.flags = 0; + n->v.parent = cpu_to_le32(parent); +@@ -1219,7 +1253,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + n->v.btime.lo = cpu_to_le64(bch2_current_time(c)); + n->v.btime.hi = 0; + +- for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) ++ for (unsigned j = 0; j < ARRAY_SIZE(n->v.skip); j++) + n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); + + bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); +@@ -1228,13 +1262,12 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) +- goto err; ++ return ret; + + new_snapids[i] = iter.pos.offset; + } +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ ++ return 0; + } + + /* +@@ -1245,14 +1278,9 @@ static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 par + u32 *snapshot_subvols, + unsigned nr_snapids) + { +- struct btree_iter iter; +- struct bkey_i_snapshot *n_parent; +- int ret = 0; +- +- n_parent = bch2_bkey_get_mut_typed(trans, &iter, +- BTREE_ID_snapshots, POS(0, parent), +- 0, snapshot); +- ret = PTR_ERR_OR_ZERO(n_parent); ++ struct bkey_i_snapshot *n_parent = ++ bch2_bkey_get_mut_typed(trans, BTREE_ID_snapshots, POS(0, parent), 0, snapshot); ++ int ret = PTR_ERR_OR_ZERO(n_parent); + if (unlikely(ret)) { + if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot %u not found", parent); +@@ -1261,22 +1289,19 @@ static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 par + + if (n_parent->v.children[0] || n_parent->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); +- ret = -EINVAL; +- goto err; ++ return -EINVAL; + } + + ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), + new_snapids, snapshot_subvols, nr_snapids); + if (ret) +- goto err; ++ return ret; + + n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); + n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); + n_parent->v.subvol = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return 0; + } + + /* +@@ -1336,67 +1361,47 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + * that key to snapshot leaf nodes, where we can mutate it + */ + +-struct snapshot_interior_delete { +- u32 id; +- u32 live_child; +-}; +-typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; +- + static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) + { +- darray_for_each(*l, i) +- if (i->id == id) +- return i->live_child; +- return 0; ++ struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id); ++ return i ? i->live_child : 0; + } + +-static unsigned __live_child(struct snapshot_table *t, u32 id, +- snapshot_id_list *delete_leaves, +- interior_delete_list *delete_interior) ++static unsigned live_child(struct bch_fs *c, u32 start) + { +- struct snapshot_t *s = __snapshot_t(t, id); +- if (!s) +- return 0; ++ struct snapshot_delete *d = &c->snapshot_delete; + +- for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) +- if (s->children[i] && +- !snapshot_list_has_id(delete_leaves, s->children[i]) && +- !interior_delete_has_id(delete_interior, s->children[i])) +- return s->children[i]; +- +- for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { +- u32 live_child = s->children[i] +- ? __live_child(t, s->children[i], delete_leaves, delete_interior) +- : 0; +- if (live_child) +- return live_child; +- } ++ guard(rcu)(); ++ struct snapshot_table *t = rcu_dereference(c->snapshots); ++ ++ for (u32 id = bch2_snapshot_tree_next(t, start); ++ id && id != start; ++ id = bch2_snapshot_tree_next(t, id)) ++ if (bch2_snapshot_is_leaf(c, id) && ++ !snapshot_list_has_id(&d->delete_leaves, id) && ++ !interior_delete_has_id(&d->delete_interior, id)) ++ return id; + + return 0; + } + +-static unsigned live_child(struct bch_fs *c, u32 id, +- snapshot_id_list *delete_leaves, +- interior_delete_list *delete_interior) ++static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) + { +- rcu_read_lock(); +- u32 ret = __live_child(rcu_dereference(c->snapshots), id, +- delete_leaves, delete_interior); +- rcu_read_unlock(); +- return ret; ++ return snapshot_list_has_id(&d->delete_leaves, id) || ++ interior_delete_has_id(&d->delete_interior, id) != 0; + } + + static int delete_dead_snapshots_process_key(struct btree_trans *trans, + struct btree_iter *iter, +- struct bkey_s_c k, +- snapshot_id_list *delete_leaves, +- interior_delete_list *delete_interior) ++ struct bkey_s_c k) + { +- if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) ++ struct snapshot_delete *d = &trans->c->snapshot_delete; ++ ++ if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + +- u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); ++ u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); + if (live_child) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + int ret = PTR_ERR_OR_ZERO(new); +@@ -1405,86 +1410,241 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, + + new->k.p.snapshot = live_child; + +- struct btree_iter dst_iter; +- struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, +- iter->btree_id, new->k.p, +- BTREE_ITER_all_snapshots| +- BTREE_ITER_intent); ++ CLASS(btree_iter, dst_iter)(trans, iter->btree_id, new->k.p, ++ BTREE_ITER_all_snapshots|BTREE_ITER_intent); ++ struct bkey_s_c dst_k = bch2_btree_iter_peek_slot(&dst_iter); + ret = bkey_err(dst_k); + if (ret) + return ret; + +- ret = (bkey_deleted(dst_k.k) ++ return (bkey_deleted(dst_k.k) + ? bch2_trans_update(trans, &dst_iter, new, + BTREE_UPDATE_internal_snapshot_node) + : 0) ?: + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); +- bch2_trans_iter_exit(trans, &dst_iter); +- return ret; + } + + return 0; + } + ++static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) ++{ ++ struct bch_fs *c = trans->c; ++ struct snapshot_delete *d = &c->snapshot_delete; ++ ++ u64 inum = iter->btree_id != BTREE_ID_inodes ++ ? iter->pos.inode ++ : iter->pos.offset; ++ ++ if (*prev_inum == inum) ++ return false; ++ ++ *prev_inum = inum; ++ ++ bool ret = !snapshot_list_has_id(&d->deleting_from_trees, ++ bch2_snapshot_tree(c, iter->pos.snapshot)); ++ if (unlikely(ret)) { ++ struct bpos pos = iter->pos; ++ pos.snapshot = 0; ++ if (iter->btree_id != BTREE_ID_inodes) ++ pos.offset = U64_MAX; ++ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(pos)); ++ } ++ ++ return ret; ++} ++ ++static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct snapshot_delete *d = &c->snapshot_delete; ++ ++ for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { ++ struct disk_reservation res = { 0 }; ++ u64 prev_inum = 0; ++ ++ d->pos.pos = POS_MIN; ++ ++ if (!btree_type_has_snapshots(d->pos.btree)) ++ continue; ++ ++ int ret = for_each_btree_key_commit(trans, iter, ++ d->pos.btree, POS_MIN, ++ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ++ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ d->pos.pos = iter.pos; ++ ++ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) ++ continue; ++ ++ delete_dead_snapshots_process_key(trans, &iter, k); ++ })); ++ ++ bch2_disk_reservation_put(c, &res); ++ ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, ++ struct bpos start, struct bpos end) ++{ ++ struct bch_fs *c = trans->c; ++ struct snapshot_delete *d = &c->snapshot_delete; ++ struct disk_reservation res = { 0 }; ++ ++ d->pos.btree = btree; ++ d->pos.pos = POS_MIN; ++ ++ int ret = for_each_btree_key_max_commit(trans, iter, ++ btree, start, end, ++ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ++ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ d->pos.pos = iter.pos; ++ delete_dead_snapshots_process_key(trans, &iter, k); ++ })); ++ ++ bch2_disk_reservation_put(c, &res); ++ return ret; ++} ++ ++static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct snapshot_delete *d = &c->snapshot_delete; ++ struct disk_reservation res = { 0 }; ++ u64 prev_inum = 0; ++ int ret = 0; ++ ++ struct btree_iter iter; ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, ++ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); ++ ++ while (1) { ++ struct bkey_s_c k; ++ ret = lockrestart_do(trans, ++ bkey_err(k = bch2_btree_iter_peek(&iter))); ++ if (ret) ++ break; ++ ++ if (!k.k) ++ break; ++ ++ d->pos.btree = iter.btree_id; ++ d->pos.pos = iter.pos; ++ ++ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) ++ continue; ++ ++ if (snapshot_id_dying(d, k.k->p.snapshot)) { ++ struct bpos start = POS(k.k->p.offset, 0); ++ struct bpos end = POS(k.k->p.offset, U64_MAX); ++ ++ ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: ++ delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: ++ delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); ++ if (ret) ++ break; ++ ++ bch2_btree_iter_set_pos(&iter, POS(0, k.k->p.offset + 1)); ++ } else { ++ bch2_btree_iter_advance(&iter); ++ } ++ } ++ bch2_trans_iter_exit(&iter); ++ ++ if (ret) ++ goto err; ++ ++ prev_inum = 0; ++ ret = for_each_btree_key_commit(trans, iter, ++ BTREE_ID_inodes, POS_MIN, ++ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ++ &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ d->pos.btree = iter.btree_id; ++ d->pos.pos = iter.pos; ++ ++ if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) ++ continue; ++ ++ delete_dead_snapshots_process_key(trans, &iter, k); ++ })); ++err: ++ bch2_disk_reservation_put(c, &res); ++ return ret; ++} ++ + /* + * For a given snapshot, if it doesn't have a subvolume that points to it, and + * it doesn't have child snapshot nodes - it's now redundant and we can mark it + * as deleted. + */ +-static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, +- snapshot_id_list *delete_leaves, +- interior_delete_list *delete_interior) ++static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) + { + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + struct bch_fs *c = trans->c; ++ struct snapshot_delete *d = &c->snapshot_delete; + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + unsigned live_children = 0; ++ int ret = 0; + + if (BCH_SNAPSHOT_SUBVOL(s.v)) + return 0; + ++ if (BCH_SNAPSHOT_DELETED(s.v)) ++ return 0; ++ ++ guard(mutex)(&d->progress_lock); + for (unsigned i = 0; i < 2; i++) { + u32 child = le32_to_cpu(s.v->children[i]); + + live_children += child && +- !snapshot_list_has_id(delete_leaves, child); ++ !snapshot_list_has_id(&d->delete_leaves, child); + } + ++ u32 tree = bch2_snapshot_tree(c, s.k->p.offset); ++ + if (live_children == 0) { +- return snapshot_list_add(c, delete_leaves, s.k->p.offset); ++ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: ++ snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); + } else if (live_children == 1) { +- struct snapshot_interior_delete d = { ++ struct snapshot_interior_delete n = { + .id = s.k->p.offset, +- .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), ++ .live_child = live_child(c, s.k->p.offset), + }; + +- if (!d.live_child) { +- bch_err(c, "error finding live child of snapshot %u", d.id); +- return -EINVAL; ++ if (!n.live_child) { ++ bch_err(c, "error finding live child of snapshot %u", n.id); ++ ret = -EINVAL; ++ } else { ++ ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: ++ darray_push(&d->delete_interior, n); + } +- +- return darray_push(delete_interior, d); +- } else { +- return 0; + } ++ ++ return ret; + } + + static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, + interior_delete_list *skip) + { +- rcu_read_lock(); ++ guard(rcu)(); ++ struct snapshot_table *t = rcu_dereference(c->snapshots); ++ + while (interior_delete_has_id(skip, id)) +- id = __bch2_snapshot_parent(c, id); ++ id = __bch2_snapshot_parent(t, id); + + while (n--) { + do { +- id = __bch2_snapshot_parent(c, id); ++ id = __bch2_snapshot_parent(t, id); + } while (interior_delete_has_id(skip, id)); + } +- rcu_read_unlock(); + + return id; + } +@@ -1498,6 +1658,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + struct bkey_i_snapshot *s; + int ret; + ++ if (!bch2_snapshot_exists(c, k.k->p.offset)) ++ return 0; ++ + if (k.k->type != KEY_TYPE_snapshot) + return 0; + +@@ -1545,69 +1708,73 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + return bch2_trans_update(trans, iter, &s->k_i, 0); + } + +-int bch2_delete_dead_snapshots(struct bch_fs *c) ++static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) + { +- if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) +- return 0; ++ prt_printf(out, "deleting from trees"); ++ darray_for_each(d->deleting_from_trees, i) ++ prt_printf(out, " %u", *i); ++ ++ prt_printf(out, "deleting leaves"); ++ darray_for_each(d->delete_leaves, i) ++ prt_printf(out, " %u", *i); ++ prt_newline(out); ++ ++ prt_printf(out, "interior"); ++ darray_for_each(d->delete_interior, i) ++ prt_printf(out, " %u->%u", i->id, i->live_child); ++ prt_newline(out); ++} + +- struct btree_trans *trans = bch2_trans_get(c); +- snapshot_id_list delete_leaves = {}; +- interior_delete_list delete_interior = {}; ++int __bch2_delete_dead_snapshots(struct bch_fs *c) ++{ ++ struct snapshot_delete *d = &c->snapshot_delete; + int ret = 0; + ++ if (!mutex_trylock(&d->lock)) ++ return 0; ++ ++ if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) { ++ mutex_unlock(&d->lock); ++ return 0; ++ } ++ ++ CLASS(btree_trans, trans)(c); ++ + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ ++ d->running = true; ++ d->pos = BBPOS_MIN; ++ + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, +- check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); ++ check_should_delete_snapshot(trans, k)); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "walking snapshots"); + if (ret) + goto err; + +- if (!delete_leaves.nr && !delete_interior.nr) ++ if (!d->delete_leaves.nr && !d->delete_interior.nr) + goto err; + + { +- struct printbuf buf = PRINTBUF; +- prt_printf(&buf, "deleting leaves"); +- darray_for_each(delete_leaves, i) +- prt_printf(&buf, " %u", *i); +- +- prt_printf(&buf, " interior"); +- darray_for_each(delete_interior, i) +- prt_printf(&buf, " %u->%u", i->id, i->live_child); ++ CLASS(printbuf, buf)(); ++ bch2_snapshot_delete_nodes_to_text(&buf, d); + + ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); +- printbuf_exit(&buf); + if (ret) + goto err; + } + +- for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { +- struct disk_reservation res = { 0 }; +- +- if (!btree_type_has_snapshots(btree)) +- continue; +- +- ret = for_each_btree_key_commit(trans, iter, +- btree, POS_MIN, +- BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, +- &res, NULL, BCH_TRANS_COMMIT_no_enospc, +- delete_dead_snapshots_process_key(trans, &iter, k, +- &delete_leaves, +- &delete_interior)); +- +- bch2_disk_reservation_put(c, &res); +- +- if (!bch2_err_matches(ret, EROFS)) +- bch_err_msg(c, ret, "deleting keys from dying snapshots"); +- if (ret) +- goto err; +- } ++ ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) ++ ? delete_dead_snapshot_keys_v2(trans) ++ : delete_dead_snapshot_keys_v1(trans); ++ if (!bch2_err_matches(ret, EROFS)) ++ bch_err_msg(c, ret, "deleting keys from dying snapshots"); ++ if (ret) ++ goto err; + +- darray_for_each(delete_leaves, i) { ++ darray_for_each(d->delete_leaves, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + if (!bch2_err_matches(ret, EROFS)) +@@ -1624,11 +1791,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, + BTREE_ITER_intent, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); ++ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); + if (ret) + goto err; + +- darray_for_each(delete_interior, i) { ++ darray_for_each(d->delete_interior, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, i->id)); + if (!bch2_err_matches(ret, EROFS)) +@@ -1637,33 +1804,64 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) + goto err; + } + err: +- darray_exit(&delete_interior); +- darray_exit(&delete_leaves); +- bch2_trans_put(trans); +- if (!bch2_err_matches(ret, EROFS)) +- bch_err_fn(c, ret); ++ scoped_guard(mutex, &d->progress_lock) { ++ darray_exit(&d->deleting_from_trees); ++ darray_exit(&d->delete_interior); ++ darray_exit(&d->delete_leaves); ++ d->running = false; ++ } ++ ++ bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots); ++ ++ mutex_unlock(&d->lock); + return ret; + } + ++int bch2_delete_dead_snapshots(struct bch_fs *c) ++{ ++ if (!c->opts.auto_snapshot_deletion) ++ return 0; ++ ++ return __bch2_delete_dead_snapshots(c); ++} ++ + void bch2_delete_dead_snapshots_work(struct work_struct *work) + { +- struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); ++ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); + + set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); + + bch2_delete_dead_snapshots(c); +- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); + } + + void bch2_delete_dead_snapshots_async(struct bch_fs *c) + { +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) ++ if (!c->opts.auto_snapshot_deletion) ++ return; ++ ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) + return; + + BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + +- if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) +- bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); ++ if (!queue_work(system_long_wq, &c->snapshot_delete.work)) ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); ++} ++ ++void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct snapshot_delete *d = &c->snapshot_delete; ++ ++ if (!d->running) { ++ prt_str(out, "(not running)"); ++ return; ++ } ++ ++ scoped_guard(mutex, &d->progress_lock) { ++ bch2_snapshot_delete_nodes_to_text(out, d); ++ bch2_bbpos_to_text(out, d->pos); ++ } + } + + int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, +@@ -1671,7 +1869,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + struct bpos pos) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bkey_s_c k; + int ret; + +@@ -1682,12 +1879,9 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + if (!bkey_eq(pos, k.k->p)) + break; + +- if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { +- ret = 1; +- break; +- } ++ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) ++ return 1; + } +- bch2_trans_iter_exit(trans, &iter); + + return ret; + } +@@ -1704,7 +1898,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct + return 0; + + struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); +- if (BCH_SNAPSHOT_DELETED(snap.v) || ++ if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || + interior_snapshot_needs_delete(snap)) + set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); + +@@ -1717,11 +1911,11 @@ int bch2_snapshots_read(struct bch_fs *c) + * Initializing the is_ancestor bitmaps requires ancestors to already be + * initialized - so mark in reverse: + */ +- int ret = bch2_trans_run(c, +- for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, + POS_MAX, 0, k, + __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: +- bch2_check_snapshot_needs_deletion(trans, k))); ++ bch2_check_snapshot_needs_deletion(trans, k)); + bch_err_fn(c, ret); + + /* +@@ -1733,10 +1927,6 @@ int bch2_snapshots_read(struct bch_fs *c) + BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && + test_bit(BCH_FS_may_go_rw, &c->flags)); + +- if (bch2_err_matches(ret, EIO) || +- (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) +- ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); +- + return ret; + } + +@@ -1744,3 +1934,11 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) + { + kvfree(rcu_dereference_protected(c->snapshots, true)); + } ++ ++void bch2_fs_snapshots_init_early(struct bch_fs *c) ++{ ++ INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); ++ mutex_init(&c->snapshot_delete.lock); ++ mutex_init(&c->snapshot_delete.progress_lock); ++ mutex_init(&c->snapshots_unlinked_lock); ++} +diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h +index 81180181d7c9..fef32a0118c4 100644 +--- a/fs/bcachefs/snapshot.h ++++ b/fs/bcachefs/snapshot.h +@@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) + + static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) + { +- rcu_read_lock(); ++ guard(rcu)(); + const struct snapshot_t *s = snapshot_t(c, id); +- id = s ? s->tree : 0; +- rcu_read_unlock(); +- +- return id; ++ return s ? s->tree : 0; + } + + static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +@@ -62,87 +59,84 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) + + static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) + { +- rcu_read_lock(); +- id = __bch2_snapshot_parent_early(c, id); +- rcu_read_unlock(); +- +- return id; ++ guard(rcu)(); ++ return __bch2_snapshot_parent_early(c, id); + } + +-static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) ++static inline u32 __bch2_snapshot_parent(struct snapshot_table *t, u32 id) + { +- const struct snapshot_t *s = snapshot_t(c, id); ++ const struct snapshot_t *s = __snapshot_t(t, id); + if (!s) + return 0; + + u32 parent = s->parent; + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + parent && +- s->depth != snapshot_t(c, parent)->depth + 1) ++ s->depth != __snapshot_t(t, parent)->depth + 1) + panic("id %u depth=%u parent %u depth=%u\n", +- id, snapshot_t(c, id)->depth, +- parent, snapshot_t(c, parent)->depth); ++ id, __snapshot_t(t, id)->depth, ++ parent, __snapshot_t(t, parent)->depth); + + return parent; + } + + static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) + { +- rcu_read_lock(); +- id = __bch2_snapshot_parent(c, id); +- rcu_read_unlock(); +- +- return id; ++ guard(rcu)(); ++ return __bch2_snapshot_parent(rcu_dereference(c->snapshots), id); + } + + static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) + { +- rcu_read_lock(); +- while (n--) +- id = __bch2_snapshot_parent(c, id); +- rcu_read_unlock(); ++ guard(rcu)(); ++ struct snapshot_table *t = rcu_dereference(c->snapshots); + ++ while (n--) ++ id = __bch2_snapshot_parent(t, id); + return id; + } + +-u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *, u32); ++u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); + u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); + + static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) + { +- u32 parent; ++ guard(rcu)(); ++ struct snapshot_table *t = rcu_dereference(c->snapshots); + +- rcu_read_lock(); +- while ((parent = __bch2_snapshot_parent(c, id))) ++ u32 parent; ++ while ((parent = __bch2_snapshot_parent(t, id))) + id = parent; +- rcu_read_unlock(); +- + return id; + } + +-static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) ++static inline enum snapshot_id_state __bch2_snapshot_id_state(struct snapshot_table *t, u32 id) + { +- const struct snapshot_t *s = snapshot_t(c, id); +- return s ? s->live : 0; ++ const struct snapshot_t *s = __snapshot_t(t, id); ++ return s ? s->state : SNAPSHOT_ID_empty; + } + +-static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) ++static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) + { +- rcu_read_lock(); +- bool ret = __bch2_snapshot_exists(c, id); +- rcu_read_unlock(); ++ guard(rcu)(); ++ return __bch2_snapshot_id_state(rcu_dereference(c->snapshots), id); ++} + +- return ret; ++static inline bool __bch2_snapshot_exists(struct snapshot_table *t, u32 id) ++{ ++ return __bch2_snapshot_id_state(t, id) == SNAPSHOT_ID_live; ++} ++ ++static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) ++{ ++ return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; + } + + static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) + { +- rcu_read_lock(); ++ guard(rcu)(); + const struct snapshot_t *s = snapshot_t(c, id); +- int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; +- rcu_read_unlock(); +- +- return ret; ++ return s ? s->children[0] : bch_err_throw(c, invalid_snapshot_node); + } + + static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) +@@ -155,13 +149,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) + + static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) + { +- u32 depth; +- +- rcu_read_lock(); +- depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; +- rcu_read_unlock(); +- +- return depth; ++ guard(rcu)(); ++ return parent ? snapshot_t(c, parent)->depth + 1 : 0; + } + + bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); +@@ -175,20 +164,14 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances + + static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) + { +- rcu_read_lock(); ++ guard(rcu)(); + const struct snapshot_t *t = snapshot_t(c, id); +- bool ret = t && (t->children[0]|t->children[1]) != 0; +- rcu_read_unlock(); +- +- return ret; ++ return t && (t->children[0]|t->children[1]) != 0; + } + + static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) + { +- darray_for_each(*s, i) +- if (*i == id) +- return true; +- return false; ++ return darray_find(*s, id) != NULL; + } + + static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) +@@ -241,10 +224,38 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, + int bch2_check_snapshot_trees(struct bch_fs *); + int bch2_check_snapshots(struct bch_fs *); + int bch2_reconstruct_snapshots(struct bch_fs *); +-int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); ++ ++int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); ++ ++static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) ++ ? 0 ++ : __bch2_check_key_has_snapshot(trans, iter, k); ++} ++ ++int __bch2_get_snapshot_overwrites(struct btree_trans *, ++ enum btree_id, struct bpos, ++ snapshot_id_list *); ++ ++/* ++ * Get a list of snapshot IDs that have overwritten a given key: ++ */ ++static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans, ++ enum btree_id btree, struct bpos pos, ++ snapshot_id_list *s) ++{ ++ darray_init(s); ++ ++ return bch2_snapshot_has_children(trans->c, pos.snapshot) ++ ? __bch2_get_snapshot_overwrites(trans, btree, pos, s) ++ : 0; ++ ++} + + int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); +-void bch2_delete_dead_snapshots_work(struct work_struct *); + + int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); + +@@ -259,7 +270,14 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + return __bch2_key_has_snapshot_overwrites(trans, id, pos); + } + ++int __bch2_delete_dead_snapshots(struct bch_fs *); ++int bch2_delete_dead_snapshots(struct bch_fs *); ++void bch2_delete_dead_snapshots_work(struct work_struct *); ++void bch2_delete_dead_snapshots_async(struct bch_fs *); ++void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); ++ + int bch2_snapshots_read(struct bch_fs *); + void bch2_fs_snapshots_exit(struct bch_fs *); ++void bch2_fs_snapshots_init_early(struct bch_fs *); + + #endif /* _BCACHEFS_SNAPSHOT_H */ +diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h +index aabcd3a74cd9..9bccae1f3590 100644 +--- a/fs/bcachefs/snapshot_format.h ++++ b/fs/bcachefs/snapshot_format.h +@@ -15,10 +15,10 @@ struct bch_snapshot { + bch_le128 btime; + }; + +-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) +- ++LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) + /* True if a subvolume points to this snapshot node: */ + LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) ++LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) + + /* + * Snapshot trees: +diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h +new file mode 100644 +index 000000000000..a826c9c83c11 +--- /dev/null ++++ b/fs/bcachefs/snapshot_types.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SNAPSHOT_TYPES_H ++#define _BCACHEFS_SNAPSHOT_TYPES_H ++ ++#include "bbpos_types.h" ++#include "darray.h" ++#include "subvolume_types.h" ++ ++DEFINE_DARRAY_NAMED(snapshot_id_list, u32); ++ ++#define IS_ANCESTOR_BITMAP 128 ++ ++struct snapshot_t { ++ enum snapshot_id_state { ++ SNAPSHOT_ID_empty, ++ SNAPSHOT_ID_live, ++ SNAPSHOT_ID_deleted, ++ } state; ++ u32 parent; ++ u32 skip[3]; ++ u32 depth; ++ u32 children[2]; ++ u32 subvol; /* Nonzero only if a subvolume points to this node: */ ++ u32 tree; ++ unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; ++}; ++ ++struct snapshot_table { ++ struct rcu_head rcu; ++ size_t nr; ++#ifndef RUST_BINDGEN ++ DECLARE_FLEX_ARRAY(struct snapshot_t, s); ++#else ++ struct snapshot_t s[0]; ++#endif ++}; ++ ++struct snapshot_interior_delete { ++ u32 id; ++ u32 live_child; ++}; ++typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; ++ ++struct snapshot_delete { ++ struct mutex lock; ++ struct work_struct work; ++ ++ struct mutex progress_lock; ++ snapshot_id_list deleting_from_trees; ++ snapshot_id_list delete_leaves; ++ interior_delete_list delete_interior; ++ ++ bool running; ++ struct bbpos pos; ++}; ++ ++#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ +diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c +index a90bf7b8a2b4..ce2a54902a64 100644 +--- a/fs/bcachefs/str_hash.c ++++ b/fs/bcachefs/str_hash.c +@@ -18,27 +18,27 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir + return ret; + return !ret; + } else { +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, ++ CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, + SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + return ret; + +- ret = bkey_is_inode(k.k); +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bkey_is_inode(k.k); + } + } + +-static noinline int fsck_rename_dirent(struct btree_trans *trans, +- struct snapshots_seen *s, +- const struct bch_hash_desc desc, +- struct bch_hash_info *hash_info, +- struct bkey_s_c_dirent old) ++static int bch2_fsck_rename_dirent(struct btree_trans *trans, ++ struct snapshots_seen *s, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct bkey_s_c_dirent old, ++ bool *updated_before_k_pos) + { ++ struct bch_fs *c = trans->c; + struct qstr old_name = bch2_dirent_get_name(old); +- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); ++ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; +@@ -47,28 +47,39 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, + dirent_copy_target(new, old); + new->k.p = old.k->p; + ++ char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20); ++ ret = PTR_ERR_OR_ZERO(renamed_buf); ++ if (ret) ++ return ret; ++ + for (unsigned i = 0; i < 1000; i++) { +- unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", +- old_name.len, old_name.name, i); +- unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); ++ new->k.u64s = BKEY_U64s_MAX; + +- if (u64s > U8_MAX) +- return -EINVAL; ++ struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf, ++ sprintf(renamed_buf, "%.*s.fsck_renamed-%u", ++ old_name.len, old_name.name, i)); + +- new->k.u64s = u64s; ++ ret = bch2_dirent_init_name(c, new, hash_info, &renamed_name, NULL); ++ if (ret) ++ return ret; + + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + (subvol_inum) { 0, old.k->p.inode }, + old.k->p.snapshot, &new->k_i, +- BTREE_UPDATE_internal_snapshot_node); +- if (!bch2_err_matches(ret, EEXIST)) ++ BTREE_UPDATE_internal_snapshot_node| ++ STR_HASH_must_create); ++ if (ret && !bch2_err_matches(ret, EEXIST)) + break; ++ if (!ret) { ++ if (bpos_lt(new->k.p, old.k->p)) ++ *updated_before_k_pos = true; ++ break; ++ } + } + +- if (ret) +- return ret; +- +- return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); ++ ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); ++ bch_err_fn(c, ret); ++ return ret; + } + + static noinline int hash_pick_winner(struct btree_trans *trans, +@@ -101,17 +112,24 @@ static noinline int hash_pick_winner(struct btree_trans *trans, + } + } + +-static int repair_inode_hash_info(struct btree_trans *trans, +- struct bch_inode_unpacked *snapshot_root) ++/* ++ * str_hash lookups across snapshots break in wild ways if hash_info in ++ * different snapshot versions doesn't match - so if we find one mismatch, check ++ * them all ++ */ ++int bch2_repair_inode_hash_info(struct btree_trans *trans, ++ struct bch_inode_unpacked *snapshot_root) + { +- struct btree_iter iter; ++ struct bch_fs *c = trans->c; + struct bkey_s_c k; ++ CLASS(printbuf, buf)(); ++ bool need_commit = false; + int ret = 0; + +- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, +- SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), +- BTREE_ITER_all_snapshots, k, ret) { +- if (k.k->p.offset != snapshot_root->bi_inum) ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, ++ POS(0, snapshot_root->bi_inum), ++ BTREE_ITER_all_snapshots, k, ret) { ++ if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) + break; + if (!bkey_is_inode(k.k)) + continue; +@@ -121,20 +139,68 @@ static int repair_inode_hash_info(struct btree_trans *trans, + if (ret) + break; + +- if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || +- INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), +- trans, inode_snapshot_mismatch, +- "inode hash info in different snapshots don't match")) { ++ if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && ++ INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); ++ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); ++ ++ BUG_ON(hash1.type != hash2.type || ++ memcmp(&hash1.siphash_key, ++ &hash2.siphash_key, ++ sizeof(hash1.siphash_key))); ++#endif ++ continue; ++ } ++ ++ printbuf_reset(&buf); ++ prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", ++ snapshot_root->bi_inum, ++ inode.bi_snapshot, ++ snapshot_root->bi_snapshot); ++ ++ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); ++ prt_printf(&buf, " %llx\n", inode.bi_hash_seed); ++ ++ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); ++ prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); ++ ++ if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { + inode.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); +- ret = __bch2_fsck_write_inode(trans, &inode) ?: +- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: +- -BCH_ERR_transaction_restart_nested; +- break; ++ ++ ret = __bch2_fsck_write_inode(trans, &inode); ++ if (ret) ++ break; ++ need_commit = true; + } + } ++ ++ if (ret) ++ return ret; ++ ++ if (!need_commit) { ++ printbuf_reset(&buf); ++ bch2_log_msg_start(c, &buf); ++ ++ prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", ++ snapshot_root->bi_inum); ++ ++ prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); ++ bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); ++ prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); ++#if 0 ++ prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); ++ bch2_prt_str_hash_type(&buf, hash_info->type); ++ prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); ++#endif ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ return bch_err_throw(c, fsck_repair_unimplemented); ++ } ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ bch_err_throw(c, transaction_restart_nested); + fsck_err: +- bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -144,47 +210,121 @@ static int repair_inode_hash_info(struct btree_trans *trans, + */ + static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, + struct bch_hash_info *hash_info) ++{ ++ struct bch_inode_unpacked snapshot_root; ++ int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); ++ if (ret) ++ return ret; ++ ++ struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); ++ if (hash_info->type != hash_root.type || ++ memcmp(&hash_info->siphash_key, ++ &hash_root.siphash_key, ++ sizeof(hash_root.siphash_key))) ++ ret = bch2_repair_inode_hash_info(trans, &snapshot_root); ++ ++ return ret; ++} ++ ++/* Put a str_hash key in its proper location, checking for duplicates */ ++int bch2_str_hash_repair_key(struct btree_trans *trans, ++ struct snapshots_seen *s, ++ const struct bch_hash_desc *desc, ++ struct bch_hash_info *hash_info, ++ struct btree_iter *k_iter, struct bkey_s_c k, ++ struct btree_iter *dup_iter, struct bkey_s_c dup_k, ++ bool *updated_before_k_pos) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; +- struct bkey_s_c k; ++ CLASS(printbuf, buf)(); ++ bool free_snapshots_seen = false; + int ret = 0; + +- for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), +- BTREE_ITER_all_snapshots, k, ret) { +- if (k.k->p.offset != inum) +- break; +- if (bkey_is_inode(k.k)) +- goto found; ++ if (!s) { ++ s = bch2_trans_kmalloc(trans, sizeof(*s)); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ goto out; ++ ++ s->pos = k_iter->pos; ++ darray_init(&s->ids); ++ ++ ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); ++ if (ret) ++ goto out; ++ ++ free_snapshots_seen = true; + } +- bch_err(c, "%s(): inum %llu not found", __func__, inum); +- ret = -BCH_ERR_fsck_repair_unimplemented; +- goto err; +-found:; +- struct bch_inode_unpacked inode; +- ret = bch2_inode_unpack(k, &inode); +- if (ret) +- goto err; + +- struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); +- if (hash_info->type != hash2.type || +- memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { +- ret = repair_inode_hash_info(trans, &inode); +- if (!ret) { +- bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" +- "%u %llx %llx\n" +- "%u %llx %llx", +- hash_info->type, +- hash_info->siphash_key.k0, +- hash_info->siphash_key.k1, +- hash2.type, +- hash2.siphash_key.k0, +- hash2.siphash_key.k1); +- ret = -BCH_ERR_fsck_repair_unimplemented; ++ if (!dup_k.k) { ++ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ goto out; ++ ++ dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, ++ (subvol_inum) { 0, new->k.p.inode }, ++ new->k.p.snapshot, new, ++ STR_HASH_must_create| ++ BTREE_ITER_with_updates| ++ BTREE_UPDATE_internal_snapshot_node); ++ ret = bkey_err(dup_k); ++ if (ret) ++ goto out; ++ if (dup_k.k) ++ goto duplicate_entries; ++ ++ if (bpos_lt(new->k.p, k.k->p)) ++ *updated_before_k_pos = true; ++ ++ ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, ++ k_iter->pos, new->k.p) ?: ++ bch2_hash_delete_at(trans, *desc, hash_info, k_iter, ++ BTREE_ITER_with_updates| ++ BTREE_UPDATE_internal_snapshot_node) ?: ++ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: ++ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ bch_err_throw(c, transaction_restart_commit); ++ } else { ++duplicate_entries: ++ ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); ++ if (ret < 0) ++ goto out; ++ ++ if (!fsck_err(trans, hash_table_key_duplicate, ++ "duplicate hash table keys%s:\n%s", ++ ret != 2 ? "" : ", both point to valid inodes", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), ++ prt_newline(&buf), ++ bch2_bkey_val_to_text(&buf, c, dup_k), ++ buf.buf))) ++ goto out; ++ ++ switch (ret) { ++ case 0: ++ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); ++ break; ++ case 1: ++ ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); ++ break; ++ case 2: ++ ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, ++ bkey_s_c_to_dirent(k), ++ updated_before_k_pos) ?: ++ bch2_hash_delete_at(trans, *desc, hash_info, k_iter, ++ BTREE_ITER_with_updates); ++ goto out; + } ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: ++ bch_err_throw(c, transaction_restart_commit); + } +-err: +- bch2_trans_iter_exit(trans, &iter); ++out: ++fsck_err: ++ bch2_trans_iter_exit(dup_iter); ++ if (free_snapshots_seen) ++ darray_exit(&s->ids); + return ret; + } + +@@ -192,11 +332,12 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, +- struct btree_iter *k_iter, struct bkey_s_c hash_k) ++ struct btree_iter *k_iter, struct bkey_s_c hash_k, ++ bool *updated_before_k_pos) + { + struct bch_fs *c = trans->c; + struct btree_iter iter = {}; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + struct bkey_s_c k; + int ret = 0; + +@@ -204,92 +345,49 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, + if (hash_k.k->p.offset < hash) + goto bad_hash; + +- for_each_btree_key_norestart(trans, iter, desc->btree_id, +- SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), +- BTREE_ITER_slots, k, ret) { ++ bch2_trans_iter_init(trans, &iter, desc->btree_id, ++ SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), ++ BTREE_ITER_slots| ++ BTREE_ITER_with_updates); ++ ++ for_each_btree_key_continue_norestart(iter, ++ BTREE_ITER_slots| ++ BTREE_ITER_with_updates, k, ret) { + if (bkey_eq(k.k->p, hash_k.k->p)) + break; + + if (k.k->type == desc->key_type && +- !desc->cmp_bkey(k, hash_k)) +- goto duplicate_entries; ++ !desc->cmp_bkey(k, hash_k)) { ++ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, ++ hash_info) ?: ++ bch2_str_hash_repair_key(trans, s, desc, hash_info, ++ k_iter, hash_k, ++ &iter, k, updated_before_k_pos); ++ break; ++ } + +- if (bkey_deleted(k.k)) { +- bch2_trans_iter_exit(trans, &iter); ++ if (bkey_deleted(k.k)) + goto bad_hash; +- } + } +-out: +- bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); ++ bch2_trans_iter_exit(&iter); ++fsck_err: + return ret; + bad_hash: ++ bch2_trans_iter_exit(&iter); + /* + * Before doing any repair, check hash_info itself: + */ + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); + if (ret) +- goto out; ++ return ret; + + if (fsck_err(trans, hash_table_key_wrong_offset, +- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", +- bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { +- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); +- if (IS_ERR(new)) +- return PTR_ERR(new); +- +- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, +- (subvol_inum) { 0, hash_k.k->p.inode }, +- hash_k.k->p.snapshot, new, +- STR_HASH_must_create| +- BTREE_ITER_with_updates| +- BTREE_UPDATE_internal_snapshot_node); +- ret = bkey_err(k); +- if (ret) +- goto out; +- if (k.k) +- goto duplicate_entries; +- +- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, +- BTREE_UPDATE_internal_snapshot_node) ?: +- bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: +- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: +- -BCH_ERR_transaction_restart_nested; +- goto out; +- } +-fsck_err: +- goto out; +-duplicate_entries: +- ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); +- if (ret < 0) +- goto out; +- +- if (!fsck_err(trans, hash_table_key_duplicate, +- "duplicate hash table keys%s:\n%s", +- ret != 2 ? "" : ", both point to valid inodes", +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, hash_k), +- prt_newline(&buf), +- bch2_bkey_val_to_text(&buf, c, k), +- buf.buf))) +- goto out; +- +- switch (ret) { +- case 0: +- ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); +- break; +- case 1: +- ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); +- break; +- case 2: +- ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: +- bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); +- goto out; +- } +- +- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: +- -BCH_ERR_transaction_restart_nested; +- goto out; ++ "hash table key at wrong offset: should be at %llu\n%s", ++ hash, ++ (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) ++ ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, ++ k_iter, hash_k, ++ &iter, bkey_s_c_null, ++ updated_before_k_pos); ++ return ret; + } +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index 0c1a00539bd1..8c0fb44929cc 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -32,6 +32,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) + } + + struct bch_hash_info { ++ u32 inum_snapshot; + u8 type; + struct unicode_map *cf_encoding; + /* +@@ -45,11 +46,10 @@ static inline struct bch_hash_info + bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) + { + struct bch_hash_info info = { +- .type = INODE_STR_HASH(bi), +-#ifdef CONFIG_UNICODE +- .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, +-#endif +- .siphash_key = { .k0 = bi->bi_hash_seed } ++ .inum_snapshot = bi->bi_snapshot, ++ .type = INODE_STR_HASH(bi), ++ .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, ++ .siphash_key = { .k0 = bi->bi_hash_seed } + }; + + if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { +@@ -159,8 +159,11 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, +- SPOS(inum.inum, desc.hash_key(info, key), snapshot), ++ bch2_trans_iter_init(trans, iter, ++ desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), ++ BTREE_ITER_slots|flags); ++ ++ for_each_btree_key_max_continue_norestart(*iter, + POS(inum.inum, U64_MAX), + BTREE_ITER_slots|flags, k, ret) { + if (is_visible_key(desc, inum, k)) { +@@ -173,9 +176,9 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, + break; + } + } +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + +- return bkey_s_c_err(ret ?: -BCH_ERR_ENOENT_str_hash_lookup); ++ return bkey_s_c_err(ret ?: bch_err_throw(trans->c, ENOENT_str_hash_lookup)); + } + + static __always_inline struct bkey_s_c +@@ -209,15 +212,18 @@ bch2_hash_hole(struct btree_trans *trans, + if (ret) + return ret; + +- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, +- SPOS(inum.inum, desc.hash_key(info, key), snapshot), ++ bch2_trans_iter_init(trans, iter, desc.btree_id, ++ SPOS(inum.inum, desc.hash_key(info, key), snapshot), ++ BTREE_ITER_slots|BTREE_ITER_intent); ++ ++ for_each_btree_key_max_continue_norestart(*iter, + POS(inum.inum, U64_MAX), + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) + if (!is_visible_key(desc, inum, k)) + return 0; +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(iter); + +- return ret ?: -BCH_ERR_ENOSPC_str_hash_create; ++ return ret ?: bch_err_throw(trans->c, ENOSPC_str_hash_create); + } + + static __always_inline +@@ -230,11 +236,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- bch2_trans_copy_iter(trans, &iter, start); ++ bch2_trans_copy_iter(&iter, start); + +- bch2_btree_iter_advance(trans, &iter); ++ bch2_btree_iter_advance(&iter); + +- for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { ++ for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { + if (k.k->type != desc.key_type && + k.k->type != KEY_TYPE_hash_whiteout) + break; +@@ -246,7 +252,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, + } + } + +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -259,15 +265,19 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, + struct bkey_i *insert, + enum btree_iter_update_trigger_flags flags) + { ++ struct bch_fs *c = trans->c; + struct btree_iter slot = {}; + struct bkey_s_c k; + bool found = false; + int ret; + +- for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, ++ bch2_trans_iter_init(trans, iter, desc.btree_id, + SPOS(insert->k.p.inode, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), ++ BTREE_ITER_slots|BTREE_ITER_intent|flags); ++ ++ for_each_btree_key_max_continue_norestart(*iter, + POS(insert->k.p.inode, U64_MAX), + BTREE_ITER_slots|BTREE_ITER_intent|flags, k, ret) { + if (is_visible_key(desc, inum, k)) { +@@ -279,26 +289,26 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, + } + + if (!slot.path && !(flags & STR_HASH_must_replace)) +- bch2_trans_copy_iter(trans, &slot, iter); ++ bch2_trans_copy_iter(&slot, iter); + + if (k.k->type != KEY_TYPE_hash_whiteout) + goto not_found; + } + + if (!ret) +- ret = -BCH_ERR_ENOSPC_str_hash_create; ++ ret = bch_err_throw(c, ENOSPC_str_hash_create); + out: +- bch2_trans_iter_exit(trans, &slot); +- bch2_trans_iter_exit(trans, iter); ++ bch2_trans_iter_exit(&slot); ++ bch2_trans_iter_exit(iter); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + found: + found = true; + not_found: + if (found && (flags & STR_HASH_must_create)) { +- bch2_trans_iter_exit(trans, &slot); ++ bch2_trans_iter_exit(&slot); + return k; + } else if (!found && (flags & STR_HASH_must_replace)) { +- ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; ++ ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace); + } else { + if (!found && slot.path) + swap(*iter, slot); +@@ -325,8 +335,8 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, + if (ret) + return ret; + if (k.k) { +- bch2_trans_iter_exit(trans, &iter); +- return -BCH_ERR_EEXIST_str_hash_set; ++ bch2_trans_iter_exit(&iter); ++ return bch_err_throw(trans->c, EEXIST_str_hash_set); + } + + return 0; +@@ -388,22 +398,34 @@ int bch2_hash_delete(struct btree_trans *trans, + return ret; + + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + ++int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); ++ + struct snapshots_seen; ++int bch2_str_hash_repair_key(struct btree_trans *, ++ struct snapshots_seen *, ++ const struct bch_hash_desc *, ++ struct bch_hash_info *, ++ struct btree_iter *, struct bkey_s_c, ++ struct btree_iter *, struct bkey_s_c, ++ bool *); ++ + int __bch2_str_hash_check_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, +- struct btree_iter *, struct bkey_s_c); ++ struct btree_iter *, struct bkey_s_c, ++ bool *); + + static inline int bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, +- struct btree_iter *k_iter, struct bkey_s_c hash_k) ++ struct btree_iter *k_iter, struct bkey_s_c hash_k, ++ bool *updated_before_k_pos) + { + if (hash_k.k->type != desc->key_type) + return 0; +@@ -411,7 +433,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans, + if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) + return 0; + +- return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); ++ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k, ++ updated_before_k_pos); + } + + #endif /* _BCACHEFS_STR_HASH_H */ +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index d0209f7658bb..6023ae46ca72 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -3,6 +3,7 @@ + #include "bcachefs.h" + #include "btree_key_cache.h" + #include "btree_update.h" ++#include "enumerated_ref.h" + #include "errcode.h" + #include "error.h" + #include "fs.h" +@@ -14,6 +15,21 @@ + + static int bch2_subvolume_delete(struct btree_trans *, u32); + ++static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) ++{ ++ CLASS(printbuf, buf)(); ++ bch2_log_msg_start(c, &buf); ++ ++ prt_printf(&buf, "missing subvolume %u", subvolid); ++ bool print = bch2_count_fsck_err(c, subvol_missing, &buf); ++ ++ int ret = bch2_run_explicit_recovery_pass(c, &buf, ++ BCH_RECOVERY_PASS_check_inodes, 0); ++ if (print) ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ return ret; ++} ++ + static struct bpos subvolume_children_pos(struct bkey_s_c k) + { + if (k.k->type != KEY_TYPE_subvolume) +@@ -30,144 +46,142 @@ static int check_subvol(struct btree_trans *trans, + struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- struct bkey_s_c_subvolume subvol; +- struct btree_iter subvol_children_iter = {}; ++ struct bch_subvolume subvol; + struct bch_snapshot snapshot; +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + unsigned snapid; + int ret = 0; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + +- subvol = bkey_s_c_to_subvolume(k); +- snapid = le32_to_cpu(subvol.v->snapshot); ++ bkey_val_copy(&subvol, bkey_s_c_to_subvolume(k)); ++ snapid = le32_to_cpu(subvol.snapshot); + ret = bch2_snapshot_lookup(trans, snapid, &snapshot); + + if (bch2_err_matches(ret, ENOENT)) +- return bch2_run_explicit_recovery_pass(c, ++ return bch2_run_print_explicit_recovery_pass(c, + BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + if (ret) + return ret; + +- if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { ++ if (BCH_SUBVOLUME_UNLINKED(&subvol)) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); +- return ret ?: -BCH_ERR_transaction_restart_nested; ++ return ret ?: bch_err_throw(c, transaction_restart_nested); + } + +- if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && +- subvol.v->fs_path_parent, ++ if (fsck_err_on(k.k->p.offset == BCACHEFS_ROOT_SUBVOL && ++ subvol.fs_path_parent, + trans, subvol_root_fs_path_parent_nonzero, + "root subvolume has nonzero fs_path_parent\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i_subvolume *n = +- bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); ++ bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) +- goto err; ++ return ret; + + n->v.fs_path_parent = 0; + } + +- if (subvol.v->fs_path_parent) { +- struct bpos pos = subvolume_children_pos(k); +- +- struct bkey_s_c subvol_children_k = +- bch2_bkey_get_iter(trans, &subvol_children_iter, +- BTREE_ID_subvolume_children, pos, 0); ++ if (subvol.fs_path_parent) { ++ CLASS(btree_iter, subvol_children_iter)(trans, ++ BTREE_ID_subvolume_children, subvolume_children_pos(k), 0); ++ struct bkey_s_c subvol_children_k = bch2_btree_iter_peek_slot(&subvol_children_iter); + ret = bkey_err(subvol_children_k); + if (ret) +- goto err; ++ return ret; + + if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, + trans, subvol_children_not_set, + "subvolume not set in subvolume_children btree at %llu:%llu\n%s", +- pos.inode, pos.offset, ++ subvol_children_iter.pos.inode, subvol_children_iter.pos.offset, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { +- ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); ++ ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, subvol_children_iter.pos, true); + if (ret) +- goto err; ++ return ret; + } + } + + struct bch_inode_unpacked inode; + ret = bch2_inode_find_by_inum_nowarn_trans(trans, +- (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, ++ (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.inode) }, + &inode); + if (!ret) { +- if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, ++ if (fsck_err_on(inode.bi_subvol != k.k->p.offset, + trans, subvol_root_wrong_bi_subvol, + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", + inode.bi_inum, inode.bi_snapshot, +- inode.bi_subvol, subvol.k->p.offset)) { +- inode.bi_subvol = subvol.k->p.offset; +- inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); ++ inode.bi_subvol, k.k->p.offset)) { ++ inode.bi_subvol = k.k->p.offset; ++ inode.bi_snapshot = le32_to_cpu(subvol.snapshot); + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) +- goto err; ++ return ret; + } + } else if (bch2_err_matches(ret, ENOENT)) { + if (fsck_err(trans, subvol_to_missing_root, + "subvolume %llu points to missing subvolume root %llu:%u", +- k.k->p.offset, le64_to_cpu(subvol.v->inode), +- le32_to_cpu(subvol.v->snapshot))) { +- ret = bch2_subvolume_delete(trans, iter->pos.offset); +- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); +- ret = ret ?: -BCH_ERR_transaction_restart_nested; +- goto err; ++ k.k->p.offset, le64_to_cpu(subvol.inode), ++ le32_to_cpu(subvol.snapshot))) { ++ /* ++ * Recreate - any contents that are still disconnected ++ * will then get reattached under lost+found ++ */ ++ bch2_inode_init_early(c, &inode); ++ bch2_inode_init_late(c, &inode, bch2_current_time(c), ++ 0, 0, S_IFDIR|0700, 0, NULL); ++ inode.bi_inum = le64_to_cpu(subvol.inode); ++ inode.bi_snapshot = le32_to_cpu(subvol.snapshot); ++ inode.bi_subvol = k.k->p.offset; ++ inode.bi_parent_subvol = le32_to_cpu(subvol.fs_path_parent); ++ ret = __bch2_fsck_write_inode(trans, &inode); ++ if (ret) ++ return ret; + } + } else { +- goto err; ++ return ret; + } + +- if (!BCH_SUBVOLUME_SNAP(subvol.v)) { +- u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); +- u32 snapshot_tree; +- struct bch_snapshot_tree st; +- +- rcu_read_lock(); +- snapshot_tree = snapshot_t(c, snapshot_root)->tree; +- rcu_read_unlock(); ++ if (!BCH_SUBVOLUME_SNAP(&subvol)) { ++ u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.snapshot)); ++ u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root); + ++ struct bch_snapshot_tree st; + ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "%s: snapshot tree %u not found", __func__, snapshot_tree); + + if (ret) +- goto err; ++ return ret; + +- if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, ++ if (fsck_err_on(le32_to_cpu(st.master_subvol) != k.k->p.offset, + trans, subvol_not_master_and_not_snapshot, + "subvolume %llu is not set as snapshot but is not master subvolume", + k.k->p.offset)) { + struct bkey_i_subvolume *s = +- bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); ++ bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) +- goto err; ++ return ret; + + SET_BCH_SUBVOLUME_SNAP(&s->v, true); + } + } +-err: + fsck_err: +- bch2_trans_iter_exit(trans, &subvol_children_iter); +- printbuf_exit(&buf); + return ret; + } + + int bch2_check_subvols(struct bch_fs *c) + { +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_subvol(trans, &iter, k))); +- bch_err_fn(c, ret); +- return ret; ++ check_subvol(trans, &iter, k)); + } + + static int check_subvol_child(struct btree_trans *trans, +@@ -196,13 +210,11 @@ static int check_subvol_child(struct btree_trans *trans, + + int bch2_check_subvol_children(struct bch_fs *c) + { +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- check_subvol_child(trans, &iter, k))); +- bch_err_fn(c, ret); +- return 0; ++ check_subvol_child(trans, &iter, k)); + } + + /* Subvolumes: */ +@@ -242,6 +254,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, + prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); + prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); + } ++ ++ if (BCH_SUBVOLUME_RO(s.v)) ++ prt_printf(out, " ro"); ++ if (BCH_SUBVOLUME_SNAP(s.v)) ++ prt_printf(out, " snapshot"); ++ if (BCH_SUBVOLUME_UNLINKED(s.v)) ++ prt_printf(out, " unlinked"); + } + + static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) +@@ -273,14 +292,11 @@ int bch2_subvolume_trigger(struct btree_trans *trans, + + int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) + { +- struct btree_iter iter; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); +- struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); +- bch2_trans_iter_exit(trans, &iter); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_subvolume_children, POS(subvol, 0), 0); ++ struct bkey_s_c k = bch2_btree_iter_peek(&iter); + + return bkey_err(k) ?: k.k && k.k->p.inode == subvol +- ? -BCH_ERR_ENOTEMPTY_subvol_not_empty ++ ? bch_err_throw(trans->c, ENOTEMPTY_subvol_not_empty) + : 0; + } + +@@ -292,9 +308,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), + BTREE_ITER_cached| + BTREE_ITER_with_updates, subvolume, s); +- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && +- inconsistent_if_not_found, +- trans->c, "missing subvolume %u", subvol); ++ if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) ++ ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; + return ret; + } + +@@ -319,7 +334,8 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) + + int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol) + { +- return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol)); ++ CLASS(btree_trans, trans)(c); ++ return lockrestart_do(trans, bch2_subvol_is_ro_trans(trans, subvol)); + } + + int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, +@@ -334,22 +350,16 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, + int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, + u32 *snapid, bool warn) + { +- struct btree_iter iter; +- struct bkey_s_c_subvolume subvol; +- int ret; +- +- subvol = bch2_bkey_get_iter_typed(trans, &iter, +- BTREE_ID_subvolumes, POS(0, subvolid), +- BTREE_ITER_cached|BTREE_ITER_with_updates, +- subvolume); +- ret = bkey_err(subvol); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_subvolumes, POS(0, subvolid), ++ BTREE_ITER_cached|BTREE_ITER_with_updates); ++ struct bkey_s_c_subvolume subvol = bch2_bkey_get_typed(&iter, subvolume); ++ int ret = bkey_err(subvol); + +- bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c, +- "missing subvolume %u", subvolid); ++ if (bch2_err_matches(ret, ENOENT)) ++ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; + + if (likely(!ret)) + *snapid = le32_to_cpu(subvol.v->snapshot); +- bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -410,42 +420,35 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d + */ + static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) + { +- struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; +- +- struct bkey_s_c_subvolume subvol = +- bch2_bkey_get_iter_typed(trans, &subvol_iter, +- BTREE_ID_subvolumes, POS(0, subvolid), +- BTREE_ITER_cached|BTREE_ITER_intent, +- subvolume); ++ CLASS(btree_iter, subvol_iter)(trans, BTREE_ID_subvolumes, POS(0, subvolid), ++ BTREE_ITER_cached|BTREE_ITER_intent); ++ struct bkey_s_c_subvolume subvol = bch2_bkey_get_typed(&subvol_iter, subvolume); + int ret = bkey_err(subvol); +- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, +- "missing subvolume %u", subvolid); ++ if (bch2_err_matches(ret, ENOENT)) ++ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; + if (ret) +- goto err; ++ return ret; + + u32 snapid = le32_to_cpu(subvol.v->snapshot); + +- struct bkey_s_c_snapshot snapshot = +- bch2_bkey_get_iter_typed(trans, &snapshot_iter, +- BTREE_ID_snapshots, POS(0, snapid), +- 0, snapshot); ++ CLASS(btree_iter, snapshot_iter)(trans, BTREE_ID_snapshots, POS(0, snapid), 0); ++ struct bkey_s_c_snapshot snapshot = bch2_bkey_get_typed(&snapshot_iter, snapshot); + ret = bkey_err(snapshot); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot %u", snapid); + if (ret) +- goto err; ++ return ret; + + u32 treeid = le32_to_cpu(snapshot.v->tree); + ++ CLASS(btree_iter, snapshot_tree_iter)(trans, BTREE_ID_snapshot_trees, POS(0, treeid), 0); + struct bkey_s_c_snapshot_tree snapshot_tree = +- bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, +- BTREE_ID_snapshot_trees, POS(0, treeid), +- 0, snapshot_tree); ++ bch2_bkey_get_typed(&snapshot_tree_iter, snapshot_tree); + ret = bkey_err(snapshot_tree); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing snapshot tree %u", treeid); + if (ret) +- goto err; ++ return ret; + + if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { + struct bkey_i_snapshot_tree *snapshot_tree_mut = +@@ -454,48 +457,48 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); + if (ret) +- goto err; ++ return ret; + + snapshot_tree_mut->v.master_subvol = 0; + } + +- ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: ++ return bch2_btree_delete_at(trans, &subvol_iter, 0) ?: + bch2_snapshot_node_set_deleted(trans, snapid); +-err: +- bch2_trans_iter_exit(trans, &snapshot_tree_iter); +- bch2_trans_iter_exit(trans, &snapshot_iter); +- bch2_trans_iter_exit(trans, &subvol_iter); +- return ret; + } + + static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) + { +- return bch2_subvolumes_reparent(trans, subvolid) ?: ++ int ret = bch2_subvolumes_reparent(trans, subvolid) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_subvolume_delete(trans, subvolid)); ++ ++ bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols); ++ return ret; + } + + static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) + { + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); +- snapshot_id_list s; +- u32 *id; + int ret = 0; + + while (!ret) { +- mutex_lock(&c->snapshots_unlinked_lock); +- s = c->snapshots_unlinked; +- darray_init(&c->snapshots_unlinked); +- mutex_unlock(&c->snapshots_unlinked_lock); ++ snapshot_id_list s; ++ ++ scoped_guard(mutex, &c->snapshots_unlinked_lock) { ++ s = c->snapshots_unlinked; ++ darray_init(&c->snapshots_unlinked); ++ } + + if (!s.nr) + break; + + bch2_evict_subvolume_inodes(c, &s); + +- for (id = s.data; id < s.data + s.nr; id++) { +- ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); ++ CLASS(btree_trans, trans)(c); ++ ++ darray_for_each(s, id) { ++ ret = bch2_subvolume_delete(trans, *id); + bch_err_msg(c, ret, "deleting subvolume %u", *id); + if (ret) + break; +@@ -504,7 +507,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor + darray_exit(&s); + } + +- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); + } + + struct subvolume_unlink_hook { +@@ -519,31 +522,25 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans + struct bch_fs *c = trans->c; + int ret = 0; + +- mutex_lock(&c->snapshots_unlinked_lock); +- if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) +- ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); +- mutex_unlock(&c->snapshots_unlinked_lock); ++ scoped_guard(mutex, &c->snapshots_unlinked_lock) ++ if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) ++ ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); + + if (ret) + return ret; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) + return -EROFS; + + if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) +- bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); + return 0; + } + + int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) + { +- struct btree_iter iter; +- struct bkey_i_subvolume *n; +- struct subvolume_unlink_hook *h; +- int ret = 0; +- +- h = bch2_trans_kmalloc(trans, sizeof(*h)); +- ret = PTR_ERR_OR_ZERO(h); ++ struct subvolume_unlink_hook *h = bch2_trans_kmalloc(trans, sizeof(*h)); ++ int ret = PTR_ERR_OR_ZERO(h); + if (ret) + return ret; + +@@ -551,19 +548,17 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) + h->subvol = subvolid; + bch2_trans_commit_hook(trans, &h->h); + +- n = bch2_bkey_get_mut_typed(trans, &iter, +- BTREE_ID_subvolumes, POS(0, subvolid), +- BTREE_ITER_cached, subvolume); ++ struct bkey_i_subvolume *n = ++ bch2_bkey_get_mut_typed(trans, BTREE_ID_subvolumes, POS(0, subvolid), ++ BTREE_ITER_cached, subvolume); + ret = PTR_ERR_OR_ZERO(n); +- if (unlikely(ret)) { +- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, +- "missing subvolume %u", subvolid); ++ if (bch2_err_matches(ret, ENOENT)) ++ ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; ++ if (unlikely(ret)) + return ret; +- } + + SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); + n->v.fs_path_parent = 0; +- bch2_trans_iter_exit(trans, &iter); + return ret; + } + +@@ -575,7 +570,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + bool ro) + { + struct bch_fs *c = trans->c; +- struct btree_iter dst_iter, src_iter = {}; ++ struct btree_iter dst_iter; + struct bkey_i_subvolume *new_subvol = NULL; + struct bkey_i_subvolume *src_subvol = NULL; + u32 parent = 0, new_nodes[2], snapshot_subvols[2]; +@@ -584,7 +579,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + ret = bch2_bkey_get_empty_slot(trans, &dst_iter, + BTREE_ID_subvolumes, POS(0, U32_MAX)); + if (ret == -BCH_ERR_ENOSPC_btree_slot) +- ret = -BCH_ERR_ENOSPC_subvolume_create; ++ ret = bch_err_throw(c, ENOSPC_subvolume_create); + if (ret) + return ret; + +@@ -594,15 +589,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + if (src_subvolid) { + /* Creating a snapshot: */ + +- src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, +- BTREE_ID_subvolumes, POS(0, src_subvolid), +- BTREE_ITER_cached, subvolume); ++ src_subvol = bch2_bkey_get_mut_typed(trans, BTREE_ID_subvolumes, POS(0, src_subvolid), ++ BTREE_ITER_cached, subvolume); + ret = PTR_ERR_OR_ZERO(src_subvol); +- if (unlikely(ret)) { +- bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, +- "subvolume %u not found", src_subvolid); ++ if (bch2_err_matches(ret, ENOENT)) ++ ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; ++ if (unlikely(ret)) + goto err; +- } + + parent = le32_to_cpu(src_subvol->v.snapshot); + } +@@ -613,12 +606,8 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + if (ret) + goto err; + +- if (src_subvolid) { ++ if (src_subvolid) + src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); +- ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); +- if (ret) +- goto err; +- } + + new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); + ret = PTR_ERR_OR_ZERO(new_subvol); +@@ -639,8 +628,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + *new_subvolid = new_subvol->k.p.offset; + *new_snapshotid = new_nodes[0]; + err: +- bch2_trans_iter_exit(trans, &src_iter); +- bch2_trans_iter_exit(trans, &dst_iter); ++ bch2_trans_iter_exit(&dst_iter); + return ret; + } + +@@ -649,7 +637,6 @@ int bch2_initialize_subvolumes(struct bch_fs *c) + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; +- int ret; + + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; +@@ -670,57 +657,44 @@ int bch2_initialize_subvolumes(struct bch_fs *c) + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + +- ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: ++ return bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0, 0); +- bch_err_fn(c, ret); +- return ret; + } + + static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) + { +- struct btree_iter iter; +- struct bkey_s_c k; +- struct bch_inode_unpacked inode; +- int ret; +- +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, +- SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); +- ret = bkey_err(k); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_inodes, SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); ++ int ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_is_inode(k.k)) { +- bch_err(trans->c, "root inode not found"); +- ret = -BCH_ERR_ENOENT_inode; +- goto err; ++ struct bch_fs *c = trans->c; ++ bch_err(c, "root inode not found"); ++ return bch_err_throw(c, ENOENT_inode); + } + ++ struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + +- ret = bch2_inode_write(trans, &iter, &inode); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_inode_write(trans, &iter, &inode); + } + + /* set bi_subvol on root inode */ + int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) + { +- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- __bch2_fs_upgrade_for_subvolumes(trans)); +- bch_err_fn(c, ret); +- return ret; ++ CLASS(btree_trans, trans)(c); ++ return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ++ __bch2_fs_upgrade_for_subvolumes(trans)); + } + +-int bch2_fs_subvolumes_init(struct bch_fs *c) ++void bch2_fs_subvolumes_init_early(struct bch_fs *c) + { +- INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); + INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, + bch2_subvolume_wait_for_pagecache_and_delete); +- mutex_init(&c->snapshots_unlinked_lock); +- return 0; + } +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index f640c1e3d639..b6d7c1f4a256 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -33,59 +33,52 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); + int bch2_subvol_is_ro(struct bch_fs *, u32); + + static inline struct bkey_s_c +-bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, +- struct bpos end, u32 subvolid, unsigned flags) ++bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, ++ u32 subvolid, unsigned flags) + { + u32 snapshot; +- int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); ++ int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); + if (ret) + return bkey_s_c_err(ret); + +- bch2_btree_iter_set_snapshot(trans, iter, snapshot); +- return bch2_btree_iter_peek_max_type(trans, iter, end, flags); ++ bch2_btree_iter_set_snapshot(iter, snapshot); ++ return bch2_btree_iter_peek_max_type(iter, end, flags); + } + + #define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do) \ + ({ \ +- struct bkey_s_c _k; \ + int _ret3 = 0; \ + \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ +- (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ ++ struct bkey_s_c _k = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter),\ + _end, _subvolid, (_flags)); \ + if (!(_k).k) \ + break; \ + \ + bkey_err(_k) ?: (_do); \ + })); \ +- } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ ++ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + \ +- bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret3; \ + }) + + #define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ + _start, _end, _subvolid, _flags, _k, _do) \ + ({ \ +- struct btree_iter _iter; \ +- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ +- (_start), (_flags)); \ ++ CLASS(btree_iter, _iter)((_trans), (_btree_id), (_start), (_flags)); \ + \ + for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do); \ + }) + +-int bch2_delete_dead_snapshots(struct bch_fs *); +-void bch2_delete_dead_snapshots_async(struct bch_fs *); +- + int bch2_subvolume_unlink(struct btree_trans *, u32); + int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); + + int bch2_initialize_subvolumes(struct bch_fs *); + int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); + +-int bch2_fs_subvolumes_init(struct bch_fs *); ++void bch2_fs_subvolumes_init_early(struct bch_fs *); + + #endif /* _BCACHEFS_SUBVOLUME_H */ +diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h +index 1549d6daf7af..9d634b906dcd 100644 +--- a/fs/bcachefs/subvolume_types.h ++++ b/fs/bcachefs/subvolume_types.h +@@ -2,33 +2,6 @@ + #ifndef _BCACHEFS_SUBVOLUME_TYPES_H + #define _BCACHEFS_SUBVOLUME_TYPES_H + +-#include "darray.h" +- +-typedef DARRAY(u32) snapshot_id_list; +- +-#define IS_ANCESTOR_BITMAP 128 +- +-struct snapshot_t { +- bool live; +- u32 parent; +- u32 skip[3]; +- u32 depth; +- u32 children[2]; +- u32 subvol; /* Nonzero only if a subvolume points to this node: */ +- u32 tree; +- unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +-}; +- +-struct snapshot_table { +- struct rcu_head rcu; +- size_t nr; +-#ifndef RUST_BINDGEN +- DECLARE_FLEX_ARRAY(struct snapshot_t, s); +-#else +- struct snapshot_t s[0]; +-#endif +-}; +- + typedef struct { + /* we can't have padding in this struct: */ + u64 subvol; +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index cb5d960aed92..be7ed612d28f 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -68,36 +68,35 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta + + int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) + { +- int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && +- version <= c->sb.version_incompat_allowed) +- ? 0 +- : -BCH_ERR_may_not_use_incompat_feature; +- +- mutex_lock(&c->sb_lock); +- if (!ret) { +- SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, +- max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); +- bch2_write_super(c); ++ if (((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && ++ version <= c->sb.version_incompat_allowed)) { ++ guard(mutex)(&c->sb_lock); ++ ++ if (version > c->sb.version_incompat) { ++ SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, ++ max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); ++ bch2_write_super(c); ++ } ++ return 0; + } else { +- darray_for_each(c->incompat_versions_requested, i) +- if (version == *i) +- goto out; +- +- darray_push(&c->incompat_versions_requested, version); +- struct printbuf buf = PRINTBUF; +- prt_str(&buf, "requested incompat feature "); +- bch2_version_to_text(&buf, version); +- prt_str(&buf, " currently not enabled"); +- prt_printf(&buf, "\n set version_upgrade=incompat to enable"); +- +- bch_notice(c, "%s", buf.buf); +- printbuf_exit(&buf); +- } ++ BUILD_BUG_ON(BCH_VERSION_MAJOR(bcachefs_metadata_version_current) != 1); + +-out: +- mutex_unlock(&c->sb_lock); ++ unsigned minor = BCH_VERSION_MINOR(version); + +- return ret; ++ if (!test_bit(minor, c->incompat_versions_requested) && ++ !test_and_set_bit(minor, c->incompat_versions_requested)) { ++ CLASS(printbuf, buf)(); ++ prt_str(&buf, "requested incompat feature "); ++ bch2_version_to_text(&buf, version); ++ prt_str(&buf, " currently not enabled, allowed up to "); ++ bch2_version_to_text(&buf, version); ++ prt_printf(&buf, "\n set version_upgrade=incompat to enable"); ++ ++ bch_notice(c, "%s", buf.buf); ++ } ++ ++ return bch_err_throw(c, may_not_use_incompat_feature); ++ } + } + + const char * const bch2_sb_fields[] = { +@@ -202,12 +201,11 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) + u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; + + if (new_bytes > max_bytes) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_bdevname(&buf, sb->bdev); + prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes); + pr_err("%s", buf.buf); +- printbuf_exit(&buf); + return -BCH_ERR_ENOSPC_sb; + } + } +@@ -260,11 +258,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, + + /* XXX: we're not checking that offline device have enough space */ + +- for_each_online_member(c, ca) { ++ for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { + struct bch_sb_handle *dev_sb = &ca->disk_sb; + + if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); + return NULL; + } + } +@@ -384,7 +382,6 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) + int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, + enum bch_validate_flags flags, struct printbuf *out) + { +- struct bch_sb_field_members_v1 *mi; + enum bch_opt_id opt_id; + int ret; + +@@ -468,6 +465,9 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); + } + ++ if (sb->nr_devices > 1) ++ SET_BCH_SB_MULTI_DEVICE(sb, true); ++ + if (!flags) { + /* + * Been seeing a bug where these are getting inexplicably +@@ -536,14 +536,17 @@ int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, + } + } + ++ struct bch_sb_field *mi = ++ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: ++ bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); ++ + /* members must be validated first: */ +- mi = bch2_sb_field_get(sb, members_v1); + if (!mi) { + prt_printf(out, "Invalid superblock: member info area missing"); + return -BCH_ERR_invalid_sb_members_missing; + } + +- ret = bch2_sb_field_validate(sb, &mi->field, flags, out); ++ ret = bch2_sb_field_validate(sb, mi, flags, out); + if (ret) + return ret; + +@@ -612,20 +615,21 @@ static void bch2_sb_update(struct bch_fs *c) + + c->sb.features = le64_to_cpu(src->features[0]); + c->sb.compat = le64_to_cpu(src->compat[0]); ++ c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); + + memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); + if (ext) { ++ c->sb.recovery_passes_required = ++ bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); ++ + le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, + sizeof(c->sb.errors_silent) * 8); + c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); + } + +- for_each_member_device(c, ca) { +- struct bch_member m = bch2_sb_member_get(src, ca->dev_idx); +- ca->mi = bch2_mi_to_cpu(&m); +- } ++ bch2_sb_members_to_cpu(c); + } + + static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) +@@ -776,8 +780,8 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, + { + u64 offset = opt_get(*opts, sb); + struct bch_sb_layout layout; +- struct printbuf err = PRINTBUF; +- struct printbuf err2 = PRINTBUF; ++ CLASS(printbuf, err)(); ++ CLASS(printbuf, err2)(); + __le64 *i; + int ret; + #ifndef __KERNEL__ +@@ -852,7 +856,6 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, + else + bch2_print_opts(opts, KERN_ERR "%s", err2.buf); + +- printbuf_exit(&err2); + printbuf_reset(&err); + + /* +@@ -918,15 +921,14 @@ static int __bch2_read_super(const char *path, struct bch_opts *opts, + path, err.buf); + goto err_no_print; + } +-out: +- printbuf_exit(&err); +- return ret; ++ ++ return 0; + err: + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", + path, err.buf); + err_no_print: + bch2_free_super(sb); +- goto out; ++ return ret; + } + + int bch2_read_super(const char *path, struct bch_opts *opts, +@@ -961,7 +963,7 @@ static void write_super_endio(struct bio *bio) + } + + closure_put(&ca->fs->sb_write); +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); + } + + static void read_back_super(struct bch_fs *c, struct bch_dev *ca) +@@ -979,7 +981,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); + +- percpu_ref_get(&ca->io_ref[READ]); ++ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); + closure_bio_submit(bio, &c->sb_write); + } + +@@ -994,7 +996,12 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) + sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), + null_nonce(), sb); + +- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); ++ /* ++ * blk-wbt.c throttles all writes except those that have both REQ_SYNC ++ * and REQ_IDLE set... ++ */ ++ ++ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_IDLE|REQ_META); + bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; +@@ -1005,14 +1012,14 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], + bio_sectors(bio)); + +- percpu_ref_get(&ca->io_ref[READ]); ++ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); + closure_bio_submit(bio, &c->sb_write); + } + + int bch2_write_super(struct bch_fs *c) + { + struct closure *cl = &c->sb_write; +- struct printbuf err = PRINTBUF; ++ CLASS(printbuf, err)(); + unsigned sb = 0, nr_wrote; + struct bch_devs_mask sb_written; + bool wrote, can_mount_without_written, can_mount_with_written; +@@ -1022,7 +1029,7 @@ int bch2_write_super(struct bch_fs *c) + + trace_and_count(c, write_super, c, _RET_IP_); + +- if (c->opts.very_degraded) ++ if (c->opts.degraded == BCH_DEGRADED_very) + degraded_flags |= BCH_FORCE_IF_LOST; + + lockdep_assert_held(&c->sb_lock); +@@ -1037,13 +1044,13 @@ int bch2_write_super(struct bch_fs *c) + * For now, we expect to be able to call write_super() when we're not + * yet RW: + */ +- for_each_online_member(c, ca) { ++ for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { + ret = darray_push(&online_devices, ca); + if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { +- percpu_ref_put(&ca->io_ref[READ]); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); + goto out; + } +- percpu_ref_get(&ca->io_ref[READ]); ++ enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); + } + + /* Make sure we're using the new magic numbers: */ +@@ -1094,15 +1101,14 @@ int bch2_write_super(struct bch_fs *c) + goto out; + + if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + prt_printf(&buf, "attempting to write superblock that wasn't version downgraded ("); + bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version)); + prt_str(&buf, " > "); + bch2_version_to_text(&buf, bcachefs_metadata_version_current); + prt_str(&buf, ")"); + bch2_fs_fatal_error(c, ": %s", buf.buf); +- printbuf_exit(&buf); +- ret = -BCH_ERR_sb_not_downgraded; ++ ret = bch_err_throw(c, sb_not_downgraded); + goto out; + } + +@@ -1122,7 +1128,7 @@ int bch2_write_super(struct bch_fs *c) + continue; + + if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, +@@ -1132,17 +1138,15 @@ int bch2_write_super(struct bch_fs *c) + + if (c->opts.errors != BCH_ON_ERROR_continue && + c->opts.errors != BCH_ON_ERROR_fix_safe) { +- ret = -BCH_ERR_erofs_sb_err; ++ ret = bch_err_throw(c, erofs_sb_err); + bch2_fs_fatal_error(c, "%s", buf.buf); + } else { + bch_err(c, "%s", buf.buf); + } +- +- printbuf_exit(&buf); + } + + if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + prt_char(&buf, ' '); + prt_bdevname(&buf, ca->disk_sb.bdev); + prt_printf(&buf, +@@ -1150,8 +1154,7 @@ int bch2_write_super(struct bch_fs *c) + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); + bch2_fs_fatal_error(c, "%s", buf.buf); +- printbuf_exit(&buf); +- ret = -BCH_ERR_erofs_sb_err; ++ ret = bch_err_throw(c, erofs_sb_err); + } + } + +@@ -1205,26 +1208,24 @@ int bch2_write_super(struct bch_fs *c) + !can_mount_with_written), c, + ": Unable to write superblock to sufficient devices (from %ps)", + (void *) _RET_IP_)) +- ret = -BCH_ERR_erofs_sb_err; ++ ret = bch_err_throw(c, erofs_sb_err); + out: + /* Make new options visible after they're persistent: */ + bch2_sb_update(c); + darray_for_each(online_devices, ca) +- percpu_ref_put(&(*ca)->io_ref[READ]); ++ enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); + darray_exit(&online_devices); +- printbuf_exit(&err); + return ret; + } + + void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) + { +- mutex_lock(&c->sb_lock); +- if (!(c->sb.features & (1ULL << feat))) { +- c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); ++ guard(mutex)(&c->sb_lock); ++ if (!(c->sb.features & BIT_ULL(feat))) { ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BIT_ULL(feat)); + + bch2_write_super(c); + } +- mutex_unlock(&c->sb_lock); + } + + /* Downgrade if superblock is at a higher version than currently supported: */ +@@ -1270,6 +1271,29 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) + } + } + ++void bch2_sb_upgrade_incompat(struct bch_fs *c) ++{ ++ guard(mutex)(&c->sb_lock); ++ ++ if (c->sb.version == c->sb.version_incompat_allowed) ++ return; ++ ++ CLASS(printbuf, buf)(); ++ ++ prt_str(&buf, "Now allowing incompatible features up to "); ++ bch2_version_to_text(&buf, c->sb.version); ++ prt_str(&buf, ", previously allowed up to "); ++ bch2_version_to_text(&buf, c->sb.version_incompat_allowed); ++ prt_newline(&buf); ++ ++ bch_notice(c, "%s", buf.buf); ++ ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); ++ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, ++ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); ++ bch2_write_super(c); ++} ++ + static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) + { +@@ -1333,7 +1357,7 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) + { + unsigned type = le32_to_cpu(f->type); +- struct printbuf field_err = PRINTBUF; ++ CLASS(printbuf, field_err)(); + const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); + int ret; + +@@ -1345,7 +1369,6 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, + bch2_sb_field_to_text(err, sb, f); + } + +- printbuf_exit(&field_err); + return ret; + } + +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index 78f708a6fbcd..a3b7a90f2533 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -107,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) + + bool bch2_check_version_downgrade(struct bch_fs *); + void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); ++void bch2_sb_upgrade_incompat(struct bch_fs *); + + void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index 84a37d971ffd..b0019488f586 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -10,6 +10,8 @@ + #include "bcachefs.h" + #include "alloc_background.h" + #include "alloc_foreground.h" ++#include "async_objs.h" ++#include "backpointers.h" + #include "bkey_sort.h" + #include "btree_cache.h" + #include "btree_gc.h" +@@ -28,6 +30,7 @@ + #include "disk_accounting.h" + #include "disk_groups.h" + #include "ec.h" ++#include "enumerated_ref.h" + #include "errcode.h" + #include "error.h" + #include "fs.h" +@@ -48,6 +51,7 @@ + #include "quota.h" + #include "rebalance.h" + #include "recovery.h" ++#include "recovery_passes.h" + #include "replicas.h" + #include "sb-clean.h" + #include "sb-counters.h" +@@ -75,15 +79,56 @@ MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Kent Overstreet "); + MODULE_DESCRIPTION("bcachefs filesystem"); + +-const char * const bch2_fs_flag_strs[] = { ++typedef DARRAY(struct bch_sb_handle) bch_sb_handles; ++ + #define x(n) #n, ++const char * const bch2_fs_flag_strs[] = { + BCH_FS_FLAGS() +-#undef x + NULL + }; + +-void bch2_print_str(struct bch_fs *c, const char *str) ++const char * const bch2_write_refs[] = { ++ BCH_WRITE_REFS() ++ NULL ++}; ++ ++const char * const bch2_dev_read_refs[] = { ++ BCH_DEV_READ_REFS() ++ NULL ++}; ++ ++const char * const bch2_dev_write_refs[] = { ++ BCH_DEV_WRITE_REFS() ++ NULL ++}; ++#undef x ++ ++static bool should_print_loglevel(struct bch_fs *c, const char *fmt) ++{ ++ unsigned loglevel_opt = c->loglevel ?: c->opts.verbose ? 7: 6; ++ ++ bool have_soh = fmt[0] == KERN_SOH[0]; ++ bool have_loglevel = have_soh && fmt[1] >= '0' && fmt[1] <= '9'; ++ ++ unsigned loglevel = have_loglevel ++ ? fmt[1] - '0' ++ : c->prev_loglevel; ++ ++ if (have_loglevel) ++ c->prev_loglevel = loglevel; ++ ++ return loglevel <= loglevel_opt; ++} ++ ++void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) + { ++ if (!should_print_loglevel(c, prefix)) ++ return; ++ ++#ifndef __KERNEL__ ++ prefix = ""; ++#endif ++ + #ifdef __KERNEL__ + struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); + +@@ -92,7 +137,7 @@ void bch2_print_str(struct bch_fs *c, const char *str) + return; + } + #endif +- bch2_print_string_as_lines(KERN_ERR, str); ++ bch2_print_string_as_lines(prefix, str); + } + + __printf(2, 0) +@@ -122,6 +167,14 @@ void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) + + void __bch2_print(struct bch_fs *c, const char *fmt, ...) + { ++ if (!should_print_loglevel(c, fmt)) ++ return; ++ ++#ifndef __KERNEL__ ++ if (fmt[0] == KERN_SOH[0]) ++ fmt += 2; ++#endif ++ + struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); + + va_list args; +@@ -186,23 +239,17 @@ static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); + + struct bch_fs *bch2_dev_to_fs(dev_t dev) + { +- struct bch_fs *c; +- +- mutex_lock(&bch_fs_list_lock); +- rcu_read_lock(); ++ guard(mutex)(&bch_fs_list_lock); ++ guard(rcu)(); + ++ struct bch_fs *c; + list_for_each_entry(c, &bch_fs_list, list) + for_each_member_device_rcu(c, ca, NULL) + if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { + closure_get(&c->cl); +- goto found; ++ return c; + } +- c = NULL; +-found: +- rcu_read_unlock(); +- mutex_unlock(&bch_fs_list_lock); +- +- return c; ++ return NULL; + } + + static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) +@@ -220,14 +267,11 @@ static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) + + struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) + { +- struct bch_fs *c; ++ guard(mutex)(&bch_fs_list_lock); + +- mutex_lock(&bch_fs_list_lock); +- c = __bch2_uuid_to_fs(uuid); ++ struct bch_fs *c = __bch2_uuid_to_fs(uuid); + if (c) + closure_get(&c->cl); +- mutex_unlock(&bch_fs_list_lock); +- + return c; + } + +@@ -297,15 +341,13 @@ static void __bch2_fs_read_only(struct bch_fs *c) + } + } + +-#ifndef BCH_WRITE_REF_DEBUG +-static void bch2_writes_disabled(struct percpu_ref *writes) ++static void bch2_writes_disabled(struct enumerated_ref *writes) + { + struct bch_fs *c = container_of(writes, struct bch_fs, writes); + + set_bit(BCH_FS_write_disable_complete, &c->flags); + wake_up(&bch2_read_only_wait); + } +-#endif + + void bch2_fs_read_only(struct bch_fs *c) + { +@@ -323,12 +365,7 @@ void bch2_fs_read_only(struct bch_fs *c) + * writes will return -EROFS: + */ + set_bit(BCH_FS_going_ro, &c->flags); +-#ifndef BCH_WRITE_REF_DEBUG +- percpu_ref_kill(&c->writes); +-#else +- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) +- bch2_write_ref_put(c, i); +-#endif ++ enumerated_ref_stop_async(&c->writes); + + /* + * If we're not doing an emergency shutdown, we want to wait on +@@ -366,7 +403,7 @@ void bch2_fs_read_only(struct bch_fs *c) + !test_bit(BCH_FS_emergency_ro, &c->flags) && + test_bit(BCH_FS_started, &c->flags) && + test_bit(BCH_FS_clean_shutdown, &c->flags) && +- c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { ++ c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { + BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); + BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); + BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); +@@ -378,9 +415,8 @@ void bch2_fs_read_only(struct bch_fs *c) + bch2_fs_mark_clean(c); + } else { + /* Make sure error counts/counters are persisted */ +- mutex_lock(&c->sb_lock); ++ guard(mutex)(&c->sb_lock); + bch2_write_super(c); +- mutex_unlock(&c->sb_lock); + + bch_verbose(c, "done going read-only, filesystem not clean"); + } +@@ -391,9 +427,8 @@ static void bch2_fs_read_only_work(struct work_struct *work) + struct bch_fs *c = + container_of(work, struct bch_fs, read_only_work); + +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + bch2_fs_read_only(c); +- up_write(&c->state_lock); + } + + static void bch2_fs_read_only_async(struct bch_fs *c) +@@ -412,6 +447,30 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) + return ret; + } + ++static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, ++ bool locked) ++{ ++ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); ++ ++ if (!locked) ++ bch2_journal_halt(&c->journal); ++ else ++ bch2_journal_halt_locked(&c->journal); ++ bch2_fs_read_only_async(c); ++ wake_up(&bch2_read_only_wait); ++ ++ if (ret) ++ prt_printf(out, "emergency read only at seq %llu\n", ++ journal_cur_seq(&c->journal)); ++ ++ return ret; ++} ++ ++bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) ++{ ++ return __bch2_fs_emergency_read_only2(c, out, false); ++} ++ + bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) + { + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); +@@ -429,9 +488,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + + BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + ++ if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) ++ return bch_err_throw(c, erofs_no_alloc_info); ++ + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { + bch_err(c, "cannot go rw, unfixed btree errors"); +- return -BCH_ERR_erofs_unfixed_errors; ++ return bch_err_throw(c, erofs_unfixed_errors); ++ } ++ ++ if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { ++ bch_err(c, "cannot go rw, filesystem is an unresized image file"); ++ return bch_err_throw(c, erofs_filesystem_full); + } + + if (test_bit(BCH_FS_rw, &c->flags)) +@@ -439,16 +506,27 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + + bch_info(c, "going read-write"); + ++ ret = bch2_fs_init_rw(c); ++ if (ret) ++ return ret; ++ + ret = bch2_sb_members_v2_init(c); + if (ret) +- goto err; ++ return ret; ++ ++ ret = bch2_fs_mark_dirty(c); ++ if (ret) ++ return ret; + + clear_bit(BCH_FS_clean_shutdown, &c->flags); + +- __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { +- bch2_dev_allocator_add(c, ca); +- percpu_ref_reinit(&ca->io_ref[WRITE]); +- } ++ scoped_guard(rcu) ++ for_each_online_member_rcu(c, ca) ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) { ++ bch2_dev_allocator_add(c, ca); ++ enumerated_ref_start(&ca->io_ref[WRITE]); ++ } ++ + bch2_recalc_capacity(c); + + /* +@@ -457,15 +535,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + * overwriting whatever was there previously, and there must always be + * at least one non-flush write in the journal or recovery will fail: + */ +- spin_lock(&c->journal.lock); +- set_bit(JOURNAL_need_flush_write, &c->journal.flags); +- set_bit(JOURNAL_running, &c->journal.flags); +- bch2_journal_space_available(&c->journal); +- spin_unlock(&c->journal.lock); ++ scoped_guard(spinlock, &c->journal.lock) { ++ set_bit(JOURNAL_need_flush_write, &c->journal.flags); ++ set_bit(JOURNAL_running, &c->journal.flags); ++ bch2_journal_space_available(&c->journal); ++ } + +- ret = bch2_fs_mark_dirty(c); +- if (ret) +- goto err; ++ /* ++ * Don't jump to our error path, and call bch2_fs_read_only(), unless we ++ * successfully marked the filesystem dirty ++ */ + + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) +@@ -474,14 +553,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + set_bit(BCH_FS_rw, &c->flags); + set_bit(BCH_FS_was_rw, &c->flags); + +-#ifndef BCH_WRITE_REF_DEBUG +- percpu_ref_reinit(&c->writes); +-#else +- for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { +- BUG_ON(atomic_long_read(&c->writes[i])); +- atomic_long_inc(&c->writes[i]); +- } +-#endif ++ enumerated_ref_start(&c->writes); + + ret = bch2_copygc_start(c); + if (ret) { +@@ -512,21 +584,21 @@ int bch2_fs_read_write(struct bch_fs *c) + { + if (c->opts.recovery_pass_last && + c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) +- return -BCH_ERR_erofs_norecovery; ++ return bch_err_throw(c, erofs_norecovery); + + if (c->opts.nochanges) +- return -BCH_ERR_erofs_nochanges; ++ return bch_err_throw(c, erofs_nochanges); ++ ++ if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) ++ return bch_err_throw(c, erofs_no_alloc_info); + + return __bch2_fs_read_write(c, false); + } + + int bch2_fs_read_write_early(struct bch_fs *c) + { +- down_write(&c->state_lock); +- int ret = __bch2_fs_read_write(c, true); +- up_write(&c->state_lock); +- +- return ret; ++ guard(rwsem_write)(&c->state_lock); ++ return __bch2_fs_read_write(c, true); + } + + /* Filesystem startup/shutdown: */ +@@ -536,42 +608,44 @@ static void __bch2_fs_free(struct bch_fs *c) + for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_exit(&c->times[i]); + +-#ifdef CONFIG_UNICODE ++#if IS_ENABLED(CONFIG_UNICODE) + utf8_unload(c->cf_encoding); + #endif + + bch2_find_btree_nodes_exit(&c->found_btree_nodes); + bch2_free_pending_node_rewrites(c); + bch2_free_fsck_errs(c); +- bch2_fs_accounting_exit(c); +- bch2_fs_sb_errors_exit(c); +- bch2_fs_counters_exit(c); ++ bch2_fs_vfs_exit(c); + bch2_fs_snapshots_exit(c); ++ bch2_fs_sb_errors_exit(c); ++ bch2_fs_replicas_exit(c); ++ bch2_fs_rebalance_exit(c); + bch2_fs_quota_exit(c); ++ bch2_fs_nocow_locking_exit(c); ++ bch2_fs_journal_exit(&c->journal); + bch2_fs_fs_io_direct_exit(c); + bch2_fs_fs_io_buffered_exit(c); + bch2_fs_fsio_exit(c); +- bch2_fs_vfs_exit(c); +- bch2_fs_ec_exit(c); +- bch2_fs_encryption_exit(c); +- bch2_fs_nocow_locking_exit(c); + bch2_fs_io_write_exit(c); + bch2_fs_io_read_exit(c); ++ bch2_fs_encryption_exit(c); ++ bch2_fs_ec_exit(c); ++ bch2_fs_counters_exit(c); ++ bch2_fs_compress_exit(c); ++ bch2_io_clock_exit(&c->io_clock[WRITE]); ++ bch2_io_clock_exit(&c->io_clock[READ]); + bch2_fs_buckets_waiting_for_journal_exit(c); +- bch2_fs_btree_interior_update_exit(c); ++ bch2_fs_btree_write_buffer_exit(c); + bch2_fs_btree_key_cache_exit(&c->btree_key_cache); +- bch2_fs_btree_cache_exit(c); + bch2_fs_btree_iter_exit(c); +- bch2_fs_replicas_exit(c); +- bch2_fs_journal_exit(&c->journal); +- bch2_io_clock_exit(&c->io_clock[WRITE]); +- bch2_io_clock_exit(&c->io_clock[READ]); +- bch2_fs_compress_exit(c); +- bch2_fs_btree_gc_exit(c); ++ bch2_fs_btree_interior_update_exit(c); ++ bch2_fs_btree_cache_exit(c); ++ bch2_fs_accounting_exit(c); ++ bch2_fs_async_obj_exit(c); + bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); ++ + BUG_ON(atomic_read(&c->journal_keys.ref)); +- bch2_fs_btree_write_buffer_exit(c); + percpu_free_rwsem(&c->mark_lock); + if (c->online_reserved) { + u64 v = percpu_u64_get(c->online_reserved); +@@ -579,7 +653,6 @@ static void __bch2_fs_free(struct bch_fs *c) + free_percpu(c->online_reserved); + } + +- darray_exit(&c->incompat_versions_requested); + darray_exit(&c->btree_roots_extra); + free_percpu(c->pcpu); + free_percpu(c->usage); +@@ -587,9 +660,7 @@ static void __bch2_fs_free(struct bch_fs *c) + mempool_exit(&c->btree_bounce_pool); + bioset_exit(&c->btree_bio); + mempool_exit(&c->fill_iter); +-#ifndef BCH_WRITE_REF_DEBUG +- percpu_ref_exit(&c->writes); +-#endif ++ enumerated_ref_exit(&c->writes); + kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); + +@@ -601,8 +672,8 @@ static void __bch2_fs_free(struct bch_fs *c) + destroy_workqueue(c->btree_read_complete_wq); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); +- if (c->btree_io_complete_wq) +- destroy_workqueue(c->btree_io_complete_wq); ++ if (c->btree_write_complete_wq) ++ destroy_workqueue(c->btree_write_complete_wq); + if (c->btree_update_wq) + destroy_workqueue(c->btree_update_wq); + +@@ -624,9 +695,14 @@ void __bch2_fs_stop(struct bch_fs *c) + + set_bit(BCH_FS_stopping, &c->flags); + +- down_write(&c->state_lock); +- bch2_fs_read_only(c); +- up_write(&c->state_lock); ++ scoped_guard(rwsem_write, &c->state_lock) ++ bch2_fs_read_only(c); ++ ++ for (unsigned i = 0; i < c->sb.nr_devices; i++) { ++ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); ++ if (ca) ++ bch2_dev_io_ref_stop(ca, READ); ++ } + + for_each_member_device(c, ca) + bch2_dev_unlink(ca); +@@ -652,20 +728,19 @@ void __bch2_fs_stop(struct bch_fs *c) + cancel_work_sync(&ca->io_error_work); + + cancel_work_sync(&c->read_only_work); ++ ++ flush_work(&c->btree_interior_update_work); + } + + void bch2_fs_free(struct bch_fs *c) + { +- unsigned i; +- +- mutex_lock(&bch_fs_list_lock); +- list_del(&c->list); +- mutex_unlock(&bch_fs_list_lock); ++ scoped_guard(mutex, &bch_fs_list_lock) ++ list_del(&c->list); + + closure_sync(&c->cl); + closure_debug_destroy(&c->cl); + +- for (i = 0; i < c->sb.nr_devices; i++) { ++ for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + + if (ca) { +@@ -693,9 +768,10 @@ static int bch2_fs_online(struct bch_fs *c) + + lockdep_assert_held(&bch_fs_list_lock); + +- if (__bch2_uuid_to_fs(c->sb.uuid)) { ++ if (c->sb.multi_device && ++ __bch2_uuid_to_fs(c->sb.uuid)) { + bch_err(c, "filesystem UUID already open"); +- return -EINVAL; ++ return bch_err_throw(c, filesystem_uuid_already_open); + } + + ret = bch2_fs_chardev_init(c); +@@ -706,7 +782,9 @@ static int bch2_fs_online(struct bch_fs *c) + + bch2_fs_debug_init(c); + +- ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: ++ ret = (c->sb.multi_device ++ ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ++ : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: + kobject_add(&c->internal, &c->kobj, "internal") ?: + kobject_add(&c->opts_dir, &c->kobj, "options") ?: + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +@@ -719,29 +797,57 @@ static int bch2_fs_online(struct bch_fs *c) + return ret; + } + +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + + for_each_member_device(c, ca) { + ret = bch2_dev_sysfs_online(c, ca); + if (ret) { + bch_err(c, "error creating sysfs objects"); + bch2_dev_put(ca); +- goto err; ++ return ret; + } + } + + BUG_ON(!list_empty(&c->list)); + list_add(&c->list, &bch_fs_list); +-err: +- up_write(&c->state_lock); + return ret; + } + +-static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ++int bch2_fs_init_rw(struct bch_fs *c) ++{ ++ if (test_bit(BCH_FS_rw_init_done, &c->flags)) ++ return 0; ++ ++ if (!(c->btree_update_wq = alloc_workqueue("bcachefs", ++ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || ++ !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", ++ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || ++ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", ++ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", ++ WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || ++ !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", ++ WQ_FREEZABLE, 0))) ++ return bch_err_throw(c, ENOMEM_fs_other_alloc); ++ ++ int ret = bch2_fs_btree_interior_update_init(c) ?: ++ bch2_fs_btree_write_buffer_init(c) ?: ++ bch2_fs_fs_io_buffered_init(c) ?: ++ bch2_fs_io_write_init(c) ?: ++ bch2_fs_journal_init(&c->journal); ++ if (ret) ++ return ret; ++ ++ set_bit(BCH_FS_rw_init_done, &c->flags); ++ return 0; ++} ++ ++static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, ++ bch_sb_handles *sbs) + { + struct bch_fs *c; +- struct printbuf name = PRINTBUF; + unsigned i, iter_size; ++ CLASS(printbuf, name)(); + int ret = 0; + + c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); +@@ -750,7 +856,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + goto out; + } + +- c->stdio = (void *)(unsigned long) opts.stdio; ++ c->stdio = (void *)(unsigned long) opts->stdio; + + __module_get(THIS_MODULE); + +@@ -774,24 +880,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + refcount_set(&c->ro_ref, 1); + init_waitqueue_head(&c->ro_ref_wait); +- spin_lock_init(&c->recovery_pass_lock); +- sema_init(&c->online_fsck_mutex, 1); + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); + +- bch2_fs_copygc_init(c); +- bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); +- bch2_fs_btree_iter_init_early(c); +- bch2_fs_btree_interior_update_init_early(c); +- bch2_fs_journal_keys_init(c); + bch2_fs_allocator_background_init(c); + bch2_fs_allocator_foreground_init(c); +- bch2_fs_rebalance_init(c); +- bch2_fs_quota_init(c); ++ bch2_fs_btree_cache_init_early(&c->btree_cache); ++ bch2_fs_btree_gc_init_early(c); ++ bch2_fs_btree_interior_update_init_early(c); ++ bch2_fs_btree_iter_init_early(c); ++ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); ++ bch2_fs_btree_write_buffer_init_early(c); ++ bch2_fs_copygc_init(c); + bch2_fs_ec_init_early(c); ++ bch2_fs_journal_init_early(&c->journal); ++ bch2_fs_journal_keys_init(c); + bch2_fs_move_init(c); ++ bch2_fs_nocow_locking_init_early(c); ++ bch2_fs_quota_init(c); ++ bch2_fs_recovery_passes_init(c); + bch2_fs_sb_errors_init_early(c); ++ bch2_fs_snapshots_init_early(c); ++ bch2_fs_subvolumes_init_early(c); + + INIT_LIST_HEAD(&c->list); + +@@ -817,29 +928,18 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; + c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; + +- bch2_fs_btree_cache_init_early(&c->btree_cache); +- + mutex_init(&c->sectors_available_lock); + + ret = percpu_init_rwsem(&c->mark_lock); + if (ret) + goto err; + +- mutex_lock(&c->sb_lock); +- ret = bch2_sb_to_fs(c, sb); +- mutex_unlock(&c->sb_lock); +- +- if (ret) +- goto err; ++ scoped_guard(mutex, &c->sb_lock) ++ ret = bch2_sb_to_fs(c, sb); + +- pr_uuid(&name, c->sb.user_uuid.b); +- ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; + if (ret) + goto err; + +- strscpy(c->name, name.buf, sizeof(c->name)); +- printbuf_exit(&name); +- + /* Compat: */ + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) +@@ -854,7 +954,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + if (ret) + goto err; + +- bch2_opts_apply(&c->opts, opts); ++ bch2_opts_apply(&c->opts, *opts); ++ ++ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ++ c->opts.block_size > PAGE_SIZE) { ++ bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); ++ ret = -EINVAL; ++ goto err; ++ } + + c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; + if (c->opts.inodes_use_key_cache) +@@ -870,26 +977,25 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + goto err; + } + ++ if (c->sb.multi_device) ++ pr_uuid(&name, c->sb.user_uuid.b); ++ else ++ prt_bdevname(&name, sbs->data[0].bdev); ++ ++ ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; ++ if (ret) ++ goto err; ++ ++ strscpy(c->name, name.buf, sizeof(c->name)); ++ + iter_size = sizeof(struct sort_iter) + + (btree_blocks(c) + 1) * 2 * + sizeof(struct sort_iter_set); + +- if (!(c->btree_update_wq = alloc_workqueue("bcachefs", +- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || +- !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", +- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || +- !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", +- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || +- !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", ++ if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || +- !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", +- WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || +- !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", +- WQ_FREEZABLE, 0)) || +-#ifndef BCH_WRITE_REF_DEBUG +- percpu_ref_init(&c->writes, bch2_writes_disabled, +- PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +-#endif ++ enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, ++ bch2_writes_disabled) || + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || + bioset_init(&c->btree_bio, 1, + max(offsetof(struct btree_read_bio, bio), +@@ -901,51 +1007,54 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, + c->opts.btree_node_size) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { +- ret = -BCH_ERR_ENOMEM_fs_other_alloc; ++ ret = bch_err_throw(c, ENOMEM_fs_other_alloc); + goto err; + } + +- ret = bch2_fs_counters_init(c) ?: +- bch2_fs_sb_errors_init(c) ?: +- bch2_io_clock_init(&c->io_clock[READ]) ?: +- bch2_io_clock_init(&c->io_clock[WRITE]) ?: +- bch2_fs_journal_init(&c->journal) ?: +- bch2_fs_btree_iter_init(c) ?: ++ ret = ++ bch2_fs_async_obj_init(c) ?: + bch2_fs_btree_cache_init(c) ?: ++ bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: +- bch2_fs_btree_interior_update_init(c) ?: +- bch2_fs_btree_gc_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c) ?: +- bch2_fs_btree_write_buffer_init(c) ?: +- bch2_fs_subvolumes_init(c) ?: +- bch2_fs_io_read_init(c) ?: +- bch2_fs_io_write_init(c) ?: +- bch2_fs_nocow_locking_init(c) ?: +- bch2_fs_encryption_init(c) ?: ++ bch2_io_clock_init(&c->io_clock[READ]) ?: ++ bch2_io_clock_init(&c->io_clock[WRITE]) ?: + bch2_fs_compress_init(c) ?: ++ bch2_fs_counters_init(c) ?: + bch2_fs_ec_init(c) ?: +- bch2_fs_vfs_init(c) ?: ++ bch2_fs_encryption_init(c) ?: + bch2_fs_fsio_init(c) ?: +- bch2_fs_fs_io_buffered_init(c) ?: +- bch2_fs_fs_io_direct_init(c); ++ bch2_fs_fs_io_direct_init(c) ?: ++ bch2_fs_io_read_init(c) ?: ++ bch2_fs_rebalance_init(c) ?: ++ bch2_fs_sb_errors_init(c) ?: ++ bch2_fs_vfs_init(c); + if (ret) + goto err; + +-#ifdef CONFIG_UNICODE +- /* Default encoding until we can potentially have more as an option. */ +- c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); +- if (IS_ERR(c->cf_encoding)) { +- printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", +- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), +- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), +- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); +- ret = -EINVAL; +- goto err; ++ if (go_rw_in_recovery(c)) { ++ /* ++ * start workqueues/kworkers early - kthread creation checks for ++ * pending signals, which is _very_ annoying ++ */ ++ ret = bch2_fs_init_rw(c); ++ if (ret) ++ goto err; ++ } ++ ++#if IS_ENABLED(CONFIG_UNICODE) ++ if (!bch2_fs_casefold_enabled(c)) { ++ /* Default encoding until we can potentially have more as an option. */ ++ c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); ++ if (IS_ERR(c->cf_encoding)) { ++ printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", ++ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), ++ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), ++ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); ++ ret = -EINVAL; ++ goto err; ++ } + } +- bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", +- unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), +- unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), +- unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + #else + if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { + printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); +@@ -969,12 +1078,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + &c->clock_journal_res, + (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); + +- mutex_lock(&bch_fs_list_lock); +- ret = bch2_fs_online(c); +- mutex_unlock(&bch_fs_list_lock); ++ scoped_guard(mutex, &bch_fs_list_lock) ++ ret = bch2_fs_online(c); + + if (ret) + goto err; ++ ++ c->recovery_task = current; + out: + return c; + err: +@@ -987,12 +1097,13 @@ noinline_for_stack + static void print_mount_opts(struct bch_fs *c) + { + enum bch_opt_id i; +- struct printbuf p = PRINTBUF; +- bool first = true; ++ CLASS(printbuf, p)(); ++ bch2_log_msg_start(c, &p); + + prt_str(&p, "starting version "); + bch2_version_to_text(&p, c->sb.version); + ++ bool first = true; + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); +@@ -1009,30 +1120,41 @@ static void print_mount_opts(struct bch_fs *c) + } + + if (c->sb.version_incompat_allowed != c->sb.version) { +- prt_printf(&p, "\n allowing incompatible features above "); ++ prt_printf(&p, "\nallowing incompatible features above "); + bch2_version_to_text(&p, c->sb.version_incompat_allowed); + } + +- bch_info(c, "%s", p.buf); +- printbuf_exit(&p); ++ if (c->opts.verbose) { ++ prt_printf(&p, "\nfeatures: "); ++ prt_bitflags(&p, bch2_sb_features, c->sb.features); ++ } ++ ++ if (c->sb.multi_device) { ++ prt_printf(&p, "\nwith devices"); ++ for_each_online_member(c, ca, BCH_DEV_READ_REF_bch2_online_devs) { ++ prt_char(&p, ' '); ++ prt_str(&p, ca->name); ++ } ++ } ++ ++ bch2_print_str(c, KERN_INFO, p.buf); + } + + static bool bch2_fs_may_start(struct bch_fs *c) + { + struct bch_dev *ca; +- unsigned i, flags = 0; ++ unsigned flags = 0; + +- if (c->opts.very_degraded) ++ switch (c->opts.degraded) { ++ case BCH_DEGRADED_very: + flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; +- +- if (c->opts.degraded) ++ break; ++ case BCH_DEGRADED_yes: + flags |= BCH_FORCE_IF_DEGRADED; +- +- if (!c->opts.degraded && +- !c->opts.very_degraded) { +- mutex_lock(&c->sb_lock); +- +- for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ break; ++ default: { ++ guard(mutex)(&c->sb_lock); ++ for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { + if (!bch2_member_exists(c->disk_sb.sb, i)) + continue; + +@@ -1040,15 +1162,14 @@ static bool bch2_fs_may_start(struct bch_fs *c) + + if (!bch2_dev_is_online(ca) && + (ca->mi.state == BCH_MEMBER_STATE_rw || +- ca->mi.state == BCH_MEMBER_STATE_ro)) { +- mutex_unlock(&c->sb_lock); ++ ca->mi.state == BCH_MEMBER_STATE_ro)) + return false; +- } + } +- mutex_unlock(&c->sb_lock); ++ break; ++ } + } + +- return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); ++ return bch2_have_enough_devs(c, c->online_devs, flags, true); + } + + int bch2_fs_start(struct bch_fs *c) +@@ -1056,42 +1177,42 @@ int bch2_fs_start(struct bch_fs *c) + time64_t now = ktime_get_real_seconds(); + int ret = 0; + ++ BUG_ON(test_bit(BCH_FS_started, &c->flags)); ++ + print_mount_opts(c); + ++ if (c->cf_encoding) ++ bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", ++ unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), ++ unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), ++ unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); ++ + if (!bch2_fs_may_start(c)) +- return -BCH_ERR_insufficient_devices_to_start; ++ return bch_err_throw(c, insufficient_devices_to_start); + +- down_write(&c->state_lock); +- mutex_lock(&c->sb_lock); ++ scoped_guard(rwsem_write, &c->state_lock) { ++ guard(mutex)(&c->sb_lock); ++ if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, ++ sizeof(struct bch_sb_field_ext) / sizeof(u64))) { ++ ret = bch_err_throw(c, ENOSPC_sb); ++ goto err; ++ } + +- BUG_ON(test_bit(BCH_FS_started, &c->flags)); ++ ret = bch2_sb_members_v2_init(c); ++ if (ret) ++ goto err; + +- if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, +- sizeof(struct bch_sb_field_ext) / sizeof(u64))) { +- mutex_unlock(&c->sb_lock); +- up_write(&c->state_lock); +- ret = -BCH_ERR_ENOSPC_sb; +- goto err; +- } ++ scoped_guard(rcu) ++ for_each_online_member_rcu(c, ca) { ++ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = ++ cpu_to_le64(now); ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) ++ bch2_dev_allocator_add(c, ca); ++ } + +- ret = bch2_sb_members_v2_init(c); +- if (ret) { +- mutex_unlock(&c->sb_lock); +- up_write(&c->state_lock); +- goto err; ++ bch2_recalc_capacity(c); + } + +- for_each_online_member(c, ca) +- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); +- +- mutex_unlock(&c->sb_lock); +- +- for_each_rw_member(c, ca) +- bch2_dev_allocator_add(c, ca); +- bch2_recalc_capacity(c); +- up_write(&c->state_lock); +- +- c->recovery_task = current; + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) + ? bch2_fs_recovery(c) + : bch2_fs_initialize(c); +@@ -1100,25 +1221,24 @@ int bch2_fs_start(struct bch_fs *c) + if (ret) + goto err; + +- ret = bch2_opts_check_may_set(c); ++ ret = bch2_opts_hooks_pre_set(c); + if (ret) + goto err; + + if (bch2_fs_init_fault("fs_start")) { +- ret = -BCH_ERR_injected_fs_start; ++ ret = bch_err_throw(c, injected_fs_start); + goto err; + } + + set_bit(BCH_FS_started, &c->flags); + wake_up(&c->ro_ref_wait); + +- down_write(&c->state_lock); +- if (c->opts.read_only) +- bch2_fs_read_only(c); +- else if (!test_bit(BCH_FS_rw, &c->flags)) +- ret = bch2_fs_read_write(c); +- up_write(&c->state_lock); +- ++ scoped_guard(rwsem_write, &c->state_lock) { ++ if (c->opts.read_only) ++ bch2_fs_read_only(c); ++ else if (!test_bit(BCH_FS_rw, &c->flags)) ++ ret = bch2_fs_read_write(c); ++ } + err: + if (ret) + bch_err_msg(c, ret, "starting filesystem"); +@@ -1132,11 +1252,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) + struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); + + if (le16_to_cpu(sb->block_size) != block_sectors(c)) +- return -BCH_ERR_mismatched_block_size; ++ return bch_err_throw(c, mismatched_block_size); + + if (le16_to_cpu(m.bucket_size) < + BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) +- return -BCH_ERR_bucket_size_too_small; ++ return bch_err_throw(c, bucket_size_too_small); + + return 0; + } +@@ -1163,7 +1283,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, + + if (fs->sb->seq == sb->sb->seq && + fs->sb->write_time != sb->sb->write_time) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_str(&buf, "Split brain detected between "); + prt_bdevname(&buf, sb->bdev); +@@ -1188,7 +1308,6 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, + prt_printf(&buf, "Not using older sb"); + + pr_err("%s", buf.buf); +- printbuf_exit(&buf); + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; +@@ -1199,7 +1318,7 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, + u64 seq_from_member = le64_to_cpu(sb->sb->seq); + + if (seq_from_fs && seq_from_fs < seq_from_member) { +- struct printbuf buf = PRINTBUF; ++ CLASS(printbuf, buf)(); + + prt_str(&buf, "Split brain detected between "); + prt_bdevname(&buf, sb->bdev); +@@ -1221,7 +1340,6 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, + } + + pr_err("%s", buf.buf); +- printbuf_exit(&buf); + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; +@@ -1234,11 +1352,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, + + static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) + { +- if (!percpu_ref_is_zero(&ca->io_ref[rw])) { +- reinit_completion(&ca->io_ref_completion[rw]); +- percpu_ref_kill(&ca->io_ref[rw]); +- wait_for_completion(&ca->io_ref_completion[rw]); +- } ++ if (rw == READ) ++ clear_bit(ca->dev_idx, ca->fs->online_devs.d); ++ ++ if (!enumerated_ref_is_zero(&ca->io_ref[rw])) ++ enumerated_ref_stop(&ca->io_ref[rw], ++ rw == READ ++ ? bch2_dev_read_refs ++ : bch2_dev_write_refs); + } + + static void bch2_dev_release(struct kobject *kobj) +@@ -1250,8 +1371,8 @@ static void bch2_dev_release(struct kobject *kobj) + + static void bch2_dev_free(struct bch_dev *ca) + { +- WARN_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); +- WARN_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); ++ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); ++ WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); + + cancel_work_sync(&ca->io_error_work); + +@@ -1260,6 +1381,9 @@ static void bch2_dev_free(struct bch_dev *ca) + if (ca->kobj.state_in_sysfs) + kobject_del(&ca->kobj); + ++ bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); ++ bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); ++ + bch2_free_super(&ca->disk_sb); + bch2_dev_allocator_background_exit(ca); + bch2_dev_journal_exit(ca); +@@ -1271,8 +1395,8 @@ static void bch2_dev_free(struct bch_dev *ca) + bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); + +- percpu_ref_exit(&ca->io_ref[WRITE]); +- percpu_ref_exit(&ca->io_ref[READ]); ++ enumerated_ref_exit(&ca->io_ref[WRITE]); ++ enumerated_ref_exit(&ca->io_ref[READ]); + #ifndef CONFIG_BCACHEFS_DEBUG + percpu_ref_exit(&ca->ref); + #endif +@@ -1284,7 +1408,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) + + lockdep_assert_held(&c->state_lock); + +- if (percpu_ref_is_zero(&ca->io_ref[READ])) ++ if (enumerated_ref_is_zero(&ca->io_ref[READ])) + return; + + __bch2_dev_read_only(c, ca); +@@ -1306,20 +1430,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref) + } + #endif + +-static void bch2_dev_io_ref_read_complete(struct percpu_ref *ref) +-{ +- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[READ]); +- +- complete(&ca->io_ref_completion[READ]); +-} +- +-static void bch2_dev_io_ref_write_complete(struct percpu_ref *ref) +-{ +- struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref[WRITE]); +- +- complete(&ca->io_ref_completion[WRITE]); +-} +- + static void bch2_dev_unlink(struct bch_dev *ca) + { + struct kobject *b; +@@ -1381,8 +1491,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + + kobject_init(&ca->kobj, &bch2_dev_ktype); + init_completion(&ca->ref_completion); +- init_completion(&ca->io_ref_completion[READ]); +- init_completion(&ca->io_ref_completion[WRITE]); + + INIT_WORK(&ca->io_error_work, bch2_io_error_work); + +@@ -1406,12 +1514,13 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + atomic_long_set(&ca->ref, 1); + #endif + ++ mutex_init(&ca->bucket_backpointer_mismatch.lock); ++ mutex_init(&ca->bucket_backpointer_empty.lock); ++ + bch2_dev_allocator_background_init(ca); + +- if (percpu_ref_init(&ca->io_ref[READ], bch2_dev_io_ref_read_complete, +- PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +- percpu_ref_init(&ca->io_ref[WRITE], bch2_dev_io_ref_write_complete, +- PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || ++ enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || + !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || + bch2_dev_buckets_alloc(c, ca) || + !(ca->io_done = alloc_percpu(*ca->io_done))) +@@ -1428,7 +1537,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, + { + ca->dev_idx = dev_idx; + __set_bit(ca->dev_idx, ca->self.d); +- scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ++ ++ if (!ca->name[0]) ++ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + ca->fs = c; + rcu_assign_pointer(c->devs[ca->dev_idx], ca); +@@ -1443,18 +1554,16 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) + struct bch_dev *ca = NULL; + + if (bch2_fs_init_fault("dev_alloc")) +- goto err; ++ return bch_err_throw(c, ENOMEM_dev_alloc); + + ca = __bch2_dev_alloc(c, &member); + if (!ca) +- goto err; ++ return bch_err_throw(c, ENOMEM_dev_alloc); + + ca->fs = c; + + bch2_dev_attach(c, ca, dev_idx); + return 0; +-err: +- return -BCH_ERR_ENOMEM_dev_alloc; + } + + static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) +@@ -1464,22 +1573,29 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) + if (bch2_dev_is_online(ca)) { + bch_err(ca, "already have device online in slot %u", + sb->sb->dev_idx); +- return -BCH_ERR_device_already_online; ++ return bch_err_throw(ca->fs, device_already_online); + } + + if (get_capacity(sb->bdev->bd_disk) < + ca->mi.bucket_size * ca->mi.nbuckets) { +- bch_err(ca, "cannot online: device too small"); +- return -BCH_ERR_device_size_too_small; ++ bch_err(ca, "cannot online: device too small (capacity %llu filesystem size %llu nbuckets %llu)", ++ get_capacity(sb->bdev->bd_disk), ++ ca->mi.bucket_size * ca->mi.nbuckets, ++ ca->mi.nbuckets); ++ return bch_err_throw(ca->fs, device_size_too_small); + } + +- BUG_ON(!percpu_ref_is_zero(&ca->io_ref[READ])); +- BUG_ON(!percpu_ref_is_zero(&ca->io_ref[WRITE])); ++ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); ++ BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); + + ret = bch2_dev_journal_init(ca, sb->sb); + if (ret) + return ret; + ++ CLASS(printbuf, name)(); ++ prt_bdevname(&name, sb->bdev); ++ strscpy(ca->name, name.buf, sizeof(ca->name)); ++ + /* Commit: */ + ca->disk_sb = *sb; + memset(sb, 0, sizeof(*sb)); +@@ -1493,7 +1609,7 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) + + ca->dev = ca->disk_sb.bdev->bd_dev; + +- percpu_ref_reinit(&ca->io_ref[READ]); ++ enumerated_ref_start(&ca->io_ref[READ]); + + return 0; + } +@@ -1517,16 +1633,9 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) + if (ret) + return ret; + +- bch2_dev_sysfs_online(c, ca); +- +- struct printbuf name = PRINTBUF; +- prt_bdevname(&name, ca->disk_sb.bdev); +- +- if (c->sb.nr_devices == 1) +- strscpy(c->name, name.buf, sizeof(c->name)); +- strscpy(ca->name, name.buf, sizeof(ca->name)); ++ set_bit(ca->dev_idx, c->online_devs.d); + +- printbuf_exit(&name); ++ bch2_dev_sysfs_online(c, ca); + + bch2_rebalance_wakeup(c); + return 0; +@@ -1578,7 +1687,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + return true; + + /* do we have enough devices to read from? */ +- new_online_devs = bch2_online_devs(c); ++ new_online_devs = c->online_devs; + __clear_bit(ca->dev_idx, new_online_devs.d); + + return bch2_have_enough_devs(c, new_online_devs, flags, false); +@@ -1608,8 +1717,8 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + +- if (percpu_ref_is_zero(&ca->io_ref[WRITE])) +- percpu_ref_reinit(&ca->io_ref[WRITE]); ++ if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) ++ enumerated_ref_start(&ca->io_ref[WRITE]); + + bch2_dev_do_discards(ca); + } +@@ -1617,25 +1726,24 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) + int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) + { +- struct bch_member *m; + int ret = 0; + + if (ca->mi.state == new_state) + return 0; + + if (!bch2_dev_state_allowed(c, ca, new_state, flags)) +- return -BCH_ERR_device_state_not_allowed; ++ return bch_err_throw(c, device_state_not_allowed); + + if (new_state != BCH_MEMBER_STATE_rw) + __bch2_dev_read_only(c, ca); + + bch_notice(ca, "%s", bch2_member_states[new_state]); + +- mutex_lock(&c->sb_lock); +- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); +- SET_BCH_MEMBER_STATE(m, new_state); +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) { ++ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); ++ SET_BCH_MEMBER_STATE(m, new_state); ++ bch2_write_super(c); ++ } + + if (new_state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); +@@ -1648,24 +1756,20 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) + { +- int ret; +- +- down_write(&c->state_lock); +- ret = __bch2_dev_set_state(c, ca, new_state, flags); +- up_write(&c->state_lock); +- +- return ret; ++ guard(rwsem_write)(&c->state_lock); ++ return __bch2_dev_set_state(c, ca, new_state, flags); + } + + /* Device add/removal: */ + + int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + { +- struct bch_member *m; + unsigned dev_idx = ca->dev_idx, data; ++ bool fast_device_removal = !bch2_request_incompat_feature(c, ++ bcachefs_metadata_version_fast_device_removal); + int ret; + +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + + /* + * We consume a reference to ca->ref, regardless of whether we succeed +@@ -1675,17 +1779,31 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { + bch_err(ca, "Cannot remove without losing data"); +- ret = -BCH_ERR_device_state_not_allowed; ++ ret = bch_err_throw(c, device_state_not_allowed); + goto err; + } + + __bch2_dev_read_only(c, ca); + +- ret = bch2_dev_data_drop(c, ca->dev_idx, flags); +- bch_err_msg(ca, ret, "bch2_dev_data_drop()"); ++ ret = fast_device_removal ++ ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) ++ : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: ++ bch2_dev_remove_stripes(c, ca->dev_idx, flags)); + if (ret) + goto err; + ++ /* Check if device still has data before blowing away alloc info */ ++ struct bch_dev_usage usage = bch2_dev_usage_read(ca); ++ for (unsigned i = 0; i < BCH_DATA_NR; i++) ++ if (!data_type_is_empty(i) && ++ !data_type_is_hidden(i) && ++ usage.buckets[i]) { ++ bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", ++ __bch2_data_types[i], usage.buckets[i]); ++ ret = -EBUSY; ++ goto err; ++ } ++ + ret = bch2_dev_remove_alloc(c, ca); + bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); + if (ret) +@@ -1718,20 +1836,17 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + + data = bch2_dev_has_data(c, ca); + if (data) { +- struct printbuf data_has = PRINTBUF; +- ++ CLASS(printbuf, data_has)(); + prt_bitflags(&data_has, __bch2_data_types, data); + bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); +- printbuf_exit(&data_has); + ret = -EBUSY; + goto err; + } + + __bch2_dev_offline(c, ca); + +- mutex_lock(&c->sb_lock); +- rcu_assign_pointer(c->devs[ca->dev_idx], NULL); +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) ++ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); + + #ifndef CONFIG_BCACHEFS_DEBUG + percpu_ref_kill(&ca->ref); +@@ -1747,21 +1862,23 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + * Free this device's slot in the bch_member array - all pointers to + * this device must be gone: + */ +- mutex_lock(&c->sb_lock); +- m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); +- memset(&m->uuid, 0, sizeof(m->uuid)); ++ scoped_guard(mutex, &c->sb_lock) { ++ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); + +- bch2_write_super(c); ++ if (fast_device_removal) ++ m->uuid = BCH_SB_MEMBER_DELETED_UUID; ++ else ++ memset(&m->uuid, 0, sizeof(m->uuid)); ++ ++ bch2_write_super(c); ++ } + +- mutex_unlock(&c->sb_lock); +- up_write(&c->state_lock); + return 0; + err: + if (test_bit(BCH_FS_rw, &c->flags) && + ca->mi.state == BCH_MEMBER_STATE_rw && +- !percpu_ref_is_zero(&ca->io_ref[READ])) ++ !enumerated_ref_is_zero(&ca->io_ref[READ])) + __bch2_dev_read_write(c, ca); +- up_write(&c->state_lock); + return ret; + } + +@@ -1769,11 +1886,10 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) + int bch2_dev_add(struct bch_fs *c, const char *path) + { + struct bch_opts opts = bch2_opts_empty(); +- struct bch_sb_handle sb; ++ struct bch_sb_handle sb = {}; + struct bch_dev *ca = NULL; +- struct printbuf errbuf = PRINTBUF; +- struct printbuf label = PRINTBUF; +- int ret; ++ CLASS(printbuf, label)(); ++ int ret = 0; + + ret = bch2_read_super(path, &opts, &sb); + bch_err_msg(c, ret, "reading super"); +@@ -1790,6 +1906,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + } + } + ++ if (list_empty(&c->list)) { ++ scoped_guard(mutex, &bch_fs_list_lock) { ++ if (__bch2_uuid_to_fs(c->sb.uuid)) ++ ret = bch_err_throw(c, filesystem_uuid_already_open); ++ else ++ list_add(&c->list, &bch_fs_list); ++ } ++ ++ if (ret) { ++ bch_err(c, "filesystem UUID already open"); ++ goto err; ++ } ++ } ++ + ret = bch2_dev_may_add(sb.sb, c); + if (ret) + goto err; +@@ -1804,81 +1934,95 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + if (ret) + goto err; + +- down_write(&c->state_lock); +- mutex_lock(&c->sb_lock); ++ scoped_guard(rwsem_write, &c->state_lock) { ++ scoped_guard(mutex, &c->sb_lock) { ++ SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); + +- ret = bch2_sb_from_fs(c, ca); +- bch_err_msg(c, ret, "setting up new superblock"); +- if (ret) +- goto err_unlock; +- +- if (dynamic_fault("bcachefs:add:no_slot")) +- goto err_unlock; ++ ret = bch2_sb_from_fs(c, ca); ++ bch_err_msg(c, ret, "setting up new superblock"); ++ if (ret) ++ goto err; + +- ret = bch2_sb_member_alloc(c); +- if (ret < 0) { +- bch_err_msg(c, ret, "setting up new superblock"); +- goto err_unlock; +- } +- unsigned dev_idx = ret; ++ if (dynamic_fault("bcachefs:add:no_slot")) ++ goto err; + +- /* success: */ ++ ret = bch2_sb_member_alloc(c); ++ if (ret < 0) { ++ bch_err_msg(c, ret, "setting up new superblock"); ++ goto err; ++ } ++ unsigned dev_idx = ret; ++ ret = 0; + +- dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); +- *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; ++ /* success: */ + +- ca->disk_sb.sb->dev_idx = dev_idx; +- bch2_dev_attach(c, ca, dev_idx); ++ dev_mi.last_mount = cpu_to_le64(ktime_get_real_seconds()); ++ *bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx) = dev_mi; + +- if (BCH_MEMBER_GROUP(&dev_mi)) { +- ret = __bch2_dev_group_set(c, ca, label.buf); +- bch_err_msg(c, ret, "creating new label"); +- if (ret) +- goto err_unlock; +- } ++ ca->disk_sb.sb->dev_idx = dev_idx; ++ bch2_dev_attach(c, ca, dev_idx); + +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ set_bit(ca->dev_idx, c->online_devs.d); + +- ret = bch2_dev_usage_init(ca, false); +- if (ret) +- goto err_late; ++ if (BCH_MEMBER_GROUP(&dev_mi)) { ++ ret = __bch2_dev_group_set(c, ca, label.buf); ++ bch_err_msg(c, ret, "creating new label"); ++ if (ret) ++ goto err_late; ++ } + +- ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); +- bch_err_msg(ca, ret, "marking new superblock"); +- if (ret) +- goto err_late; ++ bch2_write_super(c); ++ } + +- ret = bch2_fs_freespace_init(c); +- bch_err_msg(ca, ret, "initializing free space"); +- if (ret) +- goto err_late; ++ ret = bch2_dev_usage_init(ca, false); ++ if (ret) ++ goto err_late; ++ ++ if (test_bit(BCH_FS_started, &c->flags)) { ++ ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); ++ bch_err_msg(ca, ret, "marking new superblock"); ++ if (ret) ++ goto err_late; ++ ++ ret = bch2_fs_freespace_init(c); ++ bch_err_msg(ca, ret, "initializing free space"); ++ if (ret) ++ goto err_late; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_rw) ++ __bch2_dev_read_write(c, ca); ++ ++ ret = bch2_dev_journal_alloc(ca, false); ++ bch_err_msg(c, ret, "allocating journal"); ++ if (ret) ++ goto err_late; ++ } + +- if (ca->mi.state == BCH_MEMBER_STATE_rw) +- __bch2_dev_read_write(c, ca); ++ /* ++ * We just changed the superblock UUID, invalidate cache and send a ++ * uevent to update /dev/disk/by-uuid ++ */ ++ invalidate_bdev(ca->disk_sb.bdev); + +- ret = bch2_dev_journal_alloc(ca, false); +- bch_err_msg(c, ret, "allocating journal"); +- if (ret) +- goto err_late; ++ char uuid_str[37]; ++ snprintf(uuid_str, sizeof(uuid_str), "UUID=%pUb", &c->sb.uuid); + +- up_write(&c->state_lock); ++ char *envp[] = { ++ "CHANGE=uuid", ++ uuid_str, ++ NULL, ++ }; ++ kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp); ++ } + out: +- printbuf_exit(&label); +- printbuf_exit(&errbuf); + bch_err_fn(c, ret); + return ret; +- +-err_unlock: +- mutex_unlock(&c->sb_lock); +- up_write(&c->state_lock); + err: + if (ca) + bch2_dev_free(ca); + bch2_free_super(&sb); + goto out; + err_late: +- up_write(&c->state_lock); + ca = NULL; + goto err; + } +@@ -1892,13 +2036,11 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + unsigned dev_idx; + int ret; + +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + + ret = bch2_read_super(path, &opts, &sb); +- if (ret) { +- up_write(&c->state_lock); ++ if (ret) + return ret; +- } + + dev_idx = sb.sb->dev_idx; + +@@ -1935,104 +2077,139 @@ int bch2_dev_online(struct bch_fs *c, const char *path) + goto err; + } + +- mutex_lock(&c->sb_lock); +- bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = +- cpu_to_le64(ktime_get_real_seconds()); +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ scoped_guard(mutex, &c->sb_lock) { ++ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = ++ cpu_to_le64(ktime_get_real_seconds()); ++ bch2_write_super(c); ++ } + +- up_write(&c->state_lock); + return 0; + err: +- up_write(&c->state_lock); + bch2_free_super(&sb); + return ret; + } + + int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) + { +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + + if (!bch2_dev_is_online(ca)) { + bch_err(ca, "Already offline"); +- up_write(&c->state_lock); + return 0; + } + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { + bch_err(ca, "Cannot offline required disk"); +- up_write(&c->state_lock); +- return -BCH_ERR_device_state_not_allowed; ++ return bch_err_throw(c, device_state_not_allowed); + } + + __bch2_dev_offline(c, ca); +- +- up_write(&c->state_lock); + return 0; + } + ++static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) ++{ ++ struct bch_fs *c = ca->fs; ++ u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; ++ ++ return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, ++ bch2_disk_accounting_mod2(trans, false, v, dev_data_type, ++ .dev = ca->dev_idx, ++ .data_type = BCH_DATA_free)) ?: ++ bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); ++} ++ + int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + { +- struct bch_member *m; + u64 old_nbuckets; + int ret = 0; + +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); + old_nbuckets = ca->mi.nbuckets; + + if (nbuckets < ca->mi.nbuckets) { + bch_err(ca, "Cannot shrink yet"); +- ret = -EINVAL; +- goto err; ++ return -EINVAL; + } + + if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { + bch_err(ca, "New device size too big (%llu greater than max %u)", + nbuckets, BCH_MEMBER_NBUCKETS_MAX); +- ret = -BCH_ERR_device_size_too_big; +- goto err; ++ return bch_err_throw(c, device_size_too_big); + } + + if (bch2_dev_is_online(ca) && + get_capacity(ca->disk_sb.bdev->bd_disk) < + ca->mi.bucket_size * nbuckets) { + bch_err(ca, "New size larger than device"); +- ret = -BCH_ERR_device_size_too_small; +- goto err; ++ return bch_err_throw(c, device_size_too_small); + } + + ret = bch2_dev_buckets_resize(c, ca, nbuckets); + bch_err_msg(ca, ret, "resizing buckets"); + if (ret) +- goto err; ++ return ret; + + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); + if (ret) +- goto err; ++ return ret; + +- mutex_lock(&c->sb_lock); +- m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); +- m->nbuckets = cpu_to_le64(nbuckets); ++ scoped_guard(mutex, &c->sb_lock) { ++ struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); ++ m->nbuckets = cpu_to_le64(nbuckets); + +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ bch2_write_super(c); ++ } + + if (ca->mi.freespace_initialized) { +- u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; +- +- ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, +- bch2_disk_accounting_mod2(trans, false, v, dev_data_type, +- .dev = ca->dev_idx, +- .data_type = BCH_DATA_free)) ?: +- bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); ++ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); + if (ret) +- goto err; ++ return ret; + } + + bch2_recalc_capacity(c); +-err: +- up_write(&c->state_lock); +- return ret; ++ return 0; ++} ++ ++int bch2_fs_resize_on_mount(struct bch_fs *c) ++{ ++ for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { ++ u64 old_nbuckets = ca->mi.nbuckets; ++ u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), ++ ca->mi.bucket_size); ++ ++ if (ca->mi.resize_on_mount && ++ new_nbuckets > ca->mi.nbuckets) { ++ bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); ++ int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); ++ bch_err_fn(ca, ret); ++ if (ret) { ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_fs_resize_on_mount); ++ return ret; ++ } ++ ++ scoped_guard(mutex, &c->sb_lock) { ++ struct bch_member *m = ++ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); ++ m->nbuckets = cpu_to_le64(new_nbuckets); ++ SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); ++ ++ c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); ++ bch2_write_super(c); ++ } ++ ++ if (ca->mi.freespace_initialized) { ++ ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); ++ if (ret) { ++ enumerated_ref_put(&ca->io_ref[READ], ++ BCH_DEV_READ_REF_fs_resize_on_mount); ++ return ret; ++ } ++ } ++ } ++ } ++ return 0; + } + + /* return with ref on ca->ref: */ +@@ -2065,6 +2242,10 @@ static struct bch_fs *bdev_get_fs(struct block_device *bdev) + return c; + } + ++DEFINE_CLASS(bdev_get_fs, struct bch_fs *, ++ bch2_ro_ref_put(_T), bdev_get_fs(bdev), ++ struct block_device *bdev); ++ + /* returns with ref on ca->ref */ + static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) + { +@@ -2076,7 +2257,7 @@ static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bd + + static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) + { +- struct bch_fs *c = bdev_get_fs(bdev); ++ CLASS(bdev_get_fs, c)(bdev); + if (!c) + return; + +@@ -2090,36 +2271,45 @@ static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) + down_read(&sb->s_umount); + } + +- down_write(&c->state_lock); ++ guard(rwsem_write)(&c->state_lock); ++ + struct bch_dev *ca = bdev_to_bch_dev(c, bdev); +- if (!ca) +- goto unlock; ++ if (ca) { ++ bool dev = bch2_dev_state_allowed(c, ca, ++ BCH_MEMBER_STATE_failed, ++ BCH_FORCE_IF_DEGRADED); + +- if (bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, BCH_FORCE_IF_DEGRADED)) { +- __bch2_dev_offline(c, ca); +- } else { +- if (sb) { ++ if (!dev && sb) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + } + +- bch2_journal_flush(&c->journal); +- bch2_fs_emergency_read_only(c); ++ CLASS(printbuf, buf)(); ++ __bch2_log_msg_start(ca->name, &buf); ++ ++ prt_printf(&buf, "offline from block layer"); ++ ++ if (dev) { ++ __bch2_dev_offline(c, ca); ++ } else { ++ bch2_journal_flush(&c->journal); ++ bch2_fs_emergency_read_only2(c, &buf); ++ } ++ ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ ++ bch2_dev_put(ca); + } + +- bch2_dev_put(ca); +-unlock: + if (sb) + up_read(&sb->s_umount); +- up_write(&c->state_lock); +- bch2_ro_ref_put(c); + } + + static void bch2_fs_bdev_sync(struct block_device *bdev) + { +- struct bch_fs *c = bdev_get_fs(bdev); ++ CLASS(bdev_get_fs, c)(bdev); + if (!c) + return; + +@@ -2130,12 +2320,9 @@ static void bch2_fs_bdev_sync(struct block_device *bdev) + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ +- down_read(&sb->s_umount); ++ guard(rwsem_read)(&sb->s_umount); + sync_filesystem(sb); +- up_read(&sb->s_umount); + } +- +- bch2_ro_ref_put(c); + } + + const struct blk_holder_ops bch2_sb_handle_bdev_ops = { +@@ -2151,39 +2338,38 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) + cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); + } + +-struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, +- struct bch_opts opts) ++struct bch_fs *bch2_fs_open(darray_const_str *devices, ++ struct bch_opts *opts) + { +- DARRAY(struct bch_sb_handle) sbs = { 0 }; ++ bch_sb_handles sbs = {}; + struct bch_fs *c = NULL; + struct bch_sb_handle *best = NULL; +- struct printbuf errbuf = PRINTBUF; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); + +- if (!nr_devices) { ++ if (!devices->nr) { + ret = -EINVAL; + goto err; + } + +- ret = darray_make_room(&sbs, nr_devices); ++ ret = darray_make_room(&sbs, devices->nr); + if (ret) + goto err; + +- for (unsigned i = 0; i < nr_devices; i++) { ++ darray_for_each(*devices, i) { + struct bch_sb_handle sb = { NULL }; + +- ret = bch2_read_super(devices[i], &opts, &sb); ++ ret = bch2_read_super(*i, opts, &sb); + if (ret) + goto err; + + BUG_ON(darray_push(&sbs, sb)); + } + +- if (opts.nochanges && !opts.read_only) { +- ret = -BCH_ERR_erofs_nochanges; ++ if (opts->nochanges && !opts->read_only) { ++ ret = bch_err_throw(c, erofs_nochanges); + goto err_print; + } + +@@ -2192,7 +2378,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + best = sb; + + darray_for_each_reverse(sbs, sb) { +- ret = bch2_dev_in_fs(best, sb, &opts); ++ ret = bch2_dev_in_fs(best, sb, opts); + + if (ret == -BCH_ERR_device_has_been_removed || + ret == -BCH_ERR_device_splitbrain) { +@@ -2207,20 +2393,17 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + goto err_print; + } + +- c = bch2_fs_alloc(best->sb, opts); ++ c = bch2_fs_alloc(best->sb, opts, &sbs); + ret = PTR_ERR_OR_ZERO(c); + if (ret) + goto err; + +- down_write(&c->state_lock); +- darray_for_each(sbs, sb) { +- ret = bch2_dev_attach_bdev(c, sb); +- if (ret) { +- up_write(&c->state_lock); +- goto err; ++ scoped_guard(rwsem_write, &c->state_lock) ++ darray_for_each(sbs, sb) { ++ ret = bch2_dev_attach_bdev(c, sb); ++ if (ret) ++ goto err; + } +- } +- up_write(&c->state_lock); + + if (!c->opts.nostart) { + ret = bch2_fs_start(c); +@@ -2231,12 +2414,11 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + darray_for_each(sbs, sb) + bch2_free_super(sb); + darray_exit(&sbs); +- printbuf_exit(&errbuf); + module_put(THIS_MODULE); + return c; + err_print: + pr_err("bch_fs_open err opening %s: %s", +- devices[0], bch2_err_str(ret)); ++ devices->data[0], bch2_err_str(ret)); + err: + if (!IS_ERR_OR_NULL(c)) + bch2_fs_stop(c); +@@ -2273,9 +2455,47 @@ static int __init bcachefs_init(void) + return -ENOMEM; + } + +-#define BCH_DEBUG_PARAM(name, description) \ +- bool bch2_##name; \ +- module_param_named(name, bch2_##name, bool, 0644); \ ++#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); ++BCH_DEBUG_PARAMS_ALL() ++#undef BCH_DEBUG_PARAM ++ ++static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) ++{ ++ /* Match bool exactly, by re-using it. */ ++ struct static_key *key = kp->arg; ++ struct kernel_param boolkp = *kp; ++ bool v; ++ int ret; ++ ++ boolkp.arg = &v; ++ ++ ret = param_set_bool(val, &boolkp); ++ if (ret) ++ return ret; ++ if (v) ++ static_key_enable(key); ++ else ++ static_key_disable(key); ++ return 0; ++} ++ ++static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) ++{ ++ struct static_key *key = kp->arg; ++ return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); ++} ++ ++/* this is unused in userspace - silence the warning */ ++__maybe_unused ++static const struct kernel_param_ops bch2_param_ops_static_key_t = { ++ .flags = KERNEL_PARAM_OPS_FL_NOARG, ++ .set = bch2_param_set_static_key_t, ++ .get = bch2_param_get_static_key_t, ++}; ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ ++ __MODULE_PARM_TYPE(name, "static_key_t"); \ + MODULE_PARM_DESC(name, description); + BCH_DEBUG_PARAMS() + #undef BCH_DEBUG_PARAM +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index 23533bce5709..e90bab9afe78 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -9,6 +9,9 @@ + #include + + extern const char * const bch2_fs_flag_strs[]; ++extern const char * const bch2_write_refs[]; ++extern const char * const bch2_dev_read_refs[]; ++extern const char * const bch2_dev_write_refs[]; + + struct bch_fs *bch2_dev_to_fs(dev_t); + struct bch_fs *bch2_uuid_to_fs(__uuid_t); +@@ -29,18 +32,23 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); + struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); + + bool bch2_fs_emergency_read_only(struct bch_fs *); ++bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); ++ + bool bch2_fs_emergency_read_only_locked(struct bch_fs *); + void bch2_fs_read_only(struct bch_fs *); + + int bch2_fs_read_write(struct bch_fs *); + int bch2_fs_read_write_early(struct bch_fs *); + ++int bch2_fs_resize_on_mount(struct bch_fs *); ++ + void __bch2_fs_stop(struct bch_fs *); + void bch2_fs_free(struct bch_fs *); + void bch2_fs_stop(struct bch_fs *); + ++int bch2_fs_init_rw(struct bch_fs *); + int bch2_fs_start(struct bch_fs *); +-struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); ++struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); + + extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; + +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 82ee333ddd21..bd3fa9c3372d 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -18,6 +18,7 @@ + #include "btree_key_cache.h" + #include "btree_update.h" + #include "btree_update_interior.h" ++#include "btree_write_buffer.h" + #include "btree_gc.h" + #include "buckets.h" + #include "clock.h" +@@ -25,6 +26,8 @@ + #include "disk_accounting.h" + #include "disk_groups.h" + #include "ec.h" ++#include "enumerated_ref.h" ++#include "error.h" + #include "inode.h" + #include "journal.h" + #include "journal_reclaim.h" +@@ -34,12 +37,15 @@ + #include "nocow_locking.h" + #include "opts.h" + #include "rebalance.h" ++#include "recovery_passes.h" + #include "replicas.h" ++#include "sb-errors.h" + #include "super-io.h" + #include "tests.h" + + #include + #include ++#include + #include + + #include "util.h" +@@ -57,7 +63,7 @@ static ssize_t fn ## _to_text(struct printbuf *, \ + static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ + { \ +- struct printbuf out = PRINTBUF; \ ++ CLASS(printbuf, out)(); \ + ssize_t ret = fn ## _to_text(&out, kobj, attr); \ + \ + if (out.pos && out.buf[out.pos - 1] != '\n') \ +@@ -70,7 +76,6 @@ static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ + memcpy(buf, out.buf, ret); \ + } \ +- printbuf_exit(&out); \ + return bch2_err_class(ret); \ + } \ + \ +@@ -141,13 +146,19 @@ do { \ + write_attribute(trigger_gc); + write_attribute(trigger_discards); + write_attribute(trigger_invalidates); ++write_attribute(trigger_journal_commit); + write_attribute(trigger_journal_flush); + write_attribute(trigger_journal_writes); + write_attribute(trigger_btree_cache_shrink); + write_attribute(trigger_btree_key_cache_shrink); +-write_attribute(trigger_freelist_wakeup); ++write_attribute(trigger_btree_write_buffer_flush); + write_attribute(trigger_btree_updates); ++write_attribute(trigger_freelist_wakeup); ++write_attribute(trigger_recalc_capacity); ++write_attribute(trigger_delete_dead_snapshots); ++write_attribute(trigger_emergency_read_only); + read_attribute(gc_gens_pos); ++__sysfs_attribute(read_fua_test, 0400); + + read_attribute(uuid); + read_attribute(minor); +@@ -162,12 +173,15 @@ read_attribute(io_latency_read); + read_attribute(io_latency_write); + read_attribute(io_latency_stats_read); + read_attribute(io_latency_stats_write); ++#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + read_attribute(congested); ++#endif + + read_attribute(btree_write_stats); + + read_attribute(btree_cache_size); + read_attribute(compression_stats); ++read_attribute(errors); + read_attribute(journal_debug); + read_attribute(btree_cache); + read_attribute(btree_key_cache); +@@ -176,25 +190,9 @@ read_attribute(open_buckets); + read_attribute(open_buckets_partial); + read_attribute(nocow_lock_table); + +-#ifdef BCH_WRITE_REF_DEBUG ++read_attribute(read_refs); + read_attribute(write_refs); + +-static const char * const bch2_write_refs[] = { +-#define x(n) #n, +- BCH_WRITE_REFS() +-#undef x +- NULL +-}; +- +-static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) +-{ +- bch2_printbuf_tabstop_push(out, 24); +- +- for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) +- prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); +-} +-#endif +- + read_attribute(internal_uuid); + read_attribute(disk_groups); + +@@ -212,6 +210,8 @@ read_attribute(copy_gc_wait); + + sysfs_pd_controller_attribute(rebalance); + read_attribute(rebalance_status); ++read_attribute(snapshot_delete_status); ++read_attribute(recovery_status); + + read_attribute(new_stripes); + +@@ -236,14 +236,13 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) + size_t ret = 0; + struct btree *b; + +- mutex_lock(&bc->lock); ++ guard(mutex)(&bc->lock); + list_for_each_entry(b, &bc->live[0].list, list) + ret += btree_buf_bytes(b); + list_for_each_entry(b, &bc->live[1].list, list) + ret += btree_buf_bytes(b); + list_for_each_entry(b, &bc->freeable, list) + ret += btree_buf_bytes(b); +- mutex_unlock(&bc->lock); + return ret; + } + +@@ -308,6 +307,116 @@ static void bch2_fs_usage_base_to_text(struct printbuf *out, struct bch_fs *c) + prt_printf(out, "nr_inodes:\t%llu\n", b.nr_inodes); + } + ++static int bch2_read_fua_test(struct printbuf *out, struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bio *bio = NULL; ++ void *buf = NULL; ++ unsigned bs = c->opts.block_size, iters; ++ u64 end, test_duration = NSEC_PER_SEC * 2; ++ struct bch2_time_stats stats_nofua, stats_fua, stats_random; ++ int ret = 0; ++ ++ bch2_time_stats_init_no_pcpu(&stats_nofua); ++ bch2_time_stats_init_no_pcpu(&stats_fua); ++ bch2_time_stats_init_no_pcpu(&stats_random); ++ ++ if (!bch2_dev_get_ioref(c, ca->dev_idx, READ, BCH_DEV_READ_REF_read_fua_test)) { ++ prt_str(out, "offline\n"); ++ return 0; ++ } ++ ++ struct block_device *bdev = ca->disk_sb.bdev; ++ ++ bio = bio_kmalloc(1, GFP_KERNEL); ++ if (!bio) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ buf = kmalloc(bs, GFP_KERNEL); ++ if (!buf) ++ goto err; ++ ++ end = ktime_get_ns() + test_duration; ++ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { ++ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); ++ bch2_bio_map(bio, buf, bs); ++ ++ u64 submit_time = ktime_get_ns(); ++ ret = submit_bio_wait(bio); ++ bch2_time_stats_update(&stats_nofua, submit_time); ++ ++ if (ret) ++ goto err; ++ } ++ ++ end = ktime_get_ns() + test_duration; ++ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { ++ bio_init(bio, bdev, bio->bi_inline_vecs, 1, REQ_FUA|READ); ++ bch2_bio_map(bio, buf, bs); ++ ++ u64 submit_time = ktime_get_ns(); ++ ret = submit_bio_wait(bio); ++ bch2_time_stats_update(&stats_fua, submit_time); ++ ++ if (ret) ++ goto err; ++ } ++ ++ u64 dev_size = ca->mi.nbuckets * bucket_bytes(ca); ++ ++ end = ktime_get_ns() + test_duration; ++ for (iters = 0; iters < 1000 && time_before64(ktime_get_ns(), end); iters++) { ++ bio_init(bio, bdev, bio->bi_inline_vecs, 1, READ); ++ bio->bi_iter.bi_sector = (bch2_get_random_u64_below(dev_size) & ~((u64) bs - 1)) >> 9; ++ bch2_bio_map(bio, buf, bs); ++ ++ u64 submit_time = ktime_get_ns(); ++ ret = submit_bio_wait(bio); ++ bch2_time_stats_update(&stats_random, submit_time); ++ ++ if (ret) ++ goto err; ++ } ++ ++ u64 ns_nofua = mean_and_variance_get_mean(stats_nofua.duration_stats); ++ u64 ns_fua = mean_and_variance_get_mean(stats_fua.duration_stats); ++ u64 ns_rand = mean_and_variance_get_mean(stats_random.duration_stats); ++ ++ u64 stddev_nofua = mean_and_variance_get_stddev(stats_nofua.duration_stats); ++ u64 stddev_fua = mean_and_variance_get_stddev(stats_fua.duration_stats); ++ u64 stddev_rand = mean_and_variance_get_stddev(stats_random.duration_stats); ++ ++ printbuf_tabstop_push(out, 8); ++ printbuf_tabstop_push(out, 12); ++ printbuf_tabstop_push(out, 12); ++ prt_printf(out, "This test must be run on an idle drive for accurate results\n"); ++ prt_printf(out, "%s\n", dev_name(&ca->disk_sb.bdev->bd_device)); ++ prt_printf(out, "fua support advertized: %s\n", str_yes_no(bdev_fua(bdev))); ++ prt_newline(out); ++ prt_printf(out, "ns:\tlatency\rstddev\r\n"); ++ prt_printf(out, "nofua\t%llu\r%llu\r\n", ns_nofua, stddev_nofua); ++ prt_printf(out, "fua\t%llu\r%llu\r\n", ns_fua, stddev_fua); ++ prt_printf(out, "random\t%llu\r%llu\r\n", ns_rand, stddev_rand); ++ ++ bool read_cache = ns_nofua * 2 < ns_rand; ++ bool fua_cached = read_cache && ns_fua < (ns_nofua + ns_rand) / 2; ++ ++ if (!read_cache) ++ prt_str(out, "reads don't appear to be cached - safe\n"); ++ else if (!fua_cached) ++ prt_str(out, "fua reads don't appear to be cached - safe\n"); ++ else ++ prt_str(out, "fua reads appear to be cached - unsafe\n"); ++err: ++ kfree(buf); ++ kfree(bio); ++ enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_read_fua_test); ++ bch_err_fn(c, ret); ++ return ret; ++} ++ + SHOW(bch2_fs) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); +@@ -334,6 +443,12 @@ SHOW(bch2_fs) + if (attr == &sysfs_rebalance_status) + bch2_rebalance_status_to_text(out, c); + ++ if (attr == &sysfs_snapshot_delete_status) ++ bch2_snapshot_delete_status_to_text(out, c); ++ ++ if (attr == &sysfs_recovery_status) ++ bch2_recovery_pass_status_to_text(out, c); ++ + /* Debugging: */ + + if (attr == &sysfs_journal_debug) +@@ -357,6 +472,9 @@ SHOW(bch2_fs) + if (attr == &sysfs_compression_stats) + bch2_compression_stats_to_text(out, c); + ++ if (attr == &sysfs_errors) ++ bch2_fs_errors_to_text(out, c); ++ + if (attr == &sysfs_new_stripes) + bch2_new_stripes_to_text(out, c); + +@@ -369,10 +487,8 @@ SHOW(bch2_fs) + if (attr == &sysfs_moving_ctxts) + bch2_fs_moving_ctxts_to_text(out, c); + +-#ifdef BCH_WRITE_REF_DEBUG + if (attr == &sysfs_write_refs) +- bch2_write_refs_to_text(out, c); +-#endif ++ enumerated_ref_to_text(out, &c->writes, bch2_write_refs); + + if (attr == &sysfs_nocow_lock_table) + bch2_nocow_locks_to_text(out, &c->nocow_locks); +@@ -405,7 +521,7 @@ STORE(bch2_fs) + if (attr == &sysfs_trigger_btree_updates) + queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) ++ if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) + return -EROFS; + + if (attr == &sysfs_trigger_btree_cache_shrink) { +@@ -425,6 +541,11 @@ STORE(bch2_fs) + c->btree_key_cache.shrink->scan_objects(c->btree_key_cache.shrink, &sc); + } + ++ if (attr == &sysfs_trigger_btree_write_buffer_flush) ++ bch2_trans_do(c, ++ (bch2_btree_write_buffer_flush_sync(trans), ++ bch2_trans_begin(trans))); ++ + if (attr == &sysfs_trigger_gc) + bch2_gc_gens(c); + +@@ -434,6 +555,9 @@ STORE(bch2_fs) + if (attr == &sysfs_trigger_invalidates) + bch2_do_invalidates(c); + ++ if (attr == &sysfs_trigger_journal_commit) ++ bch2_journal_flush(&c->journal); ++ + if (attr == &sysfs_trigger_journal_flush) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); +@@ -445,6 +569,24 @@ STORE(bch2_fs) + if (attr == &sysfs_trigger_freelist_wakeup) + closure_wake_up(&c->freelist_wait); + ++ if (attr == &sysfs_trigger_recalc_capacity) { ++ guard(rwsem_read)(&c->state_lock); ++ bch2_recalc_capacity(c); ++ } ++ ++ if (attr == &sysfs_trigger_delete_dead_snapshots) ++ __bch2_delete_dead_snapshots(c); ++ ++ if (attr == &sysfs_trigger_emergency_read_only) { ++ struct printbuf buf = PRINTBUF; ++ bch2_log_msg_start(c, &buf); ++ ++ prt_printf(&buf, "shutdown by sysfs\n"); ++ bch2_fs_emergency_read_only2(c, &buf); ++ bch2_print_str(c, KERN_ERR, buf.buf); ++ printbuf_exit(&buf); ++ } ++ + #ifdef CONFIG_BCACHEFS_TESTS + if (attr == &sysfs_perf_test) { + char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; +@@ -465,7 +607,7 @@ STORE(bch2_fs) + size = ret; + } + #endif +- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); + return size; + } + SYSFS_OPS(bch2_fs); +@@ -476,8 +618,11 @@ struct attribute *bch2_fs_files[] = { + &sysfs_btree_write_stats, + + &sysfs_rebalance_status, ++ &sysfs_snapshot_delete_status, ++ &sysfs_recovery_status, + + &sysfs_compression_stats, ++ &sysfs_errors, + + #ifdef CONFIG_BCACHEFS_TESTS + &sysfs_perf_test, +@@ -558,9 +703,7 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_new_stripes, + &sysfs_open_buckets, + &sysfs_open_buckets_partial, +-#ifdef BCH_WRITE_REF_DEBUG + &sysfs_write_refs, +-#endif + &sysfs_nocow_lock_table, + &sysfs_io_timers_read, + &sysfs_io_timers_write, +@@ -568,12 +711,17 @@ struct attribute *bch2_fs_internal_files[] = { + &sysfs_trigger_gc, + &sysfs_trigger_discards, + &sysfs_trigger_invalidates, ++ &sysfs_trigger_journal_commit, + &sysfs_trigger_journal_flush, + &sysfs_trigger_journal_writes, + &sysfs_trigger_btree_cache_shrink, + &sysfs_trigger_btree_key_cache_shrink, +- &sysfs_trigger_freelist_wakeup, ++ &sysfs_trigger_btree_write_buffer_flush, + &sysfs_trigger_btree_updates, ++ &sysfs_trigger_freelist_wakeup, ++ &sysfs_trigger_recalc_capacity, ++ &sysfs_trigger_delete_dead_snapshots, ++ &sysfs_trigger_emergency_read_only, + + &sysfs_gc_gens_pos, + +@@ -626,7 +774,7 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, + * We don't need to take c->writes for correctness, but it eliminates an + * unsightly error message in the dmesg log when we're RO: + */ +- if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) ++ if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) + return -EROFS; + + char *tmp = kstrdup(buf, GFP_KERNEL); +@@ -637,40 +785,34 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, + + u64 v; + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: +- bch2_opt_check_may_set(c, ca, id, v); ++ bch2_opt_hook_pre_set(c, ca, id, v); + kfree(tmp); + + if (ret < 0) + goto err; + +- bch2_opt_set_sb(c, ca, opt, v); +- bch2_opt_set_by_id(&c->opts, id, v); +- +- if (v && +- (id == Opt_background_target || +- (id == Opt_foreground_target && !c->opts.background_target) || +- id == Opt_background_compression || +- (id == Opt_compression && !c->opts.background_compression))) +- bch2_set_rebalance_needs_scan(c, 0); +- +- if (v && id == Opt_rebalance_enabled) +- bch2_rebalance_wakeup(c); ++ bool is_sb = opt->get_sb || opt->get_member; ++ bool changed = false; + +- if (v && id == Opt_copygc_enabled) +- bch2_copygc_wakeup(c); ++ if (is_sb) { ++ changed = bch2_opt_set_sb(c, ca, opt, v); ++ } else if (!ca) { ++ changed = bch2_opt_get_by_id(&c->opts, id) != v; ++ } else { ++ /* device options that aren't superblock options aren't ++ * supported */ ++ BUG(); ++ } + +- if (id == Opt_discard && !ca) { +- mutex_lock(&c->sb_lock); +- for_each_member_device(c, ca) +- opt->set_member(bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx), v); ++ if (!ca) ++ bch2_opt_set_by_id(&c->opts, id, v); + +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); +- } ++ if (changed) ++ bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); + + ret = size; + err: +- bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); ++ enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); + return ret; + } + +@@ -807,9 +949,10 @@ SHOW(bch2_dev) + if (attr == &sysfs_io_latency_stats_write) + bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); + +- sysfs_printf(congested, "%u%%", +- clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) +- * 100 / CONGESTED_MAX); ++#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT ++ if (attr == &sysfs_congested) ++ bch2_dev_congested_to_text(out, ca); ++#endif + + if (attr == &sysfs_alloc_debug) + bch2_dev_alloc_debug_to_text(out, ca); +@@ -817,10 +960,19 @@ SHOW(bch2_dev) + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c, ca); + ++ if (attr == &sysfs_read_fua_test) ++ return bch2_read_fua_test(out, ca); ++ + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_show(c, ca, opt_id, out); + ++ if (attr == &sysfs_read_refs) ++ enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); ++ ++ if (attr == &sysfs_write_refs) ++ enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); ++ + return 0; + } + +@@ -871,11 +1023,18 @@ struct attribute *bch2_dev_files[] = { + &sysfs_io_latency_write, + &sysfs_io_latency_stats_read, + &sysfs_io_latency_stats_write, ++#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + &sysfs_congested, ++#endif ++ ++ &sysfs_read_fua_test, + + /* debug: */ + &sysfs_alloc_debug, + &sysfs_open_buckets, ++ ++ &sysfs_read_refs, ++ &sysfs_write_refs, + NULL + }; + +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index 782a05fe7656..baaaedf68422 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -31,78 +31,66 @@ static void delete_test_keys(struct bch_fs *c) + + static int test_delete(struct bch_fs *c, u64 nr) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; + struct bkey_i_cookie k; +- int ret; +- + bkey_cookie_init(&k.k_i); + k.k.p.snapshot = U32_MAX; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, +- BTREE_ITER_intent); ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, k.k.p, BTREE_ITER_intent); + +- ret = commit_do(trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(trans, &iter) ?: ++ int ret = commit_do(trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, &k.k_i, 0)); + bch_err_msg(c, ret, "update error"); + if (ret) +- goto err; ++ return ret; + + pr_info("deleting once"); + ret = commit_do(trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(trans, &iter) ?: ++ bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error (first)"); + if (ret) +- goto err; ++ return ret; + + pr_info("deleting twice"); + ret = commit_do(trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(trans, &iter) ?: ++ bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error (second)"); + if (ret) +- goto err; +-err: +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); +- return ret; ++ return ret; ++ ++ return 0; + } + + static int test_delete_written(struct bch_fs *c, u64 nr) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; + struct bkey_i_cookie k; +- int ret; +- + bkey_cookie_init(&k.k_i); + k.k.p.snapshot = U32_MAX; + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, k.k.p, +- BTREE_ITER_intent); ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, k.k.p, BTREE_ITER_intent); + +- ret = commit_do(trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(trans, &iter) ?: ++ int ret = commit_do(trans, NULL, NULL, 0, ++ bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, &k.k_i, 0)); + bch_err_msg(c, ret, "update error"); + if (ret) +- goto err; ++ return ret; + + bch2_trans_unlock(trans); + bch2_journal_flush_all_pins(&c->journal); + + ret = commit_do(trans, NULL, NULL, 0, +- bch2_btree_iter_traverse(trans, &iter) ?: ++ bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(trans, &iter, 0)); + bch_err_msg(c, ret, "delete error"); + if (ret) +- goto err; +-err: +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); +- return ret; ++ return ret; ++ ++ return 0; + } + + static int test_iterate(struct bch_fs *c, u64 nr) +@@ -130,13 +118,14 @@ static int test_iterate(struct bch_fs *c, u64 nr) + pr_info("iterating forwards"); + i = 0; + +- ret = bch2_trans_run(c, +- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, +- SPOS(0, 0, U32_MAX), POS(0, U64_MAX), +- 0, k, ({ ++ CLASS(btree_trans, trans)(c); ++ ++ ret = for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), ++ 0, k, ({ + BUG_ON(k.k->p.offset != i++); + 0; +- }))); ++ })); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) + return ret; +@@ -145,12 +134,11 @@ static int test_iterate(struct bch_fs *c, u64 nr) + + pr_info("iterating backwards"); + +- ret = bch2_trans_run(c, +- for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, ++ ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs, + SPOS(0, U64_MAX, U32_MAX), 0, k, ({ + BUG_ON(k.k->p.offset != --i); + 0; +- }))); ++ })); + bch_err_msg(c, ret, "error iterating backwards"); + if (ret) + return ret; +@@ -185,14 +173,15 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + pr_info("iterating forwards"); + i = 0; + +- ret = bch2_trans_run(c, +- for_each_btree_key_max(trans, iter, BTREE_ID_extents, +- SPOS(0, 0, U32_MAX), POS(0, U64_MAX), +- 0, k, ({ ++ CLASS(btree_trans, trans)(c); ++ ++ ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), ++ 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + 0; +- }))); ++ })); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) + return ret; +@@ -201,13 +190,12 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + + pr_info("iterating backwards"); + +- ret = bch2_trans_run(c, +- for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, ++ ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents, + SPOS(0, U64_MAX, U32_MAX), 0, k, ({ + BUG_ON(k.k->p.offset != i); + i = bkey_start_offset(k.k); + 0; +- }))); ++ })); + bch_err_msg(c, ret, "error iterating backwards"); + if (ret) + return ret; +@@ -241,14 +229,15 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + pr_info("iterating forwards"); + i = 0; + +- ret = bch2_trans_run(c, +- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, +- SPOS(0, 0, U32_MAX), POS(0, U64_MAX), +- 0, k, ({ ++ CLASS(btree_trans, trans)(c); ++ ++ ret = for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), ++ 0, k, ({ + BUG_ON(k.k->p.offset != i); + i += 2; + 0; +- }))); ++ })); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) + return ret; +@@ -258,10 +247,9 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + pr_info("iterating forwards by slots"); + i = 0; + +- ret = bch2_trans_run(c, +- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, +- SPOS(0, 0, U32_MAX), POS(0, U64_MAX), +- BTREE_ITER_slots, k, ({ ++ ret = for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, ++ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), ++ BTREE_ITER_slots, k, ({ + if (i >= nr * 2) + break; + +@@ -270,7 +258,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + + i++; + 0; +- }))); ++ })); + bch_err_msg(c, ret, "error iterating forwards by slots"); + return ret; + } +@@ -301,15 +289,16 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + pr_info("iterating forwards"); + i = 0; + +- ret = bch2_trans_run(c, +- for_each_btree_key_max(trans, iter, BTREE_ID_extents, +- SPOS(0, 0, U32_MAX), POS(0, U64_MAX), +- 0, k, ({ ++ CLASS(btree_trans, trans)(c); ++ ++ ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), ++ 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; + 0; +- }))); ++ })); + bch_err_msg(c, ret, "error iterating forwards"); + if (ret) + return ret; +@@ -319,10 +308,9 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + pr_info("iterating forwards by slots"); + i = 0; + +- ret = bch2_trans_run(c, +- for_each_btree_key_max(trans, iter, BTREE_ID_extents, +- SPOS(0, 0, U32_MAX), POS(0, U64_MAX), +- BTREE_ITER_slots, k, ({ ++ ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, ++ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), ++ BTREE_ITER_slots, k, ({ + if (i == nr) + break; + BUG_ON(bkey_deleted(k.k) != !(i % 16)); +@@ -331,7 +319,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + BUG_ON(k.k->size != 8); + i = k.k->p.offset; + 0; +- }))); ++ })); + bch_err_msg(c, ret, "error iterating forwards by slots"); + return ret; + } +@@ -344,21 +332,16 @@ static int test_peek_end(struct bch_fs *c, u64 nr) + { + delete_test_keys(c); + +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; +- struct bkey_s_c k; ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, +- SPOS(0, 0, U32_MAX), 0); +- +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); ++ struct bkey_s_c k; ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); + return 0; + } + +@@ -366,21 +349,16 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) + { + delete_test_keys(c); + +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; +- struct bkey_s_c k; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, +- SPOS(0, 0, U32_MAX), 0); ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); + +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); ++ struct bkey_s_c k; ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); + return 0; + } + +@@ -392,15 +370,13 @@ static int insert_test_extent(struct bch_fs *c, + u64 start, u64 end) + { + struct bkey_i_cookie k; +- int ret; +- + bkey_cookie_init(&k.k_i); + k.k_i.k.p.offset = end; + k.k_i.k.p.snapshot = U32_MAX; + k.k_i.k.size = end - start; + k.k_i.k.bversion.lo = test_version++; + +- ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); ++ int ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, NULL, 0, 0); + bch_err_fn(c, ret); + return ret; + } +@@ -446,15 +422,14 @@ static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) + static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) + { + struct bkey_i_cookie k; +- int ret; +- + bkey_cookie_init(&k.k_i); + k.k_i.k.p.inode = inum; + k.k_i.k.p.offset = start + len; + k.k_i.k.p.snapshot = snapid; + k.k_i.k.size = len; + +- ret = bch2_trans_commit_do(c, NULL, NULL, 0, ++ CLASS(btree_trans, trans)(c); ++ int ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i, + BTREE_UPDATE_internal_snapshot_node)); + bch_err_fn(c, ret); +@@ -477,48 +452,43 @@ static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) + /* Test skipping over keys in unrelated snapshots: */ + static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) + { +- struct btree_trans *trans; +- struct btree_iter iter; +- struct bkey_s_c k; + struct bkey_i_cookie cookie; +- int ret; +- + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = snapid_hi; +- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); ++ int ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); + if (ret) + return ret; + +- trans = bch2_trans_get(c); +- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, +- SPOS(0, 0, snapid_lo), 0); +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); ++ ++ struct bkey_s_c k; ++ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + + BUG_ON(k.k->p.snapshot != U32_MAX); + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); + return ret; + } + + static int test_snapshots(struct bch_fs *c, u64 nr) + { + struct bkey_i_cookie cookie; +- u32 snapids[2]; +- u32 snapid_subvols[2] = { 1, 1 }; +- int ret; +- + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = U32_MAX; +- ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); ++ ++ int ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, NULL, 0, 0); + if (ret) + return ret; + +- ret = bch2_trans_commit_do(c, NULL, NULL, 0, +- bch2_snapshot_node_create(trans, U32_MAX, +- snapids, +- snapid_subvols, +- 2)); ++ u32 snapids[2]; ++ u32 snapid_subvols[2] = { 1, 1 }; ++ ++ CLASS(btree_trans, trans)(c); ++ ret = commit_do(trans, NULL, NULL, 0, ++ bch2_snapshot_node_create(trans, U32_MAX, ++ snapids, ++ snapid_subvols, ++ 2)); + if (ret) + return ret; + +@@ -542,42 +512,37 @@ static u64 test_rand(void) + + static int rand_insert(struct bch_fs *c, u64 nr) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct bkey_i_cookie k; +- int ret = 0; +- u64 i; ++ CLASS(btree_trans, trans)(c); + +- for (i = 0; i < nr; i++) { ++ for (u64 i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; + bkey_cookie_init(&k.k_i); + k.k.p.offset = test_rand(); + k.k.p.snapshot = U32_MAX; + +- ret = commit_do(trans, NULL, NULL, 0, ++ int ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k.k_i, 0)); + if (ret) +- break; ++ return ret; + } + +- bch2_trans_put(trans); +- return ret; ++ return 0; + } + + static int rand_insert_multi(struct bch_fs *c, u64 nr) + { +- struct btree_trans *trans = bch2_trans_get(c); ++ CLASS(btree_trans, trans)(c); + struct bkey_i_cookie k[8]; +- int ret = 0; + unsigned j; +- u64 i; + +- for (i = 0; i < nr; i += ARRAY_SIZE(k)) { ++ for (u64 i = 0; i < nr; i += ARRAY_SIZE(k)) { + for (j = 0; j < ARRAY_SIZE(k); j++) { + bkey_cookie_init(&k[j].k_i); + k[j].k.p.offset = test_rand(); + k[j].k.p.snapshot = U32_MAX; + } + +- ret = commit_do(trans, NULL, NULL, 0, ++ int ret = commit_do(trans, NULL, NULL, 0, + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: +@@ -587,36 +552,27 @@ static int rand_insert_multi(struct bch_fs *c, u64 nr) + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_xattrs, &k[7].k_i, 0)); + if (ret) +- break; ++ return ret; + } + +- bch2_trans_put(trans); +- return ret; ++ return 0; + } + + static int rand_lookup(struct bch_fs *c, u64 nr) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret = 0; +- u64 i; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, +- SPOS(0, 0, U32_MAX), 0); ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); + +- for (i = 0; i < nr; i++) { +- bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); ++ for (u64 i = 0; i < nr; i++) { ++ bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); +- ret = bkey_err(k); ++ struct bkey_s_c k; ++ int ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + if (ret) +- break; ++ return ret; + } + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); +- return ret; ++ return 0; + } + + static int rand_mixed_trans(struct btree_trans *trans, +@@ -627,9 +583,9 @@ static int rand_mixed_trans(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); ++ bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + +- k = bch2_btree_iter_peek(trans, iter); ++ k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + bch_err_msg(trans->c, ret, "lookup error"); + if (ret) +@@ -646,77 +602,59 @@ static int rand_mixed_trans(struct btree_trans *trans, + + static int rand_mixed(struct bch_fs *c, u64 nr) + { +- struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; +- struct bkey_i_cookie cookie; +- int ret = 0; +- u64 i, rand; ++ CLASS(btree_trans, trans)(c); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, +- SPOS(0, 0, U32_MAX), 0); +- +- for (i = 0; i < nr; i++) { +- rand = test_rand(); +- ret = commit_do(trans, NULL, NULL, 0, ++ for (u64 i = 0; i < nr; i++) { ++ u64 rand = test_rand(); ++ struct bkey_i_cookie cookie; ++ int ret = commit_do(trans, NULL, NULL, 0, + rand_mixed_trans(trans, &iter, &cookie, i, rand)); + if (ret) +- break; ++ return ret; + } + +- bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); +- return ret; ++ return 0; + } + + static int __do_delete(struct btree_trans *trans, struct bpos pos) + { +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret = 0; +- +- bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, +- BTREE_ITER_intent); +- k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); +- ret = bkey_err(k); ++ CLASS(btree_iter, iter)(trans, BTREE_ID_xattrs, pos, ++ BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); ++ int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + if (!k.k) +- goto err; ++ return 0; + +- ret = bch2_btree_delete_at(trans, &iter, 0); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; ++ return bch2_btree_delete_at(trans, &iter, 0); + } + + static int rand_delete(struct bch_fs *c, u64 nr) + { +- struct btree_trans *trans = bch2_trans_get(c); +- int ret = 0; +- u64 i; ++ CLASS(btree_trans, trans)(c); + +- for (i = 0; i < nr; i++) { ++ for (u64 i = 0; i < nr; i++) { + struct bpos pos = SPOS(0, test_rand(), U32_MAX); + +- ret = commit_do(trans, NULL, NULL, 0, ++ int ret = commit_do(trans, NULL, NULL, 0, + __do_delete(trans, pos)); + if (ret) +- break; ++ return ret; + } + +- bch2_trans_put(trans); +- return ret; ++ return 0; + } + + static int seq_insert(struct bch_fs *c, u64 nr) + { + struct bkey_i_cookie insert; +- + bkey_cookie_init(&insert.k_i); + +- return bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_slots|BTREE_ITER_intent, k, + NULL, NULL, 0, ({ +@@ -724,22 +662,22 @@ static int seq_insert(struct bch_fs *c, u64 nr) + break; + insert.k.p = iter.pos; + bch2_trans_update(trans, &iter, &insert.k_i, 0); +- }))); ++ })); + } + + static int seq_lookup(struct bch_fs *c, u64 nr) + { +- return bch2_trans_run(c, +- for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, +- 0)); ++ 0); + } + + static int seq_overwrite(struct bch_fs *c, u64 nr) + { +- return bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, ++ CLASS(btree_trans, trans)(c); ++ return for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_intent, k, + NULL, NULL, 0, ({ +@@ -747,7 +685,7 @@ static int seq_overwrite(struct bch_fs *c, u64 nr) + + bkey_reassemble(&u.k_i, k); + bch2_trans_update(trans, &iter, &u.k_i, 0); +- }))); ++ })); + } + + static int seq_delete(struct bch_fs *c, u64 nr) +@@ -808,8 +746,8 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + { + struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; + char name_buf[20]; +- struct printbuf nr_buf = PRINTBUF; +- struct printbuf per_sec_buf = PRINTBUF; ++ CLASS(printbuf, nr_buf)(); ++ CLASS(printbuf, per_sec_buf)(); + unsigned i; + u64 time; + +@@ -883,8 +821,6 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + div_u64(time, NSEC_PER_SEC), + div_u64(time * nr_threads, nr), + per_sec_buf.buf); +- printbuf_exit(&per_sec_buf); +- printbuf_exit(&nr_buf); + return j.ret; + } + +diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c +index 314a24d15d4e..c2eae0ab7765 100644 +--- a/fs/bcachefs/thread_with_file.c ++++ b/fs/bcachefs/thread_with_file.c +@@ -60,8 +60,7 @@ int bch2_run_thread_with_file(struct thread_with_file *thr, + err: + if (fd >= 0) + put_unused_fd(fd); +- if (thr->task) +- kthread_stop(thr->task); ++ kthread_stop(thr->task); + return ret; + } + +@@ -185,23 +184,23 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu + break; + } + +- spin_lock(&buf->lock); +- size_t makeroom = b; +- if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) +- makeroom = min_t(ssize_t, makeroom, +- max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, +- 0)); +- darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); +- +- b = min(len, darray_room(buf->buf)); +- +- if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { +- buf->buf.nr += b; +- ubuf += b; +- len -= b; +- copied += b; ++ scoped_guard(spinlock, &buf->lock) { ++ size_t makeroom = b; ++ if (!buf->waiting_for_line || memchr(buf->buf.data, '\n', buf->buf.nr)) ++ makeroom = min_t(ssize_t, makeroom, ++ max_t(ssize_t, STDIO_REDIRECT_BUFSIZE - buf->buf.nr, ++ 0)); ++ darray_make_room_gfp(&buf->buf, makeroom, GFP_NOWAIT); ++ ++ b = min(len, darray_room(buf->buf)); ++ ++ if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { ++ buf->buf.nr += b; ++ ubuf += b; ++ len -= b; ++ copied += b; ++ } + } +- spin_unlock(&buf->lock); + + if (b) { + wake_up(&buf->wait); +@@ -349,14 +348,15 @@ int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t le + if (stdio->done) + return -1; + +- spin_lock(&buf->lock); +- int ret = min(len, buf->buf.nr); +- buf->buf.nr -= ret; +- memcpy(ubuf, buf->buf.data, ret); +- memmove(buf->buf.data, +- buf->buf.data + ret, +- buf->buf.nr); +- spin_unlock(&buf->lock); ++ int ret; ++ scoped_guard(spinlock, &buf->lock) { ++ ret = min(len, buf->buf.nr); ++ buf->buf.nr -= ret; ++ memcpy(ubuf, buf->buf.data, ret); ++ memmove(buf->buf.data, ++ buf->buf.data + ret, ++ buf->buf.nr); ++ } + + wake_up(&buf->wait); + return ret; +diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c +index 2c34fe4be912..7b5fa44807d7 100644 +--- a/fs/bcachefs/time_stats.c ++++ b/fs/bcachefs/time_stats.c +@@ -138,10 +138,8 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { +- struct time_stat_buffer *b; +- +- preempt_disable(); +- b = this_cpu_ptr(stats->buffer); ++ guard(preempt)(); ++ struct time_stat_buffer *b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct time_stat_buffer_entry) { +@@ -151,7 +149,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + time_stats_clear_buffer(stats, b); +- preempt_enable(); + } + } + +diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h +index 519d00d62ae7..3776a1403104 100644 +--- a/fs/bcachefs/trace.h ++++ b/fs/bcachefs/trace.h +@@ -92,58 +92,6 @@ DECLARE_EVENT_CLASS(trans_str_nocaller, + __entry->trans_fn, __get_str(str)) + ); + +-DECLARE_EVENT_CLASS(btree_node_nofs, +- TP_PROTO(struct bch_fs *c, struct btree *b), +- TP_ARGS(c, b), +- +- TP_STRUCT__entry( +- __field(dev_t, dev ) +- __field(u8, level ) +- __field(u8, btree_id ) +- TRACE_BPOS_entries(pos) +- ), +- +- TP_fast_assign( +- __entry->dev = c->dev; +- __entry->level = b->c.level; +- __entry->btree_id = b->c.btree_id; +- TRACE_BPOS_assign(pos, b->key.k.p); +- ), +- +- TP_printk("%d,%d %u %s %llu:%llu:%u", +- MAJOR(__entry->dev), MINOR(__entry->dev), +- __entry->level, +- bch2_btree_id_str(__entry->btree_id), +- __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +-); +- +-DECLARE_EVENT_CLASS(btree_node, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b), +- +- TP_STRUCT__entry( +- __field(dev_t, dev ) +- __array(char, trans_fn, 32 ) +- __field(u8, level ) +- __field(u8, btree_id ) +- TRACE_BPOS_entries(pos) +- ), +- +- TP_fast_assign( +- __entry->dev = trans->c->dev; +- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); +- __entry->level = b->c.level; +- __entry->btree_id = b->c.btree_id; +- TRACE_BPOS_assign(pos, b->key.k.p); +- ), +- +- TP_printk("%d,%d %s %u %s %llu:%llu:%u", +- MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn, +- __entry->level, +- bch2_btree_id_str(__entry->btree_id), +- __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +-); +- + DECLARE_EVENT_CLASS(bch_fs, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c), +@@ -199,6 +147,50 @@ DECLARE_EVENT_CLASS(bio, + (unsigned long long)__entry->sector, __entry->nr_sector) + ); + ++/* errors */ ++ ++TRACE_EVENT(error_throw, ++ TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip), ++ TP_ARGS(c, bch_err, ip), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(int, err ) ++ __array(char, err_str, 32 ) ++ __array(char, ip, 32 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->err = bch_err; ++ strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str)); ++ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); ++ ), ++ ++ TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->ip, __entry->err_str) ++); ++ ++TRACE_EVENT(error_downcast, ++ TP_PROTO(int bch_err, int std_err, unsigned long ip), ++ TP_ARGS(bch_err, std_err, ip), ++ ++ TP_STRUCT__entry( ++ __array(char, bch_err, 32 ) ++ __array(char, std_err, 32 ) ++ __array(char, ip, 32 ) ++ ), ++ ++ TP_fast_assign( ++ strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); ++ strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); ++ snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); ++ ), ++ ++ TP_printk("%s ret %s -> %s %s", __entry->ip, ++ __entry->bch_err, __entry->std_err, __entry->ip) ++); ++ + /* disk_accounting.c */ + + TRACE_EVENT(accounting_mem_insert, +@@ -300,23 +292,9 @@ DEFINE_EVENT(bio, io_read_promote, + TP_ARGS(bio) + ); + +-TRACE_EVENT(io_read_nopromote, +- TP_PROTO(struct bch_fs *c, int ret), +- TP_ARGS(c, ret), +- +- TP_STRUCT__entry( +- __field(dev_t, dev ) +- __array(char, ret, 32 ) +- ), +- +- TP_fast_assign( +- __entry->dev = c->dev; +- strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); +- ), +- +- TP_printk("%d,%d ret %s", +- MAJOR(__entry->dev), MINOR(__entry->dev), +- __entry->ret) ++DEFINE_EVENT(fs_str, io_read_nopromote, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + + DEFINE_EVENT(bio, io_read_bounce, +@@ -339,6 +317,11 @@ DEFINE_EVENT(bio, io_read_reuse_race, + TP_ARGS(bio) + ); + ++DEFINE_EVENT(bio, io_read_fail_and_poison, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ + /* ec.c */ + + TRACE_EVENT(stripe_create, +@@ -478,9 +461,9 @@ TRACE_EVENT(btree_cache_scan, + __entry->nr_to_scan, __entry->can_free, __entry->ret) + ); + +-DEFINE_EVENT(btree_node_nofs, btree_cache_reap, +- TP_PROTO(struct bch_fs *c, struct btree *b), +- TP_ARGS(c, b) ++DEFINE_EVENT(fs_str, btree_cache_reap, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + + DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail, +@@ -505,39 +488,24 @@ DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock, + + /* Btree */ + +-DEFINE_EVENT(btree_node, btree_node_read, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b) ++DEFINE_EVENT(fs_str, btree_node_read, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + +-TRACE_EVENT(btree_node_write, +- TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), +- TP_ARGS(b, bytes, sectors), +- +- TP_STRUCT__entry( +- __field(enum btree_node_type, type) +- __field(unsigned, bytes ) +- __field(unsigned, sectors ) +- ), +- +- TP_fast_assign( +- __entry->type = btree_node_type(b); +- __entry->bytes = bytes; +- __entry->sectors = sectors; +- ), +- +- TP_printk("bkey type %u bytes %u sectors %u", +- __entry->type , __entry->bytes, __entry->sectors) ++DEFINE_EVENT(fs_str, btree_node_write, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + +-DEFINE_EVENT(btree_node, btree_node_alloc, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b) ++DEFINE_EVENT(fs_str, btree_node_alloc, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + +-DEFINE_EVENT(btree_node, btree_node_free, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b) ++DEFINE_EVENT(fs_str, btree_node_free, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + + TRACE_EVENT(btree_reserve_get_fail, +@@ -568,29 +536,29 @@ TRACE_EVENT(btree_reserve_get_fail, + __entry->ret) + ); + +-DEFINE_EVENT(btree_node, btree_node_compact, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b) ++DEFINE_EVENT(fs_str, btree_node_set_root, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + +-DEFINE_EVENT(btree_node, btree_node_merge, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b) ++DEFINE_EVENT(fs_str, btree_node_rewrite, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + +-DEFINE_EVENT(btree_node, btree_node_split, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b) ++DEFINE_EVENT(fs_str, btree_node_merge, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + +-DEFINE_EVENT(btree_node, btree_node_rewrite, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b) ++DEFINE_EVENT(fs_str, btree_node_compact, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + +-DEFINE_EVENT(btree_node, btree_node_set_root, +- TP_PROTO(struct btree_trans *trans, struct btree *b), +- TP_ARGS(trans, b) ++DEFINE_EVENT(fs_str, btree_node_split, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + + TRACE_EVENT(btree_path_relock_fail, +@@ -1031,34 +999,14 @@ TRACE_EVENT(trans_blocked_journal_reclaim, + __entry->must_wait) + ); + +-TRACE_EVENT(trans_restart_journal_preres_get, +- TP_PROTO(struct btree_trans *trans, +- unsigned long caller_ip, +- unsigned flags), +- TP_ARGS(trans, caller_ip, flags), +- +- TP_STRUCT__entry( +- __array(char, trans_fn, 32 ) +- __field(unsigned long, caller_ip ) +- __field(unsigned, flags ) +- ), +- +- TP_fast_assign( +- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); +- __entry->caller_ip = caller_ip; +- __entry->flags = flags; +- ), +- +- TP_printk("%s %pS %x", __entry->trans_fn, +- (void *) __entry->caller_ip, +- __entry->flags) +-); +- ++#if 0 ++/* todo: bring back dynamic fault injection */ + DEFINE_EVENT(transaction_event, trans_restart_fault_inject, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) + ); ++#endif + + DEFINE_EVENT(transaction_event, trans_traverse_all, + TP_PROTO(struct btree_trans *trans, +@@ -1122,51 +1070,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, + TP_ARGS(trans, caller_ip, path) + ); + +-TRACE_EVENT(trans_restart_upgrade, +- TP_PROTO(struct btree_trans *trans, +- unsigned long caller_ip, +- struct btree_path *path, +- unsigned old_locks_want, +- unsigned new_locks_want, +- struct get_locks_fail *f), +- TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), +- +- TP_STRUCT__entry( +- __array(char, trans_fn, 32 ) +- __field(unsigned long, caller_ip ) +- __field(u8, btree_id ) +- __field(u8, old_locks_want ) +- __field(u8, new_locks_want ) +- __field(u8, level ) +- __field(u32, path_seq ) +- __field(u32, node_seq ) +- TRACE_BPOS_entries(pos) +- ), +- +- TP_fast_assign( +- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); +- __entry->caller_ip = caller_ip; +- __entry->btree_id = path->btree_id; +- __entry->old_locks_want = old_locks_want; +- __entry->new_locks_want = new_locks_want; +- __entry->level = f->l; +- __entry->path_seq = path->l[f->l].lock_seq; +- __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; +- TRACE_BPOS_assign(pos, path->pos) +- ), +- +- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", +- __entry->trans_fn, +- (void *) __entry->caller_ip, +- bch2_btree_id_str(__entry->btree_id), +- __entry->pos_inode, +- __entry->pos_offset, +- __entry->pos_snapshot, +- __entry->old_locks_want, +- __entry->new_locks_want, +- __entry->level, +- __entry->path_seq, +- __entry->node_seq) ++DEFINE_EVENT(fs_str, trans_restart_upgrade, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + + DEFINE_EVENT(trans_str, trans_restart_relock, +@@ -1188,19 +1094,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, + TP_ARGS(trans, caller_ip, path) + ); + +-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, +- TP_PROTO(struct btree_trans *trans, +- unsigned long caller_ip, +- struct btree_path *path), +- TP_ARGS(trans, caller_ip, path) +-); +- +-DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, +- TP_PROTO(struct btree_trans *trans, +- unsigned long caller_ip), +- TP_ARGS(trans, caller_ip) +-); +- + DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, +@@ -1222,13 +1115,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, + TP_ARGS(trans, caller_ip, path) + ); + +-DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, +- TP_PROTO(struct btree_trans *trans, +- unsigned long caller_ip, +- struct btree_path *path), +- TP_ARGS(trans, caller_ip, path) +-); +- + DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, +@@ -1287,44 +1173,6 @@ TRACE_EVENT(trans_restart_mem_realloced, + __entry->bytes) + ); + +-TRACE_EVENT(trans_restart_key_cache_key_realloced, +- TP_PROTO(struct btree_trans *trans, +- unsigned long caller_ip, +- struct btree_path *path, +- unsigned old_u64s, +- unsigned new_u64s), +- TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), +- +- TP_STRUCT__entry( +- __array(char, trans_fn, 32 ) +- __field(unsigned long, caller_ip ) +- __field(enum btree_id, btree_id ) +- TRACE_BPOS_entries(pos) +- __field(u32, old_u64s ) +- __field(u32, new_u64s ) +- ), +- +- TP_fast_assign( +- strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); +- __entry->caller_ip = caller_ip; +- +- __entry->btree_id = path->btree_id; +- TRACE_BPOS_assign(pos, path->pos); +- __entry->old_u64s = old_u64s; +- __entry->new_u64s = new_u64s; +- ), +- +- TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", +- __entry->trans_fn, +- (void *) __entry->caller_ip, +- bch2_btree_id_str(__entry->btree_id), +- __entry->pos_inode, +- __entry->pos_offset, +- __entry->pos_snapshot, +- __entry->old_u64s, +- __entry->new_u64s) +-); +- + DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), +@@ -1468,23 +1316,49 @@ DEFINE_EVENT(fs_str, data_update, + TP_ARGS(c, str) + ); + +-TRACE_EVENT(error_downcast, +- TP_PROTO(int bch_err, int std_err, unsigned long ip), +- TP_ARGS(bch_err, std_err, ip), ++DEFINE_EVENT(fs_str, data_update_done_no_rw_devs, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) ++); + +- TP_STRUCT__entry( +- __array(char, bch_err, 32 ) +- __array(char, std_err, 32 ) +- __array(char, ip, 32 ) +- ), ++DEFINE_EVENT(fs_str, io_move_pred, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) ++); + +- TP_fast_assign( +- strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); +- strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); +- snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); +- ), ++DEFINE_EVENT(fs_str, io_move_created_rebalance, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) ++); + +- TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) ++DEFINE_EVENT(fs_str, io_move_evacuate_bucket, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) ++); ++ ++DEFINE_EVENT(fs_str, extent_trim_atomic, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) ++); ++ ++DEFINE_EVENT(fs_str, btree_iter_peek_slot, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) ++); ++ ++DEFINE_EVENT(fs_str, __btree_iter_peek, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) ++); ++ ++DEFINE_EVENT(fs_str, btree_iter_peek_max, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) ++); ++ ++DEFINE_EVENT(fs_str, btree_iter_peek_prev_min, ++ TP_PROTO(struct bch_fs *c, const char *str), ++ TP_ARGS(c, str) + ); + + #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS +@@ -1899,21 +1773,6 @@ TRACE_EVENT(btree_path_free, + __entry->dup_locked) + ); + +-TRACE_EVENT(btree_path_free_trans_begin, +- TP_PROTO(btree_path_idx_t path), +- TP_ARGS(path), +- +- TP_STRUCT__entry( +- __field(btree_path_idx_t, idx ) +- ), +- +- TP_fast_assign( +- __entry->idx = path; +- ), +- +- TP_printk(" path %3u", __entry->idx) +-); +- + #else /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ + #ifndef _TRACE_BCACHEFS_H + +@@ -1931,7 +1790,6 @@ static inline void trace_btree_path_traverse_start(struct btree_trans *trans, st + static inline void trace_btree_path_traverse_end(struct btree_trans *trans, struct btree_path *path) {} + static inline void trace_btree_path_set_pos(struct btree_trans *trans, struct btree_path *path, struct bpos *new_pos) {} + static inline void trace_btree_path_free(struct btree_trans *trans, btree_path_idx_t path, struct btree_path *dup) {} +-static inline void trace_btree_path_free_trans_begin(btree_path_idx_t path) {} + + #endif + #endif /* CONFIG_BCACHEFS_PATH_TRACEPOINTS */ +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +index 87af551692f4..2ded7f3c835f 100644 +--- a/fs/bcachefs/util.c ++++ b/fs/bcachefs/util.c +@@ -252,8 +252,17 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) + bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); + } + +-static void __bch2_print_string_as_lines(const char *prefix, const char *lines, +- bool nonblocking) ++static bool string_is_spaces(const char *str) ++{ ++ while (*str) { ++ if (*str != ' ') ++ return false; ++ str++; ++ } ++ return true; ++} ++ ++void bch2_print_string_as_lines(const char *prefix, const char *lines) + { + bool locked = false; + const char *p; +@@ -263,15 +272,13 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, + return; + } + +- if (!nonblocking) { +- console_lock(); +- locked = true; +- } else { +- locked = console_trylock(); +- } ++ locked = console_trylock(); + + while (*lines) { + p = strchrnul(lines, '\n'); ++ if (!*p && string_is_spaces(lines)) ++ break; ++ + printk("%s%.*s\n", prefix, (int) (p - lines), lines); + if (!*p) + break; +@@ -281,16 +288,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, + console_unlock(); + } + +-void bch2_print_string_as_lines(const char *prefix, const char *lines) +-{ +- return __bch2_print_string_as_lines(prefix, lines, false); +-} +- +-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) +-{ +- return __bch2_print_string_as_lines(prefix, lines, true); +-} +- + int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, + gfp_t gfp) + { +@@ -302,17 +299,12 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigne + if (ret) + return ret; + +- if (!down_read_trylock(&task->signal->exec_update_lock)) +- return -1; +- + do { + nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1); + } while (nr_entries == stack->size && + !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp))); + + stack->nr = nr_entries; +- up_read(&task->signal->exec_update_lock); +- + return ret; + #else + return 0; +@@ -329,11 +321,10 @@ void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) + + int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp) + { +- bch_stacktrace stack = { 0 }; ++ CLASS(bch_stacktrace, stack)(); + int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp); + + bch2_prt_backtrace(out, &stack); +- darray_exit(&stack); + return ret; + } + +@@ -620,17 +611,10 @@ void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_contro + + void bch2_bio_map(struct bio *bio, void *base, size_t size) + { +- while (size) { +- struct page *page = is_vmalloc_addr(base) +- ? vmalloc_to_page(base) +- : virt_to_page(base); +- unsigned offset = offset_in_page(base); +- unsigned len = min_t(size_t, PAGE_SIZE - offset, size); +- +- BUG_ON(!bio_add_page(bio, page, len, offset)); +- size -= len; +- base += len; +- } ++ if (is_vmalloc_addr(base)) ++ bio_add_vmalloc(bio, base, size); ++ else ++ bio_add_virt_nofail(bio, base, size); + } + + int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) +@@ -725,6 +709,16 @@ void bch2_corrupt_bio(struct bio *bio) + } + #endif + ++void bch2_bio_to_text(struct printbuf *out, struct bio *bio) ++{ ++ prt_printf(out, "bi_remaining:\t%u\n", ++ atomic_read(&bio->__bi_remaining)); ++ prt_printf(out, "bi_end_io:\t%ps\n", ++ bio->bi_end_io); ++ prt_printf(out, "bi_status:\t%u\n", ++ bio->bi_status); ++} ++ + #if 0 + void eytzinger1_test(void) + { +@@ -987,9 +981,8 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) + int cpu; + + /* access to pcpu vars has to be blocked by other locking */ +- preempt_disable(); +- ret = this_cpu_ptr(p); +- preempt_enable(); ++ scoped_guard(preempt) ++ ret = this_cpu_ptr(p); + + for_each_possible_cpu(cpu) { + u64 *i = per_cpu_ptr(p, cpu); +@@ -1003,14 +996,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) + return ret; + } + +-void bch2_darray_str_exit(darray_str *d) ++void bch2_darray_str_exit(darray_const_str *d) + { + darray_for_each(*d, i) + kfree(*i); + darray_exit(d); + } + +-int bch2_split_devs(const char *_dev_name, darray_str *ret) ++int bch2_split_devs(const char *_dev_name, darray_const_str *ret) + { + darray_init(ret); + +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index 3e52c7f8ddd2..31e8a4575e4b 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -55,15 +56,16 @@ static inline size_t buf_pages(void *p, size_t len) + PAGE_SIZE); + } + +-static inline void *bch2_kvmalloc(size_t n, gfp_t flags) ++static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) + { + void *p = unlikely(n >= INT_MAX) +- ? vmalloc(n) +- : kvmalloc(n, flags & ~__GFP_ZERO); ++ ? vmalloc_noprof(n) ++ : kvmalloc_noprof(n, flags & ~__GFP_ZERO); + if (p && (flags & __GFP_ZERO)) + memset(p, 0, n); + return p; + } ++#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) + + #define init_heap(heap, _size, gfp) \ + ({ \ +@@ -211,10 +213,10 @@ u64 bch2_read_flag_list(const char *, const char * const[]); + void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); + void bch2_prt_u64_base2(struct printbuf *, u64); + +-void bch2_print_string_as_lines(const char *prefix, const char *lines); +-void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); ++void bch2_print_string_as_lines(const char *, const char *); ++ ++DEFINE_DARRAY_NAMED(bch_stacktrace, unsigned long); + +-typedef DARRAY(unsigned long) bch_stacktrace; + int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); + void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); + int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t); +@@ -419,6 +421,8 @@ static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) + #define bch2_maybe_corrupt_bio(...) do {} while (0) + #endif + ++void bch2_bio_to_text(struct printbuf *, struct bio *); ++ + static inline void memcpy_u64s_small(void *dst, const void *src, + unsigned u64s) + { +@@ -688,8 +692,8 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) + return l.len == r.len && !memcmp(l.name, r.name, l.len); + } + +-void bch2_darray_str_exit(darray_str *); +-int bch2_split_devs(const char *, darray_str *); ++void bch2_darray_str_exit(darray_const_str *); ++int bch2_split_devs(const char *, darray_const_str *); + + #ifdef __KERNEL__ + +@@ -730,6 +734,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr) + return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; + } + ++static inline bool __test_and_set_bit_le64(size_t bit, __le64 *addr) ++{ ++ bool ret = test_bit_le64(bit, addr); ++ __set_bit_le64(bit, addr); ++ return ret; ++} ++ + static inline void memcpy_swab(void *_dst, void *_src, size_t len) + { + u8 *dst = _dst + len; +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index e6be32003f3b..6094b568dd33 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) + struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); + + return bch2_xattr_hash(info, +- &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); ++ &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); + } + + static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) +@@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) + + return l.v->x_type != r->type || + l.v->x_name_len != r->name.len || +- memcmp(l.v->x_name, r->name.name, r->name.len); ++ memcmp(l.v->x_name_and_value, r->name.name, r->name.len); + } + + static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) +@@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) + + return l.v->x_type != r.v->x_type || + l.v->x_name_len != r.v->x_name_len || +- memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); ++ memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); + } + + const struct bch_hash_desc bch2_xattr_hash_desc = { +@@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, + c, xattr_invalid_type, + "invalid type (%u)", xattr.v->x_type); + +- bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), ++ bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), + c, xattr_name_invalid_chars, + "xattr name has invalid characters"); + fsck_err: +@@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, + unsigned name_len = xattr.v->x_name_len; + unsigned val_len = le16_to_cpu(xattr.v->x_val_len); + unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - +- offsetof(struct bch_xattr, x_name); ++ offsetof(struct bch_xattr, x_name_and_value); + + val_len = min_t(int, val_len, max_name_val_bytes - name_len); + name_len = min(name_len, max_name_val_bytes); + + prt_printf(out, "%.*s:%.*s", +- name_len, xattr.v->x_name, ++ name_len, xattr.v->x_name_and_value, + val_len, (char *) xattr_val(xattr.v)); + + if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || +@@ -157,7 +157,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info + else + memcpy(buffer, xattr_val(xattr.v), ret); + } +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_exit(&iter); + return ret; + } + +@@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + int type, int flags) + { + struct bch_fs *c = trans->c; +- struct btree_iter inode_iter = {}; ++ struct btree_iter inode_iter = { NULL }; + int ret; + + ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: +@@ -176,10 +176,15 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + if (ret) + return ret; + ++ /* ++ * Besides the ctime update, extents, dirents and xattrs updates require ++ * that an inode update also happens - to ensure that if a key exists in ++ * one of those btrees with a given snapshot ID an inode is also present ++ */ + inode_u->bi_ctime = bch2_current_time(c); + + ret = bch2_inode_write(trans, &inode_iter, inode_u); +- bch2_trans_iter_exit(trans, &inode_iter); ++ bch2_trans_iter_exit(&inode_iter); + + if (ret) + return ret; +@@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + xattr->v.x_type = type; + xattr->v.x_name_len = namelen; + xattr->v.x_val_len = cpu_to_le16(size); +- memcpy(xattr->v.x_name, name, namelen); ++ memcpy(xattr->v.x_name_and_value, name, namelen); + memcpy(xattr_val(&xattr->v), value, size); + + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, +@@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry, + if (!prefix) + return 0; + +- return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); ++ return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); + } + + static int bch2_xattr_list_bcachefs(struct bch_fs *c, +@@ -308,8 +313,8 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; + u64 offset = 0, inum = inode->ei_inode.bi_inum; + +- int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, ++ CLASS(btree_trans, trans)(c); ++ int ret = for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, + POS(inum, offset), + POS(inum, U64_MAX), + inode->ei_inum.subvol, 0, k, ({ +@@ -317,7 +322,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + continue; + + bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); +- }))) ?: ++ })) ?: + bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false) ?: + bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); + +@@ -330,9 +335,10 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler, + { + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; +- int ret = bch2_trans_do(c, +- bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); ++ CLASS(btree_trans, trans)(c); + ++ int ret = lockrestart_do(trans, ++ bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags)); + if (ret < 0 && bch2_err_matches(ret, ENOENT)) + ret = -ENODATA; + +@@ -351,12 +357,12 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct bch_inode_unpacked inode_u; + int ret; + +- ret = bch2_trans_run(c, +- commit_do(trans, NULL, NULL, 0, ++ CLASS(btree_trans, trans)(c); ++ ret = commit_do(trans, NULL, NULL, 0, + bch2_xattr_set(trans, inode_inum(inode), &inode_u, + &hash, name, value, size, + handler->flags, flags)) ?: +- (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0)); ++ (bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME), 0); + + return bch2_err_class(ret); + } +@@ -413,7 +419,6 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + bch2_inode_opts_to_opts(&inode->ei_inode); + const struct bch_option *opt; + int id, inode_opt_id; +- struct printbuf out = PRINTBUF; + int ret; + u64 v; + +@@ -434,6 +439,7 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) + return -ENODATA; + ++ CLASS(printbuf, out)(); + v = bch2_opt_get_by_id(&opts, id); + bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); + +@@ -448,7 +454,6 @@ static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + memcpy(buffer, out.buf, out.pos); + } + +- printbuf_exit(&out); + return ret; + } + +@@ -527,11 +532,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + kfree(buf); + + if (ret < 0) +- goto err_class_exit; ++ goto err; + +- ret = bch2_opt_check_may_set(c, NULL, opt_id, v); ++ ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); + if (ret < 0) +- goto err_class_exit; ++ goto err; + + s.v = v + 1; + s.defined = true; +@@ -543,7 +548,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + * rename() also has to deal with keeping inherited options up + * to date - see bch2_reinherit_attrs() + */ +- spin_lock(&dentry->d_lock); ++ guard(spinlock)(&dentry->d_lock); + if (!IS_ROOT(dentry)) { + struct bch_inode_info *dir = + to_bch_ei(d_inode(dentry->d_parent)); +@@ -552,26 +557,24 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + } else { + s.v = 0; + } +- spin_unlock(&dentry->d_lock); + + s.defined = false; + } + +- mutex_lock(&inode->ei_update_lock); +- if (inode_opt_id == Inode_opt_project) { +- /* +- * inode fields accessible via the xattr interface are stored +- * with a +1 bias, so that 0 means unset: +- */ +- ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); +- if (ret) +- goto err; +- } ++ scoped_guard(mutex, &inode->ei_update_lock) { ++ if (inode_opt_id == Inode_opt_project) { ++ /* ++ * inode fields accessible via the xattr interface are stored ++ * with a +1 bias, so that 0 means unset: ++ */ ++ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); ++ if (ret) ++ goto err; ++ } + +- ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); ++ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); ++ } + err: +- mutex_unlock(&inode->ei_update_lock); +-err_class_exit: + return bch2_err_class(ret); + } + +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +index 132fbbd15a66..1139bf345f70 100644 +--- a/fs/bcachefs/xattr.h ++++ b/fs/bcachefs/xattr.h +@@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) + { +- return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + ++ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + + name_len + val_len, sizeof(u64)); + } + + #define xattr_val(_xattr) \ +- ((void *) (_xattr)->x_name + (_xattr)->x_name_len) ++ ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) + + struct xattr_search_key { + u8 type; +diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h +index 67426e33d04e..4121b78d9a92 100644 +--- a/fs/bcachefs/xattr_format.h ++++ b/fs/bcachefs/xattr_format.h +@@ -16,10 +16,10 @@ struct bch_xattr { + /* + * x_name contains the name and value counted by + * x_name_len + x_val_len. The introduction of +- * __counted_by(x_name_len) caused a false positive ++ * __counted_by(x_name_len) previously caused a false positive + * detection of an out of bounds write. + */ +- __u8 x_name[]; ++ __u8 x_name_and_value[]; + } __packed __aligned(8); + + #endif /* _BCACHEFS_XATTR_FORMAT_H */ +-- +2.49.1 + diff --git a/sys-kernel/hardened-kernel/files/linux-6.15/1194_ovl-support-layers-on-case-folding-capable-filesyste.patch b/sys-kernel/hardened-kernel/files/linux-6.15/1194_ovl-support-layers-on-case-folding-capable-filesyste.patch new file mode 100644 index 0000000..5ffc0ff --- /dev/null +++ b/sys-kernel/hardened-kernel/files/linux-6.15/1194_ovl-support-layers-on-case-folding-capable-filesyste.patch @@ -0,0 +1,177 @@ +From 81f83264e1127666cfc72cc998b69103ae44b881 Mon Sep 17 00:00:00 2001 +From: Amir Goldstein +Date: Mon, 2 Jun 2025 19:17:02 +0200 +Subject: [PATCH] ovl: support layers on case-folding capable filesystems +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 8bit + +Case folding is often applied to subtrees and not on an entire +filesystem. + +Disallowing layers from filesystems that support case folding is over +limiting. + +Replace the rule that case-folding capable are not allowed as layers +with a rule that case folded directories are not allowed in a merged +directory stack. + +Should case folding be enabled on an underlying directory while +overlayfs is mounted the outcome is generally undefined. + +Specifically in ovl_lookup(), we check the base underlying directory +and fail with -ESTALE and write a warning to kmsg if an underlying +directory case folding is enabled. + +Suggested-by: Kent Overstreet +Link: https://lore.kernel.org/linux-fsdevel/20250520051600.1903319-1-kent.overstreet@linux.dev/ +Signed-off-by: Amir Goldstein +Signed-off-by: Kent Overstreet +Signed-off-by: Alexander Miroshnichenko +--- + fs/overlayfs/namei.c | 31 ++++++++++++++++++++++++++++--- + fs/overlayfs/overlayfs.h | 6 ++++++ + fs/overlayfs/params.c | 10 ++++------ + fs/overlayfs/util.c | 15 +++++++++++---- + 4 files changed, 49 insertions(+), 13 deletions(-) + +diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c +index 2043f0369059..76d6248b625e 100644 +--- a/fs/overlayfs/namei.c ++++ b/fs/overlayfs/namei.c +@@ -230,13 +230,26 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, + struct dentry **ret, bool drop_negative) + { + struct ovl_fs *ofs = OVL_FS(d->sb); +- struct dentry *this; ++ struct dentry *this = NULL; ++ const char *warn; + struct path path; + int err; + bool last_element = !post[0]; + bool is_upper = d->layer->idx == 0; + char val; + ++ /* ++ * We allow filesystems that are case-folding capable but deny composing ++ * ovl stack from case-folded directories. If someone has enabled case ++ * folding on a directory on underlying layer, the warranty of the ovl ++ * stack is voided. ++ */ ++ if (ovl_dentry_casefolded(base)) { ++ warn = "case folded parent"; ++ err = -ESTALE; ++ goto out_warn; ++ } ++ + this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); + if (IS_ERR(this)) { + err = PTR_ERR(this); +@@ -246,10 +259,17 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, + goto out_err; + } + ++ if (ovl_dentry_casefolded(this)) { ++ warn = "case folded child"; ++ err = -EREMOTE; ++ goto out_warn; ++ } ++ + if (ovl_dentry_weird(this)) { + /* Don't support traversing automounts and other weirdness */ ++ warn = "unsupported object type"; + err = -EREMOTE; +- goto out_err; ++ goto out_warn; + } + + path.dentry = this; +@@ -283,8 +303,9 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, + } else { + if (ovl_lookup_trap_inode(d->sb, this)) { + /* Caught in a trap of overlapping layers */ ++ warn = "overlapping layers"; + err = -ELOOP; +- goto out_err; ++ goto out_warn; + } + + if (last_element) +@@ -316,6 +337,10 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, + this = NULL; + goto out; + ++out_warn: ++ pr_warn_ratelimited("failed lookup in %s (%pd2, name='%.*s', err=%i): %s\n", ++ is_upper ? "upper" : "lower", base, ++ namelen, name, err, warn); + out_err: + dput(this); + return err; +diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h +index 497323128e5f..55806bd36faa 100644 +--- a/fs/overlayfs/overlayfs.h ++++ b/fs/overlayfs/overlayfs.h +@@ -448,6 +448,12 @@ void ovl_dentry_init_reval(struct dentry *dentry, struct dentry *upperdentry, + void ovl_dentry_init_flags(struct dentry *dentry, struct dentry *upperdentry, + struct ovl_entry *oe, unsigned int mask); + bool ovl_dentry_weird(struct dentry *dentry); ++ ++static inline bool ovl_dentry_casefolded(struct dentry *dentry) ++{ ++ return sb_has_encoding(dentry->d_sb) && IS_CASEFOLDED(d_inode(dentry)); ++} ++ + enum ovl_path_type ovl_path_type(struct dentry *dentry); + void ovl_path_upper(struct dentry *dentry, struct path *path); + void ovl_path_lower(struct dentry *dentry, struct path *path); +diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c +index f42488c01957..2b9b31524c38 100644 +--- a/fs/overlayfs/params.c ++++ b/fs/overlayfs/params.c +@@ -282,13 +282,11 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, + return invalfc(fc, "%s is not a directory", name); + + /* +- * Root dentries of case-insensitive capable filesystems might +- * not have the dentry operations set, but still be incompatible +- * with overlayfs. Check explicitly to prevent post-mount +- * failures. ++ * Allow filesystems that are case-folding capable but deny composing ++ * ovl stack from case-folded directories. + */ +- if (sb_has_encoding(path->mnt->mnt_sb)) +- return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name); ++ if (ovl_dentry_casefolded(path->dentry)) ++ return invalfc(fc, "case-insensitive directory on %s not supported", name); + + if (ovl_dentry_weird(path->dentry)) + return invalfc(fc, "filesystem on %s not supported", name); +diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c +index dcccb4b4a66c..593c4da107d6 100644 +--- a/fs/overlayfs/util.c ++++ b/fs/overlayfs/util.c +@@ -206,10 +206,17 @@ bool ovl_dentry_weird(struct dentry *dentry) + if (!d_can_lookup(dentry) && !d_is_file(dentry) && !d_is_symlink(dentry)) + return true; + +- return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | +- DCACHE_MANAGE_TRANSIT | +- DCACHE_OP_HASH | +- DCACHE_OP_COMPARE); ++ if (dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | DCACHE_MANAGE_TRANSIT)) ++ return true; ++ ++ /* ++ * Allow filesystems that are case-folding capable but deny composing ++ * ovl stack from case-folded directories. ++ */ ++ if (sb_has_encoding(dentry->d_sb)) ++ return IS_CASEFOLDED(d_inode(dentry)); ++ ++ return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE); + } + + enum ovl_path_type ovl_path_type(struct dentry *dentry) +-- +2.49.1 +