From 789801c993dea3becd2396c8d9dee19b9e40ad3a Mon Sep 17 00:00:00 2001 From: Alexander Miroshnichenko Date: Mon, 3 Feb 2025 07:40:48 +0300 Subject: [PATCH] sys-kernel/hardened-kernel: update v6.12.10 bcachefs cherry-pick updates from bcachefs-for-upstream 5d9ccda Signed-off-by: Alexander Miroshnichenko --- ...s-from-bcachefs-for-upstream-5d9ccda.patch | 24737 ++++++++++++++++ 1 file changed, 24737 insertions(+) create mode 100644 sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-bcachefs-for-upstream-5d9ccda.patch diff --git a/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-bcachefs-for-upstream-5d9ccda.patch b/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-bcachefs-for-upstream-5d9ccda.patch new file mode 100644 index 0000000..697d602 --- /dev/null +++ b/sys-kernel/hardened-kernel/files/linux-6.12/1191-bcachefs-cherry-pick-updates-from-bcachefs-for-upstream-5d9ccda.patch @@ -0,0 +1,24737 @@ +From 4696907078c0fda263fdbf0e0f68d9579085e03e Mon Sep 17 00:00:00 2001 +From: Alexander Miroshnichenko +Date: Mon, 3 Feb 2025 07:16:47 +0300 +Subject: [PATCH] bcachefs: cherry-pick updates from bcachefs-for-upstream + 5d9ccda +Content-Type: text/plain; charset="utf-8" +Content-Transfer-Encoding: 8bit + +Signed-off-by: Alexander Miroshnichenko +--- + .../filesystems/bcachefs/CodingStyle.rst | 2 +- + fs/bcachefs/Kconfig | 2 +- + fs/bcachefs/Makefile | 1 + + fs/bcachefs/acl.c | 11 +- + fs/bcachefs/alloc_background.c | 558 +++++------ + fs/bcachefs/alloc_background.h | 18 +- + fs/bcachefs/alloc_background_format.h | 4 +- + fs/bcachefs/alloc_foreground.c | 315 +++---- + fs/bcachefs/alloc_foreground.h | 21 +- + fs/bcachefs/backpointers.c | 864 +++++++++++------- + fs/bcachefs/backpointers.h | 97 +- + fs/bcachefs/bbpos.h | 2 +- + fs/bcachefs/bcachefs.h | 70 +- + fs/bcachefs/bcachefs_format.h | 106 ++- + fs/bcachefs/bkey.h | 7 - + fs/bcachefs/bkey_methods.c | 29 +- + fs/bcachefs/bkey_methods.h | 15 +- + fs/bcachefs/bkey_types.h | 28 + + fs/bcachefs/btree_cache.c | 64 +- + fs/bcachefs/btree_cache.h | 14 +- + fs/bcachefs/btree_gc.c | 178 +--- + fs/bcachefs/btree_gc.h | 4 +- + fs/bcachefs/btree_io.c | 225 +++-- + fs/bcachefs/btree_io.h | 6 +- + fs/bcachefs/btree_iter.c | 593 +++++++----- + fs/bcachefs/btree_iter.h | 134 ++- + fs/bcachefs/btree_journal_iter.c | 237 ++++- + fs/bcachefs/btree_journal_iter.h | 22 +- + fs/bcachefs/btree_journal_iter_types.h | 36 + + fs/bcachefs/btree_key_cache.c | 73 +- + fs/bcachefs/btree_locking.c | 78 +- + fs/bcachefs/btree_locking.h | 50 +- + fs/bcachefs/btree_node_scan.c | 153 ++-- + fs/bcachefs/btree_node_scan_types.h | 1 - + fs/bcachefs/btree_trans_commit.c | 207 ++--- + fs/bcachefs/btree_types.h | 42 +- + fs/bcachefs/btree_update.c | 70 +- + fs/bcachefs/btree_update.h | 29 +- + fs/bcachefs/btree_update_interior.c | 313 ++++--- + fs/bcachefs/btree_update_interior.h | 7 +- + fs/bcachefs/btree_write_buffer.c | 83 +- + fs/bcachefs/buckets.c | 133 +-- + fs/bcachefs/buckets.h | 30 +- + fs/bcachefs/buckets_types.h | 2 +- + fs/bcachefs/chardev.c | 219 +---- + fs/bcachefs/checksum.c | 10 +- + fs/bcachefs/checksum.h | 2 +- + fs/bcachefs/compress.c | 127 ++- + fs/bcachefs/compress.h | 4 +- + fs/bcachefs/darray.h | 2 +- + fs/bcachefs/data_update.c | 295 ++++-- + fs/bcachefs/data_update.h | 9 +- + fs/bcachefs/debug.c | 5 +- + fs/bcachefs/dirent.c | 10 +- + fs/bcachefs/dirent.h | 9 +- + fs/bcachefs/disk_accounting.c | 150 +-- + fs/bcachefs/disk_accounting.h | 73 +- + fs/bcachefs/ec.c | 267 +++--- + fs/bcachefs/ec.h | 5 +- + fs/bcachefs/ec_format.h | 17 + + fs/bcachefs/errcode.h | 26 +- + fs/bcachefs/error.c | 187 ++-- + fs/bcachefs/error.h | 58 +- + fs/bcachefs/extent_update.c | 4 +- + fs/bcachefs/extents.c | 290 ++---- + fs/bcachefs/extents.h | 18 +- + fs/bcachefs/extents_format.h | 15 +- + fs/bcachefs/fs-common.c | 119 ++- + fs/bcachefs/fs-common.h | 2 + + fs/bcachefs/fs-io-buffered.c | 68 +- + fs/bcachefs/fs-io-direct.c | 25 +- + fs/bcachefs/fs-io-pagecache.c | 4 +- + fs/bcachefs/fs-io.c | 54 +- + fs/bcachefs/fs-ioctl.c | 7 +- + fs/bcachefs/fs.c | 101 +- + fs/bcachefs/fs.h | 1 + + fs/bcachefs/fsck.c | 772 ++++++++++------ + fs/bcachefs/fsck.h | 11 + + fs/bcachefs/inode.c | 169 ++-- + fs/bcachefs/inode.h | 43 +- + fs/bcachefs/inode_format.h | 15 +- + fs/bcachefs/io_misc.c | 22 +- + fs/bcachefs/io_read.c | 726 ++++++++------- + fs/bcachefs/io_read.h | 98 +- + fs/bcachefs/io_write.c | 184 ++-- + fs/bcachefs/io_write.h | 31 +- + fs/bcachefs/io_write_types.h | 2 +- + fs/bcachefs/journal.c | 252 ++--- + fs/bcachefs/journal.h | 18 +- + fs/bcachefs/journal_io.c | 223 +++-- + fs/bcachefs/journal_io.h | 2 +- + fs/bcachefs/journal_reclaim.c | 161 +++- + fs/bcachefs/journal_reclaim.h | 3 + + fs/bcachefs/journal_types.h | 18 +- + fs/bcachefs/logged_ops.c | 11 +- + fs/bcachefs/logged_ops_format.h | 5 + + fs/bcachefs/lru.c | 4 +- + fs/bcachefs/lru.h | 2 +- + fs/bcachefs/move.c | 248 ++--- + fs/bcachefs/move.h | 5 +- + fs/bcachefs/movinggc.c | 17 +- + fs/bcachefs/opts.c | 26 +- + fs/bcachefs/opts.h | 61 +- + fs/bcachefs/printbuf.h | 15 +- + fs/bcachefs/quota.c | 2 +- + fs/bcachefs/quota.h | 4 +- + fs/bcachefs/rcu_pending.c | 38 +- + fs/bcachefs/rebalance.c | 270 +++++- + fs/bcachefs/rebalance.h | 10 + + fs/bcachefs/rebalance_format.h | 53 ++ + fs/bcachefs/rebalance_types.h | 2 - + fs/bcachefs/recovery.c | 212 +++-- + fs/bcachefs/recovery.h | 2 +- + fs/bcachefs/recovery_passes.c | 112 ++- + fs/bcachefs/recovery_passes.h | 1 + + fs/bcachefs/recovery_passes_types.h | 92 +- + fs/bcachefs/reflink.c | 496 +++++++--- + fs/bcachefs/reflink.h | 20 +- + fs/bcachefs/reflink_format.h | 7 +- + fs/bcachefs/sb-clean.c | 6 +- + fs/bcachefs/sb-counters_format.h | 165 ++-- + fs/bcachefs/sb-downgrade.c | 28 +- + fs/bcachefs/sb-errors_format.h | 56 +- + fs/bcachefs/six.c | 27 +- + fs/bcachefs/six.h | 1 + + fs/bcachefs/snapshot.c | 515 +++++------ + fs/bcachefs/snapshot.h | 17 +- + fs/bcachefs/str_hash.c | 295 ++++++ + fs/bcachefs/str_hash.h | 28 +- + fs/bcachefs/subvolume.c | 68 +- + fs/bcachefs/subvolume.h | 19 +- + fs/bcachefs/subvolume_types.h | 2 +- + fs/bcachefs/super-io.c | 83 +- + fs/bcachefs/super-io.h | 21 +- + fs/bcachefs/super.c | 54 +- + fs/bcachefs/super.h | 10 - + fs/bcachefs/sysfs.c | 60 +- + fs/bcachefs/tests.c | 26 +- + fs/bcachefs/trace.h | 103 ++- + fs/bcachefs/util.h | 32 + + fs/bcachefs/varint.c | 5 +- + fs/bcachefs/xattr.c | 13 +- + fs/bcachefs/xattr.h | 5 +- + fs/fs_parser.c | 3 +- + include/linux/fs_parser.h | 2 + + include/linux/min_heap.h | 4 +- + 146 files changed, 7996 insertions(+), 5234 deletions(-) + create mode 100644 fs/bcachefs/btree_journal_iter_types.h + create mode 100644 fs/bcachefs/rebalance_format.h + create mode 100644 fs/bcachefs/str_hash.c + +diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst +index 01de555e21d8..b29562a6bf55 100644 +--- a/Documentation/filesystems/bcachefs/CodingStyle.rst ++++ b/Documentation/filesystems/bcachefs/CodingStyle.rst +@@ -183,4 +183,4 @@ even better as a code comment. + A good code comment is wonderful, but even better is the comment that didn't + need to exist because the code was so straightforward as to be obvious; + organized into small clean and tidy modules, with clear and descriptive names +-for functions and variable, where every line of code has a clear purpose. ++for functions and variables, where every line of code has a clear purpose. +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +index 5bac803ea367..e8549d04dcb8 100644 +--- a/fs/bcachefs/Kconfig ++++ b/fs/bcachefs/Kconfig +@@ -89,7 +89,7 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN + + config BCACHEFS_PATH_TRACEPOINTS + bool "Extra btree_path tracepoints" +- depends on BCACHEFS_FS ++ depends on BCACHEFS_FS && TRACING + help + Enable extra tracepoints for debugging btree_path operations; we don't + normally want these enabled because they happen at very high rates. +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +index 56d20e219f59..d2689388d5e8 100644 +--- a/fs/bcachefs/Makefile ++++ b/fs/bcachefs/Makefile +@@ -82,6 +82,7 @@ bcachefs-y := \ + siphash.o \ + six.o \ + snapshot.o \ ++ str_hash.o \ + subvolume.o \ + super.o \ + super-io.o \ +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +index 87f1be9d4db4..99487727ae64 100644 +--- a/fs/bcachefs/acl.c ++++ b/fs/bcachefs/acl.c +@@ -184,11 +184,6 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, + return ERR_PTR(-EINVAL); + } + +-#define acl_for_each_entry(acl, acl_e) \ +- for (acl_e = acl->a_entries; \ +- acl_e < acl->a_entries + acl->a_count; \ +- acl_e++) +- + /* + * Convert from in-memory to filesystem representation. + */ +@@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans, + { + struct bkey_i_xattr *xattr; + bch_acl_header *acl_header; +- const struct posix_acl_entry *acl_e; ++ const struct posix_acl_entry *acl_e, *pe; + void *outptr; + unsigned nr_short = 0, nr_long = 0, acl_len, u64s; + +- acl_for_each_entry(acl, acl_e) { ++ FOREACH_ACL_ENTRY(acl_e, acl, pe) { + switch (acl_e->e_tag) { + case ACL_USER: + case ACL_GROUP: +@@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans, + + outptr = (void *) acl_header + sizeof(*acl_header); + +- acl_for_each_entry(acl, acl_e) { ++ FOREACH_ACL_ENTRY(acl_e, acl, pe) { + bch_acl_entry *entry = outptr; + + entry->e_tag = cpu_to_le16(acl_e->e_tag); +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +index c84a91572a1d..fc2ef33b67b3 100644 +--- a/fs/bcachefs/alloc_background.c ++++ b/fs/bcachefs/alloc_background.c +@@ -198,7 +198,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) + } + + int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + int ret = 0; +@@ -213,7 +213,7 @@ int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, + } + + int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_alloc_unpacked u; + int ret = 0; +@@ -226,7 +226,7 @@ int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, + } + + int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_alloc_unpacked u; + int ret = 0; +@@ -239,7 +239,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, + } + + int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bch_alloc_v4 a; + int ret = 0; +@@ -322,9 +322,9 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, + void bch2_alloc_v4_swab(struct bkey_s k) + { + struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; +- struct bch_backpointer *bp, *bps; + +- a->journal_seq = swab64(a->journal_seq); ++ a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); ++ a->journal_seq_empty = swab64(a->journal_seq_empty); + a->flags = swab32(a->flags); + a->dirty_sectors = swab32(a->dirty_sectors); + a->cached_sectors = swab32(a->cached_sectors); +@@ -333,13 +333,6 @@ void bch2_alloc_v4_swab(struct bkey_s k) + a->stripe = swab32(a->stripe); + a->nr_external_backpointers = swab32(a->nr_external_backpointers); + a->stripe_sectors = swab32(a->stripe_sectors); +- +- bps = alloc_v4_backpointers(a); +- for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { +- bp->bucket_offset = swab40(bp->bucket_offset); +- bp->bucket_len = swab32(bp->bucket_len); +- bch2_bpos_swab(&bp->pos); +- } + } + + void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +@@ -354,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c + prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); + bch2_prt_data_type(out, a->data_type); + prt_newline(out); +- prt_printf(out, "journal_seq %llu\n", a->journal_seq); +- prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); +- prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); +- prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); +- prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); +- prt_printf(out, "cached_sectors %u\n", a->cached_sectors); +- prt_printf(out, "stripe %u\n", a->stripe); +- prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); +- prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); +- prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); ++ prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); ++ prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); ++ prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); ++ prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); ++ prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); ++ prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); ++ prt_printf(out, "cached_sectors %u\n", a->cached_sectors); ++ prt_printf(out, "stripe %u\n", a->stripe); ++ prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); ++ prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); ++ prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); + + if (ca) + prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); +@@ -392,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + *out = (struct bch_alloc_v4) { +- .journal_seq = u.journal_seq, ++ .journal_seq_nonempty = u.journal_seq, + .flags = u.need_discard, + .gen = u.gen, + .oldest_gen = u.oldest_gen, +@@ -517,7 +511,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) + } + + int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + int ret = 0; + +@@ -664,74 +658,80 @@ int bch2_alloc_read(struct bch_fs *c) + + /* Free space/discard btree: */ + ++static int __need_discard_or_freespace_err(struct btree_trans *trans, ++ struct bkey_s_c alloc_k, ++ bool set, bool discard, bool repair) ++{ ++ struct bch_fs *c = trans->c; ++ enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0); ++ enum bch_sb_error_id err_id = discard ++ ? BCH_FSCK_ERR_need_discard_key_wrong ++ : BCH_FSCK_ERR_freespace_key_wrong; ++ enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf, c, alloc_k); ++ ++ int ret = __bch2_fsck_err(NULL, trans, flags, err_id, ++ "bucket incorrectly %sset in %s btree\n" ++ " %s", ++ set ? "" : "un", ++ bch2_btree_id_str(btree), ++ buf.buf); ++ if (ret == -BCH_ERR_fsck_ignore || ++ ret == -BCH_ERR_fsck_errors_not_fixed) ++ ret = 0; ++ ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++#define need_discard_or_freespace_err(...) \ ++ fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__)) ++ ++#define need_discard_or_freespace_err_on(cond, ...) \ ++ (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false) ++ + static int bch2_bucket_do_index(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c alloc_k, + const struct bch_alloc_v4 *a, + bool set) + { +- struct bch_fs *c = trans->c; +- struct btree_iter iter; +- struct bkey_s_c old; +- struct bkey_i *k; + enum btree_id btree; +- enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; +- enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; +- struct printbuf buf = PRINTBUF; +- int ret; ++ struct bpos pos; + + if (a->data_type != BCH_DATA_free && + a->data_type != BCH_DATA_need_discard) + return 0; + +- k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); +- if (IS_ERR(k)) +- return PTR_ERR(k); +- +- bkey_init(&k->k); +- k->k.type = new_type; +- + switch (a->data_type) { + case BCH_DATA_free: + btree = BTREE_ID_freespace; +- k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); +- bch2_key_resize(&k->k, 1); ++ pos = alloc_freespace_pos(alloc_k.k->p, *a); + break; + case BCH_DATA_need_discard: + btree = BTREE_ID_need_discard; +- k->k.p = alloc_k.k->p; ++ pos = alloc_k.k->p; + break; + default: + return 0; + } + +- old = bch2_bkey_get_iter(trans, &iter, btree, +- bkey_start_pos(&k->k), +- BTREE_ITER_intent); +- ret = bkey_err(old); ++ struct btree_iter iter; ++ struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); ++ int ret = bkey_err(old); + if (ret) + return ret; + +- if (ca->mi.freespace_initialized && +- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && +- bch2_trans_inconsistent_on(old.k->type != old_type, trans, +- "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" +- " for %s", +- set ? "setting" : "clearing", +- bch2_btree_id_str(btree), +- iter.pos.inode, +- iter.pos.offset, +- bch2_bkey_types[old.k->type], +- bch2_bkey_types[old_type], +- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { +- ret = -EIO; +- goto err; +- } ++ need_discard_or_freespace_err_on(ca->mi.freespace_initialized && ++ !old.k->type != set, ++ trans, alloc_k, set, ++ btree == BTREE_ID_need_discard, false); + +- ret = bch2_trans_update(trans, &iter, k, 0); +-err: ++ ret = bch2_btree_bit_mod_iter(trans, &iter, set); ++fsck_err: + bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); + return ret; + } + +@@ -858,7 +858,10 @@ int bch2_trigger_alloc(struct btree_trans *trans, + if (flags & BTREE_TRIGGER_transactional) { + alloc_data_type_set(new_a, new_a->data_type); + +- if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { ++ int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - ++ (int) data_type_is_empty(old_a->data_type); ++ ++ if (is_empty_delta < 0) { + new_a->io_time[READ] = bch2_current_io_time(c, READ); + new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); +@@ -928,37 +931,55 @@ int bch2_trigger_alloc(struct btree_trans *trans, + } + + if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { +- u64 journal_seq = trans->journal_res.seq; +- u64 bucket_journal_seq = new_a->journal_seq; ++ u64 transaction_seq = trans->journal_res.seq; ++ BUG_ON(!transaction_seq); + +- if ((flags & BTREE_TRIGGER_insert) && +- data_type_is_empty(old_a->data_type) != +- data_type_is_empty(new_a->data_type) && +- new.k->type == KEY_TYPE_alloc_v4) { +- struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; ++ if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, ++ trans, alloc_key_journal_seq_in_future, ++ "bucket journal seq in future (currently at %llu)\n%s", ++ journal_cur_seq(&c->journal), ++ (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) ++ new_a->journal_seq_nonempty = transaction_seq; + +- /* +- * If the btree updates referring to a bucket weren't flushed +- * before the bucket became empty again, then the we don't have +- * to wait on a journal flush before we can reuse the bucket: +- */ +- v->journal_seq = bucket_journal_seq = +- data_type_is_empty(new_a->data_type) && +- (journal_seq == v->journal_seq || +- bch2_journal_noflush_seq(&c->journal, v->journal_seq)) +- ? 0 : journal_seq; ++ int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - ++ (int) data_type_is_empty(old_a->data_type); ++ ++ /* ++ * Record journal sequence number of empty -> nonempty transition: ++ * Note that there may be multiple empty -> nonempty ++ * transitions, data in a bucket may be overwritten while we're ++ * still writing to it - so be careful to only record the first: ++ * */ ++ if (is_empty_delta < 0 && ++ new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { ++ new_a->journal_seq_nonempty = transaction_seq; ++ new_a->journal_seq_empty = 0; + } + +- if (!data_type_is_empty(old_a->data_type) && +- data_type_is_empty(new_a->data_type) && +- bucket_journal_seq) { +- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, +- c->journal.flushed_seq_ondisk, +- new.k->p.inode, new.k->p.offset, +- bucket_journal_seq); +- if (bch2_fs_fatal_err_on(ret, c, +- "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) +- goto err; ++ /* ++ * Bucket becomes empty: mark it as waiting for a journal flush, ++ * unless updates since empty -> nonempty transition were never ++ * flushed - we may need to ask the journal not to flush ++ * intermediate sequence numbers: ++ */ ++ if (is_empty_delta > 0) { ++ if (new_a->journal_seq_nonempty == transaction_seq || ++ bch2_journal_noflush_seq(&c->journal, ++ new_a->journal_seq_nonempty, ++ transaction_seq)) { ++ new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; ++ } else { ++ new_a->journal_seq_empty = transaction_seq; ++ ++ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, ++ new.k->p.inode, new.k->p.offset, ++ transaction_seq); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "setting bucket_needs_journal_commit: %s", ++ bch2_err_str(ret))) ++ goto err; ++ } + } + + if (new_a->gen != old_a->gen) { +@@ -974,7 +995,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, + + #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) + #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) +-#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) ++#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) + + if (statechange(a->data_type == BCH_DATA_free) && + bucket_flushed(new_a)) +@@ -1006,6 +1027,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, + rcu_read_unlock(); + } + err: ++fsck_err: + printbuf_exit(&buf); + bch2_dev_put(ca); + return ret; +@@ -1045,7 +1067,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos + * btree node min/max is a closed interval, upto takes a half + * open interval: + */ +- k = bch2_btree_iter_peek_upto(&iter2, end); ++ k = bch2_btree_iter_peek_max(&iter2, end); + next = iter2.pos; + bch2_trans_iter_exit(iter->trans, &iter2); + +@@ -1129,7 +1151,6 @@ int bch2_check_alloc_key(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; +- unsigned discard_key_type, freespace_key_type; + unsigned gens_offset; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; +@@ -1149,64 +1170,30 @@ int bch2_check_alloc_key(struct btree_trans *trans, + + a = bch2_alloc_to_v4(alloc_k, &a_convert); + +- discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; + bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); + k = bch2_btree_iter_peek_slot(discard_iter); + ret = bkey_err(k); + if (ret) + goto err; + +- if (fsck_err_on(k.k->type != discard_key_type, +- trans, need_discard_key_wrong, +- "incorrect key in need_discard btree (got %s should be %s)\n" +- " %s", +- bch2_bkey_types[k.k->type], +- bch2_bkey_types[discard_key_type], +- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { +- struct bkey_i *update = +- bch2_trans_kmalloc(trans, sizeof(*update)); +- +- ret = PTR_ERR_OR_ZERO(update); +- if (ret) +- goto err; +- +- bkey_init(&update->k); +- update->k.type = discard_key_type; +- update->k.p = discard_iter->pos; +- +- ret = bch2_trans_update(trans, discard_iter, update, 0); ++ bool is_discarded = a->data_type == BCH_DATA_need_discard; ++ if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, ++ trans, alloc_k, !is_discarded, true, true)) { ++ ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); + if (ret) + goto err; + } + +- freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; + bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); + k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; + +- if (fsck_err_on(k.k->type != freespace_key_type, +- trans, freespace_key_wrong, +- "incorrect key in freespace btree (got %s should be %s)\n" +- " %s", +- bch2_bkey_types[k.k->type], +- bch2_bkey_types[freespace_key_type], +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { +- struct bkey_i *update = +- bch2_trans_kmalloc(trans, sizeof(*update)); +- +- ret = PTR_ERR_OR_ZERO(update); +- if (ret) +- goto err; +- +- bkey_init(&update->k); +- update->k.type = freespace_key_type; +- update->k.p = freespace_iter->pos; +- bch2_key_resize(&update->k, 1); +- +- ret = bch2_trans_update(trans, freespace_iter, update, 0); ++ bool is_free = a->data_type == BCH_DATA_free; ++ if (need_discard_or_freespace_err_on(!!k.k->type != is_free, ++ trans, alloc_k, !is_free, false, true)) { ++ ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); + if (ret) + goto err; + } +@@ -1368,51 +1355,88 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, + return ret; + } + +-static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, +- struct btree_iter *iter) ++struct check_discard_freespace_key_async { ++ struct work_struct work; ++ struct bch_fs *c; ++ struct bbpos pos; ++}; ++ ++static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); ++ int ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ u8 gen; ++ ret = k.k->type != KEY_TYPE_set ++ ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) ++ : 0; ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static void check_discard_freespace_key_work(struct work_struct *work) ++{ ++ struct check_discard_freespace_key_async *w = ++ container_of(work, struct check_discard_freespace_key_async, work); ++ ++ bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); ++ bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); ++ kfree(w); ++} ++ ++int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, ++ bool async_repair) + { + struct bch_fs *c = trans->c; +- struct btree_iter alloc_iter; +- struct bkey_s_c alloc_k; +- struct bch_alloc_v4 a_convert; +- const struct bch_alloc_v4 *a; +- u64 genbits; +- struct bpos pos; + enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard + ? BCH_DATA_need_discard + : BCH_DATA_free; + struct printbuf buf = PRINTBUF; +- int ret; + +- pos = iter->pos; +- pos.offset &= ~(~0ULL << 56); +- genbits = iter->pos.offset & (~0ULL << 56); ++ struct bpos bucket = iter->pos; ++ bucket.offset &= ~(~0ULL << 56); ++ u64 genbits = iter->pos.offset & (~0ULL << 56); + +- alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); +- ret = bkey_err(alloc_k); ++ struct btree_iter alloc_iter; ++ struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, ++ BTREE_ID_alloc, bucket, ++ async_repair ? BTREE_ITER_cached : 0); ++ int ret = bkey_err(alloc_k); + if (ret) + return ret; + +- if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), +- trans, need_discard_freespace_key_to_invalid_dev_bucket, +- "entry in %s btree for nonexistant dev:bucket %llu:%llu", +- bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) +- goto delete; ++ if (!bch2_dev_bucket_exists(c, bucket)) { ++ if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, ++ "entry in %s btree for nonexistant dev:bucket %llu:%llu", ++ bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) ++ goto delete; ++ ret = 1; ++ goto out; ++ } + +- a = bch2_alloc_to_v4(alloc_k, &a_convert); ++ struct bch_alloc_v4 a_convert; ++ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); ++ ++ if (a->data_type != state || ++ (state == BCH_DATA_free && ++ genbits != alloc_freespace_genbits(*a))) { ++ if (fsck_err(trans, need_discard_freespace_key_bad, ++ "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", ++ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), ++ bch2_btree_id_str(iter->btree_id), ++ iter->pos.inode, ++ iter->pos.offset, ++ a->data_type == state, ++ genbits >> 56, alloc_freespace_genbits(*a) >> 56)) ++ goto delete; ++ ret = 1; ++ goto out; ++ } + +- if (fsck_err_on(a->data_type != state || +- (state == BCH_DATA_free && +- genbits != alloc_freespace_genbits(*a)), +- trans, need_discard_freespace_key_bad, +- "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", +- (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), +- bch2_btree_id_str(iter->btree_id), +- iter->pos.inode, +- iter->pos.offset, +- a->data_type == state, +- genbits >> 56, alloc_freespace_genbits(*a) >> 56)) +- goto delete; ++ *gen = a->gen; + out: + fsck_err: + bch2_set_btree_iter_dontneed(&alloc_iter); +@@ -1420,11 +1444,40 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran + printbuf_exit(&buf); + return ret; + delete: +- ret = bch2_btree_delete_extent_at(trans, iter, +- iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: +- bch2_trans_commit(trans, NULL, NULL, +- BCH_TRANS_COMMIT_no_enospc); +- goto out; ++ if (!async_repair) { ++ ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BCH_TRANS_COMMIT_no_enospc) ?: ++ -BCH_ERR_transaction_restart_commit; ++ goto out; ++ } else { ++ /* ++ * We can't repair here when called from the allocator path: the ++ * commit will recurse back into the allocator ++ */ ++ struct check_discard_freespace_key_async *w = ++ kzalloc(sizeof(*w), GFP_KERNEL); ++ if (!w) ++ goto out; ++ ++ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { ++ kfree(w); ++ goto out; ++ } ++ ++ INIT_WORK(&w->work, check_discard_freespace_key_work); ++ w->c = c; ++ w->pos = BBPOS(iter->btree_id, iter->pos); ++ queue_work(c->write_ref_wq, &w->work); ++ goto out; ++ } ++} ++ ++static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) ++{ ++ u8 gen; ++ int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); ++ return ret < 0 ? ret : 0; + } + + /* +@@ -1581,7 +1634,7 @@ int bch2_check_alloc_info(struct bch_fs *c) + ret = for_each_btree_key(trans, iter, + BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_prefetch, k, +- bch2_check_discard_freespace_key(trans, &iter)); ++ bch2_check_discard_freespace_key_fsck(trans, &iter)); + if (ret) + goto err; + +@@ -1594,7 +1647,7 @@ int bch2_check_alloc_info(struct bch_fs *c) + break; + + ret = bkey_err(k) ?: +- bch2_check_discard_freespace_key(trans, &iter); ++ bch2_check_discard_freespace_key_fsck(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; +@@ -1757,7 +1810,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, + struct btree_iter *need_discard_iter, + struct bpos *discard_pos_done, +- struct discard_buckets_state *s) ++ struct discard_buckets_state *s, ++ bool fastpath) + { + struct bch_fs *c = trans->c; + struct bpos pos = need_discard_iter->pos; +@@ -1793,44 +1847,23 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + if (ret) + goto out; + +- if (bch2_bucket_sectors_total(a->v)) { +- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, +- trans, "attempting to discard bucket with dirty data\n%s", +- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) +- ret = -EIO; +- goto out; +- } +- + if (a->v.data_type != BCH_DATA_need_discard) { +- if (data_type_is_empty(a->v.data_type) && +- BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { +- a->v.gen++; +- SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); +- goto write; ++ if (need_discard_or_freespace_err(trans, k, true, true, true)) { ++ ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); ++ if (ret) ++ goto out; ++ goto commit; + } + +- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, +- trans, "bucket incorrectly set in need_discard btree\n" +- "%s", +- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) +- ret = -EIO; + goto out; + } + +- if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { +- if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, +- trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", +- a->v.journal_seq, +- c->journal.flushed_seq_ondisk, +- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) +- ret = -EIO; +- goto out; +- } +- +- if (discard_in_flight_add(ca, iter.pos.offset, true)) +- goto out; ++ if (!fastpath) { ++ if (discard_in_flight_add(ca, iter.pos.offset, true)) ++ goto out; + +- discard_locked = true; ++ discard_locked = true; ++ } + + if (!bkey_eq(*discard_pos_done, iter.pos) && + ca->mi.discard && !c->opts.nochanges) { +@@ -1844,6 +1877,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + ca->mi.bucket_size, + GFP_KERNEL); + *discard_pos_done = iter.pos; ++ s->discarded++; + + ret = bch2_trans_relock_notrace(trans); + if (ret) +@@ -1851,22 +1885,25 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, + } + + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); +-write: + alloc_data_type_set(&a->v, a->v.data_type); + +- ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: +- bch2_trans_commit(trans, NULL, NULL, +- BCH_WATERMARK_btree| +- BCH_TRANS_COMMIT_no_enospc); ++ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); ++ if (ret) ++ goto out; ++commit: ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BCH_WATERMARK_btree| ++ BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto out; + + count_event(c, bucket_discard); +- s->discarded++; + out: ++fsck_err: + if (discard_locked) + discard_in_flight_remove(ca, iter.pos.offset); +- s->seen++; ++ if (!ret) ++ s->seen++; + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +@@ -1886,11 +1923,11 @@ static void bch2_do_discards_work(struct work_struct *work) + * successful commit: + */ + ret = bch2_trans_run(c, +- for_each_btree_key_upto(trans, iter, ++ for_each_btree_key_max(trans, iter, + BTREE_ID_need_discard, + POS(ca->dev_idx, 0), + POS(ca->dev_idx, U64_MAX), 0, k, +- bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); ++ bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); + + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, + bch2_err_str(ret)); +@@ -1923,27 +1960,29 @@ void bch2_do_discards(struct bch_fs *c) + bch2_dev_do_discards(ca); + } + +-static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) ++static int bch2_do_discards_fast_one(struct btree_trans *trans, ++ struct bch_dev *ca, ++ u64 bucket, ++ struct bpos *discard_pos_done, ++ struct discard_buckets_state *s) + { +- struct btree_iter iter; +- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); +- struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); +- int ret = bkey_err(k); ++ struct btree_iter need_discard_iter; ++ struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, ++ BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); ++ int ret = bkey_err(discard_k); + if (ret) +- goto err; +- +- struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); +- ret = PTR_ERR_OR_ZERO(a); +- if (ret) +- goto err; ++ return ret; + +- BUG_ON(a->v.dirty_sectors); +- SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); +- alloc_data_type_set(&a->v, a->v.data_type); ++ if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, ++ trans, discarding_bucket_not_in_need_discard_btree, ++ "attempting to discard bucket %u:%llu not in need_discard btree", ++ ca->dev_idx, bucket)) ++ goto out; + +- ret = bch2_trans_update(trans, &iter, &a->k_i, 0); +-err: +- bch2_trans_iter_exit(trans, &iter); ++ ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); ++out: ++fsck_err: ++ bch2_trans_iter_exit(trans, &need_discard_iter); + return ret; + } + +@@ -1951,6 +1990,10 @@ static void bch2_do_discards_fast_work(struct work_struct *work) + { + struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); + struct bch_fs *c = ca->fs; ++ struct discard_buckets_state s = {}; ++ struct bpos discard_pos_done = POS_MAX; ++ struct btree_trans *trans = bch2_trans_get(c); ++ int ret = 0; + + while (1) { + bool got_bucket = false; +@@ -1971,16 +2014,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) + if (!got_bucket) + break; + +- if (ca->mi.discard && !c->opts.nochanges) +- blkdev_issue_discard(ca->disk_sb.bdev, +- bucket_to_sector(ca, bucket), +- ca->mi.bucket_size, +- GFP_KERNEL); +- +- int ret = bch2_trans_commit_do(c, NULL, NULL, +- BCH_WATERMARK_btree| +- BCH_TRANS_COMMIT_no_enospc, +- bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); ++ ret = lockrestart_do(trans, ++ bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); + bch_err_fn(c, ret); + + discard_in_flight_remove(ca, bucket); +@@ -1989,6 +2024,9 @@ static void bch2_do_discards_fast_work(struct work_struct *work) + break; + } + ++ trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); ++ ++ bch2_trans_put(trans); + percpu_ref_put(&ca->io_ref); + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + } +@@ -2030,8 +2068,11 @@ static int invalidate_one_bucket(struct btree_trans *trans, + return 1; + + if (!bch2_dev_bucket_exists(c, bucket)) { +- prt_str(&buf, "lru entry points to invalid bucket"); +- goto err; ++ if (fsck_err(trans, lru_entry_to_invalid_bucket, ++ "lru key points to nonexistent device:bucket %llu:%llu", ++ bucket.inode, bucket.offset)) ++ return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); ++ goto out; + } + + if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) +@@ -2072,28 +2113,9 @@ static int invalidate_one_bucket(struct btree_trans *trans, + trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); + --*nr_to_invalidate; + out: ++fsck_err: + printbuf_exit(&buf); + return ret; +-err: +- prt_str(&buf, "\n lru key: "); +- bch2_bkey_val_to_text(&buf, c, lru_k); +- +- prt_str(&buf, "\n lru entry: "); +- bch2_lru_pos_to_text(&buf, lru_iter->pos); +- +- prt_str(&buf, "\n alloc key: "); +- if (!a) +- bch2_bpos_to_text(&buf, bucket); +- else +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); +- +- bch_err(c, "%s", buf.buf); +- if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { +- bch2_inconsistent_error(c); +- ret = -EINVAL; +- } +- +- goto out; + } + + static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, +@@ -2101,7 +2123,7 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter + { + struct bkey_s_c k; + again: +- k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); ++ k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + if (!k.k && !*wrapped) { + bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + *wrapped = true; +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +index 163a67b97a40..de25ba4ee94b 100644 +--- a/fs/bcachefs/alloc_background.h ++++ b/fs/bcachefs/alloc_background.h +@@ -8,8 +8,6 @@ + #include "debug.h" + #include "super.h" + +-enum bch_validate_flags; +- + /* How out of date a pointer gen is allowed to be: */ + #define BUCKET_GC_GEN_MAX 96U + +@@ -245,10 +243,14 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s + + int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + +-int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +-int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +-int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +-int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); ++int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); ++int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); ++int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_alloc_v4_swab(struct bkey_s); + void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +@@ -282,7 +284,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + }) + + int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ +@@ -307,6 +309,8 @@ int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, + int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); ++ ++int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); + int bch2_check_alloc_info(struct bch_fs *); + int bch2_check_alloc_to_lru_refs(struct bch_fs *); + void bch2_dev_do_discards(struct bch_dev *); +diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h +index befdaa95c515..740238369a5a 100644 +--- a/fs/bcachefs/alloc_background_format.h ++++ b/fs/bcachefs/alloc_background_format.h +@@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + + struct bch_alloc_v4 { + struct bch_val v; +- __u64 journal_seq; ++ __u64 journal_seq_nonempty; + __u32 flags; + __u8 gen; + __u8 oldest_gen; +@@ -70,7 +70,7 @@ struct bch_alloc_v4 { + __u32 stripe; + __u32 nr_external_backpointers; + /* end of fields in original version of alloc_v4 */ +- __u64 _fragmentation_lru; /* obsolete */ ++ __u64 journal_seq_empty; + __u32 stripe_sectors; + __u32 pad; + } __packed __aligned(8); +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +index 372178c8d416..ecd14962ab01 100644 +--- a/fs/bcachefs/alloc_foreground.c ++++ b/fs/bcachefs/alloc_foreground.c +@@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) + return; + } + +- percpu_down_read(&c->mark_lock); + spin_lock(&ob->lock); +- + ob->valid = false; + ob->data_type = 0; +- + spin_unlock(&ob->lock); +- percpu_up_read(&c->mark_lock); + + spin_lock(&c->freelist_lock); + bch2_open_bucket_hash_remove(c, ob); +@@ -156,6 +152,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) + return ob; + } + ++static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) ++{ ++ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) ++ return false; ++ ++ return bch2_is_superblock_bucket(ca, b); ++} ++ + static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) + { + BUG_ON(c->open_buckets_partial_nr >= +@@ -175,70 +179,46 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) + closure_wake_up(&c->freelist_wait); + } + +-/* _only_ for allocating the journal on a new device: */ +-long bch2_bucket_alloc_new_fs(struct bch_dev *ca) ++static inline bool may_alloc_bucket(struct bch_fs *c, ++ struct bpos bucket, ++ struct bucket_alloc_state *s) + { +- while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { +- u64 b = ca->new_fs_bucket_idx++; +- +- if (!is_superblock_bucket(ca, b) && +- (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) +- return b; ++ if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { ++ s->skipped_open++; ++ return false; + } + +- return -1; +-} ++ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, ++ c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { ++ s->skipped_need_journal_commit++; ++ return false; ++ } + +-static inline unsigned open_buckets_reserved(enum bch_watermark watermark) +-{ +- switch (watermark) { +- case BCH_WATERMARK_interior_updates: +- return 0; +- case BCH_WATERMARK_reclaim: +- return OPEN_BUCKETS_COUNT / 6; +- case BCH_WATERMARK_btree: +- case BCH_WATERMARK_btree_copygc: +- return OPEN_BUCKETS_COUNT / 4; +- case BCH_WATERMARK_copygc: +- return OPEN_BUCKETS_COUNT / 3; +- default: +- return OPEN_BUCKETS_COUNT / 2; ++ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { ++ s->skipped_nocow++; ++ return false; + } ++ ++ return true; + } + + static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +- u64 bucket, ++ u64 bucket, u8 gen, + enum bch_watermark watermark, +- const struct bch_alloc_v4 *a, + struct bucket_alloc_state *s, + struct closure *cl) + { +- struct open_bucket *ob; ++ if (unlikely(is_superblock_bucket(c, ca, bucket))) ++ return NULL; + + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + s->skipped_nouse++; + return NULL; + } + +- if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { +- s->skipped_open++; +- return NULL; +- } +- +- if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, +- c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { +- s->skipped_need_journal_commit++; +- return NULL; +- } +- +- if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { +- s->skipped_nocow++; +- return NULL; +- } +- + spin_lock(&c->freelist_lock); + +- if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { ++ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(watermark))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + +@@ -254,14 +234,13 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + return NULL; + } + +- ob = bch2_open_bucket_alloc(c); ++ struct open_bucket *ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); +- + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->dev = ca->dev_idx; +- ob->gen = a->gen; ++ ob->gen = gen; + ob->bucket = bucket; + spin_unlock(&ob->lock); + +@@ -276,111 +255,29 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * + } + + static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, +- enum bch_watermark watermark, u64 free_entry, ++ enum bch_watermark watermark, + struct bucket_alloc_state *s, +- struct bkey_s_c freespace_k, ++ struct btree_iter *freespace_iter, + struct closure *cl) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter = { NULL }; +- struct bkey_s_c k; +- struct open_bucket *ob; +- struct bch_alloc_v4 a_convert; +- const struct bch_alloc_v4 *a; +- u64 b = free_entry & ~(~0ULL << 56); +- unsigned genbits = free_entry >> 56; +- struct printbuf buf = PRINTBUF; +- int ret; +- +- if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { +- prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" +- " freespace key ", +- ca->mi.first_bucket, ca->mi.nbuckets); +- bch2_bkey_val_to_text(&buf, c, freespace_k); +- bch2_trans_inconsistent(trans, "%s", buf.buf); +- ob = ERR_PTR(-EIO); +- goto err; +- } +- +- k = bch2_bkey_get_iter(trans, &iter, +- BTREE_ID_alloc, POS(ca->dev_idx, b), +- BTREE_ITER_cached); +- ret = bkey_err(k); +- if (ret) { +- ob = ERR_PTR(ret); +- goto err; +- } +- +- a = bch2_alloc_to_v4(k, &a_convert); +- +- if (a->data_type != BCH_DATA_free) { +- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { +- ob = NULL; +- goto err; +- } +- +- prt_printf(&buf, "non free bucket in freespace btree\n" +- " freespace key "); +- bch2_bkey_val_to_text(&buf, c, freespace_k); +- prt_printf(&buf, "\n "); +- bch2_bkey_val_to_text(&buf, c, k); +- bch2_trans_inconsistent(trans, "%s", buf.buf); +- ob = ERR_PTR(-EIO); +- goto err; +- } +- +- if (genbits != (alloc_freespace_genbits(*a) >> 56) && +- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { +- prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" +- " freespace key ", +- genbits, alloc_freespace_genbits(*a) >> 56); +- bch2_bkey_val_to_text(&buf, c, freespace_k); +- prt_printf(&buf, "\n "); +- bch2_bkey_val_to_text(&buf, c, k); +- bch2_trans_inconsistent(trans, "%s", buf.buf); +- ob = ERR_PTR(-EIO); +- goto err; +- } +- +- if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { +- struct bch_backpointer bp; +- struct bpos bp_pos = POS_MIN; ++ u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); + +- ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, +- &bp_pos, &bp, +- BTREE_ITER_nopreserve); +- if (ret) { +- ob = ERR_PTR(ret); +- goto err; +- } ++ if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) ++ return NULL; + +- if (!bkey_eq(bp_pos, POS_MAX)) { +- /* +- * Bucket may have data in it - we don't call +- * bc2h_trans_inconnsistent() because fsck hasn't +- * finished yet +- */ +- ob = NULL; +- goto err; +- } +- } ++ u8 gen; ++ int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); ++ if (ret < 0) ++ return ERR_PTR(ret); ++ if (ret) ++ return NULL; + +- ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); +- if (!ob) +- bch2_set_btree_iter_dontneed(&iter); +-err: +- if (iter.path) +- bch2_set_btree_iter_dontneed(&iter); +- bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); +- return ob; ++ return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); + } + + /* + * This path is for before the freespace btree is initialized: +- * +- * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & +- * journal buckets - journal buckets will be < ca->new_fs_bucket_idx + */ + static noinline struct open_bucket * + bch2_bucket_alloc_early(struct btree_trans *trans, +@@ -389,10 +286,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans, + struct bucket_alloc_state *s, + struct closure *cl) + { ++ struct bch_fs *c = trans->c; + struct btree_iter iter, citer; + struct bkey_s_c k, ck; + struct open_bucket *ob = NULL; +- u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); ++ u64 first_bucket = ca->mi.first_bucket; + u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 alloc_start = max(first_bucket, *dev_alloc_cursor); + u64 alloc_cursor = alloc_start; +@@ -415,10 +313,6 @@ bch2_bucket_alloc_early(struct btree_trans *trans, + if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) + break; + +- if (ca->new_fs_bucket_idx && +- is_superblock_bucket(ca, k.k->p.offset)) +- continue; +- + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { +@@ -452,7 +346,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, + + s->buckets_seen++; + +- ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); ++ ob = may_alloc_bucket(c, k.k->p, s) ++ ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, ++ watermark, s, cl) ++ : NULL; + next: + bch2_set_btree_iter_dontneed(&citer); + bch2_trans_iter_exit(trans, &citer); +@@ -489,20 +386,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); + u64 alloc_cursor = alloc_start; + int ret; +- +- BUG_ON(ca->new_fs_bucket_idx); + again: +- for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, +- POS(ca->dev_idx, alloc_cursor), 0, k, ret) { +- if (k.k->p.inode != ca->dev_idx) +- break; ++ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, ++ POS(ca->dev_idx, alloc_cursor), ++ POS(ca->dev_idx, U64_MAX), ++ 0, k, ret) { ++ /* ++ * peek normally dosen't trim extents - they can span iter.pos, ++ * which is not what we want here: ++ */ ++ iter.k.size = iter.k.p.offset - iter.pos.offset; + +- for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); +- alloc_cursor < k.k->p.offset; +- alloc_cursor++) { ++ while (iter.k.size) { + s->buckets_seen++; + +- u64 bucket = alloc_cursor & ~(~0ULL << 56); ++ u64 bucket = iter.pos.offset & ~(~0ULL << 56); + if (s->btree_bitmap != BTREE_BITMAP_ANY && + s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { +@@ -511,32 +409,36 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + goto fail; + + bucket = sector_to_bucket(ca, +- round_up(bucket_to_sector(ca, bucket) + 1, ++ round_up(bucket_to_sector(ca, bucket + 1), + 1ULL << ca->mi.btree_bitmap_shift)); +- u64 genbits = alloc_cursor >> 56; +- alloc_cursor = bucket | (genbits << 56); ++ alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); + +- if (alloc_cursor > k.k->p.offset) +- bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); ++ bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + s->skipped_mi_btree_bitmap++; +- continue; ++ goto next; + } + +- ob = try_alloc_bucket(trans, ca, watermark, +- alloc_cursor, s, k, cl); ++ ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); + if (ob) { ++ if (!IS_ERR(ob)) ++ *dev_alloc_cursor = iter.pos.offset; + bch2_set_btree_iter_dontneed(&iter); + break; + } +- } + ++ iter.k.size--; ++ iter.pos.offset++; ++ } ++next: + if (ob || ret) + break; + } + fail: + bch2_trans_iter_exit(trans, &iter); + +- if (!ob && ret) ++ BUG_ON(ob && ret); ++ ++ if (ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_start > ca->mi.first_bucket) { +@@ -544,8 +446,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + goto again; + } + +- *dev_alloc_cursor = alloc_cursor; +- + return ob; + } + +@@ -595,6 +495,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, + * @watermark: how important is this allocation? + * @data_type: BCH_DATA_journal, btree, user... + * @cl: if not NULL, closure to be used to wait if buckets not available ++ * @nowait: if true, do not wait for buckets to become available + * @usage: for secondarily also returning the current device usage + * + * Returns: an open_bucket on success, or an ERR_PTR() on failure. +@@ -629,6 +530,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + bch2_dev_do_invalidates(ca); + + if (!avail) { ++ if (watermark > BCH_WATERMARK_normal && ++ c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) ++ goto alloc; ++ + if (cl && !waiting) { + closure_wait(&c->freelist_wait, cl); + waiting = true; +@@ -711,9 +616,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + unsigned i; + + for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) +- ret.devs[ret.nr++] = i; ++ ret.data[ret.nr++] = i; + +- bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); ++ bubble_sort(ret.data, ret.nr, dev_stripe_cmp); + return ret; + } + +@@ -785,18 +690,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct closure *cl) + { + struct bch_fs *c = trans->c; +- struct dev_alloc_list devs_sorted = +- bch2_dev_alloc_list(c, stripe, devs_may_alloc); + int ret = -BCH_ERR_insufficient_devices; + + BUG_ON(*nr_effective >= nr_replicas); + +- for (unsigned i = 0; i < devs_sorted.nr; i++) { +- struct bch_dev_usage usage; +- struct open_bucket *ob; +- +- unsigned dev = devs_sorted.devs[i]; +- struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); ++ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); ++ darray_for_each(devs_sorted, i) { ++ struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); + if (!ca) + continue; + +@@ -805,8 +705,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + continue; + } + +- ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, +- cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); ++ struct bch_dev_usage usage; ++ struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, ++ cl, flags & BCH_WRITE_alloc_nowait, &usage); + if (!IS_ERR(ob)) + bch2_dev_stripe_increment_inlined(ca, stripe, &usage); + bch2_dev_put(ca); +@@ -850,10 +751,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, + struct closure *cl) + { + struct bch_fs *c = trans->c; +- struct dev_alloc_list devs_sorted; +- struct ec_stripe_head *h; +- struct open_bucket *ob; +- unsigned i, ec_idx; + int ret = 0; + + if (nr_replicas < 2) +@@ -862,34 +759,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, + if (ec_open_bucket(c, ptrs)) + return 0; + +- h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); ++ struct ec_stripe_head *h = ++ bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + if (IS_ERR(h)) + return PTR_ERR(h); + if (!h) + return 0; + +- devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); +- +- for (i = 0; i < devs_sorted.nr; i++) +- for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { ++ struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); ++ darray_for_each(devs_sorted, i) ++ for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + if (!h->s->blocks[ec_idx]) + continue; + +- ob = c->open_buckets + h->s->blocks[ec_idx]; +- if (ob->dev == devs_sorted.devs[i] && +- !test_and_set_bit(ec_idx, h->s->blocks_allocated)) +- goto got_bucket; ++ struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; ++ if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { ++ ob->ec_idx = ec_idx; ++ ob->ec = h->s; ++ ec_stripe_new_get(h->s, STRIPE_REF_io); ++ ++ ret = add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_replicas, nr_effective, ++ have_cache, ob); ++ goto out; ++ } + } +- goto out_put_head; +-got_bucket: +- ob->ec_idx = ec_idx; +- ob->ec = h->s; +- ec_stripe_new_get(h->s, STRIPE_REF_io); +- +- ret = add_new_bucket(c, ptrs, devs_may_alloc, +- nr_replicas, nr_effective, +- have_cache, ob); +-out_put_head: ++out: + bch2_ec_stripe_head_put(c, h); + return ret; + } +@@ -1420,7 +1315,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + if (wp->data_type != BCH_DATA_user) + have_cache = true; + +- if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ++ if (target && !(flags & BCH_WRITE_only_specified_devs)) { + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, +@@ -1510,7 +1405,7 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + +- if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && ++ if (cl && !(flags & BCH_WRITE_alloc_nowait) && + bch2_err_matches(ret, BCH_ERR_freelist_empty)) + ret = -BCH_ERR_bucket_alloc_blocked; + +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +index 1a16fd5bd4f8..baf5dc163c8a 100644 +--- a/fs/bcachefs/alloc_foreground.h ++++ b/fs/bcachefs/alloc_foreground.h +@@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *); + + struct dev_alloc_list { + unsigned nr; +- u8 devs[BCH_SB_MEMBERS_MAX]; ++ u8 data[BCH_SB_MEMBERS_MAX]; + }; + + struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, +@@ -28,13 +28,28 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, + struct bch_devs_mask *); + void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); + +-long bch2_bucket_alloc_new_fs(struct bch_dev *); +- + static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) + { + return bch2_dev_have_ref(c, ob->dev); + } + ++static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) ++{ ++ switch (watermark) { ++ case BCH_WATERMARK_interior_updates: ++ return 0; ++ case BCH_WATERMARK_reclaim: ++ return OPEN_BUCKETS_COUNT / 6; ++ case BCH_WATERMARK_btree: ++ case BCH_WATERMARK_btree_copygc: ++ return OPEN_BUCKETS_COUNT / 4; ++ case BCH_WATERMARK_copygc: ++ return OPEN_BUCKETS_COUNT / 3; ++ default: ++ return OPEN_BUCKETS_COUNT / 2; ++ } ++} ++ + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, + enum bch_watermark, enum bch_data_type, + struct closure *); +diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c +index 654a58132a4d..655be2332742 100644 +--- a/fs/bcachefs/backpointers.c ++++ b/fs/bcachefs/backpointers.c +@@ -14,42 +14,8 @@ + + #include + +-static bool extent_matches_bp(struct bch_fs *c, +- enum btree_id btree_id, unsigned level, +- struct bkey_s_c k, +- struct bpos bucket, +- struct bch_backpointer bp) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- +- rcu_read_lock(); +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- struct bpos bucket2; +- struct bch_backpointer bp2; +- +- if (p.ptr.cached) +- continue; +- +- struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); +- if (!ca) +- continue; +- +- bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); +- if (bpos_eq(bucket, bucket2) && +- !memcmp(&bp, &bp2, sizeof(bp))) { +- rcu_read_unlock(); +- return true; +- } +- } +- rcu_read_unlock(); +- +- return false; +-} +- + int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + int ret = 0; +@@ -59,67 +25,70 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, + "backpointer level bad: %u >= %u", + bp.v->level, BTREE_MAX_DEPTH); + +- rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); +- if (!ca) { +- /* these will be caught by fsck */ +- rcu_read_unlock(); +- return 0; +- } +- +- struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); +- struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); +- rcu_read_unlock(); +- +- bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || +- !bpos_eq(bp.k->p, bp_pos), +- c, backpointer_bucket_offset_wrong, +- "backpointer bucket_offset wrong"); ++ bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, ++ c, backpointer_dev_bad, ++ "backpointer for BCH_SB_MEMBER_INVALID"); + fsck_err: + return ret; + } + +-void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) ++void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) + { +- prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", +- bch2_btree_id_str(bp->btree_id), +- bp->level, +- (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), +- (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), +- bp->bucket_len); +- bch2_bpos_to_text(out, bp->pos); +-} ++ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + +-void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +-{ + rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); + if (ca) { +- struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); ++ u32 bucket_offset; ++ struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); + rcu_read_unlock(); +- prt_str(out, "bucket="); +- bch2_bpos_to_text(out, bucket); +- prt_str(out, " "); ++ prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); + } else { + rcu_read_unlock(); ++ prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); + } + +- bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); ++ bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); ++ prt_printf(out, " suboffset=%u len=%u gen=%u pos=", ++ (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), ++ bp.v->bucket_len, ++ bp.v->bucket_gen); ++ bch2_bpos_to_text(out, bp.v->pos); + } + + void bch2_backpointer_swab(struct bkey_s k) + { + struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); + +- bp.v->bucket_offset = swab40(bp.v->bucket_offset); + bp.v->bucket_len = swab32(bp.v->bucket_len); + bch2_bpos_swab(&bp.v->pos); + } + ++static bool extent_matches_bp(struct bch_fs *c, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c k, ++ struct bkey_s_c_backpointer bp) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ struct bkey_i_backpointer bp2; ++ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); ++ ++ if (bpos_eq(bp.k->p, bp2.k.p) && ++ !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) ++ return true; ++ } ++ ++ return false; ++} ++ + static noinline int backpointer_mod_err(struct btree_trans *trans, +- struct bch_backpointer bp, +- struct bkey_s_c bp_k, + struct bkey_s_c orig_k, ++ struct bkey_i_backpointer *new_bp, ++ struct bkey_s_c found_bp, + bool insert) + { + struct bch_fs *c = trans->c; +@@ -127,12 +96,12 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, + + if (insert) { + prt_printf(&buf, "existing backpointer found when inserting "); +- bch2_backpointer_to_text(&buf, &bp); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "found "); +- bch2_bkey_val_to_text(&buf, c, bp_k); ++ bch2_bkey_val_to_text(&buf, c, found_bp); + prt_newline(&buf); + + prt_printf(&buf, "for "); +@@ -144,11 +113,11 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "searching for "); +- bch2_backpointer_to_text(&buf, &bp); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); + prt_newline(&buf); + + prt_printf(&buf, "got "); +- bch2_bkey_val_to_text(&buf, c, bp_k); ++ bch2_bkey_val_to_text(&buf, c, found_bp); + prt_newline(&buf); + + prt_printf(&buf, "for "); +@@ -167,230 +136,188 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, + } + + int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, +- struct bch_dev *ca, +- struct bpos bucket, +- struct bch_backpointer bp, + struct bkey_s_c orig_k, ++ struct bkey_i_backpointer *bp, + bool insert) + { + struct btree_iter bp_iter; +- struct bkey_s_c k; +- struct bkey_i_backpointer *bp_k; +- int ret; +- +- bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); +- ret = PTR_ERR_OR_ZERO(bp_k); +- if (ret) +- return ret; +- +- bkey_backpointer_init(&bp_k->k_i); +- bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); +- bp_k->v = bp; +- +- if (!insert) { +- bp_k->k.type = KEY_TYPE_deleted; +- set_bkey_val_u64s(&bp_k->k, 0); +- } +- +- k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, +- bp_k->k.p, ++ struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, ++ bp->k.p, + BTREE_ITER_intent| + BTREE_ITER_slots| + BTREE_ITER_with_updates); +- ret = bkey_err(k); ++ int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + + if (insert + ? k.k->type + : (k.k->type != KEY_TYPE_backpointer || +- memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { +- ret = backpointer_mod_err(trans, bp, k, orig_k, insert); ++ memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { ++ ret = backpointer_mod_err(trans, orig_k, bp, k, insert); + if (ret) + goto err; + } + +- ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); ++ if (!insert) { ++ bp->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&bp->k, 0); ++ } ++ ++ ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); + err: + bch2_trans_iter_exit(trans, &bp_iter); + return ret; + } + +-/* +- * Find the next backpointer >= *bp_offset: +- */ +-int bch2_get_next_backpointer(struct btree_trans *trans, +- struct bch_dev *ca, +- struct bpos bucket, int gen, +- struct bpos *bp_pos, +- struct bch_backpointer *bp, +- unsigned iter_flags) ++static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) + { +- struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0); +- struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; +- struct bkey_s_c k; +- int ret = 0; +- +- if (bpos_ge(*bp_pos, bp_end_pos)) +- goto done; +- +- if (gen >= 0) { +- k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, +- bucket, BTREE_ITER_cached|iter_flags); +- ret = bkey_err(k); +- if (ret) +- goto out; +- +- if (k.k->type != KEY_TYPE_alloc_v4 || +- bkey_s_c_to_alloc_v4(k).v->gen != gen) +- goto done; +- } +- +- *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0)); +- +- for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, +- *bp_pos, iter_flags, k, ret) { +- if (bpos_ge(k.k->p, bp_end_pos)) +- break; ++ return (likely(!bch2_backpointers_no_use_write_buffer) ++ ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) ++ : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: ++ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); ++} + +- *bp_pos = k.k->p; +- *bp = *bkey_s_c_to_backpointer(k).v; +- goto out; +- } +-done: +- *bp_pos = SPOS_MAX; +-out: +- bch2_trans_iter_exit(trans, &bp_iter); +- bch2_trans_iter_exit(trans, &alloc_iter); +- return ret; ++static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, ++ struct bkey_s_c visiting_k, ++ struct bkey_buf *last_flushed) ++{ ++ return likely(!bch2_backpointers_no_use_write_buffer) ++ ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) ++ : 0; + } + +-static void backpointer_not_found(struct btree_trans *trans, +- struct bpos bp_pos, +- struct bch_backpointer bp, +- struct bkey_s_c k) ++static int backpointer_target_not_found(struct btree_trans *trans, ++ struct bkey_s_c_backpointer bp, ++ struct bkey_s_c target_k, ++ struct bkey_buf *last_flushed) + { + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; ++ int ret = 0; + + /* + * If we're using the btree write buffer, the backpointer we were + * looking at may have already been deleted - failure to find what it + * pointed to is not an error: + */ +- if (likely(!bch2_backpointers_no_use_write_buffer)) +- return; +- +- struct bpos bucket; +- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) +- return; ++ ret = last_flushed ++ ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) ++ : 0; ++ if (ret) ++ return ret; + + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", +- bp.level ? "btree node" : "extent"); +- prt_printf(&buf, "bucket: "); +- bch2_bpos_to_text(&buf, bucket); +- prt_printf(&buf, "\n "); ++ bp.v->level ? "btree node" : "extent"); ++ bch2_bkey_val_to_text(&buf, c, bp.s_c); + +- prt_printf(&buf, "backpointer pos: "); +- bch2_bpos_to_text(&buf, bp_pos); + prt_printf(&buf, "\n "); ++ bch2_bkey_val_to_text(&buf, c, target_k); + +- bch2_backpointer_to_text(&buf, &bp); +- prt_printf(&buf, "\n "); +- bch2_bkey_val_to_text(&buf, c, k); +- if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) +- bch_err_ratelimited(c, "%s", buf.buf); +- else +- bch2_trans_inconsistent(trans, "%s", buf.buf); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) ++ if (p.ptr.dev == bp.k->p.inode) { ++ prt_printf(&buf, "\n "); ++ struct bkey_i_backpointer bp2; ++ bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); ++ } + ++ if (fsck_err(trans, backpointer_to_missing_ptr, ++ "%s", buf.buf)) ++ ret = bch2_backpointer_del(trans, bp.k->p); ++fsck_err: + printbuf_exit(&buf); ++ return ret; + } + + struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, ++ struct bkey_s_c_backpointer bp, + struct btree_iter *iter, +- struct bpos bp_pos, +- struct bch_backpointer bp, +- unsigned iter_flags) ++ unsigned iter_flags, ++ struct bkey_buf *last_flushed) + { +- if (likely(!bp.level)) { +- struct bch_fs *c = trans->c; +- +- struct bpos bucket; +- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) +- return bkey_s_c_err(-EIO); +- +- bch2_trans_node_iter_init(trans, iter, +- bp.btree_id, +- bp.pos, +- 0, 0, +- iter_flags); +- struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); +- if (bkey_err(k)) { +- bch2_trans_iter_exit(trans, iter); +- return k; +- } ++ struct bch_fs *c = trans->c; + +- if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) +- return k; ++ if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) ++ return bkey_s_c_null; + ++ bch2_trans_node_iter_init(trans, iter, ++ bp.v->btree_id, ++ bp.v->pos, ++ 0, ++ bp.v->level, ++ iter_flags); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); +- backpointer_not_found(trans, bp_pos, bp, k); +- return bkey_s_c_null; ++ return k; ++ } ++ ++ if (k.k && ++ extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) ++ return k; ++ ++ bch2_trans_iter_exit(trans, iter); ++ ++ if (!bp.v->level) { ++ int ret = backpointer_target_not_found(trans, bp, k, last_flushed); ++ return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + } else { +- struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); ++ struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); ++ if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) ++ return bkey_s_c_null; ++ if (IS_ERR_OR_NULL(b)) ++ return ((struct bkey_s_c) { .k = ERR_CAST(b) }); + +- if (IS_ERR_OR_NULL(b)) { +- bch2_trans_iter_exit(trans, iter); +- return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; +- } + return bkey_i_to_s_c(&b->key); + } + } + + struct btree *bch2_backpointer_get_node(struct btree_trans *trans, ++ struct bkey_s_c_backpointer bp, + struct btree_iter *iter, +- struct bpos bp_pos, +- struct bch_backpointer bp) ++ struct bkey_buf *last_flushed) + { + struct bch_fs *c = trans->c; + +- BUG_ON(!bp.level); +- +- struct bpos bucket; +- if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) +- return ERR_PTR(-EIO); ++ BUG_ON(!bp.v->level); + + bch2_trans_node_iter_init(trans, iter, +- bp.btree_id, +- bp.pos, ++ bp.v->btree_id, ++ bp.v->pos, + 0, +- bp.level - 1, ++ bp.v->level - 1, + 0); + struct btree *b = bch2_btree_iter_peek_node(iter); + if (IS_ERR_OR_NULL(b)) + goto err; + +- BUG_ON(b->c.level != bp.level - 1); ++ BUG_ON(b->c.level != bp.v->level - 1); + +- if (extent_matches_bp(c, bp.btree_id, bp.level, +- bkey_i_to_s_c(&b->key), +- bucket, bp)) ++ if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, ++ bkey_i_to_s_c(&b->key), bp)) + return b; + + if (btree_node_will_make_reachable(b)) { + b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + } else { +- backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); +- b = NULL; ++ int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); ++ b = ret ? ERR_PTR(ret) : NULL; + } + err: + bch2_trans_iter_exit(trans, iter); + return b; + } + +-static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, +- struct bkey_s_c k) ++static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, ++ struct bkey_buf *last_flushed) + { ++ if (k.k->type != KEY_TYPE_backpointer) ++ return 0; ++ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bkey_s_c alloc_k; +@@ -399,10 +326,14 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ + + struct bpos bucket; + if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { ++ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); ++ if (ret) ++ goto out; ++ + if (fsck_err(trans, backpointer_to_missing_device, + "backpointer for missing device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) +- ret = bch2_btree_delete_at(trans, bp_iter, 0); ++ ret = bch2_backpointer_del(trans, k.k->p); + goto out; + } + +@@ -411,13 +342,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ + if (ret) + goto out; + +- if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, +- trans, backpointer_to_missing_alloc, +- "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", +- alloc_iter.pos.inode, alloc_iter.pos.offset, +- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { +- ret = bch2_btree_delete_at(trans, bp_iter, 0); +- goto out; ++ if (alloc_k.k->type != KEY_TYPE_alloc_v4) { ++ ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); ++ if (ret) ++ goto out; ++ ++ if (fsck_err(trans, backpointer_to_missing_alloc, ++ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", ++ alloc_iter.pos.inode, alloc_iter.pos.offset, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ++ ret = bch2_backpointer_del(trans, k.k->p); + } + out: + fsck_err: +@@ -429,18 +363,24 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ + /* verify that every backpointer has a corresponding alloc key */ + int bch2_check_btree_backpointers(struct bch_fs *c) + { ++ struct bkey_buf last_flushed; ++ bch2_bkey_buf_init(&last_flushed); ++ bkey_init(&last_flushed.k->k); ++ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_backpointers, POS_MIN, 0, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_check_btree_backpointer(trans, &iter, k))); ++ bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); ++ ++ bch2_bkey_buf_exit(&last_flushed, c); + bch_err_fn(c, ret); + return ret; + } + + struct extents_to_bp_state { +- struct bpos bucket_start; +- struct bpos bucket_end; ++ struct bpos bp_start; ++ struct bpos bp_end; + struct bkey_buf last_flushed; + }; + +@@ -501,9 +441,13 @@ static int check_extent_checksum(struct btree_trans *trans, + goto err; + + prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); +- prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); ++ prt_printf(&buf, "\n "); ++ bch2_btree_id_to_text(&buf, btree); ++ prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, extent); +- prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); ++ prt_printf(&buf, "\n "); ++ bch2_btree_id_to_text(&buf, o_btree); ++ prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, extent2); + + struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); +@@ -524,41 +468,25 @@ static int check_extent_checksum(struct btree_trans *trans, + + static int check_bp_exists(struct btree_trans *trans, + struct extents_to_bp_state *s, +- struct bpos bucket, +- struct bch_backpointer bp, ++ struct bkey_i_backpointer *bp, + struct bkey_s_c orig_k) + { + struct bch_fs *c = trans->c; +- struct btree_iter bp_iter = {}; + struct btree_iter other_extent_iter = {}; + struct printbuf buf = PRINTBUF; +- struct bkey_s_c bp_k; +- int ret = 0; +- +- struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); +- if (!ca) { +- prt_str(&buf, "extent for nonexistent device:bucket "); +- bch2_bpos_to_text(&buf, bucket); +- prt_str(&buf, "\n "); +- bch2_bkey_val_to_text(&buf, c, orig_k); +- bch_err(c, "%s", buf.buf); +- ret = -BCH_ERR_fsck_repair_unimplemented; +- goto err; +- } + +- if (bpos_lt(bucket, s->bucket_start) || +- bpos_gt(bucket, s->bucket_end)) +- goto out; ++ if (bpos_lt(bp->k.p, s->bp_start) || ++ bpos_gt(bp->k.p, s->bp_end)) ++ return 0; + +- bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, +- bucket_pos_to_bp(ca, bucket, bp.bucket_offset), +- 0); +- ret = bkey_err(bp_k); ++ struct btree_iter bp_iter; ++ struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); ++ int ret = bkey_err(bp_k); + if (ret) + goto err; + + if (bp_k.k->type != KEY_TYPE_backpointer || +- memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { ++ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { + ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); + if (ret) + goto err; +@@ -570,7 +498,6 @@ static int check_bp_exists(struct btree_trans *trans, + fsck_err: + bch2_trans_iter_exit(trans, &other_extent_iter); + bch2_trans_iter_exit(trans, &bp_iter); +- bch2_dev_put(ca); + printbuf_exit(&buf); + return ret; + check_existing_bp: +@@ -578,10 +505,10 @@ static int check_bp_exists(struct btree_trans *trans, + if (bp_k.k->type != KEY_TYPE_backpointer) + goto missing; + +- struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; ++ struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); + + struct bkey_s_c other_extent = +- bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); ++ bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); + ret = bkey_err(other_extent); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + ret = 0; +@@ -600,19 +527,23 @@ static int check_bp_exists(struct btree_trans *trans, + bch_err(c, "%s", buf.buf); + + if (other_extent.k->size <= orig_k.k->size) { +- ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); ++ ret = drop_dev_and_update(trans, other_bp.v->btree_id, ++ other_extent, bp->k.p.inode); + if (ret) + goto err; + goto out; + } else { +- ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); ++ ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); + if (ret) + goto err; + goto missing; + } + } + +- ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); ++ ret = check_extent_checksum(trans, ++ other_bp.v->btree_id, other_extent, ++ bp->v.btree_id, orig_k, ++ bp->k.p.inode); + if (ret < 0) + goto err; + if (ret) { +@@ -620,7 +551,8 @@ static int check_bp_exists(struct btree_trans *trans, + goto missing; + } + +- ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); ++ ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, ++ other_bp.v->btree_id, other_extent, bp->k.p.inode); + if (ret < 0) + goto err; + if (ret) { +@@ -629,7 +561,7 @@ static int check_bp_exists(struct btree_trans *trans, + } + + printbuf_reset(&buf); +- prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); ++ prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_str(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, other_extent); +@@ -638,21 +570,15 @@ static int check_bp_exists(struct btree_trans *trans, + goto err; + missing: + printbuf_reset(&buf); +- prt_printf(&buf, "missing backpointer for btree=%s l=%u ", +- bch2_btree_id_str(bp.btree_id), bp.level); ++ prt_str(&buf, "missing backpointer\n for: "); + bch2_bkey_val_to_text(&buf, c, orig_k); +- prt_printf(&buf, "\n got: "); ++ prt_printf(&buf, "\n want: "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); ++ prt_printf(&buf, "\n got: "); + bch2_bkey_val_to_text(&buf, c, bp_k); + +- struct bkey_i_backpointer n_bp_k; +- bkey_backpointer_init(&n_bp_k.k_i); +- n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); +- n_bp_k.v = bp; +- prt_printf(&buf, "\n want: "); +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); +- + if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) +- ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); ++ ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); + + goto out; + } +@@ -663,31 +589,33 @@ static int check_extent_to_backpointers(struct btree_trans *trans, + struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +- struct bkey_ptrs_c ptrs; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +- int ret; + +- ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- struct bpos bucket_pos = POS_MIN; +- struct bch_backpointer bp; +- + if (p.ptr.cached) + continue; + ++ if (p.ptr.dev == BCH_SB_MEMBER_INVALID) ++ continue; ++ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); +- if (ca) +- bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); ++ bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); ++ bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); + rcu_read_unlock(); + +- if (!ca) +- continue; ++ if (check || empty) { ++ struct bkey_i_backpointer bp; ++ bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + +- ret = check_bp_exists(trans, s, bucket_pos, bp, k); +- if (ret) +- return ret; ++ int ret = check ++ ? check_bp_exists(trans, s, &bp, k) ++ : bch2_bucket_backpointer_mod(trans, k, &bp, true); ++ if (ret) ++ return ret; ++ } + } + + return 0; +@@ -896,54 +824,330 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + return 0; + } + ++enum alloc_sector_counter { ++ ALLOC_dirty, ++ ALLOC_cached, ++ ALLOC_stripe, ++ ALLOC_SECTORS_NR ++}; ++ ++static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) ++{ ++ switch (t) { ++ case BCH_DATA_btree: ++ case BCH_DATA_user: ++ return ALLOC_dirty; ++ case BCH_DATA_cached: ++ return ALLOC_cached; ++ case BCH_DATA_stripe: ++ return ALLOC_stripe; ++ default: ++ BUG(); ++ } ++} ++ ++static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); ++ ++static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, ++ struct bkey_buf *last_flushed) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_alloc_v4 a_convert; ++ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); ++ bool need_commit = false; ++ ++ if (a->data_type == BCH_DATA_sb || ++ a->data_type == BCH_DATA_journal || ++ a->data_type == BCH_DATA_parity) ++ return 0; ++ ++ u32 sectors[ALLOC_SECTORS_NR]; ++ memset(sectors, 0, sizeof(sectors)); ++ ++ struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); ++ if (!ca) ++ return 0; ++ ++ struct btree_iter iter; ++ struct bkey_s_c bp_k; ++ int ret = 0; ++ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, ++ bucket_pos_to_bp_start(ca, alloc_k.k->p), ++ bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { ++ if (bp_k.k->type != KEY_TYPE_backpointer) ++ continue; ++ ++ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); ++ ++ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && ++ (bp.v->bucket_gen != a->gen || ++ bp.v->pad)) { ++ ret = bch2_backpointer_del(trans, bp_k.k->p); ++ if (ret) ++ break; ++ ++ need_commit = true; ++ continue; ++ } ++ ++ if (bp.v->bucket_gen != a->gen) ++ continue; ++ ++ sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; ++ }; ++ bch2_trans_iter_exit(trans, &iter); ++ if (ret) ++ goto err; ++ ++ if (need_commit) { ++ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); ++ if (ret) ++ goto err; ++ } ++ ++ /* Cached pointers don't have backpointers: */ ++ ++ if (sectors[ALLOC_dirty] != a->dirty_sectors || ++ sectors[ALLOC_stripe] != a->stripe_sectors) { ++ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { ++ ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); ++ if (ret) ++ goto err; ++ } ++ ++ if (sectors[ALLOC_dirty] > a->dirty_sectors || ++ sectors[ALLOC_stripe] > a->stripe_sectors) { ++ ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: ++ -BCH_ERR_transaction_restart_nested; ++ goto err; ++ } ++ ++ if (!sectors[ALLOC_dirty] && ++ !sectors[ALLOC_stripe]) ++ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); ++ else ++ __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); ++ } ++err: ++ bch2_dev_put(ca); ++ return ret; ++} ++ ++static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr_v2: { ++ bool ret = false; ++ ++ rcu_read_lock(); ++ struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; ++ while (pos.inode <= k.k->p.inode) { ++ if (pos.inode >= c->sb.nr_devices) ++ break; ++ ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); ++ if (!ca) ++ goto next; ++ ++ struct bpos bucket = bp_pos_to_bucket(ca, pos); ++ bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, ++ ca->mi.nbuckets, bucket.offset); ++ if (bucket.offset == ca->mi.nbuckets) ++ goto next; ++ ++ ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); ++ if (ret) ++ break; ++next: ++ pos = SPOS(pos.inode + 1, 0, 0); ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++ } ++ case KEY_TYPE_btree_ptr: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, ++ enum btree_id btree, unsigned level) ++{ ++ struct btree_iter iter; ++ bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); ++ int ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ goto err; ++ ++ if (b) ++ bch2_node_pin(trans->c, b); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, ++ struct bpos start, struct bpos *end) ++{ ++ struct bch_fs *c = trans->c; ++ int ret = 0; ++ ++ struct bkey_buf tmp; ++ bch2_bkey_buf_init(&tmp); ++ ++ bch2_btree_cache_unpin(c); ++ ++ *end = SPOS_MAX; ++ ++ s64 mem_may_pin = mem_may_pin_bytes(c); ++ struct btree_iter iter; ++ bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, ++ 0, 1, BTREE_ITER_prefetch); ++ ret = for_each_btree_key_continue(trans, iter, 0, k, ({ ++ if (!backpointer_node_has_missing(c, k)) ++ continue; ++ ++ mem_may_pin -= c->opts.btree_node_size; ++ if (mem_may_pin <= 0) ++ break; ++ ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ struct btree_path *path = btree_iter_path(trans, &iter); ++ ++ BUG_ON(path->level != 1); ++ ++ bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); ++ })); ++ if (ret) ++ return ret; ++ ++ struct bpos pinned = SPOS_MAX; ++ mem_may_pin = mem_may_pin_bytes(c); ++ bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, ++ 0, 1, BTREE_ITER_prefetch); ++ ret = for_each_btree_key_continue(trans, iter, 0, k, ({ ++ if (!backpointer_node_has_missing(c, k)) ++ continue; ++ ++ mem_may_pin -= c->opts.btree_node_size; ++ if (mem_may_pin <= 0) { ++ *end = pinned; ++ break; ++ } ++ ++ bch2_bkey_buf_reassemble(&tmp, c, k); ++ struct btree_path *path = btree_iter_path(trans, &iter); ++ ++ BUG_ON(path->level != 1); ++ ++ int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); ++ ++ if (!ret2) ++ pinned = tmp.k->k.p; ++ ++ ret; ++ })); ++ if (ret) ++ return ret; ++ ++ return ret; ++} ++ + int bch2_check_extents_to_backpointers(struct bch_fs *c) + { ++ int ret = 0; ++ ++ /* ++ * Can't allow devices to come/go/resize while we have bucket bitmaps ++ * allocated ++ */ ++ lockdep_assert_held(&c->state_lock); ++ ++ for_each_member_device(c, ca) { ++ BUG_ON(ca->bucket_backpointer_mismatches); ++ ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), ++ sizeof(unsigned long), ++ GFP_KERNEL); ++ ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), ++ sizeof(unsigned long), ++ GFP_KERNEL); ++ if (!ca->bucket_backpointer_mismatches || ++ !ca->bucket_backpointer_empty) { ++ bch2_dev_put(ca); ++ ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; ++ goto err_free_bitmaps; ++ } ++ } ++ + struct btree_trans *trans = bch2_trans_get(c); +- struct extents_to_bp_state s = { .bucket_start = POS_MIN }; +- int ret; ++ struct extents_to_bp_state s = { .bp_start = POS_MIN }; + + bch2_bkey_buf_init(&s.last_flushed); + bkey_init(&s.last_flushed.k->k); + ++ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, ++ POS_MIN, BTREE_ITER_prefetch, k, ({ ++ check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); ++ })); ++ if (ret) ++ goto err; ++ ++ u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; ++ for_each_member_device(c, ca) { ++ nr_buckets += ca->mi.nbuckets; ++ nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); ++ nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); ++ } ++ ++ if (!nr_mismatches && !nr_empty) ++ goto err; ++ ++ bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", ++ nr_mismatches + nr_empty, nr_buckets); ++ + while (1) { +- struct bbpos end; +- ret = bch2_get_btree_in_memory_pos(trans, +- BIT_ULL(BTREE_ID_backpointers), +- BIT_ULL(BTREE_ID_backpointers), +- BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); ++ ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); + if (ret) + break; + +- s.bucket_end = end.pos; +- +- if ( bpos_eq(s.bucket_start, POS_MIN) && +- !bpos_eq(s.bucket_end, SPOS_MAX)) ++ if ( bpos_eq(s.bp_start, POS_MIN) && ++ !bpos_eq(s.bp_end, SPOS_MAX)) + bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", + __func__, btree_nodes_fit_in_ram(c)); + +- if (!bpos_eq(s.bucket_start, POS_MIN) || +- !bpos_eq(s.bucket_end, SPOS_MAX)) { ++ if (!bpos_eq(s.bp_start, POS_MIN) || ++ !bpos_eq(s.bp_end, SPOS_MAX)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "check_extents_to_backpointers(): "); +- bch2_bpos_to_text(&buf, s.bucket_start); ++ bch2_bpos_to_text(&buf, s.bp_start); + prt_str(&buf, "-"); +- bch2_bpos_to_text(&buf, s.bucket_end); ++ bch2_bpos_to_text(&buf, s.bp_end); + + bch_verbose(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + ret = bch2_check_extents_to_backpointers_pass(trans, &s); +- if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) ++ if (ret || bpos_eq(s.bp_end, SPOS_MAX)) + break; + +- s.bucket_start = bpos_successor(s.bucket_end); ++ s.bp_start = bpos_successor(s.bp_end); + } ++err: + bch2_trans_put(trans); + bch2_bkey_buf_exit(&s.last_flushed, c); +- + bch2_btree_cache_unpin(c); ++err_free_bitmaps: ++ for_each_member_device(c, ca) { ++ kvfree(ca->bucket_backpointer_empty); ++ ca->bucket_backpointer_empty = NULL; ++ kvfree(ca->bucket_backpointer_mismatches); ++ ca->bucket_backpointer_mismatches = NULL; ++ } + + bch_err_fn(c, ret); + return ret; +@@ -959,44 +1163,43 @@ static int check_one_backpointer(struct btree_trans *trans, + return 0; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); +- struct bch_fs *c = trans->c; +- struct btree_iter iter; + struct bbpos pos = bp_to_bbpos(*bp.v); +- struct bkey_s_c k; +- struct printbuf buf = PRINTBUF; +- int ret; + + if (bbpos_cmp(pos, start) < 0 || + bbpos_cmp(pos, end) > 0) + return 0; + +- k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); +- ret = bkey_err(k); ++ struct btree_iter iter; ++ struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); ++ int ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + +- if (!k.k) { +- ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); +- if (ret) +- goto out; +- +- if (fsck_err(trans, backpointer_to_missing_ptr, +- "backpointer for missing %s\n %s", +- bp.v->level ? "btree node" : "extent", +- (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { +- ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); +- goto out; +- } +- } +-out: +-fsck_err: + bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); + return ret; + } + ++static int check_bucket_backpointers_to_extents(struct btree_trans *trans, ++ struct bch_dev *ca, struct bpos bucket) ++{ ++ u32 restart_count = trans->restart_count; ++ struct bkey_buf last_flushed; ++ bch2_bkey_buf_init(&last_flushed); ++ bkey_init(&last_flushed.k->k); ++ ++ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, ++ bucket_pos_to_bp_start(ca, bucket), ++ bucket_pos_to_bp_end(ca, bucket), ++ 0, k, ++ check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) ++ ); ++ ++ bch2_bkey_buf_exit(&last_flushed, trans->c); ++ return ret ?: trans_was_restarted(trans, restart_count); ++} ++ + static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, + struct bbpos start, + struct bbpos end) +@@ -1009,9 +1212,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, + bkey_init(&last_flushed.k->k); + progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); + +- int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, +- POS_MIN, BTREE_ITER_prefetch, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ++ int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, ++ POS_MIN, BTREE_ITER_prefetch, k, ({ + progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + check_one_backpointer(trans, start, end, k, &last_flushed); + })); +diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h +index 3b29fdf519dd..060dad1521ee 100644 +--- a/fs/bcachefs/backpointers.h ++++ b/fs/bcachefs/backpointers.h +@@ -18,14 +18,14 @@ static inline u64 swab40(u64 x) + ((x & 0xff00000000ULL) >> 32)); + } + +-int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); +-void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); +-void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, ++ struct bkey_validate_context); ++void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + void bch2_backpointer_swab(struct bkey_s); + + #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ + .key_validate = bch2_backpointer_validate, \ +- .val_to_text = bch2_backpointer_k_to_text, \ ++ .val_to_text = bch2_backpointer_to_text, \ + .swab = bch2_backpointer_swab, \ + .min_val_size = 32, \ + }) +@@ -43,22 +43,24 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos + return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); + } + ++static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, ++ u32 *bucket_offset) ++{ ++ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; ++ ++ return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); ++} ++ + static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) + { + rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); + if (ca) + *bucket = bp_pos_to_bucket(ca, bp_pos); + rcu_read_unlock(); + return ca != NULL; + } + +-static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) +-{ +- return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket), +- c, "backpointer for missing device %llu", bp_pos.inode); +-} +- + static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, + struct bpos bucket, + u64 bucket_offset) +@@ -80,31 +82,35 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, + return ret; + } + +-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, +- struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); ++static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket) ++{ ++ return bucket_pos_to_bp(ca, bucket, 0); ++} ++ ++static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket) ++{ ++ return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); ++} ++ ++int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, ++ struct bkey_s_c, ++ struct bkey_i_backpointer *, ++ bool); + + static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, +- struct bch_dev *ca, +- struct bpos bucket, +- struct bch_backpointer bp, + struct bkey_s_c orig_k, ++ struct bkey_i_backpointer *bp, + bool insert) + { + if (unlikely(bch2_backpointers_no_use_write_buffer)) +- return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert); +- +- struct bkey_i_backpointer bp_k; +- +- bkey_backpointer_init(&bp_k.k_i); +- bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); +- bp_k.v = bp; ++ return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); + + if (!insert) { +- bp_k.k.type = KEY_TYPE_deleted; +- set_bkey_val_u64s(&bp_k.k, 0); ++ bp->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&bp->k, 0); + } + +- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); ++ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i); + } + + static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, +@@ -134,44 +140,29 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, + } + } + +-static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, ++static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + const union bch_extent_entry *entry, +- struct bpos *bucket_pos, struct bch_backpointer *bp, +- u64 sectors) ++ struct bkey_i_backpointer *bp) + { +- u32 bucket_offset; +- *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); +- *bp = (struct bch_backpointer) { ++ bkey_backpointer_init(&bp->k_i); ++ bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); ++ bp->v = (struct bch_backpointer) { + .btree_id = btree_id, + .level = level, + .data_type = bch2_bkey_ptr_data_type(k, p, entry), +- .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + +- p.crc.offset, +- .bucket_len = sectors, ++ .bucket_gen = p.ptr.gen, ++ .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p), + .pos = k.k->p, + }; + } + +-static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, +- enum btree_id btree_id, unsigned level, +- struct bkey_s_c k, struct extent_ptr_decoded p, +- const union bch_extent_entry *entry, +- struct bpos *bucket_pos, struct bch_backpointer *bp) +-{ +- u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); +- +- __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors); +-} +- +-int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, +- struct bpos *, struct bch_backpointer *, unsigned); +-struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, +- struct bpos, struct bch_backpointer, +- unsigned); +-struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, +- struct bpos, struct bch_backpointer); ++struct bkey_buf; ++struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, ++ struct btree_iter *, unsigned, struct bkey_buf *); ++struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, ++ struct btree_iter *, struct bkey_buf *); + + int bch2_check_btree_backpointers(struct bch_fs *); + int bch2_check_extents_to_backpointers(struct bch_fs *); +diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h +index be2edced5213..63abe17f35ea 100644 +--- a/fs/bcachefs/bbpos.h ++++ b/fs/bcachefs/bbpos.h +@@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos) + + static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) + { +- prt_str(out, bch2_btree_id_str(pos.btree)); ++ bch2_btree_id_to_text(out, pos.btree); + prt_char(out, ':'); + bch2_bpos_to_text(out, pos.pos); + } +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +index e94a83b8113e..161cf2f05d2a 100644 +--- a/fs/bcachefs/bcachefs.h ++++ b/fs/bcachefs/bcachefs.h +@@ -205,6 +205,7 @@ + #include + + #include "bcachefs_format.h" ++#include "btree_journal_iter_types.h" + #include "disk_accounting_types.h" + #include "errcode.h" + #include "fifo.h" +@@ -293,6 +294,8 @@ do { \ + + #define bch_info(c, fmt, ...) \ + bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_info_ratelimited(c, fmt, ...) \ ++ bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) + #define bch_notice(c, fmt, ...) \ + bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) + #define bch_warn(c, fmt, ...) \ +@@ -352,6 +355,12 @@ do { \ + bch_info(c, fmt, ##__VA_ARGS__); \ + } while (0) + ++#define bch_verbose_ratelimited(c, fmt, ...) \ ++do { \ ++ if ((c)->opts.verbose) \ ++ bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ ++} while (0) ++ + #define pr_verbose_init(opts, fmt, ...) \ + do { \ + if (opt_get(opts, verbose)) \ +@@ -538,20 +547,20 @@ struct bch_dev { + + /* + * Buckets: +- * Per-bucket arrays are protected by c->mark_lock, bucket_lock and +- * gc_gens_lock, for device resize - holding any is sufficient for +- * access: Or rcu_read_lock(), but only for dev_ptr_stale(): ++ * Per-bucket arrays are protected by either rcu_read_lock or ++ * state_lock, for device resize. + */ + GENRADIX(struct bucket) buckets_gc; + struct bucket_gens __rcu *bucket_gens; + u8 *oldest_gen; + unsigned long *buckets_nouse; +- struct rw_semaphore bucket_lock; ++ ++ unsigned long *bucket_backpointer_mismatches; ++ unsigned long *bucket_backpointer_empty; + + struct bch_dev_usage __percpu *usage; + + /* Allocator: */ +- u64 new_fs_bucket_idx; + u64 alloc_cursor[3]; + + unsigned nr_open_buckets; +@@ -606,6 +615,7 @@ struct bch_dev { + x(going_ro) \ + x(write_disable_complete) \ + x(clean_shutdown) \ ++ x(recovery_running) \ + x(fsck_running) \ + x(initial_gc_unfixed) \ + x(need_delete_dead_snapshots) \ +@@ -650,28 +660,6 @@ struct journal_seq_blacklist_table { + } entries[]; + }; + +-struct journal_keys { +- /* must match layout in darray_types.h */ +- size_t nr, size; +- struct journal_key { +- u64 journal_seq; +- u32 journal_offset; +- enum btree_id btree_id:8; +- unsigned level:8; +- bool allocated; +- bool overwritten; +- struct bkey_i *k; +- } *data; +- /* +- * Gap buffer: instead of all the empty space in the array being at the +- * end of the buffer - from @nr to @size - the empty space is at @gap. +- * This means that sequential insertions are O(n) instead of O(n^2). +- */ +- size_t gap; +- atomic_t ref; +- bool initial_ref_held; +-}; +- + struct btree_trans_buf { + struct btree_trans *trans; + }; +@@ -680,6 +668,7 @@ struct btree_trans_buf { + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + + #define BCH_WRITE_REFS() \ ++ x(journal) \ + x(trans) \ + x(write) \ + x(promote) \ +@@ -692,6 +681,7 @@ struct btree_trans_buf { + x(dio_write) \ + x(discard) \ + x(discard_fast) \ ++ x(check_discard_freespace_key) \ + x(invalidate) \ + x(delete_dead_snapshots) \ + x(gc_gens) \ +@@ -734,6 +724,12 @@ struct bch_fs { + #else + struct percpu_ref writes; + #endif ++ /* ++ * Certain operations are only allowed in single threaded mode, during ++ * recovery, and we want to assert that this is the case: ++ */ ++ struct task_struct *recovery_task; ++ + /* + * Analagous to c->writes, for asynchronous ops that don't necessarily + * need fs to be read-write +@@ -764,6 +760,8 @@ struct bch_fs { + __uuid_t user_uuid; + + u16 version; ++ u16 version_incompat; ++ u16 version_incompat_allowed; + u16 version_min; + u16 version_upgrade_complete; + +@@ -834,9 +832,10 @@ struct bch_fs { + struct work_struct btree_interior_update_work; + + struct workqueue_struct *btree_node_rewrite_worker; +- +- struct list_head pending_node_rewrites; +- struct mutex pending_node_rewrites_lock; ++ struct list_head btree_node_rewrites; ++ struct list_head btree_node_rewrites_pending; ++ spinlock_t btree_node_rewrites_lock; ++ struct closure_waitlist btree_node_rewrites_wait; + + /* btree_io.c: */ + spinlock_t btree_write_error_lock; +@@ -967,8 +966,7 @@ struct bch_fs { + struct rhashtable promote_table; + + mempool_t compression_bounce[2]; +- mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; +- mempool_t decompress_workspace; ++ mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; + size_t zstd_workspace_size; + + struct crypto_shash *sha256; +@@ -1027,6 +1025,7 @@ struct bch_fs { + struct list_head vfs_inodes_list; + struct mutex vfs_inodes_lock; + struct rhashtable vfs_inodes_table; ++ struct rhltable vfs_inodes_by_inum_table; + + /* VFS IO PATH - fs-io.c */ + struct bio_set writepage_bioset; +@@ -1048,10 +1047,12 @@ struct bch_fs { + * for signaling to the toplevel code which pass we want to run now. + */ + enum bch_recovery_pass curr_recovery_pass; ++ enum bch_recovery_pass next_recovery_pass; + /* bitmask of recovery passes that we actually ran */ + u64 recovery_passes_complete; + /* never rewinds version of curr_recovery_pass */ + enum bch_recovery_pass recovery_pass_done; ++ spinlock_t recovery_pass_lock; + struct semaphore online_fsck_mutex; + + /* DEBUG JUNK */ +@@ -1062,9 +1063,6 @@ struct bch_fs { + struct btree_node *verify_ondisk; + struct mutex verify_lock; + +- u64 *unused_inode_hints; +- unsigned inode_shard_bits; +- + /* + * A btree node on disk could have too many bsets for an iterator to fit + * on the stack - have to dynamically allocate them +@@ -1086,8 +1084,6 @@ struct bch_fs { + u64 counters_on_mount[BCH_COUNTER_NR]; + u64 __percpu *counters; + +- unsigned copy_gc_enabled:1; +- + struct bch2_time_stats times[BCH_TIME_STAT_NR]; + + struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +index 5004f6ba997c..f70f0108401f 100644 +--- a/fs/bcachefs/bcachefs_format.h ++++ b/fs/bcachefs/bcachefs_format.h +@@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k) + x(snapshot_tree, 31) \ + x(logged_op_truncate, 32) \ + x(logged_op_finsert, 33) \ +- x(accounting, 34) ++ x(accounting, 34) \ ++ x(inode_alloc_cursor, 35) + + enum bch_bkey_type { + #define x(name, nr) KEY_TYPE_##name = nr, +@@ -463,7 +464,8 @@ struct bch_backpointer { + __u8 btree_id; + __u8 level; + __u8 data_type; +- __u64 bucket_offset:40; ++ __u8 bucket_gen; ++ __u32 pad; + __u32 bucket_len; + struct bpos pos; + } __packed __aligned(8); +@@ -499,8 +501,6 @@ struct bch_sb_field { + #include "disk_groups_format.h" + #include "extents_format.h" + #include "ec_format.h" +-#include "dirent_format.h" +-#include "disk_groups_format.h" + #include "inode_format.h" + #include "journal_seq_blacklist_format.h" + #include "logged_ops_format.h" +@@ -679,7 +679,14 @@ struct bch_sb_field_ext { + x(disk_accounting_v3, BCH_VERSION(1, 10)) \ + x(disk_accounting_inum, BCH_VERSION(1, 11)) \ + x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ +- x(inode_has_child_snapshots, BCH_VERSION(1, 13)) ++ x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ ++ x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ ++ x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ ++ x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ ++ x(inode_depth, BCH_VERSION(1, 17)) \ ++ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ ++ x(autofix_errors, BCH_VERSION(1, 19)) \ ++ x(directory_size, BCH_VERSION(1, 20)) + + enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, +@@ -844,6 +851,10 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, + struct bch_sb, flags[5], 0, 16); + LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, + struct bch_sb, flags[5], 16, 32); ++LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); ++LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, ++ struct bch_sb, flags[5], 48, 64); ++LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); + + static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) + { +@@ -896,21 +907,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u + x(new_varint, 15) \ + x(journal_no_flush, 16) \ + x(alloc_v2, 17) \ +- x(extents_across_btree_nodes, 18) ++ x(extents_across_btree_nodes, 18) \ ++ x(incompat_version_field, 19) + + #define BCH_SB_FEATURES_ALWAYS \ +- ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ +- (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ +- (1ULL << BCH_FEATURE_btree_updates_journalled)|\ +- (1ULL << BCH_FEATURE_alloc_v2)|\ +- (1ULL << BCH_FEATURE_extents_across_btree_nodes)) ++ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ ++ BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\ ++ BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\ ++ BIT_ULL(BCH_FEATURE_alloc_v2)|\ ++ BIT_ULL(BCH_FEATURE_extents_across_btree_nodes)) + + #define BCH_SB_FEATURES_ALL \ + (BCH_SB_FEATURES_ALWAYS| \ +- (1ULL << BCH_FEATURE_new_siphash)| \ +- (1ULL << BCH_FEATURE_btree_ptr_v2)| \ +- (1ULL << BCH_FEATURE_new_varint)| \ +- (1ULL << BCH_FEATURE_journal_no_flush)) ++ BIT_ULL(BCH_FEATURE_new_siphash)| \ ++ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ ++ BIT_ULL(BCH_FEATURE_new_varint)| \ ++ BIT_ULL(BCH_FEATURE_journal_no_flush)) + + enum bch_sb_feature { + #define x(f, n) BCH_FEATURE_##f, +@@ -1032,7 +1044,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) + x(crc64, 2) \ + x(xxhash, 3) + +-enum bch_csum_opts { ++enum bch_csum_opt { + #define x(t, n) BCH_CSUM_OPT_##t = n, + BCH_CSUM_OPTS() + #undef x +@@ -1221,6 +1233,15 @@ struct jset_entry_log { + u8 d[]; + } __packed __aligned(8); + ++static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l) ++{ ++ unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d); ++ ++ while (b && !l->d[b - 1]) ++ --b; ++ return b; ++} ++ + struct jset_entry_datetime { + struct jset_entry entry; + __le64 seconds; +@@ -1268,14 +1289,18 @@ LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + /* Btree: */ + + enum btree_id_flags { +- BTREE_ID_EXTENTS = BIT(0), +- BTREE_ID_SNAPSHOTS = BIT(1), +- BTREE_ID_SNAPSHOT_FIELD = BIT(2), +- BTREE_ID_DATA = BIT(3), ++ BTREE_IS_extents = BIT(0), ++ BTREE_IS_snapshots = BIT(1), ++ BTREE_IS_snapshot_field = BIT(2), ++ BTREE_IS_data = BIT(3), ++ BTREE_IS_write_buffer = BIT(4), + }; + + #define BCH_BTREE_IDS() \ +- x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\ ++ x(extents, 0, \ ++ BTREE_IS_extents| \ ++ BTREE_IS_snapshots| \ ++ BTREE_IS_data, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_error)| \ + BIT_ULL(KEY_TYPE_cookie)| \ +@@ -1283,17 +1308,20 @@ enum btree_id_flags { + BIT_ULL(KEY_TYPE_reservation)| \ + BIT_ULL(KEY_TYPE_reflink_p)| \ + BIT_ULL(KEY_TYPE_inline_data)) \ +- x(inodes, 1, BTREE_ID_SNAPSHOTS, \ ++ x(inodes, 1, \ ++ BTREE_IS_snapshots, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_inode)| \ + BIT_ULL(KEY_TYPE_inode_v2)| \ + BIT_ULL(KEY_TYPE_inode_v3)| \ + BIT_ULL(KEY_TYPE_inode_generation)) \ +- x(dirents, 2, BTREE_ID_SNAPSHOTS, \ ++ x(dirents, 2, \ ++ BTREE_IS_snapshots, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_hash_whiteout)| \ + BIT_ULL(KEY_TYPE_dirent)) \ +- x(xattrs, 3, BTREE_ID_SNAPSHOTS, \ ++ x(xattrs, 3, \ ++ BTREE_IS_snapshots, \ + BIT_ULL(KEY_TYPE_whiteout)| \ + BIT_ULL(KEY_TYPE_cookie)| \ + BIT_ULL(KEY_TYPE_hash_whiteout)| \ +@@ -1307,7 +1335,9 @@ enum btree_id_flags { + BIT_ULL(KEY_TYPE_quota)) \ + x(stripes, 6, 0, \ + BIT_ULL(KEY_TYPE_stripe)) \ +- x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \ ++ x(reflink, 7, \ ++ BTREE_IS_extents| \ ++ BTREE_IS_data, \ + BIT_ULL(KEY_TYPE_reflink_v)| \ + BIT_ULL(KEY_TYPE_indirect_inline_data)| \ + BIT_ULL(KEY_TYPE_error)) \ +@@ -1315,28 +1345,38 @@ enum btree_id_flags { + BIT_ULL(KEY_TYPE_subvolume)) \ + x(snapshots, 9, 0, \ + BIT_ULL(KEY_TYPE_snapshot)) \ +- x(lru, 10, 0, \ ++ x(lru, 10, \ ++ BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_set)) \ +- x(freespace, 11, BTREE_ID_EXTENTS, \ ++ x(freespace, 11, \ ++ BTREE_IS_extents, \ + BIT_ULL(KEY_TYPE_set)) \ + x(need_discard, 12, 0, \ + BIT_ULL(KEY_TYPE_set)) \ +- x(backpointers, 13, 0, \ ++ x(backpointers, 13, \ ++ BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_backpointer)) \ + x(bucket_gens, 14, 0, \ + BIT_ULL(KEY_TYPE_bucket_gens)) \ + x(snapshot_trees, 15, 0, \ + BIT_ULL(KEY_TYPE_snapshot_tree)) \ +- x(deleted_inodes, 16, BTREE_ID_SNAPSHOT_FIELD, \ ++ x(deleted_inodes, 16, \ ++ BTREE_IS_snapshot_field| \ ++ BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_set)) \ + x(logged_ops, 17, 0, \ + BIT_ULL(KEY_TYPE_logged_op_truncate)| \ +- BIT_ULL(KEY_TYPE_logged_op_finsert)) \ +- x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ ++ BIT_ULL(KEY_TYPE_logged_op_finsert)| \ ++ BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ ++ x(rebalance_work, 18, \ ++ BTREE_IS_snapshot_field| \ ++ BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ + x(subvolume_children, 19, 0, \ + BIT_ULL(KEY_TYPE_set)) \ +- x(accounting, 20, BTREE_ID_SNAPSHOT_FIELD, \ ++ x(accounting, 20, \ ++ BTREE_IS_snapshot_field| \ ++ BTREE_IS_write_buffer, \ + BIT_ULL(KEY_TYPE_accounting)) \ + + enum btree_id { +@@ -1361,6 +1401,8 @@ static inline bool btree_id_is_alloc(enum btree_id id) + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: ++ case BTREE_ID_lru: ++ case BTREE_ID_accounting: + return true; + default: + return false; +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +index 41df24a53d97..054e2d5e8448 100644 +--- a/fs/bcachefs/bkey.h ++++ b/fs/bcachefs/bkey.h +@@ -9,13 +9,6 @@ + #include "util.h" + #include "vstructs.h" + +-enum bch_validate_flags { +- BCH_VALIDATE_write = BIT(0), +- BCH_VALIDATE_commit = BIT(1), +- BCH_VALIDATE_journal = BIT(2), +- BCH_VALIDATE_silent = BIT(3), +-}; +- + #if 0 + + /* +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +index e7ac227ba7e8..15c93576b5c2 100644 +--- a/fs/bcachefs/bkey_methods.c ++++ b/fs/bcachefs/bkey_methods.c +@@ -28,7 +28,7 @@ const char * const bch2_bkey_types[] = { + }; + + static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + return 0; + } +@@ -42,7 +42,7 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, + }) + + static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + int ret = 0; + +@@ -59,7 +59,7 @@ static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, + }) + + static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + return 0; + } +@@ -83,7 +83,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, + }) + + static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + return 0; + } +@@ -124,7 +124,7 @@ const struct bkey_ops bch2_bkey_null_ops = { + }; + + int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; +@@ -140,7 +140,7 @@ int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, + if (!ops->key_validate) + return 0; + +- ret = ops->key_validate(c, k, flags); ++ ret = ops->key_validate(c, k, from); + fsck_err: + return ret; + } +@@ -161,9 +161,10 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) + } + + int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, +- enum btree_node_type type, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { ++ enum btree_node_type type = __btree_node_type(from.level, from.btree); ++ + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) + return 0; + +@@ -177,7 +178,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, + return 0; + + bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && +- (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && ++ (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && + !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), + c, bkey_invalid_type_for_btree, + "invalid key type for btree %s (%s)", +@@ -228,15 +229,15 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, + } + + int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, +- enum btree_node_type type, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { +- return __bch2_bkey_validate(c, k, type, flags) ?: +- bch2_bkey_val_validate(c, k, flags); ++ return __bch2_bkey_validate(c, k, from) ?: ++ bch2_bkey_val_validate(c, k, from); + } + + int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, +- struct bkey_s_c k, enum bch_validate_flags flags) ++ struct bkey_s_c k, ++ struct bkey_validate_context from) + { + int ret = 0; + +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +index 018fb72e32d3..bf34111cdf00 100644 +--- a/fs/bcachefs/bkey_methods.h ++++ b/fs/bcachefs/bkey_methods.h +@@ -22,7 +22,7 @@ extern const struct bkey_ops bch2_bkey_null_ops; + */ + struct bkey_ops { + int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags); ++ struct bkey_validate_context from); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + void (*swab)(struct bkey_s); +@@ -48,13 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) + : &bch2_bkey_null_ops; + } + +-int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +-int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, +- enum bch_validate_flags); +-int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, +- enum bch_validate_flags); ++int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); ++int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); ++int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context from); + + void bch2_bpos_to_text(struct printbuf *, struct bpos); + void bch2_bkey_to_text(struct printbuf *, const struct bkey *); +diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h +index c9ae9e42b385..b4f328f9853c 100644 +--- a/fs/bcachefs/bkey_types.h ++++ b/fs/bcachefs/bkey_types.h +@@ -210,4 +210,32 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ + BCH_BKEY_TYPES(); + #undef x + ++enum bch_validate_flags { ++ BCH_VALIDATE_write = BIT(0), ++ BCH_VALIDATE_commit = BIT(1), ++ BCH_VALIDATE_silent = BIT(2), ++}; ++ ++#define BKEY_VALIDATE_CONTEXTS() \ ++ x(unknown) \ ++ x(superblock) \ ++ x(journal) \ ++ x(btree_root) \ ++ x(btree_node) \ ++ x(commit) ++ ++struct bkey_validate_context { ++ enum { ++#define x(n) BKEY_VALIDATE_##n, ++ BKEY_VALIDATE_CONTEXTS() ++#undef x ++ } from:8; ++ enum bch_validate_flags flags:8; ++ u8 level; ++ enum btree_id btree; ++ bool root:1; ++ unsigned journal_offset; ++ u64 journal_seq; ++}; ++ + #endif /* _BCACHEFS_BKEY_TYPES_H */ +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +index 7123019ab3bc..ca755e8d1a37 100644 +--- a/fs/bcachefs/btree_cache.c ++++ b/fs/bcachefs/btree_cache.c +@@ -24,7 +24,10 @@ do { \ + } while (0) + + const char * const bch2_btree_node_flags[] = { +-#define x(f) #f, ++ "typebit", ++ "typebit", ++ "typebit", ++#define x(f) [BTREE_NODE_##f] = #f, + BTREE_FLAGS() + #undef x + NULL +@@ -222,7 +225,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b) + struct btree_cache *bc = &c->btree_cache; + + mutex_lock(&bc->lock); +- BUG_ON(!__btree_node_pinned(bc, b)); + if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { + set_btree_node_pinned(b); + list_move(&b->list, &bc->live[1].list); +@@ -326,7 +328,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + +- bch2_btree_node_hash_remove(&c->btree_cache, b); ++ __bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); +@@ -1004,16 +1006,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) + return; + + prt_printf(&buf, +- "btree node header doesn't match ptr\n" +- "btree %s level %u\n" +- "ptr: ", +- bch2_btree_id_str(b->c.btree_id), b->c.level); ++ "btree node header doesn't match ptr: "); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_str(&buf, "\nptr: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + +- prt_printf(&buf, "\nheader: btree %s level %llu\n" +- "min ", +- bch2_btree_id_str(BTREE_NODE_ID(b->data)), +- BTREE_NODE_LEVEL(b->data)); ++ prt_str(&buf, "\nheader: "); ++ bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); ++ prt_str(&buf, "\nmin "); + bch2_bpos_to_text(&buf, b->data->min_key); + + prt_printf(&buf, "\nmax "); +@@ -1133,7 +1133,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); +- return ERR_PTR(-BCH_ERR_btree_node_read_error); ++ return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); + } + + EBUG_ON(b->c.btree_id != path->btree_id); +@@ -1223,7 +1223,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); +- return ERR_PTR(-BCH_ERR_btree_node_read_error); ++ return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); + } + + EBUG_ON(b->c.btree_id != path->btree_id); +@@ -1305,7 +1305,7 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, + + if (unlikely(btree_node_read_error(b))) { + six_unlock_read(&b->c.lock); +- b = ERR_PTR(-BCH_ERR_btree_node_read_error); ++ b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); + goto out; + } + +@@ -1398,13 +1398,31 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) + prt_printf(out, "(unknown btree %u)", btree); + } + ++void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) ++{ ++ prt_str(out, "btree="); ++ bch2_btree_id_to_text(out, btree); ++ prt_printf(out, " level=%u", level); ++} ++ ++void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, ++ enum btree_id btree, unsigned level, struct bkey_s_c k) ++{ ++ bch2_btree_id_to_text(out, btree); ++ prt_printf(out, " level %u/", level); ++ struct btree_root *r = bch2_btree_id_root(c, btree); ++ if (r) ++ prt_printf(out, "%u", r->level); ++ else ++ prt_printf(out, "(unknown)"); ++ prt_printf(out, "\n "); ++ ++ bch2_bkey_val_to_text(out, c, k); ++} ++ + void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) + { +- prt_printf(out, "%s level %u/%u\n ", +- bch2_btree_id_str(b->c.btree_id), +- b->c.level, +- bch2_btree_id_root(c, b->c.btree_id)->level); +- bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ __bch2_btree_pos_to_text(out, c, b->c.btree_id, b->c.level, bkey_i_to_s_c(&b->key)); + } + + void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) +@@ -1478,8 +1496,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc + prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_newline(out); + +- for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) +- prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); ++ for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { ++ bch2_btree_id_to_text(out, i); ++ prt_printf(out, "\t"); ++ prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); ++ prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); ++ } + + prt_newline(out); + prt_printf(out, "freed:\t%zu\n", bc->nr_freed); +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +index 66e86d1a178d..ca3c1b145330 100644 +--- a/fs/bcachefs/btree_cache.h ++++ b/fs/bcachefs/btree_cache.h +@@ -128,19 +128,27 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i + } else { + unsigned idx = id - BTREE_ID_NR; + +- EBUG_ON(idx >= c->btree_roots_extra.nr); ++ /* This can happen when we're called from btree_node_scan */ ++ if (idx >= c->btree_roots_extra.nr) ++ return NULL; ++ + return &c->btree_roots_extra.data[idx]; + } + } + + static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) + { +- return bch2_btree_id_root(c, b->c.btree_id)->b; ++ struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); ++ ++ return r ? r->b : NULL; + } + +-const char *bch2_btree_id_str(enum btree_id); ++const char *bch2_btree_id_str(enum btree_id); /* avoid */ + void bch2_btree_id_to_text(struct printbuf *, enum btree_id); ++void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); + ++void __bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, ++ enum btree_id, unsigned, struct bkey_s_c); + void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); + void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); + void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +index 81dcf9e512c0..dd1d9b74076e 100644 +--- a/fs/bcachefs/btree_gc.c ++++ b/fs/bcachefs/btree_gc.c +@@ -29,6 +29,7 @@ + #include "move.h" + #include "recovery_passes.h" + #include "reflink.h" ++#include "recovery.h" + #include "replicas.h" + #include "super-io.h" + #include "trace.h" +@@ -56,8 +57,8 @@ void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) + { + prt_str(out, bch2_gc_phase_strs[p->phase]); + prt_char(out, ' '); +- bch2_btree_id_to_text(out, p->btree); +- prt_printf(out, " l=%u ", p->level); ++ bch2_btree_id_level_to_text(out, p->btree, p->level); ++ prt_char(out, ' '); + bch2_bpos_to_text(out, p->pos); + } + +@@ -209,8 +210,9 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * + if (bpos_eq(expected_start, cur->data->min_key)) + return 0; + +- prt_printf(&buf, " at btree %s level %u:\n parent: ", +- bch2_btree_id_str(b->c.btree_id), b->c.level); ++ prt_printf(&buf, " at "); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_printf(&buf, ":\n parent: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (prev) { +@@ -277,8 +279,9 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, + if (bpos_eq(child->key.k.p, b->key.k.p)) + return 0; + +- prt_printf(&buf, "at btree %s level %u:\n parent: ", +- bch2_btree_id_str(b->c.btree_id), b->c.level); ++ prt_printf(&buf, " at "); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_printf(&buf, ":\n parent: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_str(&buf, "\n child: "); +@@ -341,14 +344,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + ret = PTR_ERR_OR_ZERO(cur); + + printbuf_reset(&buf); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); ++ prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); + + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), +- trans, btree_node_unreadable, +- "Topology repair: unreadable btree node at btree %s level %u:\n" ++ trans, btree_node_read_error, ++ "Topology repair: unreadable btree node at\n" + " %s", +- bch2_btree_id_str(b->c.btree_id), +- b->c.level - 1, + buf.buf)) { + bch2_btree_node_evict(trans, cur_k.k); + cur = NULL; +@@ -357,11 +360,9 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + if (ret) + break; + +- if (!btree_id_is_alloc(b->c.btree_id)) { +- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); +- if (ret) +- break; +- } ++ ret = bch2_btree_lost_data(c, b->c.btree_id); ++ if (ret) ++ break; + continue; + } + +@@ -370,7 +371,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + break; + + if (bch2_btree_node_is_stale(c, cur)) { +- bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); ++ bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, +@@ -478,14 +479,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct + } + + printbuf_reset(&buf); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (mustfix_fsck_err_on(!have_child, + trans, btree_node_topology_interior_node_empty, +- "empty interior btree node at btree %s level %u\n" +- " %s", +- bch2_btree_id_str(b->c.btree_id), +- b->c.level, buf.buf)) ++ "empty interior btree node at %s", buf.buf)) + ret = DROP_THIS_NODE; + err: + fsck_err: +@@ -511,6 +511,7 @@ int bch2_check_topology(struct bch_fs *c) + { + struct btree_trans *trans = bch2_trans_get(c); + struct bpos pulled_from_scan = POS_MIN; ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + bch2_trans_srcu_unlock(trans); +@@ -519,19 +520,22 @@ int bch2_check_topology(struct bch_fs *c) + struct btree_root *r = bch2_btree_id_root(c, i); + bool reconstructed_root = false; + ++ printbuf_reset(&buf); ++ bch2_btree_id_to_text(&buf, i); ++ + if (r->error) { +- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); ++ ret = bch2_btree_lost_data(c, i); + if (ret) + break; + reconstruct_root: +- bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); ++ bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); + + r->alive = false; + r->error = 0; + + if (!bch2_btree_has_scanned_nodes(c, i)) { + mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, +- "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); ++ "no nodes found for btree %s, continue?", buf.buf); + bch2_btree_root_alloc_fake_trans(trans, i, 0); + } else { + bch2_btree_root_alloc_fake_trans(trans, i, 1); +@@ -560,13 +564,14 @@ int bch2_check_topology(struct bch_fs *c) + if (!reconstructed_root) + goto reconstruct_root; + +- bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); ++ bch_err(c, "empty btree root %s", buf.buf); + bch2_btree_root_alloc_fake_trans(trans, i, 0); + r->alive = false; + ret = 0; + } + } + fsck_err: ++ printbuf_exit(&buf); + bch2_trans_put(trans); + return ret; + } +@@ -713,6 +718,7 @@ static int bch2_gc_btrees(struct bch_fs *c) + { + struct btree_trans *trans = bch2_trans_get(c); + enum btree_id ids[BTREE_ID_NR]; ++ struct printbuf buf = PRINTBUF; + unsigned i; + int ret = 0; + +@@ -727,14 +733,9 @@ static int bch2_gc_btrees(struct bch_fs *c) + continue; + + ret = bch2_gc_btree(trans, btree, true); +- +- if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), +- trans, btree_node_read_error, +- "btree node read error for %s", +- bch2_btree_id_str(btree))) +- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); + } +-fsck_err: ++ ++ printbuf_exit(&buf); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +@@ -802,7 +803,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + old = bch2_alloc_to_v4(k, &old_convert); + gc = new = *old; + +- percpu_down_read(&c->mark_lock); + __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); + + old_gc = gc; +@@ -813,7 +813,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + gc.data_type = old->data_type; + gc.dirty_sectors = old->dirty_sectors; + } +- percpu_up_read(&c->mark_lock); + + /* + * gc.data_type doesn't yet include need_discard & need_gc_gen states - +@@ -831,11 +830,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, + * safe w.r.t. transaction restarts, so fixup the gc_bucket so + * we don't run it twice: + */ +- percpu_down_read(&c->mark_lock); + struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); + gc_m->data_type = gc.data_type; + gc_m->dirty_sectors = gc.dirty_sectors; +- percpu_up_read(&c->mark_lock); + } + + if (fsck_err_on(new.data_type != gc.data_type, +@@ -895,11 +892,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c) + + for_each_member_device(c, ca) { + ret = bch2_trans_run(c, +- for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, ++ for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + POS(ca->dev_idx, ca->mi.nbuckets - 1), + BTREE_ITER_slots|BTREE_ITER_prefetch, k, +- NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_alloc_write_key(trans, &iter, ca, k))); + if (ret) { + bch2_dev_put(ca); +@@ -928,98 +925,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c) + return ret; + } + +-static int bch2_gc_write_reflink_key(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_s_c k, +- size_t *idx) +-{ +- struct bch_fs *c = trans->c; +- const __le64 *refcount = bkey_refcount_c(k); +- struct printbuf buf = PRINTBUF; +- struct reflink_gc *r; +- int ret = 0; +- +- if (!refcount) +- return 0; +- +- while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && +- r->offset < k.k->p.offset) +- ++*idx; +- +- if (!r || +- r->offset != k.k->p.offset || +- r->size != k.k->size) { +- bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); +- return -EINVAL; +- } +- +- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), +- trans, reflink_v_refcount_wrong, +- "reflink key has wrong refcount:\n" +- " %s\n" +- " should be %u", +- (bch2_bkey_val_to_text(&buf, c, k), buf.buf), +- r->refcount)) { +- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); +- ret = PTR_ERR_OR_ZERO(new); +- if (ret) +- goto out; +- +- if (!r->refcount) +- new->k.type = KEY_TYPE_deleted; +- else +- *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); +- ret = bch2_trans_update(trans, iter, new, 0); +- } +-out: +-fsck_err: +- printbuf_exit(&buf); +- return ret; +-} +- +-static int bch2_gc_reflink_done(struct bch_fs *c) +-{ +- size_t idx = 0; +- +- int ret = bch2_trans_run(c, +- for_each_btree_key_commit(trans, iter, +- BTREE_ID_reflink, POS_MIN, +- BTREE_ITER_prefetch, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_gc_write_reflink_key(trans, &iter, k, &idx))); +- c->reflink_gc_nr = 0; +- return ret; +-} +- +-static int bch2_gc_reflink_start(struct bch_fs *c) +-{ +- c->reflink_gc_nr = 0; +- +- int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, +- BTREE_ITER_prefetch, k, ({ +- const __le64 *refcount = bkey_refcount_c(k); +- +- if (!refcount) +- continue; +- +- struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, +- c->reflink_gc_nr++, GFP_KERNEL); +- if (!r) { +- ret = -BCH_ERR_ENOMEM_gc_reflink_start; +- break; +- } +- +- r->offset = k.k->p.offset; +- r->size = k.k->size; +- r->refcount = 0; +- 0; +- }))); +- +- bch_err_fn(c, ret); +- return ret; +-} +- + static int bch2_gc_write_stripes_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +@@ -1171,7 +1076,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, + if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) + return -EROFS; + +- percpu_down_read(&c->mark_lock); + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); +@@ -1180,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, + + if (dev_ptr_stale(ca, ptr) > 16) { + rcu_read_unlock(); +- percpu_up_read(&c->mark_lock); + goto update; + } + } +@@ -1195,7 +1098,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, + *gen = ptr->gen; + } + rcu_read_unlock(); +- percpu_up_read(&c->mark_lock); + return 0; + update: + u = bch2_bkey_make_mut(trans, iter, &k, 0); +@@ -1224,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev + return ret; + + a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; +- alloc_data_type_set(&a_mut->v, a_mut->v.data_type); + + return bch2_trans_update(trans, iter, &a_mut->k_i, 0); + } +@@ -1337,9 +1238,16 @@ void bch2_gc_gens_async(struct bch_fs *c) + bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); + } + +-void bch2_fs_gc_init(struct bch_fs *c) ++void bch2_fs_btree_gc_exit(struct bch_fs *c) + { +- seqcount_init(&c->gc_pos_lock); ++} + ++int bch2_fs_btree_gc_init(struct bch_fs *c) ++{ ++ seqcount_init(&c->gc_pos_lock); + INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); ++ ++ init_rwsem(&c->gc_lock); ++ mutex_init(&c->gc_gens_lock); ++ return 0; + } +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +index 8a47e8bd0791..9693a90a48a2 100644 +--- a/fs/bcachefs/btree_gc.h ++++ b/fs/bcachefs/btree_gc.h +@@ -82,6 +82,8 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); + + int bch2_gc_gens(struct bch_fs *); + void bch2_gc_gens_async(struct bch_fs *); +-void bch2_fs_gc_init(struct bch_fs *); ++ ++void bch2_fs_btree_gc_exit(struct bch_fs *); ++int bch2_fs_btree_gc_init(struct bch_fs *); + + #endif /* _BCACHEFS_BTREE_GC_H */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +index 839d68802e42..e371e60e3133 100644 +--- a/fs/bcachefs/btree_io.c ++++ b/fs/bcachefs/btree_io.c +@@ -25,9 +25,8 @@ + + static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) + { +- prt_printf(out, "btree=%s l=%u seq %llux\n", +- bch2_btree_id_str(BTREE_NODE_ID(bn)), +- (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq); ++ bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); ++ prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); + prt_str(out, "min: "); + bch2_bpos_to_text(out, bn->min_key); + prt_newline(out); +@@ -490,8 +489,8 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) + if (b->nsets == MAX_BSETS && + !btree_node_write_in_flight(b) && + should_compact_all(c, b)) { +- bch2_btree_node_write(c, b, SIX_LOCK_write, +- BTREE_WRITE_init_next_bset); ++ bch2_btree_node_write_trans(trans, b, SIX_LOCK_write, ++ BTREE_WRITE_init_next_bset); + reinit_iter = true; + } + +@@ -832,13 +831,32 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + return ret; + } + ++static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, ++ struct bkey_s_c k, ++ enum bch_validate_flags flags) ++{ ++ return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { ++ .from = BKEY_VALIDATE_btree_node, ++ .level = b->c.level, ++ .btree = b->c.btree_id, ++ .flags = flags ++ }); ++} ++ + static int bset_key_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, +- bool updated_range, int rw) ++ bool updated_range, ++ enum bch_validate_flags flags) + { +- return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?: +- (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?: +- (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0); ++ struct bkey_validate_context from = (struct bkey_validate_context) { ++ .from = BKEY_VALIDATE_btree_node, ++ .level = b->c.level, ++ .btree = b->c.btree_id, ++ .flags = flags, ++ }; ++ return __bch2_bkey_validate(c, k, from) ?: ++ (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: ++ (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); + } + + static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, +@@ -855,7 +873,21 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, + + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); +- return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); ++ return !__bch2_bkey_validate(c, u.s_c, ++ (struct bkey_validate_context) { ++ .from = BKEY_VALIDATE_btree_node, ++ .level = b->c.level, ++ .btree = b->c.btree_id, ++ .flags = BCH_VALIDATE_silent ++ }); ++} ++ ++static inline int btree_node_read_bkey_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) ++{ ++ return bch2_bkey_cmp_packed(b, l, r) ++ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); + } + + static int validate_bset_keys(struct bch_fs *c, struct btree *b, +@@ -918,7 +950,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + +- if (prev && bkey_iter_cmp(b, prev, k) > 0) { ++ if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { + struct bkey up = bkey_unpack_key(b, prev); + + printbuf_reset(&buf); +@@ -965,6 +997,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, + got_good_key: + le16_add_cpu(&i->u64s, -next_good_key); + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); ++ set_btree_node_need_rewrite(b); + } + fsck_err: + printbuf_exit(&buf); +@@ -1038,39 +1071,51 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + while (b->written < (ptr_written ?: btree_sectors(c))) { + unsigned sectors; +- struct nonce nonce; + bool first = !b->written; +- bool csum_bad; + +- if (!b->written) { ++ if (first) { ++ bne = NULL; + i = &b->data->keys; ++ } else { ++ bne = write_block(b); ++ i = &bne->keys; + +- btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), +- -BCH_ERR_btree_node_read_err_want_retry, +- c, ca, b, i, NULL, +- bset_unknown_csum, +- "unknown checksum type %llu", BSET_CSUM_TYPE(i)); +- +- nonce = btree_nonce(i, b->written << 9); ++ if (i->seq != b->data->keys.seq) ++ break; ++ } + +- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); +- csum_bad = bch2_crc_cmp(b->data->csum, csum); +- if (csum_bad) +- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); ++ struct nonce nonce = btree_nonce(i, b->written << 9); ++ bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); + +- btree_err_on(csum_bad, +- -BCH_ERR_btree_node_read_err_want_retry, +- c, ca, b, i, NULL, +- bset_bad_csum, +- "%s", +- (printbuf_reset(&buf), +- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), +- buf.buf)); +- +- ret = bset_encrypt(c, i, b->written << 9); +- if (bch2_fs_fatal_err_on(ret, c, +- "decrypting btree node: %s", bch2_err_str(ret))) +- goto fsck_err; ++ btree_err_on(!good_csum_type, ++ bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) ++ ? -BCH_ERR_btree_node_read_err_must_retry ++ : -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, i, NULL, ++ bset_unknown_csum, ++ "unknown checksum type %llu", BSET_CSUM_TYPE(i)); ++ ++ if (first) { ++ if (good_csum_type) { ++ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); ++ bool csum_bad = bch2_crc_cmp(b->data->csum, csum); ++ if (csum_bad) ++ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); ++ ++ btree_err_on(csum_bad, ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, i, NULL, ++ bset_bad_csum, ++ "%s", ++ (printbuf_reset(&buf), ++ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), ++ buf.buf)); ++ ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "decrypting btree node: %s", bch2_err_str(ret))) ++ goto fsck_err; ++ } + + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), +@@ -1081,37 +1126,26 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + + sectors = vstruct_sectors(b->data, c->block_bits); + } else { +- bne = write_block(b); +- i = &bne->keys; +- +- if (i->seq != b->data->keys.seq) +- break; +- +- btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), +- -BCH_ERR_btree_node_read_err_want_retry, +- c, ca, b, i, NULL, +- bset_unknown_csum, +- "unknown checksum type %llu", BSET_CSUM_TYPE(i)); +- +- nonce = btree_nonce(i, b->written << 9); +- struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); +- csum_bad = bch2_crc_cmp(bne->csum, csum); +- if (ca && csum_bad) +- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); +- +- btree_err_on(csum_bad, +- -BCH_ERR_btree_node_read_err_want_retry, +- c, ca, b, i, NULL, +- bset_bad_csum, +- "%s", +- (printbuf_reset(&buf), +- bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), +- buf.buf)); +- +- ret = bset_encrypt(c, i, b->written << 9); +- if (bch2_fs_fatal_err_on(ret, c, +- "decrypting btree node: %s", bch2_err_str(ret))) +- goto fsck_err; ++ if (good_csum_type) { ++ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ bool csum_bad = bch2_crc_cmp(bne->csum, csum); ++ if (ca && csum_bad) ++ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); ++ ++ btree_err_on(csum_bad, ++ -BCH_ERR_btree_node_read_err_want_retry, ++ c, ca, b, i, NULL, ++ bset_bad_csum, ++ "%s", ++ (printbuf_reset(&buf), ++ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), ++ buf.buf)); ++ ++ ret = bset_encrypt(c, i, b->written << 9); ++ if (bch2_fs_fatal_err_on(ret, c, ++ "decrypting btree node: %s", bch2_err_str(ret))) ++ goto fsck_err; ++ } + + sectors = vstruct_sectors(bne, c->block_bits); + } +@@ -1216,7 +1250,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); + +- ret = bch2_bkey_val_validate(c, u.s_c, READ); ++ ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); + if (ret == -BCH_ERR_fsck_delete_bkey || + (bch2_inject_invalid_keys && + !bversion_cmp(u.k->bversion, MAX_VERSION))) { +@@ -1226,6 +1260,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_bset_end(b, b->set); ++ set_btree_node_need_rewrite(b); + continue; + } + if (ret) +@@ -1339,13 +1374,18 @@ static void btree_node_read_work(struct work_struct *work) + rb->start_time); + bio_put(&rb->bio); + +- if (saw_error && ++ if ((saw_error || ++ btree_node_need_rewrite(b)) && + !btree_node_read_error(b) && + c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { +- printbuf_reset(&buf); +- bch2_bpos_to_text(&buf, b->key.k.p); +- bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", +- __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); ++ if (saw_error) { ++ printbuf_reset(&buf); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_str(&buf, " "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", ++ __func__, buf.buf); ++ } + + bch2_btree_node_rewrite_async(c, b); + } +@@ -1933,7 +1973,12 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + bool saw_error; + + int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), +- BKEY_TYPE_btree, WRITE); ++ (struct bkey_validate_context) { ++ .from = BKEY_VALIDATE_btree_node, ++ .level = b->c.level + 1, ++ .btree = b->c.btree_id, ++ .flags = BCH_VALIDATE_write, ++ }); + if (ret) { + bch2_fs_inconsistent(c, "invalid btree node key before write"); + return ret; +@@ -2300,6 +2345,34 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + } + } + ++void bch2_btree_node_write_trans(struct btree_trans *trans, struct btree *b, ++ enum six_lock_type lock_type_held, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ ++ if (lock_type_held == SIX_LOCK_intent || ++ (lock_type_held == SIX_LOCK_read && ++ six_lock_tryupgrade(&b->c.lock))) { ++ __bch2_btree_node_write(c, b, flags); ++ ++ /* don't cycle lock unnecessarily: */ ++ if (btree_node_just_written(b) && ++ six_trylock_write(&b->c.lock)) { ++ bch2_btree_post_write_cleanup(c, b); ++ __bch2_btree_node_unlock_write(trans, b); ++ } ++ ++ if (lock_type_held == SIX_LOCK_read) ++ six_lock_downgrade(&b->c.lock); ++ } else { ++ __bch2_btree_node_write(c, b, flags); ++ if (lock_type_held == SIX_LOCK_write && ++ btree_node_just_written(b)) ++ bch2_btree_post_write_cleanup(c, b); ++ } ++} ++ + static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) + { + struct bucket_table *tbl; +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +index 9b01ca3de907..6f9e4a6dacf7 100644 +--- a/fs/bcachefs/btree_io.h ++++ b/fs/bcachefs/btree_io.h +@@ -144,11 +144,13 @@ enum btree_write_flags { + void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); + void bch2_btree_node_write(struct bch_fs *, struct btree *, + enum six_lock_type, unsigned); ++void bch2_btree_node_write_trans(struct btree_trans *, struct btree *, ++ enum six_lock_type, unsigned); + +-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, ++static inline void btree_node_write_if_need(struct btree_trans *trans, struct btree *b, + enum six_lock_type lock_held) + { +- bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); ++ bch2_btree_node_write_trans(trans, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); + } + + bool bch2_btree_flush_all_reads(struct bch_fs *); +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +index eef9b89c561d..5988219c6908 100644 +--- a/fs/bcachefs/btree_iter.c ++++ b/fs/bcachefs/btree_iter.c +@@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) + BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && + iter->pos.snapshot != iter->snapshot); + +- BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || +- bkey_gt(iter->pos, iter->k.p)); ++ BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : ++ !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : ++ (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || ++ bkey_gt(iter->pos, iter->k.p))); + } + + static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) +@@ -327,7 +329,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k + void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos) + { +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + + struct btree_path *path; + struct trans_for_each_path_inorder_iter iter; +@@ -697,6 +699,19 @@ void bch2_trans_node_add(struct btree_trans *trans, + bch2_trans_revalidate_updates_in_node(trans, b); + } + ++void bch2_trans_node_drop(struct btree_trans *trans, ++ struct btree *b) ++{ ++ struct btree_path *path; ++ unsigned i, level = b->c.level; ++ ++ trans_for_each_path(trans, path, i) ++ if (path->l[level].b == b) { ++ btree_node_unlock(trans, path, level); ++ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); ++ } ++} ++ + /* + * A btree node has been modified in such a way as to invalidate iterators - fix + * them: +@@ -720,7 +735,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, + unsigned long trace_ip) + { + struct bch_fs *c = trans->c; +- struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; ++ struct btree_root *r = bch2_btree_id_root(c, path->btree_id); + enum six_lock_type lock_type; + unsigned i; + int ret; +@@ -728,7 +743,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, + EBUG_ON(path->nodes_locked); + + while (1) { +- b = READ_ONCE(*rootp); ++ struct btree *b = READ_ONCE(r->b); ++ if (unlikely(!b)) { ++ BUG_ON(!r->error); ++ return r->error; ++ } ++ + path->level = READ_ONCE(b->c.level); + + if (unlikely(path->level < depth_want)) { +@@ -748,14 +768,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, + ret = btree_node_lock(trans, path, &b->c, + path->level, lock_type, trace_ip); + if (unlikely(ret)) { +- if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) +- continue; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + BUG(); + } + +- if (likely(b == READ_ONCE(*rootp) && ++ if (likely(b == READ_ONCE(r->b) && + b->c.level == path->level && + !race_fault())) { + for (i = 0; i < path->level; i++) +@@ -825,6 +843,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p + + bch2_bkey_buf_init(&tmp); + ++ jiter->fail_if_too_many_whiteouts = true; ++ + while (nr-- && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; +@@ -1000,7 +1020,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) + + bch2_trans_unlock(trans); + cond_resched(); +- trans_set_locked(trans); ++ trans_set_locked(trans, false); + + if (unlikely(trans->memory_allocation_failure)) { + struct closure cl; +@@ -1267,7 +1287,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, + { + int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); + +- bch2_trans_verify_not_in_restart(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + EBUG_ON(!trans->paths[path_idx].ref); + + trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); +@@ -1427,17 +1447,31 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ + (void *) trans->last_begin_ip); + } + +-void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) ++static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) + { ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct printbuf buf = PRINTBUF; ++ bch2_prt_backtrace(&buf, &trans->last_restarted_trace); ++ panic("in transaction restart: %s, last restarted by\n%s", ++ bch2_err_str(trans->restarted), ++ buf.buf); ++#else + panic("in transaction restart: %s, last restarted by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); ++#endif + } + +-void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) ++void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) + { +- panic("trans should be locked, unlocked by %pS\n", +- (void *) trans->last_unlock_ip); ++ if (trans->restarted) ++ bch2_trans_in_restart_error(trans); ++ ++ if (!trans->locked) ++ panic("trans should be locked, unlocked by %pS\n", ++ (void *) trans->last_unlock_ip); ++ ++ BUG(); + } + + noinline __cold +@@ -1450,10 +1484,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) + trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + +- prt_printf(buf, "update: btree=%s cached=%u %pS\n", +- bch2_btree_id_str(i->btree_id), +- i->cached, +- (void *) i->ip_allocated); ++ prt_str(buf, "update: btree="); ++ bch2_btree_id_to_text(buf, i->btree_id); ++ prt_printf(buf, " cached=%u %pS\n", ++ i->cached, ++ (void *) i->ip_allocated); + + prt_printf(buf, " old "); + bch2_bkey_val_to_text(buf, trans->c, old); +@@ -1486,13 +1521,13 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra + { + struct btree_path *path = trans->paths + path_idx; + +- prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ", ++ prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", + path_idx, path->ref, path->intent_ref, + path->preserve ? 'P' : ' ', + path->should_be_locked ? 'S' : ' ', +- path->cached ? 'C' : 'B', +- bch2_btree_id_str(path->btree_id), +- path->level); ++ path->cached ? 'C' : 'B'); ++ bch2_btree_id_level_to_text(out, path->btree_id, path->level); ++ prt_str(out, " pos "); + bch2_bpos_to_text(out, path->pos); + + if (!path->cached && btree_node_locked(path, path->level)) { +@@ -1717,8 +1752,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, + struct trans_for_each_path_inorder_iter iter; + btree_path_idx_t path_pos = 0, path_idx; + +- bch2_trans_verify_not_unlocked(trans); +- bch2_trans_verify_not_in_restart(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_trans_verify_locks(trans); + + btree_trans_sort_paths(trans); +@@ -1833,7 +1867,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * + !bkey_eq(path->pos, ck->key.pos)); + + *u = ck->k->k; +- k = bkey_i_to_s_c(ck->k); ++ k = (struct bkey_s_c) { u, &ck->k->v }; + } + + return k; +@@ -1843,7 +1877,6 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * + return (struct bkey_s_c) { u, NULL }; + } + +- + void bch2_set_btree_iter_dontneed(struct btree_iter *iter) + { + struct btree_trans *trans = iter->trans; +@@ -1870,7 +1903,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) + struct btree_trans *trans = iter->trans; + int ret; + +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + + iter->path = bch2_btree_path_set_pos(trans, iter->path, + btree_iter_search_key(iter), +@@ -1945,7 +1978,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) + int ret; + + EBUG_ON(trans->paths[iter->path].cached); +- bch2_trans_verify_not_in_restart(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify(iter); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); +@@ -2101,7 +2134,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, + { + struct btree_path *path = btree_iter_path(trans, iter); + +- return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, ++ return bch2_journal_keys_peek_max(trans->c, iter->btree_id, + path->level, + path->pos, + end_pos, +@@ -2124,21 +2157,47 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, + } + + static noinline +-struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_s_c k) ++void btree_trans_peek_journal(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c *k) + { + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *next_journal = + bch2_btree_journal_peek(trans, iter, +- k.k ? k.k->p : path_l(path)->b->key.k.p); +- ++ k->k ? k->k->p : path_l(path)->b->key.k.p); + if (next_journal) { + iter->k = next_journal->k; +- k = bkey_i_to_s_c(next_journal); ++ *k = bkey_i_to_s_c(next_journal); + } ++} + +- return k; ++static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos end_pos) ++{ ++ struct btree_path *path = btree_iter_path(trans, iter); ++ ++ return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, ++ path->level, ++ path->pos, ++ end_pos, ++ &iter->journal_idx); ++} ++ ++static noinline ++void btree_trans_peek_prev_journal(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c *k) ++{ ++ struct btree_path *path = btree_iter_path(trans, iter); ++ struct bkey_i *next_journal = ++ bch2_btree_journal_peek_prev(trans, iter, ++ k->k ? k->k->p : path_l(path)->b->key.k.p); ++ ++ if (next_journal) { ++ iter->k = next_journal->k; ++ *k = bkey_i_to_s_c(next_journal); ++ } + } + + /* +@@ -2154,8 +2213,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos + struct bkey_s_c k; + int ret; + +- bch2_trans_verify_not_in_restart(trans); +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + + if ((iter->flags & BTREE_ITER_key_cache_fill) && + bpos_eq(iter->pos, pos)) +@@ -2181,13 +2239,17 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos + if (unlikely(ret)) + return bkey_s_c_err(ret); + +- btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); +- + k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); +- if (k.k && !bkey_err(k)) { +- iter->k = u; +- k.k = &iter->k; +- } ++ if (!k.k) ++ return k; ++ ++ if ((iter->flags & BTREE_ITER_all_snapshots) && ++ !bpos_eq(pos, k.k->p)) ++ return bkey_s_c_null; ++ ++ iter->k = u; ++ k.k = &iter->k; ++ btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); + return k; + } + +@@ -2201,8 +2263,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp + bch2_btree_iter_verify(iter); + + while (1) { +- struct btree_path_level *l; +- + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_intent, + btree_iter_ip_allocated(iter)); +@@ -2212,17 +2272,17 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); +- goto out; ++ break; + } + + struct btree_path *path = btree_iter_path(trans, iter); +- l = path_l(path); ++ struct btree_path_level *l = path_l(path); + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; +- goto out; ++ break; + } + + btree_path_set_should_be_locked(trans, path); +@@ -2233,15 +2293,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + k = k2; +- ret = bkey_err(k); +- if (ret) { ++ if (bkey_err(k)) { + bch2_btree_iter_set_pos(iter, iter->pos); +- goto out; ++ break; + } + } + + if (unlikely(iter->flags & BTREE_ITER_with_journal)) +- k = btree_trans_peek_journal(trans, iter, k); ++ btree_trans_peek_journal(trans, iter, &k); + + if (unlikely((iter->flags & BTREE_ITER_with_updates) && + trans->nr_updates)) +@@ -2270,32 +2329,32 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp + /* End of btree: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; +- goto out; ++ break; + } + } +-out: +- bch2_btree_iter_verify(iter); + ++ bch2_btree_iter_verify(iter); + return k; + } + + /** +- * bch2_btree_iter_peek_upto() - returns first key greater than or equal to ++ * bch2_btree_iter_peek_max() - returns first key greater than or equal to + * iterator's current position + * @iter: iterator to peek from + * @end: search limit: returns keys less than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) ++struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) + { + struct btree_trans *trans = iter->trans; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_s_c k; +- struct bpos iter_pos; ++ struct bpos iter_pos = iter->pos; + int ret; + +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); ++ bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); + + if (iter->update_path) { +@@ -2304,8 +2363,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e + iter->update_path = 0; + } + +- bch2_btree_iter_verify_entry_exit(iter); +- + while (1) { + k = __bch2_btree_iter_peek(iter, search_key); + if (unlikely(!k.k)) +@@ -2313,75 +2370,75 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e + if (unlikely(bkey_err(k))) + goto out_no_locked; + +- /* +- * We need to check against @end before FILTER_SNAPSHOTS because +- * if we get to a different inode that requested we might be +- * seeing keys for a different snapshot tree that will all be +- * filtered out. +- * +- * But we can't do the full check here, because bkey_start_pos() +- * isn't monotonically increasing before FILTER_SNAPSHOTS, and +- * that's what we check against in extents mode: +- */ +- if (unlikely(!(iter->flags & BTREE_ITER_is_extents) +- ? bkey_gt(k.k->p, end) +- : k.k->p.inode > end.inode)) +- goto end; ++ if (iter->flags & BTREE_ITER_filter_snapshots) { ++ /* ++ * We need to check against @end before FILTER_SNAPSHOTS because ++ * if we get to a different inode that requested we might be ++ * seeing keys for a different snapshot tree that will all be ++ * filtered out. ++ * ++ * But we can't do the full check here, because bkey_start_pos() ++ * isn't monotonically increasing before FILTER_SNAPSHOTS, and ++ * that's what we check against in extents mode: ++ */ ++ if (unlikely(!(iter->flags & BTREE_ITER_is_extents) ++ ? bkey_gt(k.k->p, end) ++ : k.k->p.inode > end.inode)) ++ goto end; ++ ++ if (iter->update_path && ++ !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { ++ bch2_path_put_nokeep(trans, iter->update_path, ++ iter->flags & BTREE_ITER_intent); ++ iter->update_path = 0; ++ } + +- if (iter->update_path && +- !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { +- bch2_path_put_nokeep(trans, iter->update_path, +- iter->flags & BTREE_ITER_intent); +- iter->update_path = 0; +- } ++ if ((iter->flags & BTREE_ITER_intent) && ++ !(iter->flags & BTREE_ITER_is_extents) && ++ !iter->update_path) { ++ struct bpos pos = k.k->p; + +- if ((iter->flags & BTREE_ITER_filter_snapshots) && +- (iter->flags & BTREE_ITER_intent) && +- !(iter->flags & BTREE_ITER_is_extents) && +- !iter->update_path) { +- struct bpos pos = k.k->p; ++ if (pos.snapshot < iter->snapshot) { ++ search_key = bpos_successor(k.k->p); ++ continue; ++ } + +- if (pos.snapshot < iter->snapshot) { +- search_key = bpos_successor(k.k->p); +- continue; +- } ++ pos.snapshot = iter->snapshot; + +- pos.snapshot = iter->snapshot; ++ /* ++ * advance, same as on exit for iter->path, but only up ++ * to snapshot ++ */ ++ __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); ++ iter->update_path = iter->path; ++ ++ iter->update_path = bch2_btree_path_set_pos(trans, ++ iter->update_path, pos, ++ iter->flags & BTREE_ITER_intent, ++ _THIS_IP_); ++ ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); ++ if (unlikely(ret)) { ++ k = bkey_s_c_err(ret); ++ goto out_no_locked; ++ } ++ } + + /* +- * advance, same as on exit for iter->path, but only up +- * to snapshot ++ * We can never have a key in a leaf node at POS_MAX, so ++ * we don't have to check these successor() calls: + */ +- __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); +- iter->update_path = iter->path; +- +- iter->update_path = bch2_btree_path_set_pos(trans, +- iter->update_path, pos, +- iter->flags & BTREE_ITER_intent, +- _THIS_IP_); +- ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); +- if (unlikely(ret)) { +- k = bkey_s_c_err(ret); +- goto out_no_locked; ++ if (!bch2_snapshot_is_ancestor(trans->c, ++ iter->snapshot, ++ k.k->p.snapshot)) { ++ search_key = bpos_successor(k.k->p); ++ continue; + } +- } + +- /* +- * We can never have a key in a leaf node at POS_MAX, so +- * we don't have to check these successor() calls: +- */ +- if ((iter->flags & BTREE_ITER_filter_snapshots) && +- !bch2_snapshot_is_ancestor(trans->c, +- iter->snapshot, +- k.k->p.snapshot)) { +- search_key = bpos_successor(k.k->p); +- continue; +- } +- +- if (bkey_whiteout(k.k) && +- !(iter->flags & BTREE_ITER_all_snapshots)) { +- search_key = bkey_successor(iter, k.k->p); +- continue; ++ if (bkey_whiteout(k.k) && ++ !(iter->flags & BTREE_ITER_key_cache_fill)) { ++ search_key = bkey_successor(iter, k.k->p); ++ continue; ++ } + } + + /* +@@ -2451,127 +2508,204 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) + return bch2_btree_iter_peek(iter); + } + +-/** +- * bch2_btree_iter_peek_prev() - returns first key less than or equal to +- * iterator's current position +- * @iter: iterator to peek from +- * +- * Returns: key if found, or an error extractable with bkey_err(). +- */ +-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) ++static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) + { + struct btree_trans *trans = iter->trans; +- struct bpos search_key = iter->pos; +- struct bkey_s_c k; +- struct bkey saved_k; +- const struct bch_val *saved_v; +- btree_path_idx_t saved_path = 0; +- int ret; +- +- bch2_trans_verify_not_unlocked(trans); +- EBUG_ON(btree_iter_path(trans, iter)->cached || +- btree_iter_path(trans, iter)->level); +- +- if (iter->flags & BTREE_ITER_with_journal) +- return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); ++ struct bkey_s_c k, k2; + + bch2_btree_iter_verify(iter); +- bch2_btree_iter_verify_entry_exit(iter); +- +- if (iter->flags & BTREE_ITER_filter_snapshots) +- search_key.snapshot = U32_MAX; + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, +- iter->flags & BTREE_ITER_intent, +- btree_iter_ip_allocated(iter)); ++ iter->flags & BTREE_ITER_intent, ++ btree_iter_ip_allocated(iter)); + +- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); ++ int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); +- goto out_no_locked; ++ break; + } + + struct btree_path *path = btree_iter_path(trans, iter); ++ struct btree_path_level *l = path_l(path); ++ ++ if (unlikely(!l->b)) { ++ /* No btree nodes at requested level: */ ++ bch2_btree_iter_set_pos(iter, SPOS_MAX); ++ k = bkey_s_c_null; ++ break; ++ } ++ ++ btree_path_set_should_be_locked(trans, path); ++ ++ k = btree_path_level_peek_all(trans->c, l, &iter->k); ++ if (!k.k || bpos_gt(k.k->p, search_key)) { ++ k = btree_path_level_prev(trans, path, l, &iter->k); + +- k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); +- if (!k.k || +- ((iter->flags & BTREE_ITER_is_extents) +- ? bpos_ge(bkey_start_pos(k.k), search_key) +- : bpos_gt(k.k->p, search_key))) +- k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); ++ BUG_ON(k.k && bpos_gt(k.k->p, search_key)); ++ } ++ ++ if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && ++ k.k && ++ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { ++ k = k2; ++ if (bkey_err(k2)) { ++ bch2_btree_iter_set_pos(iter, iter->pos); ++ break; ++ } ++ } ++ ++ if (unlikely(iter->flags & BTREE_ITER_with_journal)) ++ btree_trans_peek_prev_journal(trans, iter, &k); + + if (unlikely((iter->flags & BTREE_ITER_with_updates) && + trans->nr_updates)) + bch2_btree_trans_peek_prev_updates(trans, iter, &k); + +- if (likely(k.k)) { +- if (iter->flags & BTREE_ITER_filter_snapshots) { +- if (k.k->p.snapshot == iter->snapshot) +- goto got_key; ++ if (likely(k.k && !bkey_deleted(k.k))) { ++ break; ++ } else if (k.k) { ++ search_key = bpos_predecessor(k.k->p); ++ } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { ++ /* Advance to previous leaf node: */ ++ search_key = bpos_predecessor(path->l[0].b->data->min_key); ++ } else { ++ /* Start of btree: */ ++ bch2_btree_iter_set_pos(iter, POS_MIN); ++ k = bkey_s_c_null; ++ break; ++ } ++ } ++ ++ bch2_btree_iter_verify(iter); ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to ++ * iterator's current position ++ * @iter: iterator to peek from ++ * @end: search limit: returns keys greater than or equal to @end ++ * ++ * Returns: key if found, or an error extractable with bkey_err(). ++ */ ++struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) ++{ ++ if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && ++ !bkey_eq(iter->pos, POS_MAX)) { ++ /* ++ * bkey_start_pos(), for extents, is not monotonically ++ * increasing until after filtering for snapshots: ++ * ++ * Thus, for extents we need to search forward until we find a ++ * real visible extents - easiest to just use peek_slot() (which ++ * internally uses peek() for extents) ++ */ ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) ++ return k; ++ ++ if (!bkey_deleted(k.k) && ++ (!(iter->flags & BTREE_ITER_is_extents) || ++ bkey_lt(bkey_start_pos(k.k), iter->pos))) ++ return k; ++ } ++ ++ struct btree_trans *trans = iter->trans; ++ struct bpos search_key = iter->pos; ++ struct bkey_s_c k; ++ btree_path_idx_t saved_path = 0; ++ ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); ++ bch2_btree_iter_verify_entry_exit(iter); ++ EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); ++ ++ while (1) { ++ k = __bch2_btree_iter_peek_prev(iter, search_key); ++ if (unlikely(!k.k)) ++ goto end; ++ if (unlikely(bkey_err(k))) ++ goto out_no_locked; ++ ++ if (iter->flags & BTREE_ITER_filter_snapshots) { ++ struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; ++ if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { ++ /* ++ * If we have a saved candidate, and we're past ++ * the last possible snapshot overwrite, return ++ * it: ++ */ ++ bch2_path_put_nokeep(trans, iter->path, ++ iter->flags & BTREE_ITER_intent); ++ iter->path = saved_path; ++ saved_path = 0; ++ k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); ++ break; ++ } ++ ++ /* ++ * We need to check against @end before FILTER_SNAPSHOTS because ++ * if we get to a different inode that requested we might be ++ * seeing keys for a different snapshot tree that will all be ++ * filtered out. ++ */ ++ if (unlikely(bkey_lt(k.k->p, end))) ++ goto end; ++ ++ if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { ++ search_key = bpos_predecessor(k.k->p); ++ continue; ++ } + ++ if (k.k->p.snapshot != iter->snapshot) { + /* +- * If we have a saved candidate, and we're no +- * longer at the same _key_ (not pos), return +- * that candidate ++ * Have a key visible in iter->snapshot, but ++ * might have overwrites: - save it and keep ++ * searching. Unless it's a whiteout - then drop ++ * our previous saved candidate: + */ +- if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { +- bch2_path_put_nokeep(trans, iter->path, +- iter->flags & BTREE_ITER_intent); +- iter->path = saved_path; ++ if (saved_path) { ++ bch2_path_put_nokeep(trans, saved_path, ++ iter->flags & BTREE_ITER_intent); + saved_path = 0; +- iter->k = saved_k; +- k.v = saved_v; +- goto got_key; + } + +- if (bch2_snapshot_is_ancestor(trans->c, +- iter->snapshot, +- k.k->p.snapshot)) { +- if (saved_path) +- bch2_path_put_nokeep(trans, saved_path, +- iter->flags & BTREE_ITER_intent); ++ if (!bkey_whiteout(k.k)) { + saved_path = btree_path_clone(trans, iter->path, + iter->flags & BTREE_ITER_intent, + _THIS_IP_); +- path = btree_iter_path(trans, iter); +- trace_btree_path_save_pos(trans, path, trans->paths + saved_path); +- saved_k = *k.k; +- saved_v = k.v; ++ trace_btree_path_save_pos(trans, ++ trans->paths + iter->path, ++ trans->paths + saved_path); + } + + search_key = bpos_predecessor(k.k->p); + continue; + } +-got_key: +- if (bkey_whiteout(k.k) && +- !(iter->flags & BTREE_ITER_all_snapshots)) { ++ ++ if (bkey_whiteout(k.k)) { + search_key = bkey_predecessor(iter, k.k->p); +- if (iter->flags & BTREE_ITER_filter_snapshots) +- search_key.snapshot = U32_MAX; ++ search_key.snapshot = U32_MAX; + continue; + } +- +- btree_path_set_should_be_locked(trans, path); +- break; +- } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { +- /* Advance to previous leaf node: */ +- search_key = bpos_predecessor(path->l[0].b->data->min_key); +- } else { +- /* Start of btree: */ +- bch2_btree_iter_set_pos(iter, POS_MIN); +- k = bkey_s_c_null; +- goto out_no_locked; + } +- } + +- EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); ++ EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : ++ iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : ++ bkey_gt(k.k->p, iter->pos)); ++ ++ if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : ++ iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : ++ bkey_lt(k.k->p, end))) ++ goto end; ++ ++ break; ++ } + + /* Extents can straddle iter->pos: */ +- if (bkey_lt(k.k->p, iter->pos)) +- iter->pos = k.k->p; ++ iter->pos = bpos_min(iter->pos, k.k->p);; + + if (iter->flags & BTREE_ITER_filter_snapshots) + iter->pos.snapshot = iter->snapshot; +@@ -2581,8 +2715,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) + + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); +- + return k; ++end: ++ bch2_btree_iter_set_pos(iter, end); ++ k = bkey_s_c_null; ++ goto out_no_locked; + } + + /** +@@ -2607,7 +2744,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + struct bkey_s_c k; + int ret; + +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); +@@ -2632,6 +2769,10 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + goto out_no_locked; + } + ++ struct btree_path *path = btree_iter_path(trans, iter); ++ if (unlikely(!btree_path_node(path, path->level))) ++ return bkey_s_c_null; ++ + if ((iter->flags & BTREE_ITER_cached) || + !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { + k = bkey_s_c_null; +@@ -2658,6 +2799,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); + if (unlikely(!k.k)) + goto out_no_locked; ++ ++ if (unlikely(k.k->type == KEY_TYPE_whiteout && ++ (iter->flags & BTREE_ITER_filter_snapshots) && ++ !(iter->flags & BTREE_ITER_key_cache_fill))) ++ iter->k.type = KEY_TYPE_deleted; + } else { + struct bpos next; + struct bpos end = iter->pos; +@@ -2671,7 +2817,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + struct btree_iter iter2; + + bch2_trans_copy_iter(&iter2, iter); +- k = bch2_btree_iter_peek_upto(&iter2, end); ++ k = bch2_btree_iter_peek_max(&iter2, end); + + if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); +@@ -2682,7 +2828,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) + } else { + struct bpos pos = iter->pos; + +- k = bch2_btree_iter_peek_upto(iter, end); ++ k = bch2_btree_iter_peek_max(iter, end); + if (unlikely(bkey_err(k))) + bch2_btree_iter_set_pos(iter, pos); + else +@@ -2902,7 +3048,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, + unsigned flags) + { + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, +- bch2_btree_iter_flags(trans, btree_id, flags), ++ bch2_btree_iter_flags(trans, btree_id, 0, flags), + _RET_IP_); + } + +@@ -2918,8 +3064,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, + flags |= BTREE_ITER_snapshot_field; + flags |= BTREE_ITER_all_snapshots; + ++ if (!depth && btree_id_cached(trans->c, btree_id)) ++ flags |= BTREE_ITER_with_key_cache; ++ + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, +- __bch2_btree_iter_flags(trans, btree_id, flags), ++ bch2_btree_iter_flags(trans, btree_id, depth, flags), + _RET_IP_); + + iter->min_depth = depth; +@@ -3122,14 +3271,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) + + trans->last_begin_ip = _RET_IP_; + +- trans_set_locked(trans); ++ trans_set_locked(trans, false); + + if (trans->restarted) { + bch2_btree_path_traverse_all(trans); + trans->notrace_relock_fail = false; + } + +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + return trans->restart_count; + } + +@@ -3228,7 +3377,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + trans->srcu_held = true; +- trans_set_locked(trans); ++ trans_set_locked(trans, false); + + closure_init_stack_release(&trans->ref); + return trans; +@@ -3262,6 +3411,9 @@ void bch2_trans_put(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; + ++ if (trans->restarted) ++ bch2_trans_in_restart_error(trans); ++ + bch2_trans_unlock(trans); + + trans_for_each_update(trans, i) +@@ -3285,6 +3437,10 @@ void bch2_trans_put(struct btree_trans *trans) + closure_return_sync(&trans->ref); + trans->locking_wait.task = NULL; + ++#ifdef CONFIG_BCACHEFS_DEBUG ++ darray_exit(&trans->last_restarted_trace); ++#endif ++ + unsigned long *paths_allocated = trans->paths_allocated; + trans->paths_allocated = NULL; + trans->paths = NULL; +@@ -3338,8 +3494,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, + pid = owner ? owner->pid : 0; + rcu_read_unlock(); + +- prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b', +- b->level, bch2_btree_id_str(b->btree_id)); ++ prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); ++ bch2_btree_id_to_text(out, b->btree_id); ++ prt_printf(out, " l=%u:", b->level); + bch2_bpos_to_text(out, btree_node_pos(b)); + + prt_printf(out, "\t locks %u:%u:%u held by pid %u", +@@ -3378,11 +3535,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) + if (!path->nodes_locked) + continue; + +- prt_printf(out, " path %u %c l=%u %s:", +- idx, +- path->cached ? 'c' : 'b', +- path->level, +- bch2_btree_id_str(path->btree_id)); ++ prt_printf(out, " path %u %c ", ++ idx, ++ path->cached ? 'c' : 'b'); ++ bch2_btree_id_to_text(out, path->btree_id); ++ prt_printf(out, " l=%u:", path->level); + bch2_bpos_to_text(out, path->pos); + prt_newline(out); + +@@ -3488,7 +3645,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) + #ifdef CONFIG_LOCKDEP + fs_reclaim_acquire(GFP_KERNEL); + struct btree_trans *trans = bch2_trans_get(c); +- trans_set_locked(trans); ++ trans_set_locked(trans, false); + bch2_trans_put(trans); + fs_reclaim_release(GFP_KERNEL); + #endif +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +index 0bda054f80d7..b9538e6e6d65 100644 +--- a/fs/bcachefs/btree_iter.h ++++ b/fs/bcachefs/btree_iter.h +@@ -23,6 +23,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path + { + unsigned idx = path - trans->paths; + ++ EBUG_ON(idx >= trans->nr_paths); + EBUG_ON(!test_bit(idx, trans->paths_allocated)); + if (unlikely(path->ref == U8_MAX)) { + bch2_dump_trans_paths_updates(trans); +@@ -36,6 +37,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path + + static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) + { ++ EBUG_ON(path - trans->paths >= trans->nr_paths); + EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); + EBUG_ON(!path->ref); + EBUG_ON(!path->intent_ref && intent); +@@ -234,12 +236,12 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *, + btree_path_idx_t, + unsigned, unsigned long); + +-static inline void bch2_trans_verify_not_unlocked(struct btree_trans *); ++static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); + + static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + btree_path_idx_t path, unsigned flags) + { +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + + if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) + return 0; +@@ -324,38 +326,33 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, + bch2_trans_restart_error(trans, restart_count); + } + +-void __noreturn bch2_trans_in_restart_error(struct btree_trans *); ++void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *); + +-static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) ++static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans) + { +- if (trans->restarted) +- bch2_trans_in_restart_error(trans); +-} +- +-void __noreturn bch2_trans_unlocked_error(struct btree_trans *); +- +-static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans) +-{ +- if (!trans->locked) +- bch2_trans_unlocked_error(trans); ++ if (trans->restarted || !trans->locked) ++ bch2_trans_unlocked_or_in_restart_error(trans); + } + + __always_inline +-static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) ++static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) + { + BUG_ON(err <= 0); + BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); + + trans->restarted = err; +- trans->last_restarted_ip = _THIS_IP_; ++ trans->last_restarted_ip = ip; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ darray_exit(&trans->last_restarted_trace); ++ bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); ++#endif + return -err; + } + + __always_inline + static int btree_trans_restart(struct btree_trans *trans, int err) + { +- btree_trans_restart_nounlock(trans, err); +- return -err; ++ return btree_trans_restart_ip(trans, err, _THIS_IP_); + } + + bool bch2_btree_node_upgrade(struct btree_trans *, +@@ -375,6 +372,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans, + void bch2_trans_downgrade(struct btree_trans *); + + void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *); ++void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); + void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); + + int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); +@@ -384,15 +382,21 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *); + struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); + struct btree *bch2_btree_iter_next_node(struct btree_iter *); + +-struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); ++struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); + struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + + static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) + { +- return bch2_btree_iter_peek_upto(iter, SPOS_MAX); ++ return bch2_btree_iter_peek_max(iter, SPOS_MAX); ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); ++ ++static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) ++{ ++ return bch2_btree_iter_peek_prev_min(iter, POS_MIN); + } + +-struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); + struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + + struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); +@@ -443,10 +447,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna + + void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); + +-static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, +- unsigned btree_id, +- unsigned flags) ++static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, ++ unsigned btree_id, ++ unsigned level, ++ unsigned flags) + { ++ if (level || !btree_id_cached(trans->c, btree_id)) { ++ flags &= ~BTREE_ITER_cached; ++ flags &= ~BTREE_ITER_with_key_cache; ++ } else if (!(flags & BTREE_ITER_cached)) ++ flags |= BTREE_ITER_with_key_cache; ++ + if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && + btree_id_is_extents(btree_id)) + flags |= BTREE_ITER_is_extents; +@@ -465,19 +476,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, + return flags; + } + +-static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, +- unsigned btree_id, +- unsigned flags) +-{ +- if (!btree_id_cached(trans->c, btree_id)) { +- flags &= ~BTREE_ITER_cached; +- flags &= ~BTREE_ITER_with_key_cache; +- } else if (!(flags & BTREE_ITER_cached)) +- flags |= BTREE_ITER_with_key_cache; +- +- return __bch2_btree_iter_flags(trans, btree_id, flags); +-} +- + static inline void bch2_trans_iter_init_common(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, +@@ -514,7 +512,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, + if (__builtin_constant_p(btree_id) && + __builtin_constant_p(flags)) + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, +- bch2_btree_iter_flags(trans, btree_id, flags), ++ bch2_btree_iter_flags(trans, btree_id, 0, flags), + _THIS_IP_); + else + bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); +@@ -593,13 +591,18 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, + bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ + _btree_id, _pos, _flags, KEY_TYPE_##_type)) + ++static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) ++{ ++ unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k)); ++ memcpy(dst_v, src_k.v, b); ++ if (unlikely(b < dst_size)) ++ memset(dst_v + b, 0, dst_size - b); ++} ++ + #define bkey_val_copy(_dst_v, _src_k) \ + do { \ +- unsigned b = min_t(unsigned, sizeof(*_dst_v), \ +- bkey_val_bytes(_src_k.k)); \ +- memcpy(_dst_v, _src_k.v, b); \ +- if (b < sizeof(*_dst_v)) \ +- memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \ ++ BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \ ++ __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \ + } while (0) + + static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, +@@ -608,17 +611,10 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, + unsigned val_size, void *val) + { + struct btree_iter iter; +- struct bkey_s_c k; +- int ret; +- +- k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); +- ret = bkey_err(k); ++ struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); ++ int ret = bkey_err(k); + if (!ret) { +- unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); +- +- memcpy(val, k.v, b); +- if (unlikely(b < sizeof(*val))) +- memset((void *) val + b, 0, sizeof(*val) - b); ++ __bkey_val_copy(val, val_size, k); + bch2_trans_iter_exit(trans, &iter); + } + +@@ -677,12 +673,12 @@ static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, + bch2_btree_iter_peek(iter); + } + +-static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, ++static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, + struct bpos end, + unsigned flags) + { + if (!(flags & BTREE_ITER_slots)) +- return bch2_btree_iter_peek_upto(iter, end); ++ return bch2_btree_iter_peek_max(iter, end); + + if (bkey_gt(iter->pos, end)) + return bkey_s_c_null; +@@ -746,7 +742,7 @@ transaction_restart: \ + _ret2 ?: trans_was_restarted(_trans, _restart_count); \ + }) + +-#define for_each_btree_key_upto_continue(_trans, _iter, \ ++#define for_each_btree_key_max_continue(_trans, _iter, \ + _end, _flags, _k, _do) \ + ({ \ + struct bkey_s_c _k; \ +@@ -754,7 +750,7 @@ transaction_restart: \ + \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ +- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ ++ (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ + _end, (_flags)); \ + if (!(_k).k) \ + break; \ +@@ -768,9 +764,9 @@ transaction_restart: \ + }) + + #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ +- for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) ++ for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) + +-#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ ++#define for_each_btree_key_max(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ + ({ \ + bch2_trans_begin(trans); \ +@@ -779,12 +775,12 @@ transaction_restart: \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ +- for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\ ++ for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ + }) + + #define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ +- for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ ++ for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ + SPOS_MAX, _flags, _k, _do) + + #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ +@@ -828,33 +824,33 @@ transaction_restart: \ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +-#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ ++#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \ + _start, _end, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ +- for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ ++ for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + + struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); + +-#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ ++#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ +- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ ++ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +-#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ ++#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ + for (; \ +- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ ++ (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + + #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ +- for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ ++ for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ + SPOS_MAX, _flags, _k, _ret) + + #define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ +@@ -866,7 +862,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); + bch2_btree_iter_rewind(&(_iter))) + + #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ +- for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) ++ for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) + + /* + * This should not be used in a fastpath, without first trying _do in +diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c +index c1657182c275..6d25e3f85ce8 100644 +--- a/fs/bcachefs/btree_journal_iter.c ++++ b/fs/bcachefs/btree_journal_iter.c +@@ -16,6 +16,17 @@ + * operations for the regular btree iter code to use: + */ + ++static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) ++{ ++ size_t gap_size = keys->size - keys->nr; ++ ++ BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); ++ ++ if (pos >= keys->gap) ++ pos -= gap_size; ++ return pos; ++} ++ + static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) + { + size_t gap_size = keys->size - keys->nr; +@@ -61,7 +72,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, + } + + /* Returns first non-overwritten key >= search key: */ +-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, ++struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) + { +@@ -84,27 +95,92 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree + } + } + ++ struct bkey_i *ret = NULL; ++ rcu_read_lock(); /* for overwritten_ranges */ ++ + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) +- return NULL; ++ break; + + if (k->overwritten) { +- (*idx)++; ++ if (k->overwritten_range) ++ *idx = rcu_dereference(k->overwritten_range)->end; ++ else ++ *idx += 1; + continue; + } + +- if (__journal_key_cmp(btree_id, level, pos, k) <= 0) +- return k->k; ++ if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { ++ ret = k->k; ++ break; ++ } + + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; ++ rcu_read_unlock(); + goto search; + } + } + +- return NULL; ++ rcu_read_unlock(); ++ return ret; ++} ++ ++struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, ++ unsigned level, struct bpos pos, ++ struct bpos end_pos, size_t *idx) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ unsigned iters = 0; ++ struct journal_key *k; ++ ++ BUG_ON(*idx > keys->nr); ++search: ++ if (!*idx) ++ *idx = __bch2_journal_key_search(keys, btree_id, level, pos); ++ ++ while (*idx && ++ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { ++ (*idx)++; ++ iters++; ++ if (iters == 10) { ++ *idx = 0; ++ goto search; ++ } ++ } ++ ++ struct bkey_i *ret = NULL; ++ rcu_read_lock(); /* for overwritten_ranges */ ++ ++ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { ++ if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) ++ break; ++ ++ if (k->overwritten) { ++ if (k->overwritten_range) ++ *idx = rcu_dereference(k->overwritten_range)->start - 1; ++ else ++ *idx -= 1; ++ continue; ++ } ++ ++ if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { ++ ret = k->k; ++ break; ++ } ++ ++ --(*idx); ++ iters++; ++ if (iters == 10) { ++ *idx = 0; ++ goto search; ++ } ++ } ++ ++ rcu_read_unlock(); ++ return ret; + } + + struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, +@@ -112,11 +188,12 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree + { + size_t idx = 0; + +- return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); ++ return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); + } + + static void journal_iter_verify(struct journal_iter *iter) + { ++#ifdef CONFIG_BCACHEFS_DEBUG + struct journal_keys *keys = iter->keys; + size_t gap_size = keys->size - keys->nr; + +@@ -126,10 +203,10 @@ static void journal_iter_verify(struct journal_iter *iter) + if (iter->idx < keys->size) { + struct journal_key *k = keys->data + iter->idx; + +- int cmp = cmp_int(k->btree_id, iter->btree_id) ?: +- cmp_int(k->level, iter->level); +- BUG_ON(cmp < 0); ++ int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); ++ BUG_ON(cmp > 0); + } ++#endif + } + + static void journal_iters_fix(struct bch_fs *c) +@@ -182,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ +- .journal_seq = U32_MAX, ++ .journal_seq = U64_MAX, + }; + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); +@@ -290,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, + bkey_deleted(&keys->data[idx].k->k)); + } + ++static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) ++{ ++ struct journal_key *k = keys->data + pos; ++ size_t idx = pos_to_idx(keys, pos); ++ ++ k->overwritten = true; ++ ++ struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; ++ struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; ++ ++ bool prev_overwritten = prev && prev->overwritten; ++ bool next_overwritten = next && next->overwritten; ++ ++ struct journal_key_range_overwritten *prev_range = ++ prev_overwritten ? prev->overwritten_range : NULL; ++ struct journal_key_range_overwritten *next_range = ++ next_overwritten ? next->overwritten_range : NULL; ++ ++ BUG_ON(prev_range && prev_range->end != idx); ++ BUG_ON(next_range && next_range->start != idx + 1); ++ ++ if (prev_range && next_range) { ++ prev_range->end = next_range->end; ++ ++ keys->data[pos].overwritten_range = prev_range; ++ for (size_t i = next_range->start; i < next_range->end; i++) { ++ struct journal_key *ip = keys->data + idx_to_pos(keys, i); ++ BUG_ON(ip->overwritten_range != next_range); ++ ip->overwritten_range = prev_range; ++ } ++ ++ kfree_rcu_mightsleep(next_range); ++ } else if (prev_range) { ++ prev_range->end++; ++ k->overwritten_range = prev_range; ++ if (next_overwritten) { ++ prev_range->end++; ++ next->overwritten_range = prev_range; ++ } ++ } else if (next_range) { ++ next_range->start--; ++ k->overwritten_range = next_range; ++ if (prev_overwritten) { ++ next_range->start--; ++ prev->overwritten_range = next_range; ++ } ++ } else if (prev_overwritten || next_overwritten) { ++ struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); ++ if (!r) ++ return; ++ ++ r->start = idx - (size_t) prev_overwritten; ++ r->end = idx + 1 + (size_t) next_overwritten; ++ ++ rcu_assign_pointer(k->overwritten_range, r); ++ if (prev_overwritten) ++ prev->overwritten_range = r; ++ if (next_overwritten) ++ next->overwritten_range = r; ++ } ++} ++ + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) + { +@@ -299,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + if (idx < keys->size && + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && +- bpos_eq(keys->data[idx].k->k.p, pos)) +- keys->data[idx].overwritten = true; ++ bpos_eq(keys->data[idx].k->k.p, pos) && ++ !keys->data[idx].overwritten) { ++ mutex_lock(&keys->overwrite_lock); ++ __bch2_journal_key_overwritten(keys, idx); ++ mutex_unlock(&keys->overwrite_lock); ++ } + } + + static void bch2_journal_iter_advance(struct journal_iter *iter) +@@ -314,24 +457,32 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) + + static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) + { ++ struct bkey_s_c ret = bkey_s_c_null; ++ + journal_iter_verify(iter); + ++ rcu_read_lock(); + while (iter->idx < iter->keys->size) { + struct journal_key *k = iter->keys->data + iter->idx; + +- int cmp = cmp_int(k->btree_id, iter->btree_id) ?: +- cmp_int(k->level, iter->level); +- if (cmp > 0) ++ int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); ++ if (cmp < 0) + break; + BUG_ON(cmp); + +- if (!k->overwritten) +- return bkey_i_to_s_c(k->k); ++ if (!k->overwritten) { ++ ret = bkey_i_to_s_c(k->k); ++ break; ++ } + +- bch2_journal_iter_advance(iter); ++ if (k->overwritten_range) ++ iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); ++ else ++ bch2_journal_iter_advance(iter); + } ++ rcu_read_unlock(); + +- return bkey_s_c_null; ++ return ret; + } + + static void bch2_journal_iter_exit(struct journal_iter *iter) +@@ -382,6 +533,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter + : (level > 1 ? 1 : 16); + + iter.prefetch = false; ++ iter.fail_if_too_many_whiteouts = true; + bch2_bkey_buf_init(&tmp); + + while (nr--) { +@@ -400,6 +552,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) + { + struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; ++ size_t iters = 0; + + if (iter->prefetch && iter->journal.level) + btree_and_journal_iter_prefetch(iter); +@@ -407,6 +560,11 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * + if (iter->at_end) + return bkey_s_c_null; + ++ iters++; ++ ++ if (iters > 20 && iter->fail_if_too_many_whiteouts) ++ return bkey_s_c_null; ++ + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && + bpos_lt(btree_k.k->p, iter->pos)) + bch2_journal_iter_advance_btree(iter); +@@ -481,16 +639,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + + /* sort and dedup all keys in the journal: */ + +-void bch2_journal_entries_free(struct bch_fs *c) +-{ +- struct journal_replay **i; +- struct genradix_iter iter; +- +- genradix_for_each(&c->journal_entries, iter, i) +- kvfree(*i); +- genradix_free(&c->journal_entries); +-} +- + /* + * When keys compare equal, oldest compares first: + */ +@@ -515,15 +663,26 @@ void bch2_journal_keys_put(struct bch_fs *c) + + move_gap(keys, keys->nr); + +- darray_for_each(*keys, i) ++ darray_for_each(*keys, i) { ++ if (i->overwritten_range && ++ (i == &darray_last(*keys) || ++ i->overwritten_range != i[1].overwritten_range)) ++ kfree(i->overwritten_range); ++ + if (i->allocated) + kfree(i->k); ++ } + + kvfree(keys->data); + keys->data = NULL; + keys->nr = keys->gap = keys->size = 0; + +- bch2_journal_entries_free(c); ++ struct journal_replay **i; ++ struct genradix_iter iter; ++ ++ genradix_for_each(&c->journal_entries, iter, i) ++ kvfree(*i); ++ genradix_free(&c->journal_entries); + } + + static void __journal_keys_sort(struct journal_keys *keys) +@@ -628,8 +787,20 @@ void bch2_journal_keys_dump(struct bch_fs *c) + + darray_for_each(*keys, i) { + printbuf_reset(&buf); ++ prt_printf(&buf, "btree="); ++ bch2_btree_id_to_text(&buf, i->btree_id); ++ prt_printf(&buf, " l=%u ", i->level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); +- pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); ++ pr_err("%s", buf.buf); + } + printbuf_exit(&buf); + } ++ ++void bch2_fs_journal_keys_init(struct bch_fs *c) ++{ ++ struct journal_keys *keys = &c->journal_keys; ++ ++ atomic_set(&keys->ref, 1); ++ keys->initial_ref_held = true; ++ mutex_init(&keys->overwrite_lock); ++} +diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h +index 1653de9d609b..2a3082919b8d 100644 +--- a/fs/bcachefs/btree_journal_iter.h ++++ b/fs/bcachefs/btree_journal_iter.h +@@ -26,16 +26,24 @@ struct btree_and_journal_iter { + struct bpos pos; + bool at_end; + bool prefetch; ++ bool fail_if_too_many_whiteouts; + }; + ++static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, ++ unsigned l_level, ++ const struct journal_key *r) ++{ ++ return -cmp_int(l_level, r->level) ?: ++ cmp_int(l_btree_id, r->btree_id); ++} ++ + static inline int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + const struct journal_key *r) + { +- return (cmp_int(l_btree_id, r->btree_id) ?: +- cmp_int(l_level, r->level) ?: +- bpos_cmp(l_pos, r->k->k.p)); ++ return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: ++ bpos_cmp(l_pos, r->k->k.p); + } + + static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +@@ -43,7 +51,9 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); + } + +-struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, ++struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, ++ unsigned, struct bpos, struct bpos, size_t *); ++struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); + struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, + unsigned, struct bpos); +@@ -79,8 +89,6 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c) + c->journal_keys.initial_ref_held = false; + } + +-void bch2_journal_entries_free(struct bch_fs *); +- + int bch2_journal_keys_sort(struct bch_fs *); + + void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, +@@ -89,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, + + void bch2_journal_keys_dump(struct bch_fs *); + ++void bch2_fs_journal_keys_init(struct bch_fs *); ++ + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ +diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h +new file mode 100644 +index 000000000000..8b773823704f +--- /dev/null ++++ b/fs/bcachefs/btree_journal_iter_types.h +@@ -0,0 +1,36 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H ++#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H ++ ++struct journal_key_range_overwritten { ++ size_t start, end; ++}; ++ ++struct journal_key { ++ u64 journal_seq; ++ u32 journal_offset; ++ enum btree_id btree_id:8; ++ unsigned level:8; ++ bool allocated; ++ bool overwritten; ++ struct journal_key_range_overwritten __rcu * ++ overwritten_range; ++ struct bkey_i *k; ++}; ++ ++struct journal_keys { ++ /* must match layout in darray_types.h */ ++ size_t nr, size; ++ struct journal_key *data; ++ /* ++ * Gap buffer: instead of all the empty space in the array being at the ++ * end of the buffer - from @nr to @size - the empty space is at @gap. ++ * This means that sequential insertions are O(n) instead of O(n^2). ++ */ ++ size_t gap; ++ atomic_t ref; ++ bool initial_ref_held; ++ struct mutex overwrite_lock; ++}; ++ ++#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +index 244610b1d0b5..c378b97ebeca 100644 +--- a/fs/bcachefs/btree_key_cache.c ++++ b/fs/bcachefs/btree_key_cache.c +@@ -197,7 +197,9 @@ bkey_cached_reuse(struct btree_key_cache *c) + return ck; + } + +-static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, ++static int btree_key_cache_create(struct btree_trans *trans, ++ struct btree_path *path, ++ struct btree_path *ck_path, + struct bkey_s_c k) + { + struct bch_fs *c = trans->c; +@@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * + key_u64s = min(256U, (key_u64s * 3) / 2); + key_u64s = roundup_pow_of_two(key_u64s); + +- struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); ++ struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); + int ret = PTR_ERR_OR_ZERO(ck); + if (ret) + return ret; +@@ -226,19 +228,19 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * + ck = bkey_cached_reuse(bc); + if (unlikely(!ck)) { + bch_err(c, "error allocating memory for key cache item, btree %s", +- bch2_btree_id_str(path->btree_id)); ++ bch2_btree_id_str(ck_path->btree_id)); + return -BCH_ERR_ENOMEM_btree_key_cache_create; + } + } + + ck->c.level = 0; +- ck->c.btree_id = path->btree_id; +- ck->key.btree_id = path->btree_id; +- ck->key.pos = path->pos; ++ ck->c.btree_id = ck_path->btree_id; ++ ck->key.btree_id = ck_path->btree_id; ++ ck->key.pos = ck_path->pos; + ck->flags = 1U << BKEY_CACHED_ACCESSED; + + if (unlikely(key_u64s > ck->u64s)) { +- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); ++ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); + + struct bkey_i *new_k = allocate_dropping_locks(trans, ret, + kmalloc(key_u64s * sizeof(u64), _gfp)); +@@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * + + bkey_reassemble(ck->k, k); + ++ ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); ++ if (unlikely(ret)) ++ goto err; ++ + ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); ++ ++ bch2_btree_node_unlock_write(trans, path, path_l(path)->b); ++ + if (unlikely(ret)) /* raced with another fill? */ + goto err; + + atomic_long_inc(&bc->nr_keys); + six_unlock_write(&ck->c.lock); + +- enum six_lock_type lock_want = __btree_lock_want(path, 0); ++ enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); + if (lock_want == SIX_LOCK_read) + six_lock_downgrade(&ck->c.lock); +- btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); +- path->uptodate = BTREE_ITER_UPTODATE; ++ btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); ++ ck_path->uptodate = BTREE_ITER_UPTODATE; + return 0; + err: + bkey_cached_free(bc, ck); +- mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); ++ mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); + + return ret; + } +@@ -283,7 +292,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, + unsigned flags) + { + if (flags & BTREE_ITER_cached_nofill) { +- ck_path->uptodate = BTREE_ITER_UPTODATE; ++ ck_path->l[0].b = NULL; + return 0; + } + +@@ -293,6 +302,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, + int ret; + + bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, ++ BTREE_ITER_intent| + BTREE_ITER_key_cache_fill| + BTREE_ITER_cached_nofill); + iter.flags &= ~BTREE_ITER_with_journal; +@@ -306,9 +316,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, + if (unlikely(ret)) + goto out; + +- ret = btree_key_cache_create(trans, ck_path, k); ++ ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); + if (ret) + goto err; ++ ++ if (trace_key_cache_fill_enabled()) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_bpos_to_text(&buf, ck_path->pos); ++ prt_char(&buf, ' '); ++ bch2_bkey_val_to_text(&buf, trans->c, k); ++ trace_key_cache_fill(trans, buf.buf); ++ printbuf_exit(&buf); ++ } + out: + /* We're not likely to need this iterator again: */ + bch2_set_btree_iter_dontneed(&iter); +@@ -424,8 +444,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + !test_bit(JOURNAL_space_low, &c->journal.flags)) + commit_flags |= BCH_TRANS_COMMIT_no_journal_res; + +- ret = bch2_btree_iter_traverse(&b_iter) ?: +- bch2_trans_update(trans, &b_iter, ck->k, ++ struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); ++ ret = bkey_err(btree_k); ++ if (ret) ++ goto err; ++ ++ /* * Check that we're not violating cache coherency rules: */ ++ BUG_ON(bkey_deleted(btree_k.k)); ++ ++ ret = bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_key_cache_reclaim| + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun) ?: +@@ -433,7 +460,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + commit_flags); +- ++err: + bch2_fs_fatal_err_on(ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && +@@ -586,8 +613,18 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, + bkey_cached_free(bc, ck); + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); +- btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); +- path->should_be_locked = false; ++ ++ struct btree_path *path2; ++ unsigned i; ++ trans_for_each_path(trans, path2, i) ++ if (path2->l[0].b == (void *) ck) { ++ __bch2_btree_path_unlock(trans, path2); ++ path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); ++ path2->should_be_locked = false; ++ btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); ++ } ++ ++ bch2_trans_verify_locks(trans); + } + + static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, +diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c +index efe2a007b482..10b805a60f52 100644 +--- a/fs/bcachefs/btree_locking.c ++++ b/fs/bcachefs/btree_locking.c +@@ -109,6 +109,12 @@ static noinline void lock_graph_pop_all(struct lock_graph *g) + lock_graph_up(g); + } + ++static noinline void lock_graph_pop_from(struct lock_graph *g, struct trans_waiting_for_lock *i) ++{ ++ while (g->g + g->nr > i) ++ lock_graph_up(g); ++} ++ + static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) + { + g->g[g->nr++] = (struct trans_waiting_for_lock) { +@@ -124,15 +130,20 @@ static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) + __lock_graph_down(g, trans); + } + +-static bool lock_graph_remove_non_waiters(struct lock_graph *g) ++static bool lock_graph_remove_non_waiters(struct lock_graph *g, ++ struct trans_waiting_for_lock *from) + { + struct trans_waiting_for_lock *i; + +- for (i = g->g + 1; i < g->g + g->nr; i++) ++ if (from->trans->locking != from->node_want) { ++ lock_graph_pop_from(g, from); ++ return true; ++ } ++ ++ for (i = from + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { +- while (g->g + g->nr > i) +- lock_graph_up(g); ++ lock_graph_pop_from(g, i); + return true; + } + +@@ -179,13 +190,14 @@ static int btree_trans_abort_preference(struct btree_trans *trans) + return 3; + } + +-static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) ++static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, ++ struct trans_waiting_for_lock *from) + { + struct trans_waiting_for_lock *i, *abort = NULL; + unsigned best = 0, pref; + int ret; + +- if (lock_graph_remove_non_waiters(g)) ++ if (lock_graph_remove_non_waiters(g, from)) + return 0; + + /* Only checking, for debugfs: */ +@@ -195,7 +207,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) + goto out; + } + +- for (i = g->g; i < g->g + g->nr; i++) { ++ for (i = from; i < g->g + g->nr; i++) { + pref = btree_trans_abort_preference(i->trans); + if (pref > best) { + abort = i; +@@ -229,8 +241,9 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) + ret = abort_lock(g, abort); + out: + if (ret) +- while (g->nr) +- lock_graph_up(g); ++ lock_graph_pop_all(g); ++ else ++ lock_graph_pop_from(g, abort); + return ret; + } + +@@ -243,7 +256,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + for (i = g->g; i < g->g + g->nr; i++) + if (i->trans == trans) { + closure_put(&trans->ref); +- return break_cycle(g, cycle); ++ return break_cycle(g, cycle, i); + } + + if (g->nr == ARRAY_SIZE(g->g)) { +@@ -252,8 +265,7 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + if (orig_trans->lock_may_not_fail) + return 0; + +- while (g->nr) +- lock_graph_up(g); ++ lock_graph_pop_all(g); + + if (cycle) + return 0; +@@ -281,7 +293,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) + + g.nr = 0; + +- if (trans->lock_must_abort) { ++ if (trans->lock_must_abort && !trans->lock_may_not_fail) { + if (cycle) + return -1; + +@@ -336,7 +348,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) + * structures - which means it can't be blocked + * waiting on a lock: + */ +- if (!lock_graph_remove_non_waiters(&g)) { ++ if (!lock_graph_remove_non_waiters(&g, g.g)) { + /* + * If lock_graph_remove_non_waiters() + * didn't do anything, it must be +@@ -512,7 +524,6 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned level) + { + struct btree *b = path->l[level].b; +- struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); + + if (!is_btree_node(path, level)) + return false; +@@ -536,24 +547,11 @@ bool bch2_btree_node_upgrade(struct btree_trans *trans, + if (race_fault()) + return false; + +- if (btree_node_locked(path, level)) { +- bool ret; +- +- six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); +- ret = six_lock_tryupgrade(&b->c.lock); +- six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); +- +- if (ret) +- goto success; +- } else { +- if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) +- goto success; +- } ++ if (btree_node_locked(path, level) ++ ? six_lock_tryupgrade(&b->c.lock) ++ : six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) ++ goto success; + +- /* +- * Do we already have an intent lock via another path? If so, just bump +- * lock count: +- */ + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(trans, path, level); +@@ -782,7 +780,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) + return bch2_trans_relock_fail(trans, path, &f, trace); + } + +- trans_set_locked(trans); ++ trans_set_locked(trans, true); + out: + bch2_trans_verify_locks(trans); + return 0; +@@ -818,6 +816,17 @@ void bch2_trans_unlock_long(struct btree_trans *trans) + bch2_trans_srcu_unlock(trans); + } + ++void bch2_trans_unlock_write(struct btree_trans *trans) ++{ ++ struct btree_path *path; ++ unsigned i; ++ ++ trans_for_each_path(trans, path, i) ++ for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) ++ if (btree_node_write_locked(path, l)) ++ bch2_btree_node_unlock_write(trans, path, path->l[l].b); ++} ++ + int __bch2_trans_mutex_lock(struct btree_trans *trans, + struct mutex *lock) + { +@@ -856,6 +865,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path) + (want == BTREE_NODE_UNLOCKED || + have != BTREE_NODE_WRITE_LOCKED) && + want != have); ++ ++ BUG_ON(btree_node_locked(path, l) && ++ path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); + } + } + +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +index 7c07f9fa9add..b54ef48eb8cc 100644 +--- a/fs/bcachefs/btree_locking.h ++++ b/fs/bcachefs/btree_locking.h +@@ -16,6 +16,7 @@ + void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); + + void bch2_trans_unlock_noassert(struct btree_trans *); ++void bch2_trans_unlock_write(struct btree_trans *); + + static inline bool is_btree_node(struct btree_path *path, unsigned l) + { +@@ -75,13 +76,6 @@ static inline void mark_btree_node_locked_noreset(struct btree_path *path, + path->nodes_locked |= (type + 1) << (level << 1); + } + +-static inline void mark_btree_node_unlocked(struct btree_path *path, +- unsigned level) +-{ +- EBUG_ON(btree_node_write_locked(path, level)); +- mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); +-} +- + static inline void mark_btree_node_locked(struct btree_trans *trans, + struct btree_path *path, + unsigned level, +@@ -124,19 +118,25 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans, + + /* unlock: */ + ++void bch2_btree_node_unlock_write(struct btree_trans *, ++ struct btree_path *, struct btree *); ++ + static inline void btree_node_unlock(struct btree_trans *trans, + struct btree_path *path, unsigned level) + { + int lock_type = btree_node_locked_type(path, level); + + EBUG_ON(level >= BTREE_MAX_DEPTH); +- EBUG_ON(lock_type == BTREE_NODE_WRITE_LOCKED); + + if (lock_type != BTREE_NODE_UNLOCKED) { ++ if (unlikely(lock_type == BTREE_NODE_WRITE_LOCKED)) { ++ bch2_btree_node_unlock_write(trans, path, path->l[level].b); ++ lock_type = BTREE_NODE_INTENT_LOCKED; ++ } + six_unlock_type(&path->l[level].b->c.lock, lock_type); + btree_trans_lock_hold_time_update(trans, path, level); ++ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); + } +- mark_btree_node_unlocked(path, level); + } + + static inline int btree_path_lowest_level_locked(struct btree_path *path) +@@ -162,36 +162,40 @@ static inline void __bch2_btree_path_unlock(struct btree_trans *trans, + * Updates the saved lock sequence number, so that bch2_btree_node_relock() will + * succeed: + */ ++static inline void ++__bch2_btree_node_unlock_write(struct btree_trans *trans, struct btree *b) ++{ ++ if (!b->c.lock.write_lock_recurse) { ++ struct btree_path *linked; ++ unsigned i; ++ ++ trans_for_each_path_with_node(trans, b, linked, i) ++ linked->l[b->c.level].lock_seq++; ++ } ++ ++ six_unlock_write(&b->c.lock); ++} ++ + static inline void + bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, + struct btree *b) + { +- struct btree_path *linked; +- unsigned i; +- + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); + EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); + + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); +- +- trans_for_each_path_with_node(trans, b, linked, i) +- linked->l[b->c.level].lock_seq++; +- +- six_unlock_write(&b->c.lock); ++ __bch2_btree_node_unlock_write(trans, b); + } + +-void bch2_btree_node_unlock_write(struct btree_trans *, +- struct btree_path *, struct btree *); +- + int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); + + /* lock: */ + +-static inline void trans_set_locked(struct btree_trans *trans) ++static inline void trans_set_locked(struct btree_trans *trans, bool try) + { + if (!trans->locked) { +- lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_); ++ lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_); + trans->locked = true; + trans->last_unlock_ip = 0; + +@@ -282,7 +286,7 @@ static inline int btree_node_lock(struct btree_trans *trans, + int ret = 0; + + EBUG_ON(level >= BTREE_MAX_DEPTH); +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + + if (likely(six_trylock_type(&b->lock, type)) || + btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || +diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c +index 30131c3bdd97..a7f06deee13c 100644 +--- a/fs/bcachefs/btree_node_scan.c ++++ b/fs/bcachefs/btree_node_scan.c +@@ -12,6 +12,7 @@ + #include "recovery_passes.h" + + #include ++#include + #include + + struct find_btree_nodes_worker { +@@ -22,17 +23,15 @@ struct find_btree_nodes_worker { + + static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) + { +- prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ", +- bch2_btree_id_str(n->btree_id), n->level, n->seq, +- n->journal_seq, n->cookie); ++ bch2_btree_id_level_to_text(out, n->btree_id, n->level); ++ prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", ++ n->seq, n->journal_seq, n->cookie); + bch2_bpos_to_text(out, n->min_key); + prt_str(out, "-"); + bch2_bpos_to_text(out, n->max_key); + + if (n->range_updated) + prt_str(out, " range updated"); +- if (n->overwritten) +- prt_str(out, " overwritten"); + + for (unsigned i = 0; i < n->nr_ptrs; i++) { + prt_char(out, ' '); +@@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r) + -found_btree_node_cmp_time(l, r); + } + ++static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) ++{ ++ return found_btree_node_cmp_pos(l, r) < 0; ++} ++ ++static inline void found_btree_node_swap(void *_l, void *_r, void *arg) ++{ ++ struct found_btree_node *l = _l; ++ struct found_btree_node *r = _r; ++ ++ swap(*l, *r); ++} ++ ++static const struct min_heap_callbacks found_btree_node_heap_cbs = { ++ .less = found_btree_node_cmp_pos_less, ++ .swp = found_btree_node_swap, ++}; ++ + static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, + struct bio *bio, struct btree_node *bn, u64 offset) + { +@@ -159,6 +176,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, + return; + + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { ++ if (!c->chacha20) ++ return; ++ + struct nonce nonce = btree_nonce(&bn->keys, 0); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + +@@ -292,55 +312,48 @@ static int read_btree_nodes(struct find_btree_nodes *f) + return f->ret ?: ret; + } + +-static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) ++static bool nodes_overlap(const struct found_btree_node *l, ++ const struct found_btree_node *r) + { +- while (n + 1 < end && +- found_btree_node_cmp_pos(n, n + 1) > 0) { +- swap(n[0], n[1]); +- n++; +- } ++ return (l->btree_id == r->btree_id && ++ l->level == r->level && ++ bpos_gt(l->max_key, r->min_key)); + } + + static int handle_overwrites(struct bch_fs *c, +- struct found_btree_node *start, +- struct found_btree_node *end) ++ struct found_btree_node *l, ++ found_btree_nodes *nodes_heap) + { +- struct found_btree_node *n; +-again: +- for (n = start + 1; +- n < end && +- n->btree_id == start->btree_id && +- n->level == start->level && +- bpos_lt(n->min_key, start->max_key); +- n++) { +- int cmp = found_btree_node_cmp_time(start, n); ++ struct found_btree_node *r; ++ ++ while ((r = min_heap_peek(nodes_heap)) && ++ nodes_overlap(l, r)) { ++ int cmp = found_btree_node_cmp_time(l, r); + + if (cmp > 0) { +- if (bpos_cmp(start->max_key, n->max_key) >= 0) +- n->overwritten = true; ++ if (bpos_cmp(l->max_key, r->max_key) >= 0) ++ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); + else { +- n->range_updated = true; +- n->min_key = bpos_successor(start->max_key); +- n->range_updated = true; +- bubble_up(n, end); +- goto again; ++ r->range_updated = true; ++ r->min_key = bpos_successor(l->max_key); ++ r->range_updated = true; ++ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); + } + } else if (cmp < 0) { +- BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); ++ BUG_ON(bpos_eq(l->min_key, r->min_key)); + +- start->max_key = bpos_predecessor(n->min_key); +- start->range_updated = true; +- } else if (n->level) { +- n->overwritten = true; ++ l->max_key = bpos_predecessor(r->min_key); ++ l->range_updated = true; ++ } else if (r->level) { ++ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); + } else { +- if (bpos_cmp(start->max_key, n->max_key) >= 0) +- n->overwritten = true; ++ if (bpos_cmp(l->max_key, r->max_key) >= 0) ++ min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); + else { +- n->range_updated = true; +- n->min_key = bpos_successor(start->max_key); +- n->range_updated = true; +- bubble_up(n, end); +- goto again; ++ r->range_updated = true; ++ r->min_key = bpos_successor(l->max_key); ++ r->range_updated = true; ++ min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); + } + } + } +@@ -352,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) + { + struct find_btree_nodes *f = &c->found_btree_nodes; + struct printbuf buf = PRINTBUF; ++ found_btree_nodes nodes_heap = {}; + size_t dst; + int ret = 0; + +@@ -406,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) + bch2_print_string_as_lines(KERN_INFO, buf.buf); + } + +- dst = 0; +- darray_for_each(f->nodes, i) { +- if (i->overwritten) +- continue; ++ swap(nodes_heap, f->nodes); ++ ++ { ++ /* darray must have same layout as a heap */ ++ min_heap_char real_heap; ++ BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); ++ BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); ++ BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); ++ BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); ++ } ++ ++ min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); + +- ret = handle_overwrites(c, i, &darray_top(f->nodes)); ++ if (nodes_heap.nr) { ++ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); + if (ret) + goto err; + +- BUG_ON(i->overwritten); +- f->nodes.data[dst++] = *i; ++ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); + } +- f->nodes.nr = dst; + +- if (c->opts.verbose) { ++ while (true) { ++ ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); ++ if (ret) ++ goto err; ++ ++ if (!nodes_heap.nr) ++ break; ++ ++ ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); ++ if (ret) ++ goto err; ++ ++ min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); ++ } ++ ++ for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) ++ BUG_ON(nodes_overlap(n, n + 1)); ++ ++ if (0 && c->opts.verbose) { + printbuf_reset(&buf); + prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); + found_btree_nodes_to_text(&buf, c, f->nodes); + bch2_print_string_as_lines(KERN_INFO, buf.buf); ++ } else { ++ bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); + } + + eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + err: ++ darray_exit(&nodes_heap); + printbuf_exit(&buf); + return ret; + } +@@ -499,7 +541,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + +- prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); ++ prt_str(&buf, "recovery "); ++ bch2_btree_id_level_to_text(&buf, btree, level); ++ prt_str(&buf, " "); + bch2_bpos_to_text(&buf, node_min); + prt_str(&buf, " - "); + bch2_bpos_to_text(&buf, node_max); +@@ -533,7 +577,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); + printbuf_exit(&buf); + +- BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0)); ++ BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), ++ (struct bkey_validate_context) { ++ .from = BKEY_VALIDATE_btree_node, ++ .level = level + 1, ++ .btree = btree, ++ })); + + ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); + if (ret) +diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h +index b6c36c45d0be..2811b6857c97 100644 +--- a/fs/bcachefs/btree_node_scan_types.h ++++ b/fs/bcachefs/btree_node_scan_types.h +@@ -6,7 +6,6 @@ + + struct found_btree_node { + bool range_updated:1; +- bool overwritten:1; + u8 btree_id; + u8 level; + unsigned sectors_written; +diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c +index 9bf471fa4361..2760dd9569ed 100644 +--- a/fs/bcachefs/btree_trans_commit.c ++++ b/fs/bcachefs/btree_trans_commit.c +@@ -133,7 +133,7 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans) + return 0; + } + +-static inline void bch2_trans_unlock_write(struct btree_trans *trans) ++static inline void bch2_trans_unlock_updates_write(struct btree_trans *trans) + { + if (likely(trans->write_locked)) { + trans_for_each_update(trans, i) +@@ -249,7 +249,7 @@ static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + new |= 1 << BTREE_NODE_need_write; + } while (!try_cmpxchg(&b->flags, &old, new)); + +- btree_node_write_if_need(c, b, SIX_LOCK_read); ++ btree_node_write_if_need(trans, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + + bch2_trans_put(trans); +@@ -348,7 +348,7 @@ static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) + { + return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, +- trans->journal_u64s, flags); ++ trans->journal_u64s, flags, trans); + } + + #define JSET_ENTRY_LOG_U64s 4 +@@ -384,7 +384,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, + struct bkey_i *new_k; + int ret; + +- bch2_trans_unlock_write(trans); ++ bch2_trans_unlock_updates_write(trans); + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); +@@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans, + old, flags); + } + +-static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, +- bool overwrite) ++static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) + { + verify_update_old_key(trans, i); + +@@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ + return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), + BTREE_TRIGGER_insert| + BTREE_TRIGGER_overwrite|flags) ?: 1; +- } else if (overwrite && !i->overwrite_trigger_run) { ++ } else if (!i->overwrite_trigger_run) { + i->overwrite_trigger_run = true; + return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; +- } else if (!overwrite && !i->insert_trigger_run) { ++ } else if (!i->insert_trigger_run) { + i->insert_trigger_run = true; + return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; + } else { +@@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ + } + + static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, +- unsigned btree_id_start) ++ unsigned *btree_id_updates_start) + { +- for (int overwrite = 1; overwrite >= 0; --overwrite) { +- bool trans_trigger_run; ++ bool trans_trigger_run; + +- /* +- * Running triggers will append more updates to the list of updates as +- * we're walking it: +- */ +- do { +- trans_trigger_run = false; +- +- for (unsigned i = btree_id_start; +- i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; +- i++) { +- if (trans->updates[i].btree_id != btree_id) +- continue; ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; + +- int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); +- if (ret < 0) +- return ret; +- if (ret) +- trans_trigger_run = true; ++ for (unsigned i = *btree_id_updates_start; ++ i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; ++ i++) { ++ if (trans->updates[i].btree_id < btree_id) { ++ *btree_id_updates_start = i; ++ continue; + } +- } while (trans_trigger_run); +- } ++ ++ int ret = run_one_trans_trigger(trans, trans->updates + i); ++ if (ret < 0) ++ return ret; ++ if (ret) ++ trans_trigger_run = true; ++ } ++ } while (trans_trigger_run); ++ ++ trans_for_each_update(trans, i) ++ BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && ++ i->btree_id == btree_id && ++ btree_node_type_has_trans_triggers(i->bkey_type) && ++ (!i->insert_trigger_run || !i->overwrite_trigger_run)); + + return 0; + } + + static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + { +- unsigned btree_id = 0, btree_id_start = 0; ++ unsigned btree_id = 0, btree_id_updates_start = 0; + int ret = 0; + + /* +@@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) + if (btree_id == BTREE_ID_alloc) + continue; + +- while (btree_id_start < trans->nr_updates && +- trans->updates[btree_id_start].btree_id < btree_id) +- btree_id_start++; +- +- ret = run_btree_triggers(trans, btree_id, btree_id_start); ++ ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); + if (ret) + return ret; + } + +- for (unsigned idx = 0; idx < trans->nr_updates; idx++) { +- struct btree_insert_entry *i = trans->updates + idx; +- +- if (i->btree_id > BTREE_ID_alloc) +- break; +- if (i->btree_id == BTREE_ID_alloc) { +- ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); +- if (ret) +- return ret; +- break; +- } +- } ++ btree_id_updates_start = 0; ++ ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); ++ if (ret) ++ return ret; + + #ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) +@@ -609,14 +602,6 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) + return 0; + } + +-static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) +-{ +- return (struct bversion) { +- .hi = res->seq >> 32, +- .lo = (res->seq << 32) | (res->offset + offset), +- }; +-} +- + static inline int + bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, +@@ -627,12 +612,11 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + unsigned u64s = 0; + int ret = 0; + +- bch2_trans_verify_not_unlocked(trans); +- bch2_trans_verify_not_in_restart(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + + if (race_fault()) { + trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); +- return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); ++ return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); + } + + /* +@@ -701,25 +685,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + struct jset_entry *entry = trans->journal_entries; + + percpu_down_read(&c->mark_lock); +- + for (entry = trans->journal_entries; + entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && + entry->start->k.type == KEY_TYPE_accounting) { +- BUG_ON(!trans->journal_res.ref); +- +- struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); +- +- a->k.bversion = journal_pos_to_bversion(&trans->journal_res, +- (u64 *) entry - (u64 *) trans->journal_entries); +- BUG_ON(bversion_zero(a->k.bversion)); +- +- if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { +- ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal); +- if (ret) +- goto revert_fs_usage; +- } ++ ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); ++ if (ret) ++ goto revert_fs_usage; + } + percpu_up_read(&c->mark_lock); + +@@ -739,33 +712,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + goto fatal_err; + } + +- trans_for_each_update(trans, i) { +- enum bch_validate_flags invalid_flags = 0; ++ struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; + +- if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) +- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; +- +- ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), +- i->bkey_type, invalid_flags); +- if (unlikely(ret)){ +- bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", +- trans->fn, (void *) i->ip_allocated); +- goto fatal_err; +- } +- btree_insert_entry_checks(trans, i); +- } ++ if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) ++ validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; + + for (struct jset_entry *i = trans->journal_entries; + i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + i = vstruct_next(i)) { +- enum bch_validate_flags invalid_flags = 0; +- +- if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) +- invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; +- + ret = bch2_journal_entry_validate(c, NULL, i, + bcachefs_metadata_version_current, +- CPU_BIG_ENDIAN, invalid_flags); ++ CPU_BIG_ENDIAN, validate_context); + if (unlikely(ret)) { + bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", + trans->fn); +@@ -773,6 +730,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + } + } + ++ trans_for_each_update(trans, i) { ++ validate_context.level = i->level; ++ validate_context.btree = i->btree_id; ++ ++ ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); ++ if (unlikely(ret)){ ++ bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", ++ trans->fn, (void *) i->ip_allocated); ++ goto fatal_err; ++ } ++ btree_insert_entry_checks(trans, i); ++ } ++ + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { + struct journal *j = &c->journal; + struct jset_entry *entry; +@@ -833,13 +803,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + entry2 != entry; + entry2 = vstruct_next(entry2)) + if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && +- entry2->start->k.type == KEY_TYPE_accounting) { +- struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); +- +- bch2_accounting_neg(a); +- bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); +- bch2_accounting_neg(a); +- } ++ entry2->start->k.type == KEY_TYPE_accounting) ++ bch2_accounting_trans_commit_revert(trans, ++ bkey_i_to_accounting(entry2->start), flags); + percpu_up_read(&c->mark_lock); + return ret; + } +@@ -902,7 +868,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags + if (!ret && unlikely(trans->journal_replay_not_finished)) + bch2_drop_overwrites_from_journal(trans); + +- bch2_trans_unlock_write(trans); ++ bch2_trans_unlock_updates_write(trans); + + if (!ret && trans->journal_pin) + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, +@@ -994,24 +960,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, + return ret; + } + +-static noinline int +-bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) +-{ +- struct bch_fs *c = trans->c; +- int ret; +- +- if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || +- test_bit(BCH_FS_started, &c->flags)) +- return -BCH_ERR_erofs_trans_commit; +- +- ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); +- if (ret) +- return ret; +- +- bch2_write_ref_get(c, BCH_WRITE_REF_trans); +- return 0; +-} +- + /* + * This is for updates done in the early part of fsck - btree_gc - before we've + * gone RW. we only add the new key to the list of keys for journal replay to +@@ -1022,6 +970,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) + { + struct bch_fs *c = trans->c; + ++ BUG_ON(current != c->recovery_task); ++ + trans_for_each_update(trans, i) { + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + if (ret) +@@ -1047,8 +997,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + struct bch_fs *c = trans->c; + int ret = 0; + +- bch2_trans_verify_not_unlocked(trans); +- bch2_trans_verify_not_in_restart(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + + if (!trans->nr_updates && + !trans->journal_entries_u64s) +@@ -1058,16 +1007,13 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + if (ret) + goto out_reset; + +- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { +- ret = do_bch2_trans_commit_to_journal_replay(trans); +- goto out_reset; +- } +- + if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && + unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { +- ret = bch2_trans_commit_get_rw_cold(trans, flags); +- if (ret) +- goto out_reset; ++ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) ++ ret = do_bch2_trans_commit_to_journal_replay(trans); ++ else ++ ret = -BCH_ERR_erofs_trans_commit; ++ goto out_reset; + } + + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); +@@ -1112,8 +1058,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) + } + retry: + errored_at = NULL; +- bch2_trans_verify_not_unlocked(trans); +- bch2_trans_verify_not_in_restart(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +index 4568a41fefaf..a6f251eb4164 100644 +--- a/fs/bcachefs/btree_types.h ++++ b/fs/bcachefs/btree_types.h +@@ -513,6 +513,9 @@ struct btree_trans { + u64 last_begin_time; + unsigned long last_begin_ip; + unsigned long last_restarted_ip; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ bch_stacktrace last_restarted_trace; ++#endif + unsigned long last_unlock_ip; + unsigned long srcu_lock_time; + +@@ -787,53 +790,64 @@ static inline bool btree_node_type_has_triggers(enum btree_node_type type) + return BIT_ULL(type) & BTREE_NODE_TYPE_HAS_TRIGGERS; + } + +-static inline bool btree_node_type_is_extents(enum btree_node_type type) ++static inline bool btree_id_is_extents(enum btree_id btree) + { + const u64 mask = 0 +-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << (nr + 1)) ++#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_extents)) << nr) + BCH_BTREE_IDS() + #undef x + ; + +- return BIT_ULL(type) & mask; ++ return BIT_ULL(btree) & mask; + } + +-static inline bool btree_id_is_extents(enum btree_id btree) ++static inline bool btree_node_type_is_extents(enum btree_node_type type) + { +- return btree_node_type_is_extents(__btree_node_type(0, btree)); ++ return type != BKEY_TYPE_btree && btree_id_is_extents(type - 1); ++} ++ ++static inline bool btree_type_has_snapshots(enum btree_id btree) ++{ ++ const u64 mask = 0 ++#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_snapshots)) << nr) ++ BCH_BTREE_IDS() ++#undef x ++ ; ++ ++ return BIT_ULL(btree) & mask; + } + +-static inline bool btree_type_has_snapshots(enum btree_id id) ++static inline bool btree_type_has_snapshot_field(enum btree_id btree) + { + const u64 mask = 0 +-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) ++#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_IS_snapshot_field|BTREE_IS_snapshots))) << nr) + BCH_BTREE_IDS() + #undef x + ; + +- return BIT_ULL(id) & mask; ++ return BIT_ULL(btree) & mask; + } + +-static inline bool btree_type_has_snapshot_field(enum btree_id id) ++static inline bool btree_type_has_ptrs(enum btree_id btree) + { + const u64 mask = 0 +-#define x(name, nr, flags, ...) |((!!((flags) & (BTREE_ID_SNAPSHOT_FIELD|BTREE_ID_SNAPSHOTS))) << nr) ++#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_data)) << nr) + BCH_BTREE_IDS() + #undef x + ; + +- return BIT_ULL(id) & mask; ++ return BIT_ULL(btree) & mask; + } + +-static inline bool btree_type_has_ptrs(enum btree_id id) ++static inline bool btree_type_uses_write_buffer(enum btree_id btree) + { + const u64 mask = 0 +-#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) ++#define x(name, nr, flags, ...) |((!!((flags) & BTREE_IS_write_buffer)) << nr) + BCH_BTREE_IDS() + #undef x + ; + +- return BIT_ULL(id) & mask; ++ return BIT_ULL(btree) & mask; + } + + struct btree_root { +diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c +index 5d809e8bd170..13d794f201a5 100644 +--- a/fs/bcachefs/btree_update.c ++++ b/fs/bcachefs/btree_update.c +@@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + !(ret = bkey_err(old_k)) && + bkey_eq(old_pos, old_k.k->p)) { + struct bpos whiteout_pos = +- SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; ++ SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); + + if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || + snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) +@@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + BTREE_ITER_intent| + BTREE_ITER_with_updates| + BTREE_ITER_not_extents); +- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); ++ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) +@@ -323,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, + goto out; + next: + bch2_btree_iter_advance(&iter); +- k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); ++ k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) +@@ -588,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi + int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos end) + { +- struct bkey_s_c k; +- int ret = 0; +- +- bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); +- k = bch2_btree_iter_prev(iter); +- ret = bkey_err(k); ++ bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); ++ struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); ++ int ret = bkey_err(k); + if (ret) + goto err; + +@@ -672,27 +669,19 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, + bch2_btree_insert_trans(trans, id, k, iter_flags)); + } + +-int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, +- unsigned len, unsigned update_flags) ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned update_flags) + { +- struct bkey_i *k; +- +- k = bch2_trans_kmalloc(trans, sizeof(*k)); +- if (IS_ERR(k)) +- return PTR_ERR(k); ++ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); ++ int ret = PTR_ERR_OR_ZERO(k); ++ if (ret) ++ return ret; + + bkey_init(&k->k); + k->k.p = iter->pos; +- bch2_key_resize(&k->k, len); + return bch2_trans_update(trans, iter, k, update_flags); + } + +-int bch2_btree_delete_at(struct btree_trans *trans, +- struct btree_iter *iter, unsigned update_flags) +-{ +- return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); +-} +- + int bch2_btree_delete(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + unsigned update_flags) +@@ -721,7 +710,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + int ret = 0; + + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); +- while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { ++ while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(trans->c, 0); + struct bkey_i delete; +@@ -794,8 +783,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + return ret; + } + +-int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, +- struct bpos pos, bool set) ++int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set) + { + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); +@@ -804,13 +792,21 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; +- k->k.p = pos; ++ k->k.p = iter->pos; ++ if (iter->flags & BTREE_ITER_is_extents) ++ bch2_key_resize(&k->k, 1); + ++ return bch2_trans_update(trans, iter, k, 0); ++} ++ ++int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, ++ struct bpos pos, bool set) ++{ + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); + +- ret = bch2_btree_iter_traverse(&iter) ?: +- bch2_trans_update(trans, &iter, k, 0); ++ int ret = bch2_btree_iter_traverse(&iter) ?: ++ bch2_btree_bit_mod_iter(trans, &iter, set); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +@@ -827,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, + return bch2_trans_update_buffered(trans, btree, &k); + } + +-static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) ++int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) + { ++ unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); ++ prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); ++ ++ int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; ++ if (ret) ++ return ret; ++ + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); +- int ret = PTR_ERR_OR_ZERO(e); ++ ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + +@@ -865,9 +868,8 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, + memcpy(l->d, buf.buf, buf.pos); + c->journal.early_journal_entries.nr += jset_u64s(u64s); + } else { +- ret = bch2_trans_commit_do(c, NULL, NULL, +- BCH_TRANS_COMMIT_lazy_rw|commit_flags, +- __bch2_trans_log_msg(trans, &buf, u64s)); ++ ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, ++ bch2_trans_log_msg(trans, &buf)); + } + err: + printbuf_exit(&buf); +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +index 70b3c989fac2..8f22ef9a7651 100644 +--- a/fs/bcachefs/btree_update.h ++++ b/fs/bcachefs/btree_update.h +@@ -24,7 +24,6 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, + #define BCH_TRANS_COMMIT_FLAGS() \ + x(no_enospc, "don't check for enospc") \ + x(no_check_rw, "don't attempt to take a ref on c->writes") \ +- x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ + x(no_journal_res, "don't take a journal reservation, instead " \ + "pin journal entry referred to by trans->journal_res.seq") \ + x(journal_reclaim, "operation required for journal reclaim; may return error" \ +@@ -47,8 +46,6 @@ enum bch_trans_commit_flags { + + void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); + +-int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, +- unsigned, unsigned); + int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); + int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); + +@@ -66,6 +63,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, unsigned, u64 *); + ++int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); + int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); + int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); + +@@ -161,6 +159,7 @@ void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); + int __bch2_trans_commit(struct btree_trans *, unsigned); + ++int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); + __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); + __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); + +@@ -244,7 +243,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + + static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, +- struct bkey_s_c *k, unsigned flags, ++ struct bkey_s_c *k, ++ enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned min_bytes) + { + struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); +@@ -261,8 +261,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str + return mut; + } + +-static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, +- struct bkey_s_c *k, unsigned flags) ++static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, ++ struct btree_iter *iter, struct bkey_s_c *k, ++ enum btree_iter_update_trigger_flags flags) + { + return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); + } +@@ -274,7 +275,8 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc + static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, +- unsigned flags, unsigned type, unsigned min_bytes) ++ enum btree_iter_update_trigger_flags flags, ++ unsigned type, unsigned min_bytes) + { + struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, + btree_id, pos, flags|BTREE_ITER_intent, type); +@@ -289,7 +291,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr + static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, +- unsigned flags) ++ enum btree_iter_update_trigger_flags flags) + { + return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); + } +@@ -297,7 +299,8 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran + static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, +- unsigned flags, unsigned type, unsigned min_bytes) ++ enum btree_iter_update_trigger_flags flags, ++ unsigned type, unsigned min_bytes) + { + struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, + btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); +@@ -318,7 +321,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, + static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, +- unsigned flags, unsigned min_bytes) ++ enum btree_iter_update_trigger_flags flags, ++ unsigned min_bytes) + { + return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); + } +@@ -326,7 +330,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans + static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, +- unsigned flags) ++ enum btree_iter_update_trigger_flags flags) + { + return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); + } +@@ -337,7 +341,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + + static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, +- unsigned flags, unsigned type, unsigned val_size) ++ enum btree_iter_update_trigger_flags flags, ++ unsigned type, unsigned val_size) + { + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); + int ret; +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +index d596ef93239f..ab111fec1701 100644 +--- a/fs/bcachefs/btree_update_interior.c ++++ b/fs/bcachefs/btree_update_interior.c +@@ -58,11 +58,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, + b->data->min_key)); + ++ bch2_bkey_buf_init(&prev); ++ bkey_init(&prev.k->k); ++ bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); ++ + if (b == btree_node_root(c, b)) { + if (!bpos_eq(b->data->min_key, POS_MIN)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); +- need_fsck_err(trans, btree_root_bad_min_key, ++ log_fsck_err(trans, btree_root_bad_min_key, + "btree root with incorrect min_key: %s", buf.buf); + goto topology_repair; + } +@@ -70,18 +74,14 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + if (!bpos_eq(b->data->max_key, SPOS_MAX)) { + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); +- need_fsck_err(trans, btree_root_bad_max_key, ++ log_fsck_err(trans, btree_root_bad_max_key, + "btree root with incorrect max_key: %s", buf.buf); + goto topology_repair; + } + } + + if (!b->c.level) +- return 0; +- +- bch2_bkey_buf_init(&prev); +- bkey_init(&prev.k->k); +- bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); ++ goto out; + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + if (k.k->type != KEY_TYPE_btree_ptr_v2) +@@ -97,16 +97,16 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + bch2_topology_error(c); + + printbuf_reset(&buf); +- prt_str(&buf, "end of prev node doesn't match start of next node\n"), +- prt_printf(&buf, " in btree %s level %u node ", +- bch2_btree_id_str(b->c.btree_id), b->c.level); ++ prt_str(&buf, "end of prev node doesn't match start of next node\n in "); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_str(&buf, " node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n prev "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_str(&buf, "\n next "); + bch2_bkey_val_to_text(&buf, c, k); + +- need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); ++ log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); + goto topology_repair; + } + +@@ -118,25 +118,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + bch2_topology_error(c); + + printbuf_reset(&buf); +- prt_str(&buf, "empty interior node\n"); +- prt_printf(&buf, " in btree %s level %u node ", +- bch2_btree_id_str(b->c.btree_id), b->c.level); ++ prt_str(&buf, "empty interior node\n in "); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_str(&buf, " node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + +- need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); ++ log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); + goto topology_repair; + } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + bch2_topology_error(c); + + printbuf_reset(&buf); +- prt_str(&buf, "last child node doesn't end at end of parent node\n"); +- prt_printf(&buf, " in btree %s level %u node ", +- bch2_btree_id_str(b->c.btree_id), b->c.level); ++ prt_str(&buf, "last child node doesn't end at end of parent node\n in "); ++ bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); ++ prt_str(&buf, " node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "\n last key "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + +- need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); ++ log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + goto topology_repair; + } + out: +@@ -146,13 +146,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) + printbuf_exit(&buf); + return ret; + topology_repair: +- if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && +- c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { +- bch2_inconsistent_error(c); +- ret = -BCH_ERR_btree_need_topology_repair; +- } else { +- ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); +- } ++ ret = bch2_topology_error(c); + goto out; + } + +@@ -244,7 +238,6 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree *b) + { + struct bch_fs *c = trans->c; +- unsigned i, level = b->c.level; + + bch2_btree_node_lock_write_nofail(trans, path, &b->c); + +@@ -255,13 +248,9 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans, + mutex_unlock(&c->btree_cache.lock); + + six_unlock_write(&b->c.lock); +- mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); ++ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); + +- trans_for_each_path(trans, path, i) +- if (path->l[level].b == b) { +- btree_node_unlock(trans, path, level); +- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); +- } ++ bch2_trans_node_drop(trans, b); + } + + static void bch2_btree_node_free_never_used(struct btree_update *as, +@@ -270,8 +259,6 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, + { + struct bch_fs *c = as->c; + struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; +- struct btree_path *path; +- unsigned i, level = b->c.level; + + BUG_ON(!list_empty(&b->write_blocked)); + BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); +@@ -293,11 +280,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as, + + six_unlock_intent(&b->c.lock); + +- trans_for_each_path(trans, path, i) +- if (path->l[level].b == b) { +- btree_node_unlock(trans, path, level); +- path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); +- } ++ bch2_trans_node_drop(trans, b); + } + + static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, +@@ -809,7 +792,7 @@ static void btree_update_nodes_written(struct btree_update *as) + mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); + six_unlock_write(&b->c.lock); + +- btree_node_write_if_need(c, b, SIX_LOCK_intent); ++ btree_node_write_if_need(trans, b, SIX_LOCK_intent); + btree_node_unlock(trans, path, b->c.level); + bch2_path_put(trans, path_idx, true); + } +@@ -830,7 +813,7 @@ static void btree_update_nodes_written(struct btree_update *as) + b = as->new_nodes[i]; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); +- btree_node_write_if_need(c, b, SIX_LOCK_read); ++ btree_node_write_if_need(trans, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + } + +@@ -1366,9 +1349,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + +- if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), +- btree_node_type(b), BCH_VALIDATE_write) ?: +- bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) { ++ struct bkey_validate_context from = (struct bkey_validate_context) { ++ .from = BKEY_VALIDATE_btree_node, ++ .level = b->c.level, ++ .btree = b->c.btree_id, ++ .flags = BCH_VALIDATE_commit, ++ }; ++ if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: ++ bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { + bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); + dump_stack(); + } +@@ -1418,15 +1406,26 @@ bch2_btree_insert_keys_interior(struct btree_update *as, + (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) + ; + +- while (!bch2_keylist_empty(keys)) { +- insert = bch2_keylist_front(keys); ++ for (; ++ insert != keys->top && bpos_le(insert->k.p, b->key.k.p); ++ insert = bkey_next(insert)) ++ bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); + +- if (bpos_gt(insert->k.p, b->key.k.p)) +- break; ++ if (bch2_btree_node_check_topology(trans, b)) { ++ struct printbuf buf = PRINTBUF; + +- bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); +- bch2_keylist_pop_front(keys); ++ for (struct bkey_i *k = keys->keys; ++ k != insert; ++ k = bkey_next(k)) { ++ bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); ++ prt_newline(&buf); ++ } ++ ++ panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); + } ++ ++ memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); ++ keys->top_p -= insert->_data - keys->keys_p; + } + + static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) +@@ -1575,8 +1574,6 @@ static void btree_split_insert_keys(struct btree_update *as, + bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); + + bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); +- +- BUG_ON(bch2_btree_node_check_topology(trans, b)); + } + } + +@@ -1599,8 +1596,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, + if (ret) + return ret; + +- bch2_btree_interior_update_will_free_node(as, b); +- + if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { + struct btree *n[2]; + +@@ -1699,16 +1694,18 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, + if (ret) + goto err; + ++ bch2_btree_interior_update_will_free_node(as, b); ++ + if (n3) { + bch2_btree_update_get_open_buckets(as, n3); +- bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); ++ bch2_btree_node_write_trans(trans, n3, SIX_LOCK_intent, 0); + } + if (n2) { + bch2_btree_update_get_open_buckets(as, n2); +- bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); ++ bch2_btree_node_write_trans(trans, n2, SIX_LOCK_intent, 0); + } + bch2_btree_update_get_open_buckets(as, n1); +- bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); ++ bch2_btree_node_write_trans(trans, n1, SIX_LOCK_intent, 0); + + /* + * The old node must be freed (in memory) _before_ unlocking the new +@@ -1827,8 +1824,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t + + btree_update_updated_node(as, b); + bch2_btree_node_unlock_write(trans, path, b); +- +- BUG_ON(bch2_btree_node_check_topology(trans, b)); + return 0; + split: + /* +@@ -1905,7 +1900,7 @@ static void __btree_increase_depth(struct btree_update *as, struct btree_trans * + BUG_ON(ret); + + bch2_btree_update_get_open_buckets(as, n); +- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); ++ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); + bch2_trans_node_add(trans, path, n); + six_unlock_intent(&n->c.lock); + +@@ -1953,8 +1948,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + u64 start_time = local_clock(); + int ret = 0; + +- bch2_trans_verify_not_in_restart(trans); +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + BUG_ON(!trans->paths[path].should_be_locked); + BUG_ON(!btree_node_locked(&trans->paths[path], level)); + +@@ -2058,9 +2052,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + + trace_and_count(c, btree_node_merge, trans, b); + +- bch2_btree_interior_update_will_free_node(as, b); +- bch2_btree_interior_update_will_free_node(as, m); +- + n = bch2_btree_node_alloc(as, trans, b->c.level); + + SET_BTREE_NODE_SEQ(n->data, +@@ -2096,10 +2087,13 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, + if (ret) + goto err_free_update; + ++ bch2_btree_interior_update_will_free_node(as, b); ++ bch2_btree_interior_update_will_free_node(as, m); ++ + bch2_trans_verify_paths(trans); + + bch2_btree_update_get_open_buckets(as, n); +- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); ++ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, trans->paths + path, b); + bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m); +@@ -2150,8 +2144,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + if (ret) + goto out; + +- bch2_btree_interior_update_will_free_node(as, b); +- + n = bch2_btree_node_alloc_replacement(as, trans, b); + + bch2_btree_build_aux_trees(n); +@@ -2175,8 +2167,10 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + if (ret) + goto err; + ++ bch2_btree_interior_update_will_free_node(as, b); ++ + bch2_btree_update_get_open_buckets(as, n); +- bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); ++ bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b); + +@@ -2195,48 +2189,76 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, + goto out; + } + ++int bch2_btree_node_rewrite_key(struct btree_trans *trans, ++ enum btree_id btree, unsigned level, ++ struct bpos pos, unsigned flags) ++{ ++ BUG_ON(!level); ++ ++ /* Traverse one depth lower to get a pointer to the node itself: */ ++ struct btree_iter iter; ++ bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); ++ int ret = PTR_ERR_OR_ZERO(b); ++ if (ret) ++ goto err; ++ ++ ret = bch2_btree_node_rewrite(trans, &iter, b, flags); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ + struct async_btree_rewrite { + struct bch_fs *c; + struct work_struct work; + struct list_head list; + enum btree_id btree_id; + unsigned level; +- struct bpos pos; +- __le64 seq; ++ struct bkey_buf key; + }; + + static int async_btree_node_rewrite_trans(struct btree_trans *trans, + struct async_btree_rewrite *a) + { +- struct bch_fs *c = trans->c; + struct btree_iter iter; +- struct btree *b; +- int ret; +- +- bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, ++ bch2_trans_node_iter_init(trans, &iter, ++ a->btree_id, a->key.k->k.p, + BTREE_MAX_DEPTH, a->level, 0); +- b = bch2_btree_iter_peek_node(&iter); +- ret = PTR_ERR_OR_ZERO(b); ++ struct btree *b = bch2_btree_iter_peek_node(&iter); ++ int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto out; + +- if (!b || b->data->keys.seq != a->seq) { ++ bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); ++ ret = found ++ ? bch2_btree_node_rewrite(trans, &iter, b, 0) ++ : -ENOENT; ++ ++#if 0 ++ /* Tracepoint... */ ++ if (!ret || ret == -ENOENT) { ++ struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + +- if (b) +- bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); +- else +- prt_str(&buf, "(null"); +- bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", +- __func__, a->seq, buf.buf); ++ if (!ret) { ++ prt_printf(&buf, "rewrite node:\n "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); ++ } else { ++ prt_printf(&buf, "node to rewrite not found:\n want: "); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); ++ prt_printf(&buf, "\n got: "); ++ if (b) ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); ++ else ++ prt_str(&buf, "(null)"); ++ } ++ bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); +- goto out; + } +- +- ret = bch2_btree_node_rewrite(trans, &iter, b, 0); ++#endif + out: + bch2_trans_iter_exit(trans, &iter); +- + return ret; + } + +@@ -2247,81 +2269,97 @@ static void async_btree_node_rewrite_work(struct work_struct *work) + struct bch_fs *c = a->c; + + int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); +- bch_err_fn_ratelimited(c, ret); ++ if (ret != -ENOENT) ++ bch_err_fn_ratelimited(c, ret); ++ ++ spin_lock(&c->btree_node_rewrites_lock); ++ list_del(&a->list); ++ spin_unlock(&c->btree_node_rewrites_lock); ++ ++ closure_wake_up(&c->btree_node_rewrites_wait); ++ ++ bch2_bkey_buf_exit(&a->key, c); + bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); + kfree(a); + } + + void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) + { +- struct async_btree_rewrite *a; +- int ret; +- +- a = kmalloc(sizeof(*a), GFP_NOFS); +- if (!a) { +- bch_err(c, "%s: error allocating memory", __func__); ++ struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); ++ if (!a) + return; +- } + + a->c = c; + a->btree_id = b->c.btree_id; + a->level = b->c.level; +- a->pos = b->key.k.p; +- a->seq = b->data->keys.seq; + INIT_WORK(&a->work, async_btree_node_rewrite_work); + +- if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { +- mutex_lock(&c->pending_node_rewrites_lock); +- list_add(&a->list, &c->pending_node_rewrites); +- mutex_unlock(&c->pending_node_rewrites_lock); +- return; +- } ++ bch2_bkey_buf_init(&a->key); ++ bch2_bkey_buf_copy(&a->key, c, &b->key); + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { +- if (test_bit(BCH_FS_started, &c->flags)) { +- bch_err(c, "%s: error getting c->writes ref", __func__); +- kfree(a); +- return; +- } ++ bool now = false, pending = false; + +- ret = bch2_fs_read_write_early(c); +- bch_err_msg(c, ret, "going read-write"); +- if (ret) { +- kfree(a); +- return; +- } ++ spin_lock(&c->btree_node_rewrites_lock); ++ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && ++ bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { ++ list_add(&a->list, &c->btree_node_rewrites); ++ now = true; ++ } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { ++ list_add(&a->list, &c->btree_node_rewrites_pending); ++ pending = true; ++ } ++ spin_unlock(&c->btree_node_rewrites_lock); + +- bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); ++ if (now) { ++ queue_work(c->btree_node_rewrite_worker, &a->work); ++ } else if (pending) { ++ /* bch2_do_pending_node_rewrites will execute */ ++ } else { ++ bch2_bkey_buf_exit(&a->key, c); ++ kfree(a); + } ++} + +- queue_work(c->btree_node_rewrite_worker, &a->work); ++void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) ++{ ++ closure_wait_event(&c->btree_node_rewrites_wait, ++ list_empty(&c->btree_node_rewrites)); + } + + void bch2_do_pending_node_rewrites(struct bch_fs *c) + { +- struct async_btree_rewrite *a, *n; +- +- mutex_lock(&c->pending_node_rewrites_lock); +- list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { +- list_del(&a->list); ++ while (1) { ++ spin_lock(&c->btree_node_rewrites_lock); ++ struct async_btree_rewrite *a = ++ list_pop_entry(&c->btree_node_rewrites_pending, ++ struct async_btree_rewrite, list); ++ if (a) ++ list_add(&a->list, &c->btree_node_rewrites); ++ spin_unlock(&c->btree_node_rewrites_lock); ++ ++ if (!a) ++ break; + + bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + queue_work(c->btree_node_rewrite_worker, &a->work); + } +- mutex_unlock(&c->pending_node_rewrites_lock); + } + + void bch2_free_pending_node_rewrites(struct bch_fs *c) + { +- struct async_btree_rewrite *a, *n; ++ while (1) { ++ spin_lock(&c->btree_node_rewrites_lock); ++ struct async_btree_rewrite *a = ++ list_pop_entry(&c->btree_node_rewrites_pending, ++ struct async_btree_rewrite, list); ++ spin_unlock(&c->btree_node_rewrites_lock); + +- mutex_lock(&c->pending_node_rewrites_lock); +- list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { +- list_del(&a->list); ++ if (!a) ++ break; + ++ bch2_bkey_buf_exit(&a->key, c); + kfree(a); + } +- mutex_unlock(&c->pending_node_rewrites_lock); + } + + static int __bch2_btree_node_update_key(struct btree_trans *trans, +@@ -2575,8 +2613,9 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update + prt_printf(out, "%ps: ", (void *) as->ip_started); + bch2_trans_commit_flags_to_text(out, as->flags); + +- prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", +- bch2_btree_id_str(as->btree_id), ++ prt_str(out, " "); ++ bch2_btree_id_to_text(out, as->btree_id); ++ prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", + as->update_level_start, + as->update_level_end, + bch2_btree_update_modes[as->mode], +@@ -2677,6 +2716,9 @@ void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) + + void bch2_fs_btree_interior_update_exit(struct bch_fs *c) + { ++ WARN_ON(!list_empty(&c->btree_node_rewrites)); ++ WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); ++ + if (c->btree_node_rewrite_worker) + destroy_workqueue(c->btree_node_rewrite_worker); + if (c->btree_interior_update_worker) +@@ -2692,8 +2734,9 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) + mutex_init(&c->btree_interior_update_lock); + INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); + +- INIT_LIST_HEAD(&c->pending_node_rewrites); +- mutex_init(&c->pending_node_rewrites_lock); ++ INIT_LIST_HEAD(&c->btree_node_rewrites); ++ INIT_LIST_HEAD(&c->btree_node_rewrites_pending); ++ spin_lock_init(&c->btree_node_rewrites_lock); + } + + int bch2_fs_btree_interior_update_init(struct bch_fs *c) +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +index 10f400957f21..fa5a88f95d89 100644 +--- a/fs/bcachefs/btree_update_interior.h ++++ b/fs/bcachefs/btree_update_interior.h +@@ -159,7 +159,7 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, + unsigned level, + unsigned flags) + { +- bch2_trans_verify_not_unlocked(trans); ++ bch2_trans_verify_not_unlocked_or_in_restart(trans); + + return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, + btree_prev_sib) ?: +@@ -169,7 +169,11 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, + + int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + struct btree *, unsigned); ++int bch2_btree_node_rewrite_key(struct btree_trans *, ++ enum btree_id, unsigned, ++ struct bpos, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); ++ + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + struct btree *, struct bkey_i *, + unsigned, bool); +@@ -334,6 +338,7 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); + struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, + struct jset_entry *, unsigned long); + ++void bch2_async_btree_node_rewrites_flush(struct bch_fs *); + void bch2_do_pending_node_rewrites(struct bch_fs *); + void bch2_free_pending_node_rewrites(struct bch_fs *); + +diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c +index 1639c60dffa0..b56c4987b8c9 100644 +--- a/fs/bcachefs/btree_write_buffer.c ++++ b/fs/bcachefs/btree_write_buffer.c +@@ -19,8 +19,6 @@ + static int bch2_btree_write_buffer_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + +-static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); +- + static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) + { + return (cmp_int(l->hi, r->hi) ?: +@@ -314,6 +312,8 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) + darray_for_each(wb->sorted, i) { + struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; + ++ BUG_ON(!btree_type_uses_write_buffer(k->btree)); ++ + for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) + prefetch(&wb->flushing.keys.data[n->idx]); + +@@ -481,21 +481,55 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) + return ret; + } + +-static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) ++static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) ++{ ++ struct journal_keys_to_wb dst; ++ int ret = 0; ++ ++ bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); ++ ++ for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { ++ jset_entry_for_each_key(entry, k) { ++ ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); ++ if (ret) ++ goto out; ++ } ++ ++ entry->type = BCH_JSET_ENTRY_btree_keys; ++ } ++out: ++ ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; ++ return ret; ++} ++ ++static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) + { + struct journal *j = &c->journal; + struct journal_buf *buf; ++ bool blocked; + int ret = 0; + +- while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { ++ while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { + ret = bch2_journal_keys_to_write_buffer(c, buf); ++ ++ if (!blocked && !ret) { ++ spin_lock(&j->lock); ++ buf->need_flush_to_write_buffer = false; ++ spin_unlock(&j->lock); ++ } ++ + mutex_unlock(&j->buf_lock); ++ ++ if (blocked) { ++ bch2_journal_unblock(j); ++ break; ++ } + } + + return ret; + } + +-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, ++static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, + bool *did_work) + { + struct bch_fs *c = trans->c; +@@ -505,7 +539,7 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, + do { + bch2_trans_unlock(trans); + +- fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); ++ fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); + + *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; + +@@ -518,8 +552,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, + mutex_unlock(&wb->flushing.lock); + } while (!ret && + (fetch_from_journal_err || +- (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || +- (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); ++ (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || ++ (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); + + return ret; + } +@@ -600,6 +634,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, + bch2_bkey_buf_init(&tmp); + + if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { ++ if (trace_write_buffer_maybe_flush_enabled()) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_bkey_val_to_text(&buf, c, referring_k); ++ trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); ++ printbuf_exit(&buf); ++ } ++ + bch2_bkey_buf_reassemble(&tmp, c, referring_k); + + if (bkey_is_btree_ptr(referring_k.k)) { +@@ -771,31 +813,6 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ + return ret; + } + +-static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +-{ +- struct journal_keys_to_wb dst; +- int ret = 0; +- +- bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); +- +- for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { +- jset_entry_for_each_key(entry, k) { +- ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); +- if (ret) +- goto out; +- } +- +- entry->type = BCH_JSET_ENTRY_btree_keys; +- } +- +- spin_lock(&c->journal.lock); +- buf->need_flush_to_write_buffer = false; +- spin_unlock(&c->journal.lock); +-out: +- ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; +- return ret; +-} +- + static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) + { + if (wb->keys.size >= new_size) +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +index ec7d9a59bea9..345b117a4a4a 100644 +--- a/fs/bcachefs/buckets.c ++++ b/fs/bcachefs/buckets.c +@@ -18,7 +18,9 @@ + #include "error.h" + #include "inode.h" + #include "movinggc.h" ++#include "rebalance.h" + #include "recovery.h" ++#include "recovery_passes.h" + #include "reflink.h" + #include "replicas.h" + #include "subvolume.h" +@@ -260,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, + struct printbuf buf = PRINTBUF; + int ret = 0; + +- percpu_down_read(&c->mark_lock); +- + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { + ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); + if (ret) +@@ -362,7 +362,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, + bch_info(c, "new key %s", buf.buf); + } + +- percpu_up_read(&c->mark_lock); + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); +@@ -371,8 +370,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(trans, &iter); +- percpu_down_read(&c->mark_lock); +- + if (ret) + goto err; + +@@ -380,7 +377,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + } + err: +- percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; + } +@@ -401,8 +397,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + BUG_ON(!sectors); + + if (gen_after(ptr->gen, b_gen)) { +- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- ptr_gen_newer_than_bucket_gen, ++ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); ++ log_fsck_err(trans, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, +@@ -415,8 +411,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + } + + if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { +- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- ptr_too_stale, ++ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); ++ log_fsck_err(trans, ptr_too_stale, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, +@@ -435,8 +431,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + } + + if (b_gen != ptr->gen) { +- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- stale_dirty_ptr, ++ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); ++ log_fsck_err(trans, stale_dirty_ptr, + "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, +@@ -451,8 +447,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + } + + if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { +- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- ptr_bucket_data_type_mismatch, ++ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); ++ log_fsck_err(trans, ptr_bucket_data_type_mismatch, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, +@@ -466,8 +462,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + } + + if ((u64) *bucket_sectors + sectors > U32_MAX) { +- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- bucket_sector_count_overflow, ++ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); ++ log_fsck_err(trans, bucket_sector_count_overflow, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, +@@ -485,7 +481,9 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, + printbuf_exit(&buf); + return ret; + err: ++fsck_err: + bch2_dump_trans_updates(trans); ++ bch2_inconsistent_error(c); + ret = -BCH_ERR_bucket_ref_update; + goto out; + } +@@ -543,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, + struct bkey_s_c k, + const struct extent_ptr_decoded *p, + s64 sectors, enum bch_data_type ptr_data_type, +- struct bch_alloc_v4 *a) ++ struct bch_alloc_v4 *a, ++ bool insert) + { + u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : + !p->ptr.cached ? &a->dirty_sectors : +@@ -553,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, + + if (ret) + return ret; +- +- alloc_data_type_set(a, ptr_data_type); ++ if (insert) ++ alloc_data_type_set(a, ptr_data_type); + return 0; + } + +@@ -570,8 +569,10 @@ static int bch2_trigger_pointer(struct btree_trans *trans, + struct printbuf buf = PRINTBUF; + int ret = 0; + +- u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); +- *sectors = insert ? abs_sectors : -abs_sectors; ++ struct bkey_i_backpointer bp; ++ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); ++ ++ *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; + + struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); + if (unlikely(!ca)) { +@@ -580,41 +581,36 @@ static int bch2_trigger_pointer(struct btree_trans *trans, + goto err; + } + +- struct bpos bucket; +- struct bch_backpointer bp; +- __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors); ++ struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); + + if (flags & BTREE_TRIGGER_transactional) { + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); + ret = PTR_ERR_OR_ZERO(a) ?: +- __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v); ++ __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); + if (ret) + goto err; + + if (!p.ptr.cached) { +- ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert); ++ ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); + if (ret) + goto err; + } + } + + if (flags & BTREE_TRIGGER_gc) { +- percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + p.ptr.dev, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = -BCH_ERR_trigger_pointer; +- goto err_unlock; ++ goto err; + } + + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; +- ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new); ++ ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); + alloc_to_bucket(g, new); + bucket_unlock(g); +-err_unlock: +- percpu_up_read(&c->mark_lock); + + if (!ret) + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); +@@ -951,6 +947,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + enum bch_data_type type, + unsigned sectors) + { ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + int ret = 0; + +@@ -960,8 +957,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + return PTR_ERR(a); + + if (a->v.data_type && type && a->v.data_type != type) { +- bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, +- bucket_metadata_type_mismatch, ++ bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); ++ log_fsck_err(trans, bucket_metadata_type_mismatch, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + iter.pos.inode, iter.pos.offset, a->v.gen, +@@ -979,6 +976,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + } + err: ++fsck_err: + bch2_trans_iter_exit(trans, &iter); + return ret; + } +@@ -990,11 +988,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * + struct bch_fs *c = trans->c; + int ret = 0; + +- percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, b); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", + ca->dev_idx, bch2_data_type_str(data_type))) +- goto err_unlock; ++ goto err; + + bucket_lock(g); + struct bch_alloc_v4 old = bucket_m_to_alloc(*g); +@@ -1004,26 +1001,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * + "different types of data in same bucket: %s, %s", + bch2_data_type_str(g->data_type), + bch2_data_type_str(data_type))) +- goto err; ++ goto err_unlock; + + if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, + "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", + ca->dev_idx, b, g->gen, + bch2_data_type_str(g->data_type ?: data_type), + g->dirty_sectors, sectors)) +- goto err; ++ goto err_unlock; + + g->data_type = data_type; + g->dirty_sectors += sectors; + struct bch_alloc_v4 new = bucket_m_to_alloc(*g); + bucket_unlock(g); +- percpu_up_read(&c->mark_lock); + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + return ret; +-err: +- bucket_unlock(g); + err_unlock: +- percpu_up_read(&c->mark_lock); ++ bucket_unlock(g); ++err: + return -BCH_ERR_metadata_bucket_inconsistency; + } + +@@ -1155,6 +1150,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) + return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); + } + ++bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ u64 b_offset = bucket_to_sector(ca, b); ++ u64 b_end = bucket_to_sector(ca, b + 1); ++ unsigned i; ++ ++ if (!b) ++ return true; ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ u64 end = offset + (1 << layout->sb_max_size_bits); ++ ++ if (!(offset >= b_end || end <= b_offset)) ++ return true; ++ } ++ ++ for (i = 0; i < ca->journal.nr; i++) ++ if (b == ca->journal.buckets[i]) ++ return true; ++ ++ return false; ++} ++ + /* Disk reservations: */ + + #define SECTORS_CACHE 1024 +@@ -1238,7 +1258,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c) + for_each_member_device(c, ca) { + BUG_ON(ca->buckets_nouse); + +- ca->buckets_nouse = kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * ++ ca->buckets_nouse = bch2_kvmalloc(BITS_TO_LONGS(ca->mi.nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets_nouse) { +@@ -1264,10 +1284,15 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + bool resize = ca->bucket_gens != NULL; + int ret; + +- BUG_ON(resize && ca->buckets_nouse); ++ if (resize) ++ lockdep_assert_held(&c->state_lock); ++ ++ if (resize && ca->buckets_nouse) ++ return -BCH_ERR_no_resize_with_buckets_nouse; + +- if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, +- GFP_KERNEL|__GFP_ZERO))) { ++ bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), ++ GFP_KERNEL|__GFP_ZERO); ++ if (!bucket_gens) { + ret = -BCH_ERR_ENOMEM_bucket_gens; + goto err; + } +@@ -1277,19 +1302,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; + +- if (resize) { +- down_write(&ca->bucket_lock); +- percpu_down_write(&c->mark_lock); +- } +- + old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); + + if (resize) { +- size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); +- ++ bucket_gens->nbuckets = min(bucket_gens->nbuckets, ++ old_bucket_gens->nbuckets); ++ bucket_gens->nbuckets_minus_first = ++ bucket_gens->nbuckets - bucket_gens->first_bucket; + memcpy(bucket_gens->b, + old_bucket_gens->b, +- n); ++ bucket_gens->nbuckets); + } + + rcu_assign_pointer(ca->bucket_gens, bucket_gens); +@@ -1297,11 +1319,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) + + nbuckets = ca->mi.nbuckets; + +- if (resize) { +- percpu_up_write(&c->mark_lock); +- up_write(&ca->bucket_lock); +- } +- + ret = 0; + err: + if (bucket_gens) +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +index ccc78bfe2fd4..a9acdd6c0c86 100644 +--- a/fs/bcachefs/buckets.h ++++ b/fs/bcachefs/buckets.h +@@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b) + + static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) + { +- return genradix_ptr(&ca->buckets_gc, b); ++ return bucket_valid(ca, b) ++ ? genradix_ptr(&ca->buckets_gc, b) ++ : NULL; + } + + static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) + { + return rcu_dereference_check(ca->bucket_gens, +- !ca->fs || +- percpu_rwsem_is_held(&ca->fs->mark_lock) || +- lockdep_is_held(&ca->fs->state_lock) || +- lockdep_is_held(&ca->bucket_lock)); ++ lockdep_is_held(&ca->fs->state_lock)); + } + + static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) +@@ -308,26 +307,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, + enum btree_iter_update_trigger_flags); + int bch2_trans_mark_dev_sbs(struct bch_fs *); + +-static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) +-{ +- struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; +- u64 b_offset = bucket_to_sector(ca, b); +- u64 b_end = bucket_to_sector(ca, b + 1); +- unsigned i; +- +- if (!b) +- return true; +- +- for (i = 0; i < layout->nr_superblocks; i++) { +- u64 offset = le64_to_cpu(layout->sb_offset[i]); +- u64 end = offset + (1 << layout->sb_max_size_bits); +- +- if (!(offset >= b_end || end <= b_offset)) +- return true; +- } +- +- return false; +-} ++bool bch2_is_superblock_bucket(struct bch_dev *, u64); + + static inline const char *bch2_data_type_str(enum bch_data_type type) + { +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +index 28bd09a253c8..7174047b8e92 100644 +--- a/fs/bcachefs/buckets_types.h ++++ b/fs/bcachefs/buckets_types.h +@@ -24,7 +24,7 @@ struct bucket_gens { + u16 first_bucket; + size_t nbuckets; + size_t nbuckets_minus_first; +- u8 b[]; ++ u8 b[] __counted_by(nbuckets); + }; + + struct bch_dev_usage { +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +index 2182b555c112..46e9e32105a9 100644 +--- a/fs/bcachefs/chardev.c ++++ b/fs/bcachefs/chardev.c +@@ -6,11 +6,11 @@ + #include "buckets.h" + #include "chardev.h" + #include "disk_accounting.h" ++#include "fsck.h" + #include "journal.h" + #include "move.h" + #include "recovery_passes.h" + #include "replicas.h" +-#include "super.h" + #include "super-io.h" + #include "thread_with_file.h" + +@@ -127,130 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg + } + #endif + +-struct fsck_thread { +- struct thread_with_stdio thr; +- struct bch_fs *c; +- struct bch_opts opts; +-}; +- +-static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) +-{ +- struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); +- kfree(thr); +-} +- +-static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) +-{ +- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); +- struct bch_fs *c = thr->c; +- +- int ret = PTR_ERR_OR_ZERO(c); +- if (ret) +- return ret; +- +- ret = bch2_fs_start(thr->c); +- if (ret) +- goto err; +- +- if (test_bit(BCH_FS_errors_fixed, &c->flags)) { +- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); +- ret |= 1; +- } +- if (test_bit(BCH_FS_error, &c->flags)) { +- bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); +- ret |= 4; +- } +-err: +- bch2_fs_stop(c); +- return ret; +-} +- +-static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { +- .exit = bch2_fsck_thread_exit, +- .fn = bch2_fsck_offline_thread_fn, +-}; +- +-static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) +-{ +- struct bch_ioctl_fsck_offline arg; +- struct fsck_thread *thr = NULL; +- darray_str(devs) = {}; +- long ret = 0; +- +- if (copy_from_user(&arg, user_arg, sizeof(arg))) +- return -EFAULT; +- +- if (arg.flags) +- return -EINVAL; +- +- if (!capable(CAP_SYS_ADMIN)) +- return -EPERM; +- +- for (size_t i = 0; i < arg.nr_devs; i++) { +- u64 dev_u64; +- ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); +- if (ret) +- goto err; +- +- char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); +- ret = PTR_ERR_OR_ZERO(dev_str); +- if (ret) +- goto err; +- +- ret = darray_push(&devs, dev_str); +- if (ret) { +- kfree(dev_str); +- goto err; +- } +- } +- +- thr = kzalloc(sizeof(*thr), GFP_KERNEL); +- if (!thr) { +- ret = -ENOMEM; +- goto err; +- } +- +- thr->opts = bch2_opts_empty(); +- +- if (arg.opts) { +- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); +- ret = PTR_ERR_OR_ZERO(optstr) ?: +- bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); +- if (!IS_ERR(optstr)) +- kfree(optstr); +- +- if (ret) +- goto err; +- } +- +- opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); +- opt_set(thr->opts, read_only, 1); +- opt_set(thr->opts, ratelimit_errors, 0); +- +- /* We need request_key() to be called before we punt to kthread: */ +- opt_set(thr->opts, nostart, true); +- +- bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); +- +- thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); +- +- if (!IS_ERR(thr->c) && +- thr->c->opts.errors == BCH_ON_ERROR_panic) +- thr->c->opts.errors = BCH_ON_ERROR_ro; +- +- ret = __bch2_run_thread_with_stdio(&thr->thr); +-out: +- darray_for_each(devs, i) +- kfree(*i); +- darray_exit(&devs); +- return ret; +-err: +- if (thr) +- bch2_fsck_thread_exit(&thr->thr); +- pr_err("ret %s", bch2_err_str(ret)); +- goto out; +-} +- + static long bch2_global_ioctl(unsigned cmd, void __user *arg) + { + long ret; +@@ -775,99 +651,6 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, + return ret; + } + +-static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) +-{ +- struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); +- struct bch_fs *c = thr->c; +- +- c->stdio_filter = current; +- c->stdio = &thr->thr.stdio; +- +- /* +- * XXX: can we figure out a way to do this without mucking with c->opts? +- */ +- unsigned old_fix_errors = c->opts.fix_errors; +- if (opt_defined(thr->opts, fix_errors)) +- c->opts.fix_errors = thr->opts.fix_errors; +- else +- c->opts.fix_errors = FSCK_FIX_ask; +- +- c->opts.fsck = true; +- set_bit(BCH_FS_fsck_running, &c->flags); +- +- c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; +- int ret = bch2_run_online_recovery_passes(c); +- +- clear_bit(BCH_FS_fsck_running, &c->flags); +- bch_err_fn(c, ret); +- +- c->stdio = NULL; +- c->stdio_filter = NULL; +- c->opts.fix_errors = old_fix_errors; +- +- up(&c->online_fsck_mutex); +- bch2_ro_ref_put(c); +- return ret; +-} +- +-static const struct thread_with_stdio_ops bch2_online_fsck_ops = { +- .exit = bch2_fsck_thread_exit, +- .fn = bch2_fsck_online_thread_fn, +-}; +- +-static long bch2_ioctl_fsck_online(struct bch_fs *c, +- struct bch_ioctl_fsck_online arg) +-{ +- struct fsck_thread *thr = NULL; +- long ret = 0; +- +- if (arg.flags) +- return -EINVAL; +- +- if (!capable(CAP_SYS_ADMIN)) +- return -EPERM; +- +- if (!bch2_ro_ref_tryget(c)) +- return -EROFS; +- +- if (down_trylock(&c->online_fsck_mutex)) { +- bch2_ro_ref_put(c); +- return -EAGAIN; +- } +- +- thr = kzalloc(sizeof(*thr), GFP_KERNEL); +- if (!thr) { +- ret = -ENOMEM; +- goto err; +- } +- +- thr->c = c; +- thr->opts = bch2_opts_empty(); +- +- if (arg.opts) { +- char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); +- +- ret = PTR_ERR_OR_ZERO(optstr) ?: +- bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); +- if (!IS_ERR(optstr)) +- kfree(optstr); +- +- if (ret) +- goto err; +- } +- +- ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); +-err: +- if (ret < 0) { +- bch_err_fn(c, ret); +- if (thr) +- bch2_fsck_thread_exit(&thr->thr); +- up(&c->online_fsck_mutex); +- bch2_ro_ref_put(c); +- } +- return ret; +-} +- + #define BCH_IOCTL(_name, _argtype) \ + do { \ + _argtype i; \ +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +index ce8fc677bef9..23a383577d4c 100644 +--- a/fs/bcachefs/checksum.c ++++ b/fs/bcachefs/checksum.c +@@ -2,6 +2,7 @@ + #include "bcachefs.h" + #include "checksum.h" + #include "errcode.h" ++#include "error.h" + #include "super.h" + #include "super-io.h" + +@@ -252,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, + if (!bch2_csum_type_is_encryption(type)) + return 0; + ++ if (bch2_fs_inconsistent_on(!c->chacha20, ++ c, "attempting to encrypt without encryption key")) ++ return -BCH_ERR_no_encryption_key; ++ + return do_encrypt(c->chacha20, nonce, data, len); + } + +@@ -337,8 +342,9 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, + size_t sgl_len = 0; + int ret = 0; + +- if (!bch2_csum_type_is_encryption(type)) +- return 0; ++ if (bch2_fs_inconsistent_on(!c->chacha20, ++ c, "attempting to encrypt without encryption key")) ++ return -BCH_ERR_no_encryption_key; + + darray_init(&sgl); + +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +index e40499fde9a4..43b9d71f2f2b 100644 +--- a/fs/bcachefs/checksum.h ++++ b/fs/bcachefs/checksum.h +@@ -109,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool); + void bch2_fs_encryption_exit(struct bch_fs *); + int bch2_fs_encryption_init(struct bch_fs *); + +-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, ++static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, + bool data) + { + switch (type) { +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +index 1410365a8891..114bf2f3879f 100644 +--- a/fs/bcachefs/compress.c ++++ b/fs/bcachefs/compress.c +@@ -2,13 +2,34 @@ + #include "bcachefs.h" + #include "checksum.h" + #include "compress.h" ++#include "error.h" + #include "extents.h" ++#include "io_write.h" ++#include "opts.h" + #include "super-io.h" + + #include + #include + #include + ++static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type) ++{ ++ switch (type) { ++ case BCH_COMPRESSION_TYPE_none: ++ case BCH_COMPRESSION_TYPE_incompressible: ++ return BCH_COMPRESSION_OPT_none; ++ case BCH_COMPRESSION_TYPE_lz4_old: ++ case BCH_COMPRESSION_TYPE_lz4: ++ return BCH_COMPRESSION_OPT_lz4; ++ case BCH_COMPRESSION_TYPE_gzip: ++ return BCH_COMPRESSION_OPT_gzip; ++ case BCH_COMPRESSION_TYPE_zstd: ++ return BCH_COMPRESSION_OPT_zstd; ++ default: ++ BUG(); ++ } ++} ++ + /* Bounce buffer: */ + struct bbuf { + void *b; +@@ -158,6 +179,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + void *workspace; + int ret; + ++ enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); ++ mempool_t *workspace_pool = &c->compress_workspace[opt]; ++ if (unlikely(!mempool_initialized(workspace_pool))) { ++ if (fsck_err(c, compression_type_not_marked_in_sb, ++ "compression type %s set but not marked in superblock", ++ __bch2_compression_types[crc.compression_type])) ++ ret = bch2_check_set_has_compressed_data(c, opt); ++ else ++ ret = -BCH_ERR_compression_workspace_not_initialized; ++ if (ret) ++ goto out; ++ } ++ + src_data = bio_map_or_bounce(c, src, READ); + + switch (crc.compression_type) { +@@ -176,13 +210,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + .avail_out = dst_len, + }; + +- workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); ++ workspace = mempool_alloc(workspace_pool, GFP_NOFS); + + zlib_set_workspace(&strm, workspace); + zlib_inflateInit2(&strm, -MAX_WBITS); + ret = zlib_inflate(&strm, Z_FINISH); + +- mempool_free(workspace, &c->decompress_workspace); ++ mempool_free(workspace, workspace_pool); + + if (ret != Z_STREAM_END) + goto err; +@@ -195,14 +229,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + if (real_src_len > src_len - 4) + goto err; + +- workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); ++ workspace = mempool_alloc(workspace_pool, GFP_NOFS); + ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); + + ret = zstd_decompress_dctx(ctx, + dst_data, dst_len, + src_data.b + 4, real_src_len); + +- mempool_free(workspace, &c->decompress_workspace); ++ mempool_free(workspace, workspace_pool); + + if (ret != dst_len) + goto err; +@@ -212,6 +246,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + BUG(); + } + ret = 0; ++fsck_err: + out: + bio_unmap_or_unbounce(c, src_data); + return ret; +@@ -220,11 +255,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, + goto out; + } + +-int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, +- struct bch_extent_crc_unpacked *crc) ++int bch2_bio_uncompress_inplace(struct bch_write_op *op, ++ struct bio *bio) + { ++ struct bch_fs *c = op->c; ++ struct bch_extent_crc_unpacked *crc = &op->crc; + struct bbuf data = { NULL }; + size_t dst_len = crc->uncompressed_size << 9; ++ int ret = 0; + + /* bio must own its pages: */ + BUG_ON(!bio->bi_vcnt); +@@ -232,17 +270,26 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, + + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || + crc->compressed_size << 9 > c->opts.encoded_extent_max) { +- bch_err(c, "error rewriting existing data: extent too big"); ++ struct printbuf buf = PRINTBUF; ++ bch2_write_op_error(&buf, op); ++ prt_printf(&buf, "error rewriting existing data: extent too big"); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); + return -EIO; + } + + data = __bounce_alloc(c, dst_len, WRITE); + + if (__bio_uncompress(c, bio, data.b, *crc)) { +- if (!c->opts.no_data_io) +- bch_err(c, "error rewriting existing data: decompression error"); +- bio_unmap_or_unbounce(c, data); +- return -EIO; ++ if (!c->opts.no_data_io) { ++ struct printbuf buf = PRINTBUF; ++ bch2_write_op_error(&buf, op); ++ prt_printf(&buf, "error rewriting existing data: decompression error"); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ ret = -EIO; ++ goto err; + } + + /* +@@ -259,9 +306,9 @@ int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, + crc->uncompressed_size = crc->live_size; + crc->offset = 0; + crc->csum = (struct bch_csum) { 0, 0 }; +- ++err: + bio_unmap_or_unbounce(c, data); +- return 0; ++ return ret; + } + + int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, +@@ -394,8 +441,21 @@ static unsigned __bio_compress(struct bch_fs *c, + unsigned pad; + int ret = 0; + +- BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); +- BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); ++ /* bch2_compression_decode catches unknown compression types: */ ++ BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); ++ ++ mempool_t *workspace_pool = &c->compress_workspace[compression.type]; ++ if (unlikely(!mempool_initialized(workspace_pool))) { ++ if (fsck_err(c, compression_opt_not_marked_in_sb, ++ "compression opt %s set but not marked in superblock", ++ bch2_compression_opts[compression.type])) { ++ ret = bch2_check_set_has_compressed_data(c, compression.type); ++ if (ret) /* memory allocation failure, don't compress */ ++ return 0; ++ } else { ++ return 0; ++ } ++ } + + /* If it's only one block, don't bother trying to compress: */ + if (src->bi_iter.bi_size <= c->opts.block_size) +@@ -404,7 +464,7 @@ static unsigned __bio_compress(struct bch_fs *c, + dst_data = bio_map_or_bounce(c, dst, WRITE); + src_data = bio_map_or_bounce(c, src, READ); + +- workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); ++ workspace = mempool_alloc(workspace_pool, GFP_NOFS); + + *src_len = src->bi_iter.bi_size; + *dst_len = dst->bi_iter.bi_size; +@@ -447,7 +507,7 @@ static unsigned __bio_compress(struct bch_fs *c, + *src_len = round_down(*src_len, block_bytes(c)); + } + +- mempool_free(workspace, &c->compress_workspace[compression_type]); ++ mempool_free(workspace, workspace_pool); + + if (ret) + goto err; +@@ -477,6 +537,9 @@ static unsigned __bio_compress(struct bch_fs *c, + err: + ret = BCH_COMPRESSION_TYPE_incompressible; + goto out; ++fsck_err: ++ ret = 0; ++ goto out; + } + + unsigned bch2_bio_compress(struct bch_fs *c, +@@ -559,7 +622,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) + { + unsigned i; + +- mempool_exit(&c->decompress_workspace); + for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) + mempool_exit(&c->compress_workspace[i]); + mempool_exit(&c->compression_bounce[WRITE]); +@@ -568,7 +630,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) + + static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + { +- size_t decompress_workspace_size = 0; + ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), + c->opts.encoded_extent_max); + +@@ -576,19 +637,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + + struct { + unsigned feature; +- enum bch_compression_type type; ++ enum bch_compression_opts type; + size_t compress_workspace; +- size_t decompress_workspace; + } compression_types[] = { +- { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, +- max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), +- 0 }, +- { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, +- zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), +- zlib_inflate_workspacesize(), }, +- { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, +- c->zstd_workspace_size, +- zstd_dctx_workspace_bound() }, ++ { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, ++ max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, ++ { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, ++ max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), ++ zlib_inflate_workspacesize()) }, ++ { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, ++ max(c->zstd_workspace_size, ++ zstd_dctx_workspace_bound()) }, + }, *i; + bool have_compressed = false; + +@@ -613,9 +672,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + for (i = compression_types; + i < compression_types + ARRAY_SIZE(compression_types); + i++) { +- decompress_workspace_size = +- max(decompress_workspace_size, i->decompress_workspace); +- + if (!(features & (1 << i->feature))) + continue; + +@@ -628,11 +684,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) + return -BCH_ERR_ENOMEM_compression_workspace_init; + } + +- if (!mempool_initialized(&c->decompress_workspace) && +- mempool_init_kvmalloc_pool(&c->decompress_workspace, +- 1, decompress_workspace_size)) +- return -BCH_ERR_ENOMEM_decompression_workspace_init; +- + return 0; + } + +diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h +index 607fd5e232c9..bec2f05bfd52 100644 +--- a/fs/bcachefs/compress.h ++++ b/fs/bcachefs/compress.h +@@ -47,8 +47,8 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) + return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; + } + +-int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, +- struct bch_extent_crc_unpacked *); ++struct bch_write_op; ++int bch2_bio_uncompress_inplace(struct bch_write_op *, struct bio *); + int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, + struct bvec_iter, struct bch_extent_crc_unpacked); + unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, +diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h +index 8f4c3f0665c4..c6151495985f 100644 +--- a/fs/bcachefs/darray.h ++++ b/fs/bcachefs/darray.h +@@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); + for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) + + #define darray_for_each_reverse(_d, _i) \ +- for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) ++ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) + + #define darray_init(_d) \ + do { \ +diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c +index 8e75a852b358..92c6ac20d993 100644 +--- a/fs/bcachefs/data_update.c ++++ b/fs/bcachefs/data_update.c +@@ -20,6 +20,8 @@ + #include "subvolume.h" + #include "trace.h" + ++#include ++ + static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +@@ -33,7 +35,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + bkey_for_each_ptr(ptrs, ptr) { +- if (!bch2_dev_tryget(c, ptr->dev)) { ++ if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; +@@ -91,15 +93,28 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc + return true; + } + +-static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) ++static noinline void trace_move_extent_finish2(struct data_update *u, ++ struct bkey_i *new, ++ struct bkey_i *insert) + { +- if (trace_move_extent_finish_enabled()) { +- struct printbuf buf = PRINTBUF; ++ struct bch_fs *c = u->op.c; ++ struct printbuf buf = PRINTBUF; + +- bch2_bkey_val_to_text(&buf, c, k); +- trace_move_extent_finish(c, buf.buf); +- printbuf_exit(&buf); +- } ++ prt_newline(&buf); ++ ++ bch2_data_update_to_text(&buf, u); ++ prt_newline(&buf); ++ ++ prt_str_indented(&buf, "new replicas:\t"); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); ++ prt_newline(&buf); ++ ++ prt_str_indented(&buf, "insert:\t"); ++ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); ++ prt_newline(&buf); ++ ++ trace_move_extent_finish(c, buf.buf); ++ printbuf_exit(&buf); + } + + static void trace_move_extent_fail2(struct data_update *m, +@@ -110,11 +125,8 @@ static void trace_move_extent_fail2(struct data_update *m, + { + struct bch_fs *c = m->op.c; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); +- const union bch_extent_entry *entry; +- struct bch_extent_ptr *ptr; +- struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; +- unsigned i, rewrites_found = 0; ++ unsigned rewrites_found = 0; + + if (!trace_move_extent_fail_enabled()) + return; +@@ -122,27 +134,25 @@ static void trace_move_extent_fail2(struct data_update *m, + prt_str(&buf, msg); + + if (insert) { +- i = 0; ++ const union bch_extent_entry *entry; ++ struct bch_extent_ptr *ptr; ++ struct extent_ptr_decoded p; ++ ++ unsigned ptr_bit = 1; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { +- if (((1U << i) & m->data_opts.rewrite_ptrs) && ++ if ((ptr_bit & m->data_opts.rewrite_ptrs) && + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) +- rewrites_found |= 1U << i; +- i++; ++ rewrites_found |= ptr_bit; ++ ptr_bit <<= 1; + } + } + +- prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", +- (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, +- (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, +- (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, +- (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); ++ prt_str(&buf, "rewrites found:\t"); ++ bch2_prt_u64_base2(&buf, rewrites_found); ++ prt_newline(&buf); + +- prt_printf(&buf, "\nrewrites found: %u%u%u%u", +- (rewrites_found & (1 << 0)) != 0, +- (rewrites_found & (1 << 1)) != 0, +- (rewrites_found & (1 << 2)) != 0, +- (rewrites_found & (1 << 3)) != 0); ++ bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); +@@ -194,7 +204,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + struct bpos next_pos; + bool should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; +- unsigned rewrites_found = 0, durability, i; ++ unsigned rewrites_found = 0, durability, ptr_bit; + + bch2_trans_begin(trans); + +@@ -231,16 +241,16 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + * + * Fist, drop rewrite_ptrs from @new: + */ +- i = 0; ++ ptr_bit = 1; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { +- if (((1U << i) & m->data_opts.rewrite_ptrs) && ++ if ((ptr_bit & m->data_opts.rewrite_ptrs) && + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) { + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), ptr); +- rewrites_found |= 1U << i; ++ rewrites_found |= ptr_bit; + } +- i++; ++ ptr_bit <<= 1; + } + + if (m->data_opts.rewrite_ptrs && +@@ -323,8 +333,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + * it's been hard to reproduce, so this should give us some more + * information when it does occur: + */ +- int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), +- BCH_VALIDATE_commit); ++ int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), ++ (struct bkey_validate_context) { ++ .btree = m->btree_id, ++ .flags = BCH_VALIDATE_commit, ++ }); + if (invalid) { + struct printbuf buf = PRINTBUF; + +@@ -362,7 +375,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + k.k->p, bkey_start_pos(&insert->k)) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, insert->k.p) ?: +- bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: ++ bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: + bch2_trans_update(trans, &iter, insert, + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, &op->res, +@@ -374,7 +387,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, + bch2_btree_iter_set_pos(&iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); +- trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); ++ if (trace_move_extent_finish_enabled()) ++ trace_move_extent_finish2(m, &new->k_i, insert); + } + err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -414,14 +428,15 @@ int bch2_data_update_index_update(struct bch_write_op *op) + return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); + } + +-void bch2_data_update_read_done(struct data_update *m, +- struct bch_extent_crc_unpacked crc) ++void bch2_data_update_read_done(struct data_update *m) + { ++ m->read_done = true; ++ + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); + +- m->op.crc = crc; +- m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; ++ m->op.crc = m->rbio.pick.crc; ++ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; + + closure_call(&m->op.cl, bch2_write, NULL, NULL); + } +@@ -431,31 +446,34 @@ void bch2_data_update_exit(struct data_update *update) + struct bch_fs *c = update->op.c; + struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + ++ bch2_bio_free_pages_pool(c, &update->op.wbio.bio); ++ kfree(update->bvecs); ++ update->bvecs = NULL; ++ + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); + bkey_put_dev_refs(c, k); +- bch2_bkey_buf_exit(&update->k, c); + bch2_disk_reservation_put(c, &update->op.res); +- bch2_bio_free_pages_pool(c, &update->op.wbio.bio); ++ bch2_bkey_buf_exit(&update->k, c); + } + +-static void bch2_update_unwritten_extent(struct btree_trans *trans, +- struct data_update *update) ++static int bch2_update_unwritten_extent(struct btree_trans *trans, ++ struct data_update *update) + { + struct bch_fs *c = update->op.c; +- struct bio *bio = &update->op.wbio.bio; + struct bkey_i_extent *e; + struct write_point *wp; + struct closure cl; + struct btree_iter iter; + struct bkey_s_c k; +- int ret; ++ int ret = 0; + + closure_init_stack(&cl); + bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); + +- while (bio_sectors(bio)) { +- unsigned sectors = bio_sectors(bio); ++ while (bpos_lt(update->op.pos, update->k.k->k.p)) { ++ unsigned sectors = update->k.k->k.p.offset - ++ update->op.pos.offset; + + bch2_trans_begin(trans); + +@@ -491,7 +509,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch_err_fn_ratelimited(c, ret); + + if (ret) +- return; ++ break; + + sectors = min(sectors, wp->sectors_free); + +@@ -501,7 +519,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + +- bio_advance(bio, sectors << 9); + update->op.pos.offset += sectors; + + extent_for_each_ptr(extent_i_to_s(e), ptr) +@@ -520,41 +537,60 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, + bch2_trans_unlock(trans); + closure_sync(&cl); + } ++ ++ return ret; + } + + void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +- printbuf_tabstop_push(out, 20); +- prt_str(out, "rewrite ptrs:\t"); ++ if (!out->nr_tabstops) ++ printbuf_tabstop_push(out, 20); ++ ++ prt_str_indented(out, "rewrite ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); + prt_newline(out); + +- prt_str(out, "kill ptrs:\t"); ++ prt_str_indented(out, "kill ptrs:\t"); + bch2_prt_u64_base2(out, data_opts->kill_ptrs); + prt_newline(out); + +- prt_str(out, "target:\t"); ++ prt_str_indented(out, "target:\t"); + bch2_target_to_text(out, c, data_opts->target); + prt_newline(out); + +- prt_str(out, "compression:\t"); +- bch2_compression_opt_to_text(out, background_compression(*io_opts)); ++ prt_str_indented(out, "compression:\t"); ++ bch2_compression_opt_to_text(out, io_opts->background_compression); + prt_newline(out); + +- prt_str(out, "opts.replicas:\t"); ++ prt_str_indented(out, "opts.replicas:\t"); + prt_u64(out, io_opts->data_replicas); ++ prt_newline(out); + +- prt_str(out, "extra replicas:\t"); ++ prt_str_indented(out, "extra replicas:\t"); + prt_u64(out, data_opts->extra_replicas); ++ prt_newline(out); + } + + void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) ++{ ++ bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); ++ prt_newline(out); ++ ++ prt_str_indented(out, "old key:\t"); ++ bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); ++} ++ ++void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) + { + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); ++ printbuf_indent_add(out, 2); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); ++ prt_printf(out, "read_done:\t\%u\n", m->read_done); ++ bch2_write_op_to_text(out, &m->op); ++ printbuf_indent_sub(out, 2); + } + + int bch2_extent_drop_ptrs(struct btree_trans *trans, +@@ -600,6 +636,40 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + } + ++static bool can_allocate_without_blocking(struct bch_fs *c, ++ struct data_update *m) ++{ ++ if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) ++ return false; ++ ++ unsigned target = m->op.flags & BCH_WRITE_only_specified_devs ++ ? m->op.target ++ : 0; ++ struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); ++ ++ darray_for_each(m->op.devs_have, i) ++ __clear_bit(*i, devs.d); ++ ++ rcu_read_lock(); ++ unsigned nr_replicas = 0, i; ++ for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { ++ struct bch_dev *ca = bch2_dev_rcu(c, i); ++ ++ struct bch_dev_usage usage; ++ bch2_dev_usage_read_fast(ca, &usage); ++ ++ if (!dev_buckets_free(ca, usage, m->op.watermark)) ++ continue; ++ ++ nr_replicas += ca->mi.durability; ++ if (nr_replicas >= m->op.nr_replicas) ++ break; ++ } ++ rcu_read_unlock(); ++ ++ return nr_replicas >= m->op.nr_replicas; ++} ++ + int bch2_data_update_init(struct btree_trans *trans, + struct btree_iter *iter, + struct moving_context *ctxt, +@@ -614,7 +684,7 @@ int bch2_data_update_init(struct btree_trans *trans, + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; +- unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; ++ unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; + int ret = 0; + + /* +@@ -622,17 +692,8 @@ int bch2_data_update_init(struct btree_trans *trans, + * and we have to check for this because we go rw before repairing the + * snapshots table - just skip it, we can move it later. + */ +- if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) +- return -BCH_ERR_data_update_done; +- +- if (!bkey_get_dev_refs(c, k)) +- return -BCH_ERR_data_update_done; +- +- if (c->opts.nocow_enabled && +- !bkey_nocow_lock(c, ctxt, k)) { +- bkey_put_dev_refs(c, k); +- return -BCH_ERR_nocow_lock_blocked; +- } ++ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) ++ return -BCH_ERR_data_update_done_no_snapshot; + + bch2_bkey_buf_init(&m->k); + bch2_bkey_buf_reassemble(&m->k, c, k); +@@ -647,27 +708,27 @@ int bch2_data_update_init(struct btree_trans *trans, + m->op.target = data_opts.target; + m->op.write_point = wp; + m->op.nr_replicas = 0; +- m->op.flags |= BCH_WRITE_PAGES_STABLE| +- BCH_WRITE_PAGES_OWNED| +- BCH_WRITE_DATA_ENCODED| +- BCH_WRITE_MOVE| ++ m->op.flags |= BCH_WRITE_pages_stable| ++ BCH_WRITE_pages_owned| ++ BCH_WRITE_data_encoded| ++ BCH_WRITE_move| + m->data_opts.write_flags; +- m->op.compression_opt = background_compression(io_opts); ++ m->op.compression_opt = io_opts.background_compression; + m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; + + unsigned durability_have = 0, durability_removing = 0; + +- i = 0; ++ unsigned ptr_bit = 1; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (!p.ptr.cached) { + rcu_read_lock(); +- if (BIT(i) & m->data_opts.rewrite_ptrs) { ++ if (ptr_bit & m->data_opts.rewrite_ptrs) { + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); + durability_removing += bch2_extent_ptr_desired_durability(c, &p); +- } else if (!(BIT(i) & m->data_opts.kill_ptrs)) { ++ } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + durability_have += bch2_extent_ptr_durability(c, &p); + } +@@ -687,7 +748,7 @@ int bch2_data_update_init(struct btree_trans *trans, + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + m->op.incompressible = true; + +- i++; ++ ptr_bit <<= 1; + } + + unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); +@@ -724,7 +785,15 @@ int bch2_data_update_init(struct btree_trans *trans, + /* if iter == NULL, it's just a promote */ + if (iter) + ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); +- goto out; ++ if (!ret) ++ ret = -BCH_ERR_data_update_done_no_writes_needed; ++ goto out_bkey_buf_exit; ++ } ++ ++ if ((m->op.flags & BCH_WRITE_alloc_nowait) && ++ !can_allocate_without_blocking(c, m)) { ++ ret = -BCH_ERR_data_update_done_would_block; ++ goto out_bkey_buf_exit; + } + + if (reserve_sectors) { +@@ -733,31 +802,77 @@ int bch2_data_update_init(struct btree_trans *trans, + ? 0 + : BCH_DISK_RESERVATION_NOFAIL); + if (ret) +- goto out; ++ goto out_bkey_buf_exit; ++ } ++ ++ if (!bkey_get_dev_refs(c, k)) { ++ ret = -BCH_ERR_data_update_done_no_dev_refs; ++ goto out_put_disk_res; ++ } ++ ++ if (c->opts.nocow_enabled && ++ !bkey_nocow_lock(c, ctxt, k)) { ++ ret = -BCH_ERR_nocow_lock_blocked; ++ goto out_put_dev_refs; + } + + if (bkey_extent_is_unwritten(k)) { +- bch2_update_unwritten_extent(trans, m); +- goto out; ++ ret = bch2_update_unwritten_extent(trans, m) ?: ++ -BCH_ERR_data_update_done_unwritten; ++ goto out_nocow_unlock; + } + ++ /* write path might have to decompress data: */ ++ unsigned buf_bytes = 0; ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); ++ ++ unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); ++ ++ m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); ++ if (!m->bvecs) ++ goto enomem; ++ ++ bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); ++ bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); ++ ++ if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) ++ goto enomem; ++ ++ rbio_init(&m->rbio.bio, c, io_opts, NULL); ++ m->rbio.bio.bi_iter.bi_size = buf_bytes; ++ m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); ++ ++ bio_set_prio(&m->op.wbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ + return 0; +-out: +- bch2_data_update_exit(m); +- return ret ?: -BCH_ERR_data_update_done; ++enomem: ++ ret = -ENOMEM; ++ kfree(m->bvecs); ++ m->bvecs = NULL; ++out_nocow_unlock: ++ if (c->opts.nocow_enabled) ++ bkey_nocow_unlock(c, k); ++out_put_dev_refs: ++ bkey_put_dev_refs(c, k); ++out_put_disk_res: ++ bch2_disk_reservation_put(c, &m->op.res); ++out_bkey_buf_exit: ++ bch2_bkey_buf_exit(&m->k, c); ++ return ret; + } + + void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- unsigned i = 0; ++ unsigned ptr_bit = 1; + + bkey_for_each_ptr(ptrs, ptr) { +- if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { +- opts->kill_ptrs |= 1U << i; +- opts->rewrite_ptrs ^= 1U << i; ++ if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { ++ opts->kill_ptrs |= ptr_bit; ++ opts->rewrite_ptrs ^= ptr_bit; + } + +- i++; ++ ptr_bit <<= 1; + } + } +diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h +index e4b50723428e..f4cf5d17cc37 100644 +--- a/fs/bcachefs/data_update.h ++++ b/fs/bcachefs/data_update.h +@@ -4,6 +4,7 @@ + #define _BCACHEFS_DATA_UPDATE_H + + #include "bkey_buf.h" ++#include "io_read.h" + #include "io_write_types.h" + + struct moving_context; +@@ -22,20 +23,24 @@ void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, + + struct data_update { + /* extent being updated: */ ++ bool read_done; + enum btree_id btree_id; + struct bkey_buf k; + struct data_update_opts data_opts; + struct moving_context *ctxt; + struct bch_move_stats *stats; ++ ++ struct bch_read_bio rbio; + struct bch_write_op op; ++ struct bio_vec *bvecs; + }; + + void bch2_data_update_to_text(struct printbuf *, struct data_update *); ++void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); + + int bch2_data_update_index_update(struct bch_write_op *); + +-void bch2_data_update_read_done(struct data_update *, +- struct bch_extent_crc_unpacked); ++void bch2_data_update_read_done(struct data_update *); + + int bch2_extent_drop_ptrs(struct btree_trans *, + struct btree_iter *, +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +index 45aec1afdb0e..55333e82d1fe 100644 +--- a/fs/bcachefs/debug.c ++++ b/fs/bcachefs/debug.c +@@ -20,6 +20,7 @@ + #include "extents.h" + #include "fsck.h" + #include "inode.h" ++#include "journal_reclaim.h" + #include "super.h" + + #include +@@ -472,7 +473,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + +- prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level); ++ prt_printf(out, "%px ", b); ++ bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); ++ prt_printf(out, "\n"); + + printbuf_indent_add(out, 2); + +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +index faffc98d5605..600eee936f13 100644 +--- a/fs/bcachefs/dirent.c ++++ b/fs/bcachefs/dirent.c +@@ -101,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { + }; + + int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr d_name = bch2_dirent_get_name(d); +@@ -120,7 +120,7 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, + * Check new keys don't exceed the max length + * (older keys may be larger.) + */ +- bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, ++ bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, + c, dirent_name_too_long, + "dirent name too big (%u > %u)", + d_name.len, BCH_NAME_MAX); +@@ -266,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + } else { + target->subvol = le32_to_cpu(d.v->d_child_subvol); + +- ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s); ++ ret = bch2_subvolume_get(trans, target->subvol, true, &s); + + target->inum = le64_to_cpu(s.inode); + } +@@ -500,7 +500,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 + struct bkey_s_c k; + int ret; + +- for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, ++ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, + SPOS(dir, 0, snapshot), + POS(dir, U64_MAX), 0, k, ret) + if (k.k->type == KEY_TYPE_dirent) { +@@ -549,7 +549,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) + bch2_bkey_buf_init(&sk); + + int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents, ++ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, + POS(inum.inum, ctx->pos), + POS(inum.inum, U64_MAX), + inum.subvol, 0, k, ({ +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +index 53ad99666022..a633f83c1ac7 100644 +--- a/fs/bcachefs/dirent.h ++++ b/fs/bcachefs/dirent.h +@@ -4,10 +4,10 @@ + + #include "str_hash.h" + +-enum bch_validate_flags; + extern const struct bch_hash_desc bch2_dirent_hash_desc; + +-int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ +@@ -31,6 +31,11 @@ static inline unsigned dirent_val_u64s(unsigned len) + sizeof(u64)); + } + ++static inline unsigned int dirent_occupied_size(const struct qstr *name) ++{ ++ return (BKEY_U64s + dirent_val_u64s(name->len)) * sizeof(u64); ++} ++ + int bch2_dirent_read_target(struct btree_trans *, subvol_inum, + struct bkey_s_c_dirent, subvol_inum *); + +diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c +index 07eb8fa1b026..b32e91ba8be8 100644 +--- a/fs/bcachefs/disk_accounting.c ++++ b/fs/bcachefs/disk_accounting.c +@@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_ + memcpy_u64s_small(acc->v.d, d, nr); + } + ++static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); ++ + int bch2_disk_accounting_mod(struct btree_trans *trans, + struct disk_accounting_pos *k, + s64 *d, unsigned nr, bool gc) +@@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, + + accounting_key_init(&k_i.k, k, d, nr); + +- return likely(!gc) +- ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k) +- : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); ++ if (unlikely(gc)) { ++ int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); ++ if (ret == -BCH_ERR_btree_insert_need_mark_replicas) ++ ret = drop_locks_do(trans, ++ bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: ++ bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); ++ return ret; ++ } else { ++ return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); ++ } + } + + int bch2_mod_dev_cached_sectors(struct btree_trans *trans, +@@ -127,14 +136,15 @@ static inline bool is_zero(char *start, char *end) + #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) + + int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + void *end = &acc_k + 1; + int ret = 0; + +- bkey_fsck_err_on(bversion_zero(k.k->bversion), ++ bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && ++ bversion_zero(k.k->bversion), + c, accounting_key_version_0, + "accounting key with version=0"); + +@@ -217,7 +227,8 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po + prt_printf(out, "id=%u", k->snapshot.id); + break; + case BCH_DISK_ACCOUNTING_btree: +- prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id)); ++ prt_str(out, "btree="); ++ bch2_btree_id_to_text(out, k->btree.id); + break; + } + } +@@ -243,10 +254,10 @@ void bch2_accounting_swab(struct bkey_s k) + } + + static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, +- struct disk_accounting_pos acc) ++ struct disk_accounting_pos *acc) + { +- unsafe_memcpy(r, &acc.replicas, +- replicas_entry_bytes(&acc.replicas), ++ unsafe_memcpy(r, &acc->replicas, ++ replicas_entry_bytes(&acc->replicas), + "variable length struct"); + } + +@@ -257,7 +268,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc + + switch (acc_k.type) { + case BCH_DISK_ACCOUNTING_replicas: +- __accounting_to_replicas(r, acc_k); ++ __accounting_to_replicas(r, &acc_k); + return true; + default: + return false; +@@ -322,6 +333,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun + + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); ++ ++ if (trace_accounting_mem_insert_enabled()) { ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_accounting_to_text(&buf, c, a.s_c); ++ trace_accounting_mem_insert(c, buf.buf); ++ printbuf_exit(&buf); ++ } + return 0; + err: + free_percpu(n.v[1]); +@@ -461,32 +480,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc + return ret; + } + +-void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) +-{ +- struct bch_accounting_mem *acc = &c->accounting; +- +- percpu_down_read(&c->mark_lock); +- out->atomic++; +- +- eytzinger0_for_each(i, acc->k.nr) { +- struct disk_accounting_pos acc_k; +- bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos); +- +- bch2_accounting_key_to_text(out, &acc_k); +- +- u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; +- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); +- +- prt_str(out, ":"); +- for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) +- prt_printf(out, " %llu", v[j]); +- prt_newline(out); +- } +- +- --out->atomic; +- percpu_up_read(&c->mark_lock); +-} +- + static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) + { + darray_for_each(acc->k, e) { +@@ -625,7 +618,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, + switch (acc.type) { + case BCH_DISK_ACCOUNTING_replicas: { + struct bch_replicas_padded r; +- __accounting_to_replicas(&r.e, acc); ++ __accounting_to_replicas(&r.e, &acc); + + for (unsigned i = 0; i < r.e.nr_devs; i++) + if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && +@@ -699,11 +692,45 @@ int bch2_accounting_read(struct bch_fs *c) + struct btree_trans *trans = bch2_trans_get(c); + struct printbuf buf = PRINTBUF; + +- int ret = for_each_btree_key(trans, iter, +- BTREE_ID_accounting, POS_MIN, ++ /* ++ * We might run more than once if we rewind to start topology repair or ++ * btree node scan - and those might cause us to get different results, ++ * so we can't just skip if we've already run. ++ * ++ * Instead, zero out any accounting we have: ++ */ ++ percpu_down_write(&c->mark_lock); ++ darray_for_each(acc->k, e) ++ percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); ++ for_each_member_device(c, ca) ++ percpu_memset(ca->usage, 0, sizeof(*ca->usage)); ++ percpu_memset(c->usage, 0, sizeof(*c->usage)); ++ percpu_up_write(&c->mark_lock); ++ ++ struct btree_iter iter; ++ bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, ++ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); ++ iter.flags &= ~BTREE_ITER_with_journal; ++ int ret = for_each_btree_key_continue(trans, iter, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); ++ ++ if (k.k->type != KEY_TYPE_accounting) ++ continue; ++ ++ struct disk_accounting_pos acc_k; ++ bpos_to_disk_accounting_pos(&acc_k, k.k->p); ++ ++ if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) ++ break; ++ ++ if (!bch2_accounting_is_mem(acc_k)) { ++ struct disk_accounting_pos next = { .type = acc_k.type + 1 }; ++ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); ++ continue; ++ } ++ + accounting_read_key(trans, k); + })); + if (ret) +@@ -715,6 +742,12 @@ int bch2_accounting_read(struct bch_fs *c) + + darray_for_each(*keys, i) { + if (i->k->k.type == KEY_TYPE_accounting) { ++ struct disk_accounting_pos acc_k; ++ bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); ++ ++ if (!bch2_accounting_is_mem(acc_k)) ++ continue; ++ + struct bkey_s_c k = bkey_i_to_s_c(i->k); + unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, + sizeof(acc->k.data[0]), +@@ -748,15 +781,16 @@ int bch2_accounting_read(struct bch_fs *c) + keys->gap = keys->nr = dst - keys->data; + + percpu_down_write(&c->mark_lock); +- unsigned i = 0; +- while (i < acc->k.nr) { +- unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); + ++ darray_for_each_reverse(acc->k, i) { + struct disk_accounting_pos acc_k; +- bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); ++ bpos_to_disk_accounting_pos(&acc_k, i->pos); + + u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; +- bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); ++ memset(v, 0, sizeof(v)); ++ ++ for (unsigned j = 0; j < i->nr_counters; j++) ++ v[j] = percpu_u64_get(i->v[0] + j); + + /* + * If the entry counters are zeroed, it should be treated as +@@ -765,26 +799,25 @@ int bch2_accounting_read(struct bch_fs *c) + * Remove it, so that if it's re-added it gets re-marked in the + * superblock: + */ +- ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) ++ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) + ? -BCH_ERR_remove_disk_accounting_entry +- : bch2_disk_accounting_validate_late(trans, acc_k, +- v, acc->k.data[idx].nr_counters); ++ : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); + + if (ret == -BCH_ERR_remove_disk_accounting_entry) { +- free_percpu(acc->k.data[idx].v[0]); +- free_percpu(acc->k.data[idx].v[1]); +- darray_remove_item(&acc->k, &acc->k.data[idx]); +- eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), +- accounting_pos_cmp, NULL); ++ free_percpu(i->v[0]); ++ free_percpu(i->v[1]); ++ darray_remove_item(&acc->k, i); + ret = 0; + continue; + } + + if (ret) + goto fsck_err; +- i++; + } + ++ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), ++ accounting_pos_cmp, NULL); ++ + preempt_disable(); + struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); + +@@ -804,7 +837,7 @@ int bch2_accounting_read(struct bch_fs *c) + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); + if (ca) { + struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; + percpu_u64_set(&d->buckets, v[0]); +@@ -881,10 +914,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c) + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) +- continue; ++ break; + +- if (acc_k.type == BCH_DISK_ACCOUNTING_inum) ++ if (!bch2_accounting_is_mem(acc_k)) { ++ struct disk_accounting_pos next = { .type = acc_k.type + 1 }; ++ bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + continue; ++ } + + bch2_accounting_mem_read(c, k.k->p, v, nr); + +@@ -910,7 +946,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) + break; + case BCH_DISK_ACCOUNTING_dev_data_type: { + rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); + if (!ca) { + rcu_read_unlock(); + continue; +diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h +index 4ea6c8a092bc..5360cbb3ec29 100644 +--- a/fs/bcachefs/disk_accounting.h ++++ b/fs/bcachefs/disk_accounting.h +@@ -2,6 +2,7 @@ + #ifndef _BCACHEFS_DISK_ACCOUNTING_H + #define _BCACHEFS_DISK_ACCOUNTING_H + ++#include "btree_update.h" + #include "eytzinger.h" + #include "sb-members.h" + +@@ -62,27 +63,32 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage + + static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) + { +- acc->_pad = p; ++ BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); ++ + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +- bch2_bpos_swab(&acc->_pad); ++ acc->_pad = p; ++#else ++ memcpy_swab(acc, &p, sizeof(p)); + #endif + } + +-static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k) ++static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) + { +- struct bpos ret = k->_pad; +- ++ struct bpos p; + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +- bch2_bpos_swab(&ret); ++ p = acc->_pad; ++#else ++ memcpy_swab(&p, acc, sizeof(p)); + #endif +- return ret; ++ return p; + } + + int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, + s64 *, unsigned, bool); + int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); + +-int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); + void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + void bch2_accounting_swab(struct bkey_s); +@@ -112,6 +118,12 @@ enum bch_accounting_mode { + int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); + void bch2_accounting_mem_gc(struct bch_fs *); + ++static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) ++{ ++ return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && ++ acc.type != BCH_DISK_ACCOUNTING_inum; ++} ++ + /* + * Update in memory counters so they match the btree update we're doing; called + * from transaction commit path +@@ -126,9 +138,10 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, + bpos_to_disk_accounting_pos(&acc_k, a.k->p); + bool gc = mode == BCH_ACCOUNTING_gc; + +- EBUG_ON(gc && !acc->gc_running); ++ if (gc && !acc->gc_running) ++ return 0; + +- if (acc_k.type == BCH_DISK_ACCOUNTING_inum) ++ if (!bch2_accounting_is_mem(acc_k)) + return 0; + + if (mode == BCH_ACCOUNTING_normal) { +@@ -141,7 +154,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, + break; + case BCH_DISK_ACCOUNTING_dev_data_type: + rcu_read_lock(); +- struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); + if (ca) { + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); + this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); +@@ -204,9 +217,45 @@ static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, + bch2_accounting_mem_read_counters(acc, idx, v, nr, false); + } + ++static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) ++{ ++ EBUG_ON(!res->ref); ++ ++ return (struct bversion) { ++ .hi = res->seq >> 32, ++ .lo = (res->seq << 32) | (res->offset + offset), ++ }; ++} ++ ++static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, ++ struct bkey_i_accounting *a, ++ unsigned commit_flags) ++{ ++ a->k.bversion = journal_pos_to_bversion(&trans->journal_res, ++ (u64 *) a - (u64 *) trans->journal_entries); ++ ++ EBUG_ON(bversion_zero(a->k.bversion)); ++ ++ return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) ++ ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) ++ : 0; ++} ++ ++static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, ++ struct bkey_i_accounting *a_i, ++ unsigned commit_flags) ++{ ++ if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { ++ struct bkey_s_accounting a = accounting_i_to_s(a_i); ++ ++ bch2_accounting_neg(a); ++ bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); ++ bch2_accounting_neg(a); ++ } ++} ++ + int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); + int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); +-void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); + + int bch2_gc_accounting_start(struct bch_fs *); + int bch2_gc_accounting_done(struct bch_fs *); +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +index 749dcf368841..b211e90ac54e 100644 +--- a/fs/bcachefs/ec.c ++++ b/fs/bcachefs/ec.c +@@ -26,6 +26,7 @@ + #include "util.h" + + #include ++#include + + #ifdef __KERNEL__ + +@@ -109,7 +110,7 @@ struct ec_bio { + /* Stripes btree keys: */ + + int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + int ret = 0; +@@ -129,7 +130,7 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, + "invalid csum granularity (%u >= 64)", + s->csum_granularity_bits); + +- ret = bch2_bkey_ptrs_validate(c, k, flags); ++ ret = bch2_bkey_ptrs_validate(c, k, from); + fsck_err: + return ret; + } +@@ -304,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans, + } + + if (flags & BTREE_TRIGGER_gc) { +- percpu_down_read(&c->mark_lock); + struct bucket *g = gc_bucket(ca, bucket.offset); + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + ptr->dev, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + ret = -BCH_ERR_mark_stripe; +- goto err_unlock; ++ goto err; + } + + bucket_lock(g); +@@ -318,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, + ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); + alloc_to_bucket(g, new); + bucket_unlock(g); +-err_unlock: +- percpu_up_read(&c->mark_lock); ++ + if (!ret) + ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); + } +@@ -732,7 +731,7 @@ static void ec_block_endio(struct bio *bio) + ? BCH_MEMBER_ERROR_write + : BCH_MEMBER_ERROR_read, + "erasure coding %s error: %s", +- bio_data_dir(bio) ? "write" : "read", ++ str_write_read(bio_data_dir(bio)), + bch2_blk_status_to_str(bio->bi_status))) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + +@@ -909,7 +908,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + bch2_bkey_val_to_text(&msgbuf, c, orig_k); + bch_err_ratelimited(c, + "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); +- printbuf_exit(&msgbuf);; ++ printbuf_exit(&msgbuf); + ret = -BCH_ERR_stripe_reconstruct; + goto out; + } +@@ -1275,11 +1274,11 @@ static int ec_stripe_update_extent(struct btree_trans *trans, + struct bch_dev *ca, + struct bpos bucket, u8 gen, + struct ec_stripe_buf *s, +- struct bpos *bp_pos) ++ struct bkey_s_c_backpointer bp, ++ struct bkey_buf *last_flushed) + { + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + struct bch_fs *c = trans->c; +- struct bch_backpointer bp; + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_extent_ptr *ptr_c; +@@ -1288,33 +1287,26 @@ static int ec_stripe_update_extent(struct btree_trans *trans, + struct bkey_i *n; + int ret, dev, block; + +- ret = bch2_get_next_backpointer(trans, ca, bucket, gen, +- bp_pos, &bp, BTREE_ITER_cached); +- if (ret) +- return ret; +- if (bpos_eq(*bp_pos, SPOS_MAX)) +- return 0; +- +- if (bp.level) { ++ if (bp.v->level) { + struct printbuf buf = PRINTBUF; + struct btree_iter node_iter; + struct btree *b; + +- b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); ++ b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); + bch2_trans_iter_exit(trans, &node_iter); + + if (!b) + return 0; + + prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); +- bch2_backpointer_to_text(&buf, &bp); ++ bch2_bkey_val_to_text(&buf, c, bp.s_c); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + return -EIO; + } + +- k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); ++ k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); + ret = bkey_err(k); + if (ret) + return ret; +@@ -1373,7 +1365,6 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b + struct bch_fs *c = trans->c; + struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; + struct bch_extent_ptr ptr = v->ptrs[block]; +- struct bpos bp_pos = POS_MIN; + int ret = 0; + + struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); +@@ -1382,19 +1373,27 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b + + struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); + +- while (1) { +- ret = commit_do(trans, NULL, NULL, +- BCH_TRANS_COMMIT_no_check_rw| +- BCH_TRANS_COMMIT_no_enospc, +- ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); +- if (ret) +- break; +- if (bkey_eq(bp_pos, POS_MAX)) ++ struct bkey_buf last_flushed; ++ bch2_bkey_buf_init(&last_flushed); ++ bkey_init(&last_flushed.k->k); ++ ++ ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, ++ bucket_pos_to_bp_start(ca, bucket_pos), ++ bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, ++ NULL, NULL, ++ BCH_TRANS_COMMIT_no_check_rw| ++ BCH_TRANS_COMMIT_no_enospc, ({ ++ if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) + break; + +- bp_pos = bpos_nosnap_successor(bp_pos); +- } ++ if (bp_k.k->type != KEY_TYPE_backpointer) ++ continue; + ++ ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, ++ bkey_s_c_to_backpointer(bp_k), &last_flushed); ++ })); ++ ++ bch2_bkey_buf_exit(&last_flushed, c); + bch2_dev_put(ca); + return ret; + } +@@ -1716,7 +1715,7 @@ static void ec_stripe_key_init(struct bch_fs *c, + set_bkey_val_u64s(&s->k, u64s); + } + +-static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + { + struct ec_stripe_new *s; + +@@ -1724,7 +1723,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) +- return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; ++ return NULL; + + mutex_init(&s->lock); + closure_init(&s->iodone, NULL); +@@ -1739,10 +1738,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) + ec_stripe_key_init(c, &s->new_stripe.key, + s->nr_data, s->nr_parity, + h->blocksize, h->disk_label); +- +- h->s = s; +- h->nr_created++; +- return 0; ++ return s; + } + + static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) +@@ -1887,25 +1883,26 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, + return h; + } + +-static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, ++static int new_stripe_alloc_buckets(struct btree_trans *trans, ++ struct ec_stripe_head *h, struct ec_stripe_new *s, + enum bch_watermark watermark, struct closure *cl) + { + struct bch_fs *c = trans->c; + struct bch_devs_mask devs = h->devs; + struct open_bucket *ob; + struct open_buckets buckets; +- struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; ++ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; + bool have_cache = true; + int ret = 0; + +- BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); +- BUG_ON(v->nr_redundant != h->s->nr_parity); ++ BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); ++ BUG_ON(v->nr_redundant != s->nr_parity); + + /* * We bypass the sector allocator which normally does this: */ + bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + +- for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { ++ for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { + /* + * Note: we don't yet repair invalid blocks (failed/removed + * devices) when reusing stripes - we still need a codepath to +@@ -1915,21 +1912,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ + if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) + __clear_bit(v->ptrs[i].dev, devs.d); + +- if (i < h->s->nr_data) ++ if (i < s->nr_data) + nr_have_data++; + else + nr_have_parity++; + } + +- BUG_ON(nr_have_data > h->s->nr_data); +- BUG_ON(nr_have_parity > h->s->nr_parity); ++ BUG_ON(nr_have_data > s->nr_data); ++ BUG_ON(nr_have_parity > s->nr_parity); + + buckets.nr = 0; +- if (nr_have_parity < h->s->nr_parity) { ++ if (nr_have_parity < s->nr_parity) { + ret = bch2_bucket_alloc_set_trans(trans, &buckets, + &h->parity_stripe, + &devs, +- h->s->nr_parity, ++ s->nr_parity, + &nr_have_parity, + &have_cache, 0, + BCH_DATA_parity, +@@ -1937,14 +1934,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ + cl); + + open_bucket_for_each(c, &buckets, ob, i) { +- j = find_next_zero_bit(h->s->blocks_gotten, +- h->s->nr_data + h->s->nr_parity, +- h->s->nr_data); +- BUG_ON(j >= h->s->nr_data + h->s->nr_parity); ++ j = find_next_zero_bit(s->blocks_gotten, ++ s->nr_data + s->nr_parity, ++ s->nr_data); ++ BUG_ON(j >= s->nr_data + s->nr_parity); + +- h->s->blocks[j] = buckets.v[i]; ++ s->blocks[j] = buckets.v[i]; + v->ptrs[j] = bch2_ob_ptr(c, ob); +- __set_bit(j, h->s->blocks_gotten); ++ __set_bit(j, s->blocks_gotten); + } + + if (ret) +@@ -1952,11 +1949,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ + } + + buckets.nr = 0; +- if (nr_have_data < h->s->nr_data) { ++ if (nr_have_data < s->nr_data) { + ret = bch2_bucket_alloc_set_trans(trans, &buckets, + &h->block_stripe, + &devs, +- h->s->nr_data, ++ s->nr_data, + &nr_have_data, + &have_cache, 0, + BCH_DATA_user, +@@ -1964,13 +1961,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ + cl); + + open_bucket_for_each(c, &buckets, ob, i) { +- j = find_next_zero_bit(h->s->blocks_gotten, +- h->s->nr_data, 0); +- BUG_ON(j >= h->s->nr_data); ++ j = find_next_zero_bit(s->blocks_gotten, ++ s->nr_data, 0); ++ BUG_ON(j >= s->nr_data); + +- h->s->blocks[j] = buckets.v[i]; ++ s->blocks[j] = buckets.v[i]; + v->ptrs[j] = bch2_ob_ptr(c, ob); +- __set_bit(j, h->s->blocks_gotten); ++ __set_bit(j, s->blocks_gotten); + } + + if (ret) +@@ -2016,73 +2013,78 @@ static s64 get_existing_stripe(struct bch_fs *c, + return ret; + } + +-static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) ++static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) + { +- struct bch_fs *c = trans->c; +- struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; +- struct bch_stripe *existing_v; ++ struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; ++ struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; + unsigned i; +- s64 idx; +- int ret; +- +- /* +- * If we can't allocate a new stripe, and there's no stripes with empty +- * blocks for us to reuse, that means we have to wait on copygc: +- */ +- idx = get_existing_stripe(c, h); +- if (idx < 0) +- return -BCH_ERR_stripe_alloc_blocked; +- +- ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); +- bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, +- "reading stripe key: %s", bch2_err_str(ret)); +- if (ret) { +- bch2_stripe_close(c, h->s); +- return ret; +- } + +- existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; +- +- BUG_ON(existing_v->nr_redundant != h->s->nr_parity); +- h->s->nr_data = existing_v->nr_blocks - ++ BUG_ON(existing_v->nr_redundant != s->nr_parity); ++ s->nr_data = existing_v->nr_blocks - + existing_v->nr_redundant; + +- ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); ++ int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); + if (ret) { +- bch2_stripe_close(c, h->s); ++ bch2_stripe_close(c, s); + return ret; + } + +- BUG_ON(h->s->existing_stripe.size != h->blocksize); +- BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); ++ BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); + + /* + * Free buckets we initially allocated - they might conflict with + * blocks from the stripe we're reusing: + */ +- for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { +- bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); +- h->s->blocks[i] = 0; ++ for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { ++ bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); ++ s->blocks[i] = 0; + } +- memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); +- memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); ++ memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); ++ memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); + +- for (i = 0; i < existing_v->nr_blocks; i++) { ++ for (unsigned i = 0; i < existing_v->nr_blocks; i++) { + if (stripe_blockcount_get(existing_v, i)) { +- __set_bit(i, h->s->blocks_gotten); +- __set_bit(i, h->s->blocks_allocated); ++ __set_bit(i, s->blocks_gotten); ++ __set_bit(i, s->blocks_allocated); + } + +- ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); ++ ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); + } + +- bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); +- h->s->have_existing_stripe = true; ++ bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); ++ s->have_existing_stripe = true; + + return 0; + } + +-static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) ++static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, ++ struct ec_stripe_new *s) ++{ ++ struct bch_fs *c = trans->c; ++ s64 idx; ++ int ret; ++ ++ /* ++ * If we can't allocate a new stripe, and there's no stripes with empty ++ * blocks for us to reuse, that means we have to wait on copygc: ++ */ ++ idx = get_existing_stripe(c, h); ++ if (idx < 0) ++ return -BCH_ERR_stripe_alloc_blocked; ++ ++ ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); ++ bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, ++ "reading stripe key: %s", bch2_err_str(ret)); ++ if (ret) { ++ bch2_stripe_close(c, s); ++ return ret; ++ } ++ ++ return init_new_stripe_from_existing(c, s); ++} ++ ++static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, ++ struct ec_stripe_new *s) + { + struct bch_fs *c = trans->c; + struct btree_iter iter; +@@ -2091,15 +2093,19 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); + int ret; + +- if (!h->s->res.sectors) { +- ret = bch2_disk_reservation_get(c, &h->s->res, ++ if (!s->res.sectors) { ++ ret = bch2_disk_reservation_get(c, &s->res, + h->blocksize, +- h->s->nr_parity, ++ s->nr_parity, + BCH_DISK_RESERVATION_NOFAIL); + if (ret) + return ret; + } + ++ /* ++ * Allocate stripe slot ++ * XXX: we're going to need a bitrange btree of free stripes ++ */ + for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { + if (bkey_gt(k.k->p, POS(0, U32_MAX))) { +@@ -2114,7 +2120,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st + } + + if (bkey_deleted(k.k) && +- bch2_try_open_stripe(c, h->s, k.k->p.offset)) ++ bch2_try_open_stripe(c, s, k.k->p.offset)) + break; + } + +@@ -2125,16 +2131,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st + + ret = ec_stripe_mem_alloc(trans, &iter); + if (ret) { +- bch2_stripe_close(c, h->s); ++ bch2_stripe_close(c, s); + goto err; + } + +- h->s->new_stripe.key.k.p = iter.pos; ++ s->new_stripe.key.k.p = iter.pos; + out: + bch2_trans_iter_exit(trans, &iter); + return ret; + err: +- bch2_disk_reservation_put(c, &h->s->res); ++ bch2_disk_reservation_put(c, &s->res); + goto out; + } + +@@ -2165,22 +2171,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + return h; + + if (!h->s) { +- ret = ec_new_stripe_alloc(c, h); +- if (ret) { ++ h->s = ec_new_stripe_alloc(c, h); ++ if (!h->s) { ++ ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + bch_err(c, "failed to allocate new stripe"); + goto err; + } ++ ++ h->nr_created++; + } + +- if (h->s->allocated) ++ struct ec_stripe_new *s = h->s; ++ ++ if (s->allocated) + goto allocated; + +- if (h->s->have_existing_stripe) ++ if (s->have_existing_stripe) + goto alloc_existing; + + /* First, try to allocate a full stripe: */ +- ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: +- __bch2_ec_stripe_head_reserve(trans, h); ++ ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: ++ __bch2_ec_stripe_head_reserve(trans, h, s); + if (!ret) + goto allocate_buf; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || +@@ -2192,15 +2203,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + * existing stripe: + */ + while (1) { +- ret = __bch2_ec_stripe_head_reuse(trans, h); ++ ret = __bch2_ec_stripe_head_reuse(trans, h, s); + if (!ret) + break; + if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) + goto err; + + if (watermark == BCH_WATERMARK_copygc) { +- ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: +- __bch2_ec_stripe_head_reserve(trans, h); ++ ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: ++ __bch2_ec_stripe_head_reserve(trans, h, s); + if (ret) + goto err; + goto allocate_buf; +@@ -2218,19 +2229,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + * Retry allocating buckets, with the watermark for this + * particular write: + */ +- ret = new_stripe_alloc_buckets(trans, h, watermark, cl); ++ ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); + if (ret) + goto err; + + allocate_buf: +- ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); ++ ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); + if (ret) + goto err; + +- h->s->allocated = true; ++ s->allocated = true; + allocated: +- BUG_ON(!h->s->idx); +- BUG_ON(!h->s->new_stripe.data[0]); ++ BUG_ON(!s->idx); ++ BUG_ON(!s->new_stripe.data[0]); + BUG_ON(trans->restarted); + return h; + err: +@@ -2295,7 +2306,7 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ + int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) + { + return bch2_trans_run(c, +- for_each_btree_key_upto_commit(trans, iter, ++ for_each_btree_key_max_commit(trans, iter, + BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), + BTREE_ITER_intent, k, + NULL, NULL, 0, ({ +@@ -2458,11 +2469,9 @@ void bch2_fs_ec_exit(struct bch_fs *c) + + while (1) { + mutex_lock(&c->ec_stripe_head_lock); +- h = list_first_entry_or_null(&c->ec_stripe_head_list, +- struct ec_stripe_head, list); +- if (h) +- list_del(&h->list); ++ h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); + mutex_unlock(&c->ec_stripe_head_lock); ++ + if (!h) + break; + +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +index 43326370b410..583ca6a226da 100644 +--- a/fs/bcachefs/ec.h ++++ b/fs/bcachefs/ec.h +@@ -6,9 +6,8 @@ + #include "buckets_types.h" + #include "extents_types.h" + +-enum bch_validate_flags; +- +-int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, +diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h +index 64ef52e00078..b9770f24f213 100644 +--- a/fs/bcachefs/ec_format.h ++++ b/fs/bcachefs/ec_format.h +@@ -20,6 +20,23 @@ struct bch_stripe { + */ + __u8 disk_label; + ++ /* ++ * Variable length sections: ++ * - Pointers ++ * - Checksums ++ * 2D array of [stripe block/device][csum block], with checksum block ++ * size given by csum_granularity_bits ++ * - Block sector counts: per-block array of u16s ++ * ++ * XXX: ++ * Either checksums should have come last, or we should have included a ++ * checksum_size field (the size in bytes of the checksum itself, not ++ * the blocksize the checksum covers). ++ * ++ * Currently we aren't able to access the block sector counts if the ++ * checksum type is unknown. ++ */ ++ + struct bch_extent_ptr ptrs[]; + } __packed __aligned(8); + +diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h +index 9c4fe5cdbfb7..d65a75e7216e 100644 +--- a/fs/bcachefs/errcode.h ++++ b/fs/bcachefs/errcode.h +@@ -54,7 +54,8 @@ + x(ENOMEM, ENOMEM_compression_bounce_read_init) \ + x(ENOMEM, ENOMEM_compression_bounce_write_init) \ + x(ENOMEM, ENOMEM_compression_workspace_init) \ +- x(ENOMEM, ENOMEM_decompression_workspace_init) \ ++ x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \ ++ x(EIO, compression_workspace_not_initialized) \ + x(ENOMEM, ENOMEM_bucket_gens) \ + x(ENOMEM, ENOMEM_buckets_nouse) \ + x(ENOMEM, ENOMEM_usage_init) \ +@@ -116,6 +117,8 @@ + x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ + x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_idx_not_found) \ ++ x(ENOENT, ENOENT_inode_no_backpointer) \ ++ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ + x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ + x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ + x(EEXIST, EEXIST_str_hash_set) \ +@@ -148,6 +151,7 @@ + x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ + x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ + x(BCH_ERR_transaction_restart, transaction_restart_nested) \ ++ x(BCH_ERR_transaction_restart, transaction_restart_commit) \ + x(0, no_btree_node) \ + x(BCH_ERR_no_btree_node, no_btree_node_relock) \ + x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ +@@ -164,7 +168,6 @@ + x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ + x(0, backpointer_to_overwritten_btree_node) \ +- x(0, lock_fail_root_changed) \ + x(0, journal_reclaim_would_deadlock) \ + x(EINVAL, fsck) \ + x(BCH_ERR_fsck, fsck_fix) \ +@@ -173,8 +176,15 @@ + x(BCH_ERR_fsck, fsck_errors_not_fixed) \ + x(BCH_ERR_fsck, fsck_repair_unimplemented) \ + x(BCH_ERR_fsck, fsck_repair_impossible) \ +- x(0, restart_recovery) \ ++ x(EINVAL, restart_recovery) \ ++ x(EINVAL, not_in_recovery) \ ++ x(EINVAL, cannot_rewind_recovery) \ + x(0, data_update_done) \ ++ x(BCH_ERR_data_update_done, data_update_done_would_block) \ ++ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ ++ x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ + x(EINVAL, device_state_not_allowed) \ + x(EINVAL, member_info_missing) \ + x(EINVAL, mismatched_block_size) \ +@@ -192,7 +202,9 @@ + x(EINVAL, opt_parse_error) \ + x(EINVAL, remove_with_metadata_missing_unimplemented)\ + x(EINVAL, remove_would_lose_data) \ +- x(EINVAL, btree_iter_with_journal_not_supported) \ ++ x(EINVAL, no_resize_with_buckets_nouse) \ ++ x(EINVAL, inode_unpack_error) \ ++ x(EINVAL, varint_decode_error) \ + x(EROFS, erofs_trans_commit) \ + x(EROFS, erofs_no_writes) \ + x(EROFS, erofs_journal_err) \ +@@ -241,7 +253,10 @@ + x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ + x(BCH_ERR_invalid, invalid_bkey) \ + x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ ++ x(EIO, journal_shutdown) \ ++ x(EIO, journal_flush_err) \ + x(EIO, btree_node_read_err) \ ++ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ + x(EIO, sb_not_downgraded) \ + x(EIO, btree_node_write_all_failed) \ + x(EIO, btree_node_read_error) \ +@@ -257,6 +272,8 @@ + x(EIO, no_device_to_read_from) \ + x(EIO, missing_indirect_extent) \ + x(EIO, invalidate_stripe_to_dev) \ ++ x(EIO, no_encryption_key) \ ++ x(EIO, insufficient_journal_devices) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ +@@ -305,6 +322,7 @@ static inline long bch2_err_class(long err) + + #define BLK_STS_REMOVED ((__force blk_status_t)128) + ++#include + const char *bch2_blk_status_to_str(blk_status_t); + + #endif /* _BCACHFES_ERRCODE_H */ +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +index b679def8fb98..038da6a61f6b 100644 +--- a/fs/bcachefs/error.c ++++ b/fs/bcachefs/error.c +@@ -1,7 +1,9 @@ + // SPDX-License-Identifier: GPL-2.0 + #include "bcachefs.h" ++#include "btree_cache.h" + #include "btree_iter.h" + #include "error.h" ++#include "fs-common.h" + #include "journal.h" + #include "recovery_passes.h" + #include "super.h" +@@ -33,7 +35,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) + int bch2_topology_error(struct bch_fs *c) + { + set_bit(BCH_FS_topology_error, &c->flags); +- if (!test_bit(BCH_FS_fsck_running, &c->flags)) { ++ if (!test_bit(BCH_FS_recovery_running, &c->flags)) { + bch2_inconsistent_error(c); + return -BCH_ERR_btree_need_topology_repair; + } else { +@@ -218,6 +220,30 @@ static const u8 fsck_flags_extra[] = { + #undef x + }; + ++static int do_fsck_ask_yn(struct bch_fs *c, ++ struct btree_trans *trans, ++ struct printbuf *question, ++ const char *action) ++{ ++ prt_str(question, ", "); ++ prt_str(question, action); ++ ++ if (bch2_fs_stdio_redirect(c)) ++ bch2_print(c, "%s", question->buf); ++ else ++ bch2_print_string_as_lines(KERN_ERR, question->buf); ++ ++ int ask = bch2_fsck_ask_yn(c, trans); ++ ++ if (trans) { ++ int ret = bch2_trans_relock(trans); ++ if (ret) ++ return ret; ++ } ++ ++ return ask; ++} ++ + int __bch2_fsck_err(struct bch_fs *c, + struct btree_trans *trans, + enum bch_fsck_flags flags, +@@ -226,7 +252,7 @@ int __bch2_fsck_err(struct bch_fs *c, + { + struct fsck_err_state *s = NULL; + va_list args; +- bool print = true, suppressing = false, inconsistent = false; ++ bool print = true, suppressing = false, inconsistent = false, exiting = false; + struct printbuf buf = PRINTBUF, *out = &buf; + int ret = -BCH_ERR_fsck_ignore; + const char *action_orig = "fix?", *action = action_orig; +@@ -256,9 +282,10 @@ int __bch2_fsck_err(struct bch_fs *c, + !trans && + bch2_current_has_btree_trans(c)); + +- if ((flags & FSCK_CAN_FIX) && +- test_bit(err, c->sb.errors_silent)) +- return -BCH_ERR_fsck_fix; ++ if (test_bit(err, c->sb.errors_silent)) ++ return flags & FSCK_CAN_FIX ++ ? -BCH_ERR_fsck_fix ++ : -BCH_ERR_fsck_ignore; + + bch2_sb_error_count(c, err); + +@@ -289,16 +316,14 @@ int __bch2_fsck_err(struct bch_fs *c, + */ + if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { + ret = s->ret; +- mutex_unlock(&c->fsck_error_msgs_lock); +- goto err; ++ goto err_unlock; + } + + kfree(s->last_msg); + s->last_msg = kstrdup(buf.buf, GFP_KERNEL); + if (!s->last_msg) { +- mutex_unlock(&c->fsck_error_msgs_lock); + ret = -ENOMEM; +- goto err; ++ goto err_unlock; + } + + if (c->opts.ratelimit_errors && +@@ -318,13 +343,19 @@ int __bch2_fsck_err(struct bch_fs *c, + prt_printf(out, bch2_log_msg(c, "")); + #endif + +- if ((flags & FSCK_CAN_FIX) && +- (flags & FSCK_AUTOFIX) && ++ if ((flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) { + prt_str(out, ", "); +- prt_actioning(out, action); +- ret = -BCH_ERR_fsck_fix; ++ if (flags & FSCK_CAN_FIX) { ++ prt_actioning(out, action); ++ ret = -BCH_ERR_fsck_fix; ++ } else { ++ prt_str(out, ", continuing"); ++ ret = -BCH_ERR_fsck_ignore; ++ } ++ ++ goto print; + } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if (c->opts.errors != BCH_ON_ERROR_continue || + !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { +@@ -348,31 +379,18 @@ int __bch2_fsck_err(struct bch_fs *c, + : c->opts.fix_errors; + + if (fix == FSCK_FIX_ask) { +- prt_str(out, ", "); +- prt_str(out, action); +- +- if (bch2_fs_stdio_redirect(c)) +- bch2_print(c, "%s", out->buf); +- else +- bch2_print_string_as_lines(KERN_ERR, out->buf); + print = false; + +- int ask = bch2_fsck_ask_yn(c, trans); +- +- if (trans) { +- ret = bch2_trans_relock(trans); +- if (ret) { +- mutex_unlock(&c->fsck_error_msgs_lock); +- goto err; +- } +- } ++ ret = do_fsck_ask_yn(c, trans, out, action); ++ if (ret < 0) ++ goto err_unlock; + +- if (ask >= YN_ALLNO && s) +- s->fix = ask == YN_ALLNO ++ if (ret >= YN_ALLNO && s) ++ s->fix = ret == YN_ALLNO + ? FSCK_FIX_no + : FSCK_FIX_yes; + +- ret = ask & 1 ++ ret = ret & 1 + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; + } else if (fix == FSCK_FIX_yes || +@@ -385,9 +403,7 @@ int __bch2_fsck_err(struct bch_fs *c, + prt_str(out, ", not "); + prt_actioning(out, action); + } +- } else if (flags & FSCK_NEED_FSCK) { +- prt_str(out, " (run fsck to correct)"); +- } else { ++ } else if (!(flags & FSCK_CAN_IGNORE)) { + prt_str(out, " (repair unimplemented)"); + } + +@@ -396,14 +412,13 @@ int __bch2_fsck_err(struct bch_fs *c, + !(flags & FSCK_CAN_IGNORE))) + ret = -BCH_ERR_fsck_errors_not_fixed; + +- bool exiting = +- test_bit(BCH_FS_fsck_running, &c->flags) && +- (ret != -BCH_ERR_fsck_fix && +- ret != -BCH_ERR_fsck_ignore); +- +- if (exiting) ++ if (test_bit(BCH_FS_fsck_running, &c->flags) && ++ (ret != -BCH_ERR_fsck_fix && ++ ret != -BCH_ERR_fsck_ignore)) { ++ exiting = true; + print = true; +- ++ } ++print: + if (print) { + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s\n", out->buf); +@@ -419,17 +434,24 @@ int __bch2_fsck_err(struct bch_fs *c, + if (s) + s->ret = ret; + +- mutex_unlock(&c->fsck_error_msgs_lock); +- + if (inconsistent) + bch2_inconsistent_error(c); + +- if (ret == -BCH_ERR_fsck_fix) { +- set_bit(BCH_FS_errors_fixed, &c->flags); +- } else { +- set_bit(BCH_FS_errors_not_fixed, &c->flags); +- set_bit(BCH_FS_error, &c->flags); ++ /* ++ * We don't yet track whether the filesystem currently has errors, for ++ * log_fsck_err()s: that would require us to track for every error type ++ * which recovery pass corrects it, to get the fsck exit status correct: ++ */ ++ if (flags & FSCK_CAN_FIX) { ++ if (ret == -BCH_ERR_fsck_fix) { ++ set_bit(BCH_FS_errors_fixed, &c->flags); ++ } else { ++ set_bit(BCH_FS_errors_not_fixed, &c->flags); ++ set_bit(BCH_FS_error, &c->flags); ++ } + } ++err_unlock: ++ mutex_unlock(&c->fsck_error_msgs_lock); + err: + if (action != action_orig) + kfree(action); +@@ -437,28 +459,52 @@ int __bch2_fsck_err(struct bch_fs *c, + return ret; + } + ++static const char * const bch2_bkey_validate_contexts[] = { ++#define x(n) #n, ++ BKEY_VALIDATE_CONTEXTS() ++#undef x ++ NULL ++}; ++ + int __bch2_bkey_fsck_err(struct bch_fs *c, + struct bkey_s_c k, +- enum bch_validate_flags validate_flags, ++ struct bkey_validate_context from, + enum bch_sb_error_id err, + const char *fmt, ...) + { +- if (validate_flags & BCH_VALIDATE_silent) ++ if (from.flags & BCH_VALIDATE_silent) + return -BCH_ERR_fsck_delete_bkey; + + unsigned fsck_flags = 0; +- if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) ++ if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { ++ if (test_bit(err, c->sb.errors_silent)) ++ return -BCH_ERR_fsck_delete_bkey; ++ + fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; ++ } ++ if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) ++ fsck_flags |= fsck_flags_extra[err]; + + struct printbuf buf = PRINTBUF; +- va_list args; ++ prt_printf(&buf, "invalid bkey in %s", ++ bch2_bkey_validate_contexts[from.from]); ++ ++ if (from.from == BKEY_VALIDATE_journal) ++ prt_printf(&buf, " journal seq=%llu offset=%u", ++ from.journal_seq, from.journal_offset); ++ ++ prt_str(&buf, " btree="); ++ bch2_btree_id_to_text(&buf, from.btree); ++ prt_printf(&buf, " level=%u: ", from.level); + +- prt_str(&buf, "invalid bkey "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\n "); ++ ++ va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); ++ + prt_str(&buf, ": delete?"); + + int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); +@@ -483,3 +529,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c) + + mutex_unlock(&c->fsck_error_msgs_lock); + } ++ ++int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) ++{ ++ u32 restart_count = trans->restart_count; ++ int ret = 0; ++ ++ /* XXX: we don't yet attempt to print paths when we don't know the subvol */ ++ if (inum.subvol) ++ ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); ++ if (!inum.subvol || ret) ++ prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); ++ ++ return trans_was_restarted(trans, restart_count); ++} ++ ++int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, ++ subvol_inum inum, u64 offset) ++{ ++ int ret = bch2_inum_err_msg_trans(trans, out, inum); ++ prt_printf(out, " offset %llu: ", offset); ++ return ret; ++} ++ ++void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) ++{ ++ bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); ++} ++ ++void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, ++ subvol_inum inum, u64 offset) ++{ ++ bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); ++} +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +index 6551ada926b6..7acf2a27ca28 100644 +--- a/fs/bcachefs/error.h ++++ b/fs/bcachefs/error.h +@@ -45,32 +45,11 @@ int bch2_topology_error(struct bch_fs *); + bch2_inconsistent_error(c); \ + }) + +-#define bch2_fs_inconsistent_on(cond, c, ...) \ ++#define bch2_fs_inconsistent_on(cond, ...) \ + ({ \ + bool _ret = unlikely(!!(cond)); \ +- \ +- if (_ret) \ +- bch2_fs_inconsistent(c, __VA_ARGS__); \ +- _ret; \ +-}) +- +-/* +- * Later we might want to mark only the particular device inconsistent, not the +- * entire filesystem: +- */ +- +-#define bch2_dev_inconsistent(ca, ...) \ +-do { \ +- bch_err(ca, __VA_ARGS__); \ +- bch2_inconsistent_error((ca)->fs); \ +-} while (0) +- +-#define bch2_dev_inconsistent_on(cond, ca, ...) \ +-({ \ +- bool _ret = unlikely(!!(cond)); \ +- \ + if (_ret) \ +- bch2_dev_inconsistent(ca, __VA_ARGS__); \ ++ bch2_fs_inconsistent(__VA_ARGS__); \ + _ret; \ + }) + +@@ -123,9 +102,9 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, + + void bch2_flush_fsck_errs(struct bch_fs *); + +-#define __fsck_err(c, _flags, _err_type, ...) \ ++#define fsck_err_wrap(_do) \ + ({ \ +- int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \ ++ int _ret = _do; \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) { \ + ret = _ret; \ +@@ -135,6 +114,8 @@ void bch2_flush_fsck_errs(struct bch_fs *); + _ret == -BCH_ERR_fsck_fix; \ + }) + ++#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) ++ + /* These macros return true if error should be fixed: */ + + /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ +@@ -149,12 +130,6 @@ void bch2_flush_fsck_errs(struct bch_fs *); + (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ + }) + +-#define need_fsck_err_on(cond, c, _err_type, ...) \ +- __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) +- +-#define need_fsck_err(c, _err_type, ...) \ +- __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) +- + #define mustfix_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) + +@@ -167,11 +142,22 @@ void bch2_flush_fsck_errs(struct bch_fs *); + #define fsck_err_on(cond, c, _err_type, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) + ++#define log_fsck_err(c, _err_type, ...) \ ++ __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) ++ ++#define log_fsck_err_on(cond, ...) \ ++({ \ ++ bool _ret = unlikely(!!(cond)); \ ++ if (_ret) \ ++ log_fsck_err(__VA_ARGS__); \ ++ _ret; \ ++}) ++ + enum bch_validate_flags; + __printf(5, 6) + int __bch2_bkey_fsck_err(struct bch_fs *, + struct bkey_s_c, +- enum bch_validate_flags, ++ struct bkey_validate_context from, + enum bch_sb_error_id, + const char *, ...); + +@@ -181,7 +167,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *, + */ + #define bkey_fsck_err(c, _err_type, _err_msg, ...) \ + do { \ +- int _ret = __bch2_bkey_fsck_err(c, k, flags, \ ++ int _ret = __bch2_bkey_fsck_err(c, k, from, \ + BCH_FSCK_ERR_##_err_type, \ + _err_msg, ##__VA_ARGS__); \ + if (_ret != -BCH_ERR_fsck_fix && \ +@@ -252,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); + _ret; \ + }) + ++int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); ++int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); ++ ++void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); ++void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); ++ + #endif /* _BCACHEFS_ERROR_H */ +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +index 5f4fecb358da..6aac579a692a 100644 +--- a/fs/bcachefs/extent_update.c ++++ b/fs/bcachefs/extent_update.c +@@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans, + break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); +- u64 idx = le64_to_cpu(p.v->idx); ++ u64 idx = REFLINK_P_IDX(p.v); + unsigned sectors = bpos_min(*end, p.k->p).offset - + bkey_start_offset(p.k); + struct btree_iter iter; +@@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, + + bch2_trans_copy_iter(©, iter); + +- for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { ++ for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { + unsigned offset = 0; + + if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +index 37e3d69bec06..05d5f71a7ca9 100644 +--- a/fs/bcachefs/extents.c ++++ b/fs/bcachefs/extents.c +@@ -21,6 +21,7 @@ + #include "extents.h" + #include "inode.h" + #include "journal.h" ++#include "rebalance.h" + #include "replicas.h" + #include "super.h" + #include "super-io.h" +@@ -88,6 +89,14 @@ static inline bool ptr_better(struct bch_fs *c, + u64 l1 = dev_latency(c, p1.ptr.dev); + u64 l2 = dev_latency(c, p2.ptr.dev); + ++ /* ++ * Square the latencies, to bias more in favor of the faster ++ * device - we never want to stop issuing reads to the slower ++ * device altogether, so that we can update our latency numbers: ++ */ ++ l1 *= l1; ++ l2 *= l2; ++ + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; +@@ -169,7 +178,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + /* KEY_TYPE_btree_ptr: */ + + int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + int ret = 0; + +@@ -177,7 +186,7 @@ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, + c, btree_ptr_val_too_big, + "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); + +- ret = bch2_bkey_ptrs_validate(c, k, flags); ++ ret = bch2_bkey_ptrs_validate(c, k, from); + fsck_err: + return ret; + } +@@ -189,7 +198,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + } + + int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + int ret = 0; +@@ -203,12 +212,13 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, + c, btree_ptr_v2_min_key_bad, + "min_key > key"); + +- if (flags & BCH_VALIDATE_write) ++ if ((from.flags & BCH_VALIDATE_write) && ++ c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) + bkey_fsck_err_on(!bp.v->sectors_written, + c, btree_ptr_v2_written_0, + "sectors_written == 0"); + +- ret = bch2_bkey_ptrs_validate(c, k, flags); ++ ret = bch2_bkey_ptrs_validate(c, k, from); + fsck_err: + return ret; + } +@@ -395,7 +405,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) + /* KEY_TYPE_reservation: */ + + int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + int ret = 0; +@@ -1120,6 +1130,57 @@ void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_cr + bch2_prt_compression_type(out, crc->compression_type); + } + ++static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, ++ const struct bch_extent_rebalance *r) ++{ ++ prt_str(out, "rebalance:"); ++ ++ prt_printf(out, " replicas=%u", r->data_replicas); ++ if (r->data_replicas_from_inode) ++ prt_str(out, " (inode)"); ++ ++ prt_str(out, " checksum="); ++ bch2_prt_csum_opt(out, r->data_checksum); ++ if (r->data_checksum_from_inode) ++ prt_str(out, " (inode)"); ++ ++ if (r->background_compression || r->background_compression_from_inode) { ++ prt_str(out, " background_compression="); ++ bch2_compression_opt_to_text(out, r->background_compression); ++ ++ if (r->background_compression_from_inode) ++ prt_str(out, " (inode)"); ++ } ++ ++ if (r->background_target || r->background_target_from_inode) { ++ prt_str(out, " background_target="); ++ if (c) ++ bch2_target_to_text(out, c, r->background_target); ++ else ++ prt_printf(out, "%u", r->background_target); ++ ++ if (r->background_target_from_inode) ++ prt_str(out, " (inode)"); ++ } ++ ++ if (r->promote_target || r->promote_target_from_inode) { ++ prt_str(out, " promote_target="); ++ if (c) ++ bch2_target_to_text(out, c, r->promote_target); ++ else ++ prt_printf(out, "%u", r->promote_target); ++ ++ if (r->promote_target_from_inode) ++ prt_str(out, " (inode)"); ++ } ++ ++ if (r->erasure_code || r->erasure_code_from_inode) { ++ prt_printf(out, " ec=%u", r->erasure_code); ++ if (r->erasure_code_from_inode) ++ prt_str(out, " (inode)"); ++ } ++} ++ + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) + { +@@ -1155,18 +1216,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + (u64) ec->idx, ec->block); + break; + } +- case BCH_EXTENT_ENTRY_rebalance: { +- const struct bch_extent_rebalance *r = &entry->rebalance; +- +- prt_str(out, "rebalance: target "); +- if (c) +- bch2_target_to_text(out, c, r->target); +- else +- prt_printf(out, "%u", r->target); +- prt_str(out, " compression "); +- bch2_compression_opt_to_text(out, r->compression); ++ case BCH_EXTENT_ENTRY_rebalance: ++ bch2_extent_rebalance_to_text(out, c, &entry->rebalance); + break; +- } ++ + default: + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + return; +@@ -1178,13 +1231,19 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + + static int extent_ptr_validate(struct bch_fs *c, + struct bkey_s_c k, +- enum bch_validate_flags flags, ++ struct bkey_validate_context from, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata) + { + int ret = 0; + ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ bkey_for_each_ptr(ptrs, ptr2) ++ bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, ++ c, ptr_to_duplicate_device, ++ "multiple pointers to same device (%u)", ptr->dev); ++ + /* bad pointers are repaired by check_fix_ptrs(): */ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); +@@ -1199,13 +1258,6 @@ static int extent_ptr_validate(struct bch_fs *c, + unsigned bucket_size = ca->mi.bucket_size; + rcu_read_unlock(); + +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- bkey_for_each_ptr(ptrs, ptr2) +- bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, +- c, ptr_to_duplicate_device, +- "multiple pointers to same device (%u)", ptr->dev); +- +- + bkey_fsck_err_on(bucket >= nbuckets, + c, ptr_after_last_bucket, + "pointer past last bucket (%llu > %llu)", bucket, nbuckets); +@@ -1221,7 +1273,7 @@ static int extent_ptr_validate(struct bch_fs *c, + } + + int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; +@@ -1248,7 +1300,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: +- ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false); ++ ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); + if (ret) + return ret; + +@@ -1270,9 +1322,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + +- bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, +- c, ptr_crc_uncompressed_size_too_small, +- "checksum offset + key size > uncompressed size"); + bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), + c, ptr_crc_csum_type_unknown, + "invalid checksum type"); +@@ -1280,6 +1329,19 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + c, ptr_crc_compression_type_unknown, + "invalid compression type"); + ++ bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, ++ c, ptr_crc_uncompressed_size_too_small, ++ "checksum offset + key size > uncompressed size"); ++ bkey_fsck_err_on(crc_is_encoded(crc) && ++ (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && ++ (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), ++ c, ptr_crc_uncompressed_size_too_big, ++ "too large encoded extent"); ++ bkey_fsck_err_on(!crc_is_compressed(crc) && ++ crc.compressed_size != crc.uncompressed_size, ++ c, ptr_crc_uncompressed_size_mismatch, ++ "not compressed but compressed != uncompressed size"); ++ + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; +@@ -1293,12 +1355,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, + "redundant crc entry"); + crc_since_last_ptr = true; + +- bkey_fsck_err_on(crc_is_encoded(crc) && +- (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && +- (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), +- c, ptr_crc_uncompressed_size_too_big, +- "too large encoded extent"); +- + size_ondisk = crc.compressed_size; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: +@@ -1391,166 +1447,6 @@ void bch2_ptr_swab(struct bkey_s k) + } + } + +-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- +- bkey_extent_entry_for_each(ptrs, entry) +- if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) +- return &entry->rebalance; +- +- return NULL; +-} +- +-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, +- unsigned target, unsigned compression) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- unsigned rewrite_ptrs = 0; +- +- if (compression) { +- unsigned compression_type = bch2_compression_opt_to_type(compression); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- unsigned i = 0; +- +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || +- p.ptr.unwritten) { +- rewrite_ptrs = 0; +- goto incompressible; +- } +- +- if (!p.ptr.cached && p.crc.compression_type != compression_type) +- rewrite_ptrs |= 1U << i; +- i++; +- } +- } +-incompressible: +- if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { +- unsigned i = 0; +- +- bkey_for_each_ptr(ptrs, ptr) { +- if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) +- rewrite_ptrs |= 1U << i; +- i++; +- } +- } +- +- return rewrite_ptrs; +-} +- +-bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) +-{ +- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); +- +- /* +- * If it's an indirect extent, we don't delete the rebalance entry when +- * done so that we know what options were applied - check if it still +- * needs work done: +- */ +- if (r && +- k.k->type == KEY_TYPE_reflink_v && +- !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) +- r = NULL; +- +- return r != NULL; +-} +- +-static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k, +- unsigned target, unsigned compression) +-{ +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- u64 sectors = 0; +- +- if (compression) { +- unsigned compression_type = bch2_compression_opt_to_type(compression); +- +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { +- if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || +- p.ptr.unwritten) { +- sectors = 0; +- goto incompressible; +- } +- +- if (!p.ptr.cached && p.crc.compression_type != compression_type) +- sectors += p.crc.compressed_size; +- } +- } +-incompressible: +- if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target)) +- sectors += p.crc.compressed_size; +- } +- +- return sectors; +-} +- +-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +-{ +- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); +- +- return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0; +-} +- +-int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, +- struct bch_io_opts *opts) +-{ +- struct bkey_s k = bkey_i_to_s(_k); +- struct bch_extent_rebalance *r; +- unsigned target = opts->background_target; +- unsigned compression = background_compression(*opts); +- bool needs_rebalance; +- +- if (!bkey_extent_is_direct_data(k.k)) +- return 0; +- +- /* get existing rebalance entry: */ +- r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); +- if (r) { +- if (k.k->type == KEY_TYPE_reflink_v) { +- /* +- * indirect extents: existing options take precedence, +- * so that we don't move extents back and forth if +- * they're referenced by different inodes with different +- * options: +- */ +- if (r->target) +- target = r->target; +- if (r->compression) +- compression = r->compression; +- } +- +- r->target = target; +- r->compression = compression; +- } +- +- needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); +- +- if (needs_rebalance && !r) { +- union bch_extent_entry *new = bkey_val_end(k); +- +- new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; +- new->rebalance.compression = compression; +- new->rebalance.target = target; +- new->rebalance.unused = 0; +- k.k->u64s += extent_entry_u64s(new); +- } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { +- /* +- * For indirect extents, don't delete the rebalance entry when +- * we're finished so that we know we specifically moved it or +- * compressed it to its current location/compression type +- */ +- extent_entry_drop(k, (union bch_extent_entry *) r); +- } +- +- return 0; +-} +- + /* Generic extent code: */ + + int bch2_cut_front_s(struct bpos where, struct bkey_s k) +@@ -1610,7 +1506,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); + +- le64_add_cpu(&p.v->idx, sub); ++ SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); + break; + } + case KEY_TYPE_inline_data: +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +index bcffcf60aaaf..620b284aa34f 100644 +--- a/fs/bcachefs/extents.h ++++ b/fs/bcachefs/extents.h +@@ -8,7 +8,6 @@ + + struct bch_fs; + struct btree_trans; +-enum bch_validate_flags; + + /* extent entries: */ + +@@ -410,12 +409,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + /* KEY_TYPE_btree_ptr: */ + + int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + + int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); +@@ -452,7 +451,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + /* KEY_TYPE_reservation: */ + + int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +@@ -696,7 +695,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct + void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + + static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, + struct bch_extent_ptr ptr2) +@@ -710,15 +709,6 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, + + void bch2_ptr_swab(struct bkey_s); + +-const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); +-unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, +- unsigned, unsigned); +-bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); +-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); +- +-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, +- struct bch_io_opts *); +- + /* Generic extent code: */ + + enum bch_extent_overlap { +diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h +index 3bd2fdbb0817..c198dfc376d6 100644 +--- a/fs/bcachefs/extents_format.h ++++ b/fs/bcachefs/extents_format.h +@@ -201,19 +201,8 @@ struct bch_extent_stripe_ptr { + #endif + }; + +-struct bch_extent_rebalance { +-#if defined(__LITTLE_ENDIAN_BITFIELD) +- __u64 type:6, +- unused:34, +- compression:8, /* enum bch_compression_opt */ +- target:16; +-#elif defined (__BIG_ENDIAN_BITFIELD) +- __u64 target:16, +- compression:8, +- unused:34, +- type:6; +-#endif +-}; ++/* bch_extent_rebalance: */ ++#include "rebalance_format.h" + + union bch_extent_entry { + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +index 7e10a9ddcfd9..d70d9f634cea 100644 +--- a/fs/bcachefs/fs-common.c ++++ b/fs/bcachefs/fs-common.c +@@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans, + if (!snapshot_src.inum) { + /* Inode wasn't specified, just snapshot: */ + struct bch_subvolume s; +- +- ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, +- BTREE_ITER_cached, &s); ++ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); + if (ret) + goto err; + +@@ -154,6 +152,7 @@ int bch2_create_trans(struct btree_trans *trans, + if (is_subdir_for_nlink(new_inode)) + dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; ++ dir_u->bi_size += dirent_occupied_size(name); + + ret = bch2_inode_write(trans, &dir_iter, dir_u); + if (ret) +@@ -172,6 +171,10 @@ int bch2_create_trans(struct btree_trans *trans, + new_inode->bi_dir_offset = dir_offset; + } + ++ if (S_ISDIR(mode) && ++ !new_inode->bi_subvol) ++ new_inode->bi_depth = dir_u->bi_depth + 1; ++ + inode_iter.flags &= ~BTREE_ITER_all_snapshots; + bch2_btree_iter_set_snapshot(&inode_iter, snapshot); + +@@ -218,6 +221,7 @@ int bch2_link_trans(struct btree_trans *trans, + } + + dir_u->bi_mtime = dir_u->bi_ctime = now; ++ dir_u->bi_size += dirent_occupied_size(name); + + dir_hash = bch2_hash_info_init(c, dir_u); + +@@ -320,6 +324,7 @@ int bch2_unlink_trans(struct btree_trans *trans, + + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); ++ dir_u->bi_size -= dirent_occupied_size(name); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, +@@ -458,6 +463,14 @@ int bch2_rename_trans(struct btree_trans *trans, + goto err; + } + ++ if (mode == BCH_RENAME) { ++ src_dir_u->bi_size -= dirent_occupied_size(src_name); ++ dst_dir_u->bi_size += dirent_occupied_size(dst_name); ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) ++ src_dir_u->bi_size -= dirent_occupied_size(src_name); ++ + if (src_inode_u->bi_parent_subvol) + src_inode_u->bi_parent_subvol = dst_dir.subvol; + +@@ -512,6 +525,15 @@ int bch2_rename_trans(struct btree_trans *trans, + dst_dir_u->bi_nlink++; + } + ++ if (S_ISDIR(src_inode_u->bi_mode) && ++ !src_inode_u->bi_subvol) ++ src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ S_ISDIR(dst_inode_u->bi_mode) && ++ !dst_inode_u->bi_subvol) ++ dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; ++ + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { + dst_dir_u->bi_nlink--; + src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; +@@ -548,3 +570,94 @@ int bch2_rename_trans(struct btree_trans *trans, + bch2_trans_iter_exit(trans, &src_dir_iter); + return ret; + } ++ ++static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) ++{ ++ bch2_printbuf_make_room(out, n); ++ ++ unsigned can_print = min(n, printbuf_remaining(out)); ++ ++ b += n; ++ ++ for (unsigned i = 0; i < can_print; i++) ++ out->buf[out->pos++] = *((char *) --b); ++ ++ printbuf_nul_terminate(out); ++} ++ ++static inline void prt_str_reversed(struct printbuf *out, const char *s) ++{ ++ prt_bytes_reversed(out, s, strlen(s)); ++} ++ ++static inline void reverse_bytes(void *b, size_t n) ++{ ++ char *e = b + n, *s = b; ++ ++ while (s < e) { ++ --e; ++ swap(*s, *e); ++ s++; ++ } ++} ++ ++/* XXX: we don't yet attempt to print paths when we don't know the subvol */ ++int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) ++{ ++ unsigned orig_pos = path->pos; ++ int ret = 0; ++ ++ while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && ++ inum.inum == BCACHEFS_ROOT_INO)) { ++ struct bch_inode_unpacked inode; ++ ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); ++ if (ret) ++ goto disconnected; ++ ++ if (!inode.bi_dir && !inode.bi_dir_offset) { ++ ret = -BCH_ERR_ENOENT_inode_no_backpointer; ++ goto disconnected; ++ } ++ ++ inum.subvol = inode.bi_parent_subvol ?: inum.subvol; ++ inum.inum = inode.bi_dir; ++ ++ u32 snapshot; ++ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); ++ if (ret) ++ goto disconnected; ++ ++ struct btree_iter d_iter; ++ struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, ++ BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), ++ 0, dirent); ++ ret = bkey_err(d.s_c); ++ if (ret) ++ goto disconnected; ++ ++ struct qstr dirent_name = bch2_dirent_get_name(d); ++ prt_bytes_reversed(path, dirent_name.name, dirent_name.len); ++ ++ prt_char(path, '/'); ++ ++ bch2_trans_iter_exit(trans, &d_iter); ++ } ++ ++ if (orig_pos == path->pos) ++ prt_char(path, '/'); ++out: ++ ret = path->allocation_failure ? -ENOMEM : 0; ++ if (ret) ++ goto err; ++ ++ reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); ++ return 0; ++err: ++ return ret; ++disconnected: ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ goto err; ++ ++ prt_str_reversed(path, "(disconnected)"); ++ goto out; ++} +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +index c934e807b380..2b59210bb5e8 100644 +--- a/fs/bcachefs/fs-common.h ++++ b/fs/bcachefs/fs-common.h +@@ -42,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *, + bool bch2_reinherit_attrs(struct bch_inode_unpacked *, + struct bch_inode_unpacked *); + ++int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); ++ + #endif /* _BCACHEFS_FS_COMMON_H */ +diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c +index 95972809e76d..83e15908250d 100644 +--- a/fs/bcachefs/fs-io-buffered.c ++++ b/fs/bcachefs/fs-io-buffered.c +@@ -149,12 +149,10 @@ static void bchfs_read(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_buf sk; +- int flags = BCH_READ_RETRY_IF_STALE| +- BCH_READ_MAY_PROMOTE; ++ int flags = BCH_READ_retry_if_stale| ++ BCH_READ_may_promote; + int ret = 0; + +- rbio->c = c; +- rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + bch2_bkey_buf_init(&sk); +@@ -164,7 +162,8 @@ static void bchfs_read(struct btree_trans *trans, + BTREE_ITER_slots); + while (1) { + struct bkey_s_c k; +- unsigned bytes, sectors, offset_into_extent; ++ unsigned bytes, sectors; ++ s64 offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + bch2_trans_begin(trans); +@@ -197,7 +196,7 @@ static void bchfs_read(struct btree_trans *trans, + + k = bkey_i_to_s_c(sk.k); + +- sectors = min(sectors, k.k->size - offset_into_extent); ++ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); + + if (readpages_iter) { + ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, +@@ -210,14 +209,14 @@ static void bchfs_read(struct btree_trans *trans, + swap(rbio->bio.bi_iter.bi_size, bytes); + + if (rbio->bio.bi_iter.bi_size == bytes) +- flags |= BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_last_fragment; + + bch2_bio_page_state_set(&rbio->bio, k); + + bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); + +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + break; + + swap(rbio->bio.bi_iter.bi_size, bytes); +@@ -230,10 +229,12 @@ static void bchfs_read(struct btree_trans *trans, + bch2_trans_iter_exit(trans, &iter); + + if (ret) { +- bch_err_inum_offset_ratelimited(c, +- iter.pos.inode, +- iter.pos.offset << 9, +- "read error %i from btree lookup", ret); ++ struct printbuf buf = PRINTBUF; ++ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); ++ prt_printf(&buf, "read error %i from btree lookup", ret); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ + rbio->bio.bi_status = BLK_STS_IOERR; + bio_endio(&rbio->bio); + } +@@ -248,6 +249,7 @@ void bch2_readahead(struct readahead_control *ractl) + struct bch_io_opts opts; + struct folio *folio; + struct readpages_iter readpages_iter; ++ struct blk_plug plug; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + +@@ -255,6 +257,16 @@ void bch2_readahead(struct readahead_control *ractl) + if (ret) + return; + ++ /* ++ * Besides being a general performance optimization, plugging helps with ++ * avoiding btree transaction srcu warnings - submitting a bio can ++ * block, and we don't want todo that with the transaction locked. ++ * ++ * However, plugged bios are submitted when we schedule; we ideally ++ * would have our own scheduler hook to call unlock_long() before ++ * scheduling. ++ */ ++ blk_start_plug(&plug); + bch2_pagecache_add_get(inode); + + struct btree_trans *trans = bch2_trans_get(c); +@@ -266,12 +278,13 @@ void bch2_readahead(struct readahead_control *ractl) + struct bch_read_bio *rbio = + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, + GFP_KERNEL, &c->bio_read), +- opts); ++ c, ++ opts, ++ bch2_readpages_end_io); + + readpage_iter_advance(&readpages_iter); + + rbio->bio.bi_iter.bi_sector = folio_sector(folio); +- rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bchfs_read(trans, rbio, inode_inum(inode), +@@ -281,7 +294,7 @@ void bch2_readahead(struct readahead_control *ractl) + bch2_trans_put(trans); + + bch2_pagecache_add_put(inode); +- ++ blk_finish_plug(&plug); + darray_exit(&readpages_iter.folios); + } + +@@ -296,24 +309,30 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_read_bio *rbio; + struct bch_io_opts opts; ++ struct blk_plug plug; + int ret; + DECLARE_COMPLETION_ONSTACK(done); + ++ BUG_ON(folio_test_uptodate(folio)); ++ BUG_ON(folio_test_dirty(folio)); ++ + if (!bch2_folio_create(folio, GFP_KERNEL)) + return -ENOMEM; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), +- opts); ++ c, ++ opts, ++ bch2_read_single_folio_end_io); + rbio->bio.bi_private = &done; +- rbio->bio.bi_end_io = bch2_read_single_folio_end_io; +- + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + ++ blk_start_plug(&plug); + bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); ++ blk_finish_plug(&plug); + wait_for_completion(&done); + + ret = blk_status_to_errno(rbio->bio.bi_status); +@@ -400,7 +419,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) + } + } + +- if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { ++ if (io->op.flags & BCH_WRITE_wrote_data_inline) { + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + +@@ -605,15 +624,6 @@ static int __bch2_writepage(struct folio *folio, + BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, + sectors << 9, offset << 9)); + +- /* Check for writing past i_size: */ +- WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > +- round_up(i_size, block_bytes(c)) && +- !test_bit(BCH_FS_emergency_ro, &c->flags), +- "writing past i_size: %llu > %llu (unrounded %llu)\n", +- bio_end_sector(&w->io->op.wbio.bio) << 9, +- round_up(i_size, block_bytes(c)), +- i_size); +- + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; + w->io->op.new_i_size = i_size; +@@ -669,7 +679,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, + FGP_WRITEBEGIN | fgf_set_order(len), + mapping_gfp_mask(mapping)); +- if (IS_ERR_OR_NULL(folio)) ++ if (IS_ERR(folio)) + goto err_unlock; + + offset = pos - folio_pos(folio); +diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c +index 6d3a05ae5da8..535bc5fcbcc0 100644 +--- a/fs/bcachefs/fs-io-direct.c ++++ b/fs/bcachefs/fs-io-direct.c +@@ -70,8 +70,10 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + struct bch_io_opts opts; + struct dio_read *dio; + struct bio *bio; ++ struct blk_plug plug; + loff_t offset = req->ki_pos; + bool sync = is_sync_kiocb(req); ++ bool split = false; + size_t shorten; + ssize_t ret; + +@@ -98,8 +100,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + GFP_KERNEL, + &c->dio_read_bioset); + +- bio->bi_end_io = bch2_direct_IO_read_endio; +- + dio = container_of(bio, struct dio_read, rbio.bio); + closure_init(&dio->cl, NULL); + +@@ -128,14 +128,17 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + */ + dio->should_dirty = iter_is_iovec(iter); + ++ blk_start_plug(&plug); ++ + goto start; + while (iter->count) { ++ split = true; ++ + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->bio_read); +- bio->bi_end_io = bch2_direct_IO_read_split_endio; + start: + bio->bi_opf = REQ_OP_READ|REQ_SYNC; + bio->bi_iter.bi_sector = offset >> 9; +@@ -157,9 +160,19 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) + if (iter->count) + closure_get(&dio->cl); + +- bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); ++ struct bch_read_bio *rbio = ++ rbio_init(bio, ++ c, ++ opts, ++ split ++ ? bch2_direct_IO_read_split_endio ++ : bch2_direct_IO_read_endio); ++ ++ bch2_read(c, rbio, inode_inum(inode)); + } + ++ blk_finish_plug(&plug); ++ + iter->count += shorten; + + if (sync) { +@@ -506,8 +519,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) + dio->op.devs_need_flush = &inode->ei_devs_need_flush; + + if (sync) +- dio->op.flags |= BCH_WRITE_SYNC; +- dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; ++ dio->op.flags |= BCH_WRITE_sync; ++ dio->op.flags |= BCH_WRITE_check_enospc; + + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + bio_sectors(bio), true); +diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c +index 1d4910ea0f1d..e072900e6a5b 100644 +--- a/fs/bcachefs/fs-io-pagecache.c ++++ b/fs/bcachefs/fs-io-pagecache.c +@@ -29,7 +29,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, + break; + + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); +- if (IS_ERR_OR_NULL(f)) ++ if (IS_ERR(f)) + break; + + BUG_ON(fs->nr && folio_pos(f) != pos); +@@ -199,7 +199,7 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, + unsigned folio_idx = 0; + + return bch2_trans_run(c, +- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, ++ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, + POS(inum.inum, offset), + POS(inum.inum, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +index 2456c41b215e..94bf34b9b65f 100644 +--- a/fs/bcachefs/fs-io.c ++++ b/fs/bcachefs/fs-io.c +@@ -167,6 +167,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + + /* fsync: */ + ++static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, ++ u64 *seq) ++{ ++ struct printbuf buf = PRINTBUF; ++ struct bch_inode_unpacked u; ++ struct btree_iter iter; ++ int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); ++ if (ret) ++ return ret; ++ ++ u64 cur_seq = journal_cur_seq(&trans->c->journal); ++ *seq = min(cur_seq, u.bi_journal_seq); ++ ++ if (fsck_err_on(u.bi_journal_seq > cur_seq, ++ trans, inode_journal_seq_in_future, ++ "inode journal seq in future (currently at %llu)\n%s", ++ cur_seq, ++ (bch2_inode_unpacked_to_text(&buf, &u), ++ buf.buf))) { ++ u.bi_journal_seq = cur_seq; ++ ret = bch2_inode_write(trans, &iter, &u); ++ } ++fsck_err: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++} ++ + /* + * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an + * insert trigger: look up the btree inode instead +@@ -180,9 +208,10 @@ static int bch2_flush_inode(struct bch_fs *c, + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + return -EROFS; + +- struct bch_inode_unpacked u; +- int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: +- bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?: ++ u64 seq; ++ int ret = bch2_trans_commit_do(c, NULL, NULL, 0, ++ bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: ++ bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: + bch2_inode_flush_nocow_writes(c, inode); + bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + return ret; +@@ -222,7 +251,7 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, + struct bpos end) + { + return bch2_trans_run(c, +- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end, ++ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, + subvol, 0, k, ({ + bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); + }))); +@@ -256,7 +285,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, + + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT, GFP_KERNEL); +- if (IS_ERR_OR_NULL(folio)) { ++ if (IS_ERR(folio)) { + ret = -ENOMEM; + goto out; + } +@@ -806,7 +835,7 @@ static int quota_reserve_range(struct bch_inode_info *inode, + u64 sectors = end - start; + + int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_upto(trans, iter, ++ for_each_btree_key_in_subvolume_max(trans, iter, + BTREE_ID_extents, + POS(inode->v.i_ino, start), + POS(inode->v.i_ino, end - 1), +@@ -877,11 +906,18 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + bch2_mark_pagecache_unallocated(src, pos_src >> 9, + (pos_src + aligned_len) >> 9); + ++ /* ++ * XXX: we'd like to be telling bch2_remap_range() if we have ++ * permission to write to the source file, and thus if io path option ++ * changes should be propagated through the copy, but we need mnt_idmap ++ * from the pathwalk, awkward ++ */ + ret = bch2_remap_range(c, + inode_inum(dst), pos_dst >> 9, + inode_inum(src), pos_src >> 9, + aligned_len >> 9, +- pos_dst + len, &i_sectors_delta); ++ pos_dst + len, &i_sectors_delta, ++ false); + if (ret < 0) + goto err; + +@@ -922,7 +958,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) + return -ENXIO; + + int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, ++ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, 0, k, ({ +@@ -958,7 +994,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) + return -ENXIO; + + int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, ++ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, + POS(inode->v.i_ino, offset >> 9), + POS(inode->v.i_ino, U64_MAX), + inum.subvol, BTREE_ITER_slots, k, ({ +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +index 405cf08bda34..15725b4ce393 100644 +--- a/fs/bcachefs/fs-ioctl.c ++++ b/fs/bcachefs/fs-ioctl.c +@@ -406,7 +406,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + sync_inodes_sb(c->vfs_sb); + up_read(&c->vfs_sb->s_umount); + } +-retry: ++ + if (arg.src_ptr) { + error = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.src_ptr, +@@ -486,11 +486,6 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + err2: + if (arg.src_ptr) + path_put(&src_path); +- +- if (retry_estale(error, lookup_flags)) { +- lookup_flags |= LOOKUP_REVAL; +- goto retry; +- } + err1: + return error; + } +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +index a41d0d8a2f7b..90ade8f648d9 100644 +--- a/fs/bcachefs/fs.c ++++ b/fs/bcachefs/fs.c +@@ -23,6 +23,7 @@ + #include "journal.h" + #include "keylist.h" + #include "quota.h" ++#include "rebalance.h" + #include "snapshot.h" + #include "super.h" + #include "xattr.h" +@@ -38,6 +39,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -65,6 +67,9 @@ void bch2_inode_update_after_write(struct btree_trans *trans, + i_gid_write(&inode->v, bi->bi_gid); + inode->v.i_mode = bi->bi_mode; + ++ if (fields & ATTR_SIZE) ++ i_size_write(&inode->v, bi->bi_size); ++ + if (fields & ATTR_ATIME) + inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); + if (fields & ATTR_MTIME) +@@ -89,10 +94,25 @@ int __must_check bch2_write_inode(struct bch_fs *c, + retry: + bch2_trans_begin(trans); + +- ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), +- BTREE_ITER_intent) ?: +- (set ? set(trans, inode, &inode_u, p) : 0) ?: +- bch2_inode_write(trans, &iter, &inode_u) ?: ++ ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); ++ if (ret) ++ goto err; ++ ++ struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); ++ ++ ret = (set ? set(trans, inode, &inode_u, p) : 0); ++ if (ret) ++ goto err; ++ ++ struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); ++ ++ if (memcmp(&old_r, &new_r, sizeof(new_r))) { ++ ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_inode_write(trans, &iter, &inode_u) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + + /* +@@ -101,7 +121,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, + */ + if (!ret) + bch2_inode_update_after_write(trans, inode, &inode_u, fields); +- ++err: + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -160,8 +180,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) + static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) + { + const subvol_inum *inum = data; ++ siphash_key_t k = { .key[0] = seed }; + +- return jhash(&inum->inum, sizeof(inum->inum), seed); ++ return siphash_2u64(inum->subvol, inum->inum, &k); + } + + static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) +@@ -190,11 +211,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { + .automatic_shrinking = true, + }; + ++static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { ++ .head_offset = offsetof(struct bch_inode_info, by_inum_hash), ++ .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), ++ .key_len = sizeof(u64), ++ .automatic_shrinking = true, ++}; ++ + int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) + { + struct bch_fs *c = trans->c; +- struct rhashtable *ht = &c->vfs_inodes_table; +- subvol_inum inum = (subvol_inum) { .inum = p.offset }; ++ struct rhltable *ht = &c->vfs_inodes_by_inum_table; ++ u64 inum = p.offset; + DARRAY(u32) subvols; + int ret = 0; + +@@ -219,15 +247,15 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) + struct rhash_lock_head __rcu *const *bkt; + struct rhash_head *he; + unsigned int hash; +- struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); ++ struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); + restart: +- hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); ++ hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); + bkt = rht_bucket(tbl, hash); + do { + struct bch_inode_info *inode; + + rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { +- if (inode->ei_inum.inum == inum.inum) { ++ if (inode->ei_inum.inum == inum) { + ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, + GFP_NOWAIT|__GFP_NOWARN); + if (ret) { +@@ -248,7 +276,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) + /* Ensure we see any new tables. */ + smp_rmb(); + +- tbl = rht_dereference_rcu(tbl->future_tbl, ht); ++ tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); + if (unlikely(tbl)) + goto restart; + rcu_read_unlock(); +@@ -327,7 +355,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod + spin_unlock(&inode->v.i_lock); + + if (remove) { +- int ret = rhashtable_remove_fast(&c->vfs_inodes_table, ++ int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, ++ &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); ++ BUG_ON(ret); ++ ++ ret = rhashtable_remove_fast(&c->vfs_inodes_table, + &inode->hash, bch2_vfs_inodes_params); + BUG_ON(ret); + inode->v.i_hash.pprev = NULL; +@@ -372,6 +404,11 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, + discard_new_inode(&inode->v); + return old; + } else { ++ int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, ++ &inode->by_inum_hash, ++ bch2_vfs_inodes_by_inum_params); ++ BUG_ON(ret); ++ + inode_fake_hash(&inode->v); + + inode_sb_list_add(&inode->v); +@@ -465,7 +502,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + int ret = lockrestart_do(trans, +- bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: ++ bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + bch2_trans_put(trans); +@@ -535,8 +572,7 @@ __bch2_create(struct mnt_idmap *idmap, + inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; + inum.inum = inode_u.bi_inum; + +- ret = bch2_subvolume_get(trans, inum.subvol, true, +- BTREE_ITER_with_updates, &subvol) ?: ++ ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_trans_commit(trans, NULL, &journal_seq, 0); + if (unlikely(ret)) { + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, +@@ -549,7 +585,7 @@ __bch2_create(struct mnt_idmap *idmap, + + if (!(flags & BCH_CREATE_TMPFILE)) { + bch2_inode_update_after_write(trans, dir, &dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + mutex_unlock(&dir->ei_update_lock); + } + +@@ -617,7 +653,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + + struct bch_subvolume subvol; + struct bch_inode_unpacked inode_u; +- ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: ++ ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + +@@ -628,7 +664,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + goto err; + + /* regular files may have hardlinks: */ +- if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && ++ if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && + !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), + c, + "dirent points to inode that does not point back:\n %s", +@@ -706,7 +742,7 @@ static int __bch2_link(struct bch_fs *c, + + if (likely(!ret)) { + bch2_inode_update_after_write(trans, dir, &dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + bch2_inode_update_after_write(trans, inode, &inode_u, ATTR_CTIME); + } + +@@ -759,7 +795,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + goto err; + + bch2_inode_update_after_write(trans, dir, &dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + bch2_inode_update_after_write(trans, inode, &inode_u, + ATTR_MTIME); + +@@ -937,11 +973,11 @@ static int bch2_rename2(struct mnt_idmap *idmap, + dst_inode->v.i_ino != dst_inode_u.bi_inum); + + bch2_inode_update_after_write(trans, src_dir, &src_dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + + if (src_dir != dst_dir) + bch2_inode_update_after_write(trans, dst_dir, &dst_dir_u, +- ATTR_MTIME|ATTR_CTIME); ++ ATTR_MTIME|ATTR_CTIME|ATTR_SIZE); + + bch2_inode_update_after_write(trans, src_inode, &src_inode_u, + ATTR_CTIME); +@@ -1245,7 +1281,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; +- unsigned offset_into_extent, sectors; + bool have_extent = false; + int ret = 0; + +@@ -1278,7 +1313,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + + bch2_btree_iter_set_snapshot(&iter, snapshot); + +- k = bch2_btree_iter_peek_upto(&iter, end); ++ k = bch2_btree_iter_peek_max(&iter, end); + ret = bkey_err(k); + if (ret) + continue; +@@ -1292,9 +1327,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + continue; + } + +- offset_into_extent = iter.pos.offset - +- bkey_start_offset(k.k); +- sectors = k.k->size - offset_into_extent; ++ s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); ++ unsigned sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&cur, c, k); + +@@ -1306,7 +1340,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + k = bkey_i_to_s_c(cur.k); + bch2_bkey_buf_realloc(&prev, c, k.k->u64s); + +- sectors = min(sectors, k.k->size - offset_into_extent); ++ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); + + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + +@@ -1736,7 +1770,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, + bch2_inode_update_after_write(trans, inode, bi, ~0); + + inode->v.i_blocks = bi->bi_sectors; +- inode->v.i_ino = bi->bi_inum; + inode->v.i_rdev = bi->bi_dev; + inode->v.i_generation = bi->bi_generation; + inode->v.i_size = bi->bi_size; +@@ -2200,7 +2233,8 @@ static int bch2_fs_get_tree(struct fs_context *fc) + sb->s_time_gran = c->sb.nsec_per_time_unit; + sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; + sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); +- sb->s_uuid = c->sb.user_uuid; ++ super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); ++ super_set_sysfs_name_uuid(sb); + sb->s_shrink->seeks = 0; + c->vfs_sb = sb; + strscpy(sb->s_id, c->name, sizeof(sb->s_id)); +@@ -2345,13 +2379,16 @@ static int bch2_init_fs_context(struct fs_context *fc) + + void bch2_fs_vfs_exit(struct bch_fs *c) + { ++ if (c->vfs_inodes_by_inum_table.ht.tbl) ++ rhltable_destroy(&c->vfs_inodes_by_inum_table); + if (c->vfs_inodes_table.tbl) + rhashtable_destroy(&c->vfs_inodes_table); + } + + int bch2_fs_vfs_init(struct bch_fs *c) + { +- return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); ++ return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: ++ rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); + } + + static struct file_system_type bcache_fs_type = { +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +index 59f9f7ae728d..dd2198541455 100644 +--- a/fs/bcachefs/fs.h ++++ b/fs/bcachefs/fs.h +@@ -14,6 +14,7 @@ + struct bch_inode_info { + struct inode v; + struct rhash_head hash; ++ struct rhlist_head by_inum_hash; + subvol_inum ei_inum; + + struct list_head ei_vfs_inode_list; +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +index 75c8a97a6954..8fcf7c8e5ede 100644 +--- a/fs/bcachefs/fsck.c ++++ b/fs/bcachefs/fsck.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include "bcachefs.h" ++#include "bcachefs_ioctl.h" + #include "bkey_buf.h" + #include "btree_cache.h" + #include "btree_update.h" +@@ -16,6 +17,7 @@ + #include "recovery_passes.h" + #include "snapshot.h" + #include "super.h" ++#include "thread_with_file.h" + #include "xattr.h" + + #include +@@ -73,7 +75,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, + { + u64 sectors = 0; + +- int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, ++ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ({ +@@ -90,7 +92,7 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, + { + u64 subdirs = 0; + +- int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, ++ int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ({ +@@ -107,7 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) + { + struct bch_subvolume s; +- int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); ++ int ret = bch2_subvolume_get(trans, subvol, false, &s); + + *snapshot = le32_to_cpu(s.snapshot); + *inum = le64_to_cpu(s.inode); +@@ -170,7 +172,7 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, + if (ret) + return ret; + +- struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + *target = le64_to_cpu(d.v->d_inum); + *type = d.v->d_type; + bch2_trans_iter_exit(trans, &iter); +@@ -203,6 +205,36 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) + return ret; + } + ++/* ++ * Find any subvolume associated with a tree of snapshots ++ * We can't rely on master_subvol - it might have been deleted. ++ */ ++static int find_snapshot_tree_subvol(struct btree_trans *trans, ++ u32 tree_id, u32 *subvol) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_snapshot) ++ continue; ++ ++ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); ++ if (le32_to_cpu(s.v->tree) != tree_id) ++ continue; ++ ++ if (s.v->subvol) { ++ *subvol = le32_to_cpu(s.v->subvol); ++ goto found; ++ } ++ } ++ ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; ++found: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ + /* Get lost+found, create if it doesn't exist: */ + static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + struct bch_inode_unpacked *lostfound, +@@ -210,6 +242,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + { + struct bch_fs *c = trans->c; + struct qstr lostfound_str = QSTR("lost+found"); ++ struct btree_iter lostfound_iter = { NULL }; + u64 inum = 0; + unsigned d_type = 0; + int ret; +@@ -220,20 +253,24 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + if (ret) + return ret; + +- subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; ++ u32 subvolid; ++ ret = find_snapshot_tree_subvol(trans, ++ bch2_snapshot_tree(c, snapshot), &subvolid); ++ bch_err_msg(c, ret, "finding subvol associated with snapshot tree %u", ++ bch2_snapshot_tree(c, snapshot)); ++ if (ret) ++ return ret; + + struct bch_subvolume subvol; +- ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), +- false, 0, &subvol); +- bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", +- le32_to_cpu(st.master_subvol), snapshot); ++ ret = bch2_subvolume_get(trans, subvolid, false, &subvol); ++ bch_err_msg(c, ret, "looking up subvol %u for snapshot %u", subvolid, snapshot); + if (ret) + return ret; + + if (!subvol.inode) { + struct btree_iter iter; + struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter, +- BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)), ++ BTREE_ID_subvolumes, POS(0, subvolid), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(subvol); + if (ret) +@@ -243,13 +280,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + bch2_trans_iter_exit(trans, &iter); + } + +- root_inum.inum = le64_to_cpu(subvol.inode); ++ subvol_inum root_inum = { ++ .subvol = subvolid, ++ .inum = le64_to_cpu(subvol.inode) ++ }; + + struct bch_inode_unpacked root_inode; + struct bch_hash_info root_hash_info; + ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); + bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", +- root_inum.inum, le32_to_cpu(st.master_subvol)); ++ root_inum.inum, subvolid); + if (ret) + return ret; + +@@ -288,11 +328,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, + * XXX: we could have a nicer log message here if we had a nice way to + * walk backpointers to print a path + */ +- bch_notice(c, "creating lost+found in subvol %llu snapshot %u", +- root_inum.subvol, le32_to_cpu(st.root_snapshot)); ++ struct printbuf path = PRINTBUF; ++ ret = bch2_inum_to_path(trans, root_inum, &path); ++ if (ret) ++ goto err; ++ ++ bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", ++ path.buf, root_inum.subvol, snapshot); ++ printbuf_exit(&path); + + u64 now = bch2_current_time(c); +- struct btree_iter lostfound_iter = { NULL }; + u64 cpu = raw_smp_processor_id(); + + bch2_inode_init_early(c, lostfound); +@@ -451,7 +496,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * + continue; + + struct bch_inode_unpacked child_inode; +- bch2_inode_unpack(k, &child_inode); ++ ret = bch2_inode_unpack(k, &child_inode); ++ if (ret) ++ break; + + if (!inode_should_reattach(&child_inode)) { + ret = maybe_delete_dirent(trans, +@@ -482,6 +529,13 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * + return ret; + } + ++static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos pos) ++{ ++ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); ++} ++ + static int remove_backpointer(struct btree_trans *trans, + struct bch_inode_unpacked *inode) + { +@@ -490,13 +544,11 @@ static int remove_backpointer(struct btree_trans *trans, + + struct bch_fs *c = trans->c; + struct btree_iter iter; +- struct bkey_s_c_dirent d = +- bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, +- SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, +- dirent); +- int ret = bkey_err(d) ?: +- dirent_points_to_inode(c, d, inode) ?: +- __remove_dirent(trans, d.k->p); ++ struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, ++ SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); ++ int ret = bkey_err(d) ?: ++ dirent_points_to_inode(c, d, inode) ?: ++ __remove_dirent(trans, d.k->p); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +@@ -613,7 +665,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 + struct btree_iter iter = {}; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); +- struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); ++ struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); + bch2_trans_iter_exit(trans, &iter); + int ret = bkey_err(k); + if (ret) +@@ -780,11 +832,13 @@ struct inode_walker { + struct bpos last_pos; + + DARRAY(struct inode_walker_entry) inodes; ++ snapshot_id_list deletes; + }; + + static void inode_walker_exit(struct inode_walker *w) + { + darray_exit(&w->inodes); ++ darray_exit(&w->deletes); + } + + static struct inode_walker inode_walker_init(void) +@@ -797,9 +851,8 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, + { + struct bch_inode_unpacked u; + +- BUG_ON(bch2_inode_unpack(inode, &u)); +- +- return darray_push(&w->inodes, ((struct inode_walker_entry) { ++ return bch2_inode_unpack(inode, &u) ?: ++ darray_push(&w->inodes, ((struct inode_walker_entry) { + .inode = u, + .snapshot = inode.k->p.snapshot, + })); +@@ -909,8 +962,9 @@ static int get_visible_inodes(struct btree_trans *trans, + int ret; + + w->inodes.nr = 0; ++ w->deletes.nr = 0; + +- for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), ++ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; +@@ -918,10 +972,13 @@ static int get_visible_inodes(struct btree_trans *trans, + if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) + continue; + +- if (bkey_is_inode(k.k)) +- add_inode(c, w, k); ++ if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) ++ continue; + +- if (k.k->p.snapshot >= s->pos.snapshot) ++ ret = bkey_is_inode(k.k) ++ ? add_inode(c, w, k) ++ : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); ++ if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); +@@ -929,69 +986,16 @@ static int get_visible_inodes(struct btree_trans *trans, + return ret; + } + +-static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) +-{ +- if (d.v->d_type == DT_SUBVOL) { +- u32 snap; +- u64 inum; +- int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum); +- if (ret && !bch2_err_matches(ret, ENOENT)) +- return ret; +- return !ret; +- } else { +- struct btree_iter iter; +- struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, +- SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); +- int ret = bkey_err(k); +- if (ret) +- return ret; +- +- ret = bkey_is_inode(k.k); +- bch2_trans_iter_exit(trans, &iter); +- return ret; +- } +-} +- + /* + * Prefer to delete the first one, since that will be the one at the wrong + * offset: + * return value: 0 -> delete k1, 1 -> delete k2 + */ +-static int hash_pick_winner(struct btree_trans *trans, +- const struct bch_hash_desc desc, +- struct bch_hash_info *hash_info, +- struct bkey_s_c k1, +- struct bkey_s_c k2) +-{ +- if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && +- !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) +- return 0; +- +- switch (desc.btree_id) { +- case BTREE_ID_dirents: { +- int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1)); +- if (ret < 0) +- return ret; +- if (!ret) +- return 0; +- +- ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2)); +- if (ret < 0) +- return ret; +- if (!ret) +- return 1; +- return 2; +- } +- default: +- return 0; +- } +-} +- +-static int fsck_update_backpointers(struct btree_trans *trans, +- struct snapshots_seen *s, +- const struct bch_hash_desc desc, +- struct bch_hash_info *hash_info, +- struct bkey_i *new) ++int bch2_fsck_update_backpointers(struct btree_trans *trans, ++ struct snapshots_seen *s, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct bkey_i *new) + { + if (new->k.type != KEY_TYPE_dirent) + return 0; +@@ -1019,160 +1023,6 @@ static int fsck_update_backpointers(struct btree_trans *trans, + return ret; + } + +-static int fsck_rename_dirent(struct btree_trans *trans, +- struct snapshots_seen *s, +- const struct bch_hash_desc desc, +- struct bch_hash_info *hash_info, +- struct bkey_s_c_dirent old) +-{ +- struct qstr old_name = bch2_dirent_get_name(old); +- struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); +- int ret = PTR_ERR_OR_ZERO(new); +- if (ret) +- return ret; +- +- bkey_dirent_init(&new->k_i); +- dirent_copy_target(new, old); +- new->k.p = old.k->p; +- +- for (unsigned i = 0; i < 1000; i++) { +- unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", +- old_name.len, old_name.name, i); +- unsigned u64s = BKEY_U64s + dirent_val_u64s(len); +- +- if (u64s > U8_MAX) +- return -EINVAL; +- +- new->k.u64s = u64s; +- +- ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, +- (subvol_inum) { 0, old.k->p.inode }, +- old.k->p.snapshot, &new->k_i, +- BTREE_UPDATE_internal_snapshot_node); +- if (!bch2_err_matches(ret, EEXIST)) +- break; +- } +- +- if (ret) +- return ret; +- +- return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); +-} +- +-static int hash_check_key(struct btree_trans *trans, +- struct snapshots_seen *s, +- const struct bch_hash_desc desc, +- struct bch_hash_info *hash_info, +- struct btree_iter *k_iter, struct bkey_s_c hash_k) +-{ +- struct bch_fs *c = trans->c; +- struct btree_iter iter = { NULL }; +- struct printbuf buf = PRINTBUF; +- struct bkey_s_c k; +- u64 hash; +- int ret = 0; +- +- if (hash_k.k->type != desc.key_type) +- return 0; +- +- hash = desc.hash_bkey(hash_info, hash_k); +- +- if (likely(hash == hash_k.k->p.offset)) +- return 0; +- +- if (hash_k.k->p.offset < hash) +- goto bad_hash; +- +- for_each_btree_key_norestart(trans, iter, desc.btree_id, +- SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), +- BTREE_ITER_slots, k, ret) { +- if (bkey_eq(k.k->p, hash_k.k->p)) +- break; +- +- if (k.k->type == desc.key_type && +- !desc.cmp_bkey(k, hash_k)) +- goto duplicate_entries; +- +- if (bkey_deleted(k.k)) { +- bch2_trans_iter_exit(trans, &iter); +- goto bad_hash; +- } +- } +-out: +- bch2_trans_iter_exit(trans, &iter); +- printbuf_exit(&buf); +- return ret; +-bad_hash: +- if (fsck_err(trans, hash_table_key_wrong_offset, +- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", +- bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { +- struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); +- if (IS_ERR(new)) +- return PTR_ERR(new); +- +- k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, +- (subvol_inum) { 0, hash_k.k->p.inode }, +- hash_k.k->p.snapshot, new, +- STR_HASH_must_create| +- BTREE_ITER_with_updates| +- BTREE_UPDATE_internal_snapshot_node); +- ret = bkey_err(k); +- if (ret) +- goto out; +- if (k.k) +- goto duplicate_entries; +- +- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, +- BTREE_UPDATE_internal_snapshot_node) ?: +- fsck_update_backpointers(trans, s, desc, hash_info, new) ?: +- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: +- -BCH_ERR_transaction_restart_nested; +- goto out; +- } +-fsck_err: +- goto out; +-duplicate_entries: +- ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); +- if (ret < 0) +- goto out; +- +- if (!fsck_err(trans, hash_table_key_duplicate, +- "duplicate hash table keys%s:\n%s", +- ret != 2 ? "" : ", both point to valid inodes", +- (printbuf_reset(&buf), +- bch2_bkey_val_to_text(&buf, c, hash_k), +- prt_newline(&buf), +- bch2_bkey_val_to_text(&buf, c, k), +- buf.buf))) +- goto out; +- +- switch (ret) { +- case 0: +- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); +- break; +- case 1: +- ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); +- break; +- case 2: +- ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: +- bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); +- goto out; +- } +- +- ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: +- -BCH_ERR_transaction_restart_nested; +- goto out; +-} +- +-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bpos pos) +-{ +- return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +-} +- + static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, +@@ -1260,12 +1110,43 @@ static int get_snapshot_root_inode(struct btree_trans *trans, + goto err; + BUG(); + found_root: +- BUG_ON(bch2_inode_unpack(k, root)); ++ ret = bch2_inode_unpack(k, root); + err: + bch2_trans_iter_exit(trans, &iter); + return ret; + } + ++static int check_directory_size(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ struct bkey_s_c inode_k, bool *write_inode) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ u64 new_size = 0; ++ int ret; ++ ++ for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, ++ SPOS(inode_k.k->p.offset, 0, inode_k.k->p.snapshot), ++ POS(inode_k.k->p.offset, U64_MAX), ++ 0, k, ret) { ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ struct bkey_s_c_dirent dirent = bkey_s_c_to_dirent(k); ++ struct qstr name = bch2_dirent_get_name(dirent); ++ ++ new_size += dirent_occupied_size(&name); ++ } ++ bch2_trans_iter_exit(trans, &iter); ++ ++ if (!ret && inode_u->bi_size != new_size) { ++ inode_u->bi_size = new_size; ++ *write_inode = true; ++ } ++ ++ return ret; ++} ++ + static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, +@@ -1291,7 +1172,9 @@ static int check_inode(struct btree_trans *trans, + if (!bkey_is_inode(k.k)) + return 0; + +- BUG_ON(bch2_inode_unpack(k, &u)); ++ ret = bch2_inode_unpack(k, &u); ++ if (ret) ++ goto err; + + if (snapshot_root->bi_inum != u.bi_inum) { + ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); +@@ -1302,7 +1185,7 @@ static int check_inode(struct btree_trans *trans, + if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), + trans, inode_snapshot_mismatch, +- "inodes in different snapshots don't match")) { ++ "inode hash info in different snapshots don't match")) { + u.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); + do_update = true; +@@ -1392,7 +1275,7 @@ static int check_inode(struct btree_trans *trans, + + if (fsck_err_on(!ret, + trans, inode_unlinked_and_not_open, +- "inode %llu%u unlinked and not open", ++ "inode %llu:%u unlinked and not open", + u.bi_inum, u.bi_snapshot)) { + ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck deleting inode"); +@@ -1415,7 +1298,7 @@ static int check_inode(struct btree_trans *trans, + if (u.bi_subvol) { + struct bch_subvolume s; + +- ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); ++ ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + +@@ -1441,6 +1324,27 @@ static int check_inode(struct btree_trans *trans, + do_update = true; + } + } ++ ++ if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), ++ trans, inode_journal_seq_in_future, ++ "inode journal seq in future (currently at %llu)\n%s", ++ journal_cur_seq(&c->journal), ++ (printbuf_reset(&buf), ++ bch2_inode_unpacked_to_text(&buf, &u), ++ buf.buf))) { ++ u.bi_journal_seq = journal_cur_seq(&c->journal); ++ do_update = true; ++ } ++ ++ if (S_ISDIR(u.bi_mode)) { ++ ret = check_directory_size(trans, &u, k, &do_update); ++ ++ fsck_err_on(ret, ++ trans, directory_size_mismatch, ++ "directory inode %llu:%u with the mismatch directory size", ++ u.bi_inum, k.k->p.snapshot); ++ ret = 0; ++ } + do_update: + if (do_update) { + ret = __bch2_fsck_write_inode(trans, &u); +@@ -1502,7 +1406,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, + break; + + struct bch_inode_unpacked parent_inode; +- bch2_inode_unpack(k, &parent_inode); ++ ret = bch2_inode_unpack(k, &parent_inode); ++ if (ret) ++ break; + + if (!inode_should_reattach(&parent_inode)) + break; +@@ -1525,7 +1431,9 @@ static int check_unreachable_inode(struct btree_trans *trans, + return 0; + + struct bch_inode_unpacked inode; +- BUG_ON(bch2_inode_unpack(k, &inode)); ++ ret = bch2_inode_unpack(k, &inode); ++ if (ret) ++ return ret; + + if (!inode_should_reattach(&inode)) + return 0; +@@ -1649,7 +1557,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal + if (i->count != count2) { + bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", + w->last_pos.inode, i->snapshot, i->count, count2); +- return -BCH_ERR_internal_fsck_err; ++ i->count = count2; + } + + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), +@@ -1753,7 +1661,7 @@ static int overlapping_extents_found(struct btree_trans *trans, + bch2_trans_iter_init(trans, &iter1, btree, pos1, + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents); +- k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); ++ k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); + ret = bkey_err(k1); + if (ret) + goto err; +@@ -1778,7 +1686,7 @@ static int overlapping_extents_found(struct btree_trans *trans, + while (1) { + bch2_btree_iter_advance(&iter2); + +- k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); ++ k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); + ret = bkey_err(k2); + if (ret) + goto err; +@@ -2156,7 +2064,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, + return __bch2_fsck_write_inode(trans, target); + } + +- if (bch2_inode_should_have_bp(target) && ++ if (bch2_inode_should_have_single_bp(target) && + !fsck_err(trans, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n %s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), +@@ -2480,7 +2388,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + *hash_info = bch2_hash_info_init(c, &i->inode); + dir->first_this_inode = false; + +- ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); ++ ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); + if (ret < 0) + goto err; + if (ret) { +@@ -2519,6 +2427,30 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + if (ret) + goto err; + } ++ ++ darray_for_each(target->deletes, i) ++ if (fsck_err_on(!snapshot_list_has_id(&s->ids, *i), ++ trans, dirent_to_overwritten_inode, ++ "dirent points to inode overwritten in snapshot %u:\n%s", ++ *i, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), ++ buf.buf))) { ++ struct btree_iter delete_iter; ++ bch2_trans_iter_init(trans, &delete_iter, ++ BTREE_ID_dirents, ++ SPOS(k.k->p.inode, k.k->p.offset, *i), ++ BTREE_ITER_intent); ++ ret = bch2_btree_iter_traverse(&delete_iter) ?: ++ bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ hash_info, ++ &delete_iter, ++ BTREE_UPDATE_internal_snapshot_node); ++ bch2_trans_iter_exit(trans, &delete_iter); ++ if (ret) ++ goto err; ++ ++ } + } + + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +@@ -2594,7 +2526,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + *hash_info = bch2_hash_info_init(c, &i->inode); + inode->first_this_inode = false; + +- ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); ++ ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); + bch_err_fn(c, ret); + return ret; + } +@@ -2774,6 +2706,48 @@ struct pathbuf_entry { + + typedef DARRAY(struct pathbuf_entry) pathbuf; + ++static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, ++ u32 new_depth) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, p->inum, p->snapshot), 0); ++ ++ struct bch_inode_unpacked inode; ++ int ret = bkey_err(k) ?: ++ !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode ++ : bch2_inode_unpack(k, &inode); ++ if (ret) ++ goto err; ++ ++ if (inode.bi_depth != new_depth) { ++ inode.bi_depth = new_depth; ++ ret = __bch2_fsck_write_inode(trans, &inode) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) ++{ ++ u32 restart_count = trans->restart_count; ++ int ret = 0; ++ ++ darray_for_each_reverse(*path, i) { ++ ret = nested_lockrestart_do(trans, ++ bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); ++ bch_err_fn(trans->c, ret); ++ if (ret) ++ break; ++ ++ new_bi_depth++; ++ } ++ ++ return ret ?: trans_was_restarted(trans, restart_count); ++} ++ + static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) + { + darray_for_each(*p, i) +@@ -2783,21 +2757,21 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) + return false; + } + +-static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) ++static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) + { + struct bch_fs *c = trans->c; + struct btree_iter inode_iter = {}; +- struct bch_inode_unpacked inode; ++ pathbuf path = {}; + struct printbuf buf = PRINTBUF; + u32 snapshot = inode_k.k->p.snapshot; ++ bool redo_bi_depth = false; ++ u32 min_bi_depth = U32_MAX; + int ret = 0; + +- p->nr = 0; +- +- BUG_ON(bch2_inode_unpack(inode_k, &inode)); +- +- if (!S_ISDIR(inode.bi_mode)) +- return 0; ++ struct bch_inode_unpacked inode; ++ ret = bch2_inode_unpack(inode_k, &inode); ++ if (ret) ++ return ret; + + while (!inode.bi_subvol) { + struct btree_iter dirent_iter; +@@ -2807,7 +2781,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino + d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); + ret = bkey_err(d.s_c); + if (ret && !bch2_err_matches(ret, ENOENT)) +- break; ++ goto out; + + if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) + bch2_trans_iter_exit(trans, &dirent_iter); +@@ -2822,7 +2796,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino + + bch2_trans_iter_exit(trans, &dirent_iter); + +- ret = darray_push(p, ((struct pathbuf_entry) { ++ ret = darray_push(&path, ((struct pathbuf_entry) { + .inum = inode.bi_inum, + .snapshot = snapshot, + })); +@@ -2834,22 +2808,32 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino + bch2_trans_iter_exit(trans, &inode_iter); + inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inode.bi_dir, snapshot), 0); ++ ++ struct bch_inode_unpacked parent_inode; + ret = bkey_err(inode_k) ?: + !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode +- : bch2_inode_unpack(inode_k, &inode); ++ : bch2_inode_unpack(inode_k, &parent_inode); + if (ret) { + /* Should have been caught in dirents pass */ + bch_err_msg(c, ret, "error looking up parent directory"); +- break; ++ goto out; + } + ++ min_bi_depth = parent_inode.bi_depth; ++ ++ if (parent_inode.bi_depth < inode.bi_depth && ++ min_bi_depth < U16_MAX) ++ break; ++ ++ inode = parent_inode; + snapshot = inode_k.k->p.snapshot; ++ redo_bi_depth = true; + +- if (path_is_dup(p, inode.bi_inum, snapshot)) { ++ if (path_is_dup(&path, inode.bi_inum, snapshot)) { + /* XXX print path */ + bch_err(c, "directory structure loop"); + +- darray_for_each(*p, i) ++ darray_for_each(path, i) + pr_err("%llu:%u", i->inum, i->snapshot); + pr_err("%llu:%u", inode.bi_inum, snapshot); + +@@ -2862,12 +2846,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino + ret = reattach_inode(trans, &inode); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + } +- break; ++ ++ goto out; + } + } ++ ++ if (inode.bi_subvol) ++ min_bi_depth = 0; ++ ++ if (redo_bi_depth) ++ ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); + out: + fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); ++ darray_exit(&path); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +@@ -2879,24 +2871,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino + */ + int bch2_check_directory_structure(struct bch_fs *c) + { +- pathbuf path = { 0, }; +- int ret; +- +- ret = bch2_trans_run(c, ++ int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_intent| + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ +- if (!bkey_is_inode(k.k)) ++ if (!S_ISDIR(bkey_inode_mode(k))) + continue; + + if (bch2_inode_flags(k) & BCH_INODE_unlinked) + continue; + +- check_path(trans, &path, k); ++ check_path_loop(trans, k); + }))); +- darray_exit(&path); + + bch_err_fn(c, ret); + return ret; +@@ -2994,7 +2982,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, + + /* Should never fail, checked by bch2_inode_invalid: */ + struct bch_inode_unpacked u; +- BUG_ON(bch2_inode_unpack(k, &u)); ++ _ret3 = bch2_inode_unpack(k, &u); ++ if (_ret3) ++ break; + + /* + * Backpointer and directory structure checks are sufficient for +@@ -3072,7 +3062,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite + if (!bkey_is_inode(k.k)) + return 0; + +- BUG_ON(bch2_inode_unpack(k, &u)); ++ ret = bch2_inode_unpack(k, &u); ++ if (ret) ++ return ret; + + if (S_ISDIR(u.bi_mode)) + return 0; +@@ -3194,3 +3186,223 @@ int bch2_fix_reflink_p(struct bch_fs *c) + bch_err_fn(c, ret); + return ret; + } ++ ++#ifndef NO_BCACHEFS_CHARDEV ++ ++struct fsck_thread { ++ struct thread_with_stdio thr; ++ struct bch_fs *c; ++ struct bch_opts opts; ++}; ++ ++static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) ++{ ++ struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); ++ kfree(thr); ++} ++ ++static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) ++{ ++ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); ++ struct bch_fs *c = thr->c; ++ ++ int ret = PTR_ERR_OR_ZERO(c); ++ if (ret) ++ return ret; ++ ++ ret = bch2_fs_start(thr->c); ++ if (ret) ++ goto err; ++ ++ if (test_bit(BCH_FS_errors_fixed, &c->flags)) { ++ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); ++ ret |= 1; ++ } ++ if (test_bit(BCH_FS_error, &c->flags)) { ++ bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); ++ ret |= 4; ++ } ++err: ++ bch2_fs_stop(c); ++ return ret; ++} ++ ++static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { ++ .exit = bch2_fsck_thread_exit, ++ .fn = bch2_fsck_offline_thread_fn, ++}; ++ ++long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) ++{ ++ struct bch_ioctl_fsck_offline arg; ++ struct fsck_thread *thr = NULL; ++ darray_str(devs) = {}; ++ long ret = 0; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ for (size_t i = 0; i < arg.nr_devs; i++) { ++ u64 dev_u64; ++ ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); ++ if (ret) ++ goto err; ++ ++ char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); ++ ret = PTR_ERR_OR_ZERO(dev_str); ++ if (ret) ++ goto err; ++ ++ ret = darray_push(&devs, dev_str); ++ if (ret) { ++ kfree(dev_str); ++ goto err; ++ } ++ } ++ ++ thr = kzalloc(sizeof(*thr), GFP_KERNEL); ++ if (!thr) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ thr->opts = bch2_opts_empty(); ++ ++ if (arg.opts) { ++ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ++ ret = PTR_ERR_OR_ZERO(optstr) ?: ++ bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); ++ if (!IS_ERR(optstr)) ++ kfree(optstr); ++ ++ if (ret) ++ goto err; ++ } ++ ++ opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); ++ opt_set(thr->opts, read_only, 1); ++ opt_set(thr->opts, ratelimit_errors, 0); ++ ++ /* We need request_key() to be called before we punt to kthread: */ ++ opt_set(thr->opts, nostart, true); ++ ++ bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); ++ ++ thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); ++ ++ if (!IS_ERR(thr->c) && ++ thr->c->opts.errors == BCH_ON_ERROR_panic) ++ thr->c->opts.errors = BCH_ON_ERROR_ro; ++ ++ ret = __bch2_run_thread_with_stdio(&thr->thr); ++out: ++ darray_for_each(devs, i) ++ kfree(*i); ++ darray_exit(&devs); ++ return ret; ++err: ++ if (thr) ++ bch2_fsck_thread_exit(&thr->thr); ++ pr_err("ret %s", bch2_err_str(ret)); ++ goto out; ++} ++ ++static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) ++{ ++ struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); ++ struct bch_fs *c = thr->c; ++ ++ c->stdio_filter = current; ++ c->stdio = &thr->thr.stdio; ++ ++ /* ++ * XXX: can we figure out a way to do this without mucking with c->opts? ++ */ ++ unsigned old_fix_errors = c->opts.fix_errors; ++ if (opt_defined(thr->opts, fix_errors)) ++ c->opts.fix_errors = thr->opts.fix_errors; ++ else ++ c->opts.fix_errors = FSCK_FIX_ask; ++ ++ c->opts.fsck = true; ++ set_bit(BCH_FS_fsck_running, &c->flags); ++ ++ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; ++ int ret = bch2_run_online_recovery_passes(c); ++ ++ clear_bit(BCH_FS_fsck_running, &c->flags); ++ bch_err_fn(c, ret); ++ ++ c->stdio = NULL; ++ c->stdio_filter = NULL; ++ c->opts.fix_errors = old_fix_errors; ++ ++ up(&c->online_fsck_mutex); ++ bch2_ro_ref_put(c); ++ return ret; ++} ++ ++static const struct thread_with_stdio_ops bch2_online_fsck_ops = { ++ .exit = bch2_fsck_thread_exit, ++ .fn = bch2_fsck_online_thread_fn, ++}; ++ ++long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) ++{ ++ struct fsck_thread *thr = NULL; ++ long ret = 0; ++ ++ if (arg.flags) ++ return -EINVAL; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (!bch2_ro_ref_tryget(c)) ++ return -EROFS; ++ ++ if (down_trylock(&c->online_fsck_mutex)) { ++ bch2_ro_ref_put(c); ++ return -EAGAIN; ++ } ++ ++ thr = kzalloc(sizeof(*thr), GFP_KERNEL); ++ if (!thr) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ thr->c = c; ++ thr->opts = bch2_opts_empty(); ++ ++ if (arg.opts) { ++ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ++ ++ ret = PTR_ERR_OR_ZERO(optstr) ?: ++ bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); ++ if (!IS_ERR(optstr)) ++ kfree(optstr); ++ ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); ++err: ++ if (ret < 0) { ++ bch_err_fn(c, ret); ++ if (thr) ++ bch2_fsck_thread_exit(&thr->thr); ++ up(&c->online_fsck_mutex); ++ bch2_ro_ref_put(c); ++ } ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_CHARDEV */ +diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h +index 1cca31011530..574948278cd4 100644 +--- a/fs/bcachefs/fsck.h ++++ b/fs/bcachefs/fsck.h +@@ -2,6 +2,14 @@ + #ifndef _BCACHEFS_FSCK_H + #define _BCACHEFS_FSCK_H + ++#include "str_hash.h" ++ ++int bch2_fsck_update_backpointers(struct btree_trans *, ++ struct snapshots_seen *, ++ const struct bch_hash_desc, ++ struct bch_hash_info *, ++ struct bkey_i *); ++ + int bch2_check_inodes(struct bch_fs *); + int bch2_check_extents(struct bch_fs *); + int bch2_check_indirect_extents(struct bch_fs *); +@@ -14,4 +22,7 @@ int bch2_check_directory_structure(struct bch_fs *); + int bch2_check_nlinks(struct bch_fs *); + int bch2_fix_reflink_p(struct bch_fs *); + ++long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *); ++long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online); ++ + #endif /* _BCACHEFS_FSCK_H */ +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +index 039cb7a22244..04ec05206f8c 100644 +--- a/fs/bcachefs/inode.c ++++ b/fs/bcachefs/inode.c +@@ -14,6 +14,7 @@ + #include "extent_update.h" + #include "fs.h" + #include "inode.h" ++#include "opts.h" + #include "str_hash.h" + #include "snapshot.h" + #include "subvolume.h" +@@ -47,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end, + u8 *p; + + if (in >= end) +- return -1; ++ return -BCH_ERR_inode_unpack_error; + + if (!*in) +- return -1; ++ return -BCH_ERR_inode_unpack_error; + + /* + * position of highest set bit indicates number of bytes: +@@ -60,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end, + bytes = byte_table[shift - 1]; + + if (in + bytes > end) +- return -1; ++ return -BCH_ERR_inode_unpack_error; + + p = (u8 *) be + 16 - bytes; + memcpy(p, in, bytes); +@@ -176,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, + return ret; \ + \ + if (field_bits > sizeof(unpacked->_name) * 8) \ +- return -1; \ ++ return -BCH_ERR_inode_unpack_error; \ + \ + unpacked->_name = field[1]; \ + in += ret; +@@ -217,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ +- return -1; \ ++ return -BCH_ERR_inode_unpack_error; \ + fieldnr++; + + BCH_INODE_FIELDS_v2() +@@ -268,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ +- return -1; \ ++ return -BCH_ERR_inode_unpack_error; \ + fieldnr++; + + BCH_INODE_FIELDS_v3() +@@ -428,7 +429,7 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) + } + + static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bch_inode_unpacked unpacked; + int ret = 0; +@@ -468,7 +469,7 @@ static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + } + + int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + int ret = 0; +@@ -478,13 +479,13 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, + "invalid str hash type (%llu >= %u)", + INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); + +- ret = __bch2_inode_validate(c, k, flags); ++ ret = __bch2_inode_validate(c, k, from); + fsck_err: + return ret; + } + + int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + int ret = 0; +@@ -494,13 +495,13 @@ int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, + "invalid str hash type (%llu >= %u)", + INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); + +- ret = __bch2_inode_validate(c, k, flags); ++ ret = __bch2_inode_validate(c, k, from); + fsck_err: + return ret; + } + + int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + int ret = 0; +@@ -518,7 +519,7 @@ int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, + "invalid str hash type (%llu >= %u)", + INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); + +- ret = __bch2_inode_validate(c, k, flags); ++ ret = __bch2_inode_validate(c, k, from); + fsck_err: + return ret; + } +@@ -617,7 +618,7 @@ bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter + struct bkey_s_c k; + int ret = 0; + +- for_each_btree_key_upto_norestart(trans, *iter, btree, ++ for_each_btree_key_max_norestart(trans, *iter, btree, + bpos_successor(pos), + SPOS(pos.inode, pos.offset, U32_MAX), + flags|BTREE_ITER_all_snapshots, k, ret) +@@ -652,7 +653,7 @@ int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) + struct bkey_s_c k; + int ret = 0; + +- for_each_btree_key_upto_norestart(trans, iter, ++ for_each_btree_key_max_norestart(trans, iter, + BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), + BTREE_ITER_all_snapshots| + BTREE_ITER_with_updates, k, ret) +@@ -779,7 +780,7 @@ int bch2_trigger_inode(struct btree_trans *trans, + } + + int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + int ret = 0; + +@@ -798,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, + prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); + } + ++int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, ++ struct bkey_validate_context from) ++{ ++ int ret = 0; ++ ++ bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, ++ c, inode_alloc_cursor_inode_bad, ++ "k.p.inode bad"); ++fsck_err: ++ return ret; ++} ++ ++void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); ++ ++ prt_printf(out, "idx %llu generation %llu", ++ le64_to_cpu(i.v->idx), ++ le64_to_cpu(i.v->gen)); ++} ++ + void bch2_inode_init_early(struct bch_fs *c, + struct bch_inode_unpacked *inode_u) + { +@@ -858,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k) + } + } + +-/* +- * This just finds an empty slot: +- */ +-int bch2_inode_create(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bch_inode_unpacked *inode_u, +- u32 snapshot, u64 cpu) ++static struct bkey_i_inode_alloc_cursor * ++bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) + { + struct bch_fs *c = trans->c; +- struct bkey_s_c k; +- u64 min, max, start, pos, *hint; +- int ret = 0; +- unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + +- if (c->opts.shard_inode_numbers) { +- bits -= c->inode_shard_bits; ++ u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; + +- min = (cpu << bits); +- max = (cpu << bits) | ~(ULLONG_MAX << bits); ++ cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); + +- min = max_t(u64, min, BLOCKDEV_INODE_MAX); +- hint = c->unused_inode_hints + cpu; ++ struct btree_iter iter; ++ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, ++ BTREE_ID_logged_ops, ++ POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), ++ BTREE_ITER_cached); ++ int ret = bkey_err(k); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ struct bkey_i_inode_alloc_cursor *cursor = ++ k.k->type == KEY_TYPE_inode_alloc_cursor ++ ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) ++ : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); ++ ret = PTR_ERR_OR_ZERO(cursor); ++ if (ret) ++ goto err; ++ ++ if (c->opts.inodes_32bit) { ++ *min = BLOCKDEV_INODE_MAX; ++ *max = INT_MAX; + } else { +- min = BLOCKDEV_INODE_MAX; +- max = ~(ULLONG_MAX << bits); +- hint = c->unused_inode_hints; ++ cursor->v.bits = c->opts.shard_inode_numbers_bits; ++ ++ unsigned bits = 63 - c->opts.shard_inode_numbers_bits; ++ ++ *min = max(cpu << bits, (u64) INT_MAX + 1); ++ *max = (cpu << bits) | ~(ULLONG_MAX << bits); + } + +- start = READ_ONCE(*hint); ++ if (le64_to_cpu(cursor->v.idx) < *min) ++ cursor->v.idx = cpu_to_le64(*min); + +- if (start >= max || start < min) +- start = min; ++ if (le64_to_cpu(cursor->v.idx) >= *max) { ++ cursor->v.idx = cpu_to_le64(*min); ++ le32_add_cpu(&cursor->v.gen, 1); ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret ? ERR_PTR(ret) : cursor; ++} ++ ++/* ++ * This just finds an empty slot: ++ */ ++int bch2_inode_create(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode_u, ++ u32 snapshot, u64 cpu) ++{ ++ u64 min, max; ++ struct bkey_i_inode_alloc_cursor *cursor = ++ bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); ++ int ret = PTR_ERR_OR_ZERO(cursor); ++ if (ret) ++ return ret; ++ ++ u64 start = le64_to_cpu(cursor->v.idx); ++ u64 pos = start; + +- pos = start; + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); ++ struct bkey_s_c k; + again: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && +@@ -924,6 +982,7 @@ int bch2_inode_create(struct btree_trans *trans, + /* Retry from start */ + pos = start = min; + bch2_btree_iter_set_pos(iter, POS(0, pos)); ++ le32_add_cpu(&cursor->v.gen, 1); + goto again; + found_slot: + bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); +@@ -934,9 +993,9 @@ int bch2_inode_create(struct btree_trans *trans, + return ret; + } + +- *hint = k.k->p.offset; + inode_u->bi_inum = k.k->p.offset; +- inode_u->bi_generation = bkey_generation(k); ++ inode_u->bi_generation = le64_to_cpu(cursor->v.gen); ++ cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); + return 0; + } + +@@ -966,7 +1025,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, + + bch2_btree_iter_set_snapshot(&iter, snapshot); + +- k = bch2_btree_iter_peek_upto(&iter, end); ++ k = bch2_btree_iter_peek_max(&iter, end); + ret = bkey_err(k); + if (ret) + goto err; +@@ -998,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) + { + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter iter = { NULL }; +- struct bkey_i_inode_generation delete; +- struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + u32 snapshot; + int ret; +@@ -1039,13 +1096,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) + goto err; + } + +- bch2_inode_unpack(k, &inode_u); +- +- bkey_inode_generation_init(&delete.k_i); +- delete.k.p = iter.pos; +- delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); +- +- ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: ++ ret = bch2_btree_delete_at(trans, &iter, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc); + err: +@@ -1141,12 +1192,17 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) + void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, + struct bch_inode_unpacked *inode) + { +-#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); ++#define x(_name, _bits) \ ++ if ((inode)->bi_##_name) { \ ++ opts->_name = inode->bi_##_name - 1; \ ++ opts->_name##_from_inode = true; \ ++ } else { \ ++ opts->_name = c->opts._name; \ ++ } + BCH_INODE_OPTS() + #undef x + +- if (opts->nocow) +- opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; ++ bch2_io_opts_fixups(opts); + } + + int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) +@@ -1380,7 +1436,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c) + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + if (ret > 0) { +- bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); ++ bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", ++ k.k->p.offset, k.k->p.snapshot); + + ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); + /* +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +index eab82b5eb897..d2e134528f0e 100644 +--- a/fs/bcachefs/inode.h ++++ b/fs/bcachefs/inode.h +@@ -7,15 +7,14 @@ + #include "opts.h" + #include "snapshot.h" + +-enum bch_validate_flags; + extern const char * const bch2_inode_opts[]; + + int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); +@@ -60,7 +59,7 @@ static inline bool bkey_is_inode(const struct bkey *k) + } + + int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ +@@ -69,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk + .min_val_size = 8, \ + }) + ++int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); ++void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ ++ .key_validate = bch2_inode_alloc_cursor_validate, \ ++ .val_to_text = bch2_inode_alloc_cursor_to_text, \ ++ .min_val_size = 16, \ ++}) ++ + #if 0 + typedef struct { + u64 lo; +@@ -220,6 +229,20 @@ static inline u32 bch2_inode_flags(struct bkey_s_c k) + } + } + ++static inline unsigned bkey_inode_mode(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode); ++ case KEY_TYPE_inode_v2: ++ return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode); ++ case KEY_TYPE_inode_v3: ++ return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v); ++ default: ++ return 0; ++ } ++} ++ + /* i_nlink: */ + + static inline unsigned nlink_bias(umode_t mode) +@@ -249,7 +272,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, + int bch2_inode_nlink_inc(struct bch_inode_unpacked *); + void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); + +-static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode) ++static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) + { + bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; + +@@ -262,6 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, + struct bch_inode_unpacked *); + int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); + ++static inline struct bch_extent_rebalance ++bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts io_opts; ++ bch2_inode_opts_get(&io_opts, c, inode); ++ return io_opts_to_rebalance_opts(&io_opts); ++} ++ + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); + int bch2_delete_dead_inodes(struct bch_fs *); + +diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h +index 7928d0c6954f..b99a5bf1a75e 100644 +--- a/fs/bcachefs/inode_format.h ++++ b/fs/bcachefs/inode_format.h +@@ -101,7 +101,9 @@ struct bch_inode_generation { + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) \ +- x(bi_nocow, 8) ++ x(bi_nocow, 8) \ ++ x(bi_depth, 32) \ ++ x(bi_inodes_32bit, 8) + + /* subset of BCH_INODE_FIELDS */ + #define BCH_INODE_OPTS() \ +@@ -114,7 +116,8 @@ struct bch_inode_generation { + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) \ +- x(nocow, 8) ++ x(nocow, 8) \ ++ x(inodes_32bit, 8) + + enum inode_opt_id { + #define x(name, ...) \ +@@ -164,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); + LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + ++struct bch_inode_alloc_cursor { ++ struct bch_val v; ++ __u8 bits; ++ __u8 pad; ++ __le32 gen; ++ __le64 idx; ++}; ++ + #endif /* _BCACHEFS_INODE_FORMAT_H */ +diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c +index f283051758d6..5353979117b0 100644 +--- a/fs/bcachefs/io_misc.c ++++ b/fs/bcachefs/io_misc.c +@@ -113,11 +113,13 @@ int bch2_extent_fallocate(struct btree_trans *trans, + err: + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); +- if (should_print_err(ret)) +- bch_err_inum_offset_ratelimited(c, +- inum.inum, +- iter->pos.offset << 9, +- "%s(): error: %s", __func__, bch2_err_str(ret)); ++ if (should_print_err(ret)) { ++ struct printbuf buf = PRINTBUF; ++ bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); ++ prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } + err_noprint: + bch2_open_buckets_put(c, &open_buckets); + bch2_disk_reservation_put(c, &disk_res); +@@ -164,9 +166,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + bch2_btree_iter_set_snapshot(iter, snapshot); + + /* +- * peek_upto() doesn't have ideal semantics for extents: ++ * peek_max() doesn't have ideal semantics for extents: + */ +- k = bch2_btree_iter_peek_upto(iter, end_pos); ++ k = bch2_btree_iter_peek_max(iter, end_pos); + if (!k.k) + break; + +@@ -426,8 +428,8 @@ case LOGGED_OP_FINSERT_shift_extents: + bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); + + k = insert +- ? bch2_btree_iter_peek_prev(&iter) +- : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); ++ ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) ++ : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); + if ((ret = bkey_err(k))) + goto btree_err; + +@@ -461,7 +463,7 @@ case LOGGED_OP_FINSERT_shift_extents: + + op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); + +- ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: ++ ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: + bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: + bch2_logged_op_update(trans, &op->k_i) ?: +diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c +index b3b934a87c6d..6276f375dbc9 100644 +--- a/fs/bcachefs/io_read.c ++++ b/fs/bcachefs/io_read.c +@@ -21,6 +21,7 @@ + #include "io_read.h" + #include "io_misc.h" + #include "io_write.h" ++#include "reflink.h" + #include "subvolume.h" + #include "trace.h" + +@@ -79,6 +80,7 @@ struct promote_op { + struct rhash_head hash; + struct bpos pos; + ++ struct work_struct work; + struct data_update write; + struct bio_vec bi_inline_vecs[]; /* must be last */ + }; +@@ -90,16 +92,41 @@ static const struct rhashtable_params bch_promote_params = { + .automatic_shrinking = true, + }; + ++static inline bool have_io_error(struct bch_io_failures *failed) ++{ ++ return failed && failed->nr; ++} ++ ++static bool ptr_being_rewritten(struct bch_read_bio *orig, ++ unsigned dev, ++ unsigned flags) ++{ ++ if (!(flags & BCH_READ_data_update)) ++ return false; ++ ++ struct data_update *u = container_of(orig, struct data_update, rbio); ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); ++ unsigned i = 0; ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (ptr->dev == dev && ++ u->data_opts.rewrite_ptrs & BIT(i)) ++ return true; ++ i++; ++ } ++ ++ return false; ++} ++ + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags, + struct bch_io_failures *failed) + { +- if (!failed) { ++ if (!have_io_error(failed)) { + BUG_ON(!opts.promote_target); + +- if (!(flags & BCH_READ_MAY_PROMOTE)) ++ if (!(flags & BCH_READ_may_promote)) + return -BCH_ERR_nopromote_may_not; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) +@@ -119,98 +146,94 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, + return 0; + } + +-static void promote_free(struct bch_fs *c, struct promote_op *op) ++static noinline void promote_free(struct bch_read_bio *rbio) + { +- int ret; ++ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); ++ struct bch_fs *c = rbio->c; ++ ++ int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params); ++ BUG_ON(ret); + + bch2_data_update_exit(&op->write); + +- ret = rhashtable_remove_fast(&c->promote_table, &op->hash, +- bch_promote_params); +- BUG_ON(ret); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); + } + + static void promote_done(struct bch_write_op *wop) + { +- struct promote_op *op = +- container_of(wop, struct promote_op, write.op); +- struct bch_fs *c = op->write.op.c; ++ struct promote_op *op = container_of(wop, struct promote_op, write.op); ++ struct bch_fs *c = op->write.rbio.c; + +- bch2_time_stats_update(&c->times[BCH_TIME_data_promote], +- op->start_time); +- promote_free(c, op); ++ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); ++ promote_free(&op->write.rbio); + } + +-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ++static void promote_start_work(struct work_struct *work) + { +- struct bio *bio = &op->write.op.wbio.bio; ++ struct promote_op *op = container_of(work, struct promote_op, work); + +- trace_and_count(op->write.op.c, read_promote, &rbio->bio); ++ bch2_data_update_read_done(&op->write); ++} + +- /* we now own pages: */ +- BUG_ON(!rbio->bounce); +- BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); ++static noinline void promote_start(struct bch_read_bio *rbio) ++{ ++ struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + +- memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, +- sizeof(struct bio_vec) * rbio->bio.bi_vcnt); +- swap(bio->bi_vcnt, rbio->bio.bi_vcnt); ++ trace_and_count(op->write.op.c, read_promote, &rbio->bio); + +- bch2_data_update_read_done(&op->write, rbio->pick.crc); ++ INIT_WORK(&op->work, promote_start_work); ++ queue_work(rbio->c->write_ref_wq, &op->work); + } + +-static struct promote_op *__promote_alloc(struct btree_trans *trans, +- enum btree_id btree_id, +- struct bkey_s_c k, +- struct bpos pos, +- struct extent_ptr_decoded *pick, +- struct bch_io_opts opts, +- unsigned sectors, +- struct bch_read_bio **rbio, +- struct bch_io_failures *failed) ++static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ struct bpos pos, ++ struct extent_ptr_decoded *pick, ++ unsigned sectors, ++ unsigned flags, ++ struct bch_read_bio *orig, ++ struct bch_io_failures *failed) + { + struct bch_fs *c = trans->c; +- struct promote_op *op = NULL; +- struct bio *bio; +- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; + +- if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) +- return ERR_PTR(-BCH_ERR_nopromote_no_writes); ++ struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; + +- op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); +- if (!op) { +- ret = -BCH_ERR_nopromote_enomem; +- goto err; +- } ++ if (!have_io_error(failed)) { ++ update_opts.target = orig->opts.promote_target; ++ update_opts.extra_replicas = 1; ++ update_opts.write_flags |= BCH_WRITE_cached; ++ update_opts.write_flags |= BCH_WRITE_only_specified_devs; ++ } else { ++ update_opts.target = orig->opts.foreground_target; + +- op->start_time = local_clock(); +- op->pos = pos; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ unsigned ptr_bit = 1; ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (bch2_dev_io_failures(failed, ptr->dev) && ++ !ptr_being_rewritten(orig, ptr->dev, flags)) ++ update_opts.rewrite_ptrs |= ptr_bit; ++ ptr_bit <<= 1; ++ } + +- /* +- * We don't use the mempool here because extents that aren't +- * checksummed or compressed can be too big for the mempool: +- */ +- *rbio = kzalloc(sizeof(struct bch_read_bio) + +- sizeof(struct bio_vec) * pages, +- GFP_KERNEL); +- if (!*rbio) { +- ret = -BCH_ERR_nopromote_enomem; +- goto err; ++ if (!update_opts.rewrite_ptrs) ++ return NULL; + } + +- rbio_init(&(*rbio)->bio, opts); +- bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); ++ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) ++ return ERR_PTR(-BCH_ERR_nopromote_no_writes); + +- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { ++ struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); ++ if (!op) { + ret = -BCH_ERR_nopromote_enomem; +- goto err; ++ goto err_put; + } + +- (*rbio)->bounce = true; +- (*rbio)->split = true; +- (*rbio)->kmalloc = true; ++ op->start_time = local_clock(); ++ op->pos = pos; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) { +@@ -218,64 +241,43 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, + goto err; + } + +- bio = &op->write.op.wbio.bio; +- bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); +- +- struct data_update_opts update_opts = {}; +- +- if (!failed) { +- update_opts.target = opts.promote_target; +- update_opts.extra_replicas = 1; +- update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; +- } else { +- update_opts.target = opts.foreground_target; +- +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- unsigned i = 0; +- bkey_for_each_ptr(ptrs, ptr) { +- if (bch2_dev_io_failures(failed, ptr->dev)) +- update_opts.rewrite_ptrs |= BIT(i); +- i++; +- } +- } +- + ret = bch2_data_update_init(trans, NULL, NULL, &op->write, + writepoint_hashed((unsigned long) current), +- opts, ++ orig->opts, + update_opts, + btree_id, k); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ +- if (ret) { +- BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, +- bch_promote_params)); +- goto err; +- } ++ if (ret) ++ goto err_remove_hash; + ++ rbio_init_fragment(&op->write.rbio.bio, orig); ++ op->write.rbio.bounce = true; ++ op->write.rbio.promote = true; + op->write.op.end_io = promote_done; + +- return op; ++ return &op->write.rbio; ++err_remove_hash: ++ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params)); + err: +- if (*rbio) +- bio_free_pages(&(*rbio)->bio); +- kfree(*rbio); +- *rbio = NULL; ++ bio_free_pages(&op->write.op.wbio.bio); + /* We may have added to the rhashtable and thus need rcu freeing: */ + kfree_rcu(op, rcu); ++err_put: + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + return ERR_PTR(ret); + } + + noinline +-static struct promote_op *promote_alloc(struct btree_trans *trans, ++static struct bch_read_bio *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, +- struct bch_io_opts opts, + unsigned flags, +- struct bch_read_bio **rbio, ++ struct bch_read_bio *orig, + bool *bounce, + bool *read_full, + struct bch_io_failures *failed) +@@ -285,7 +287,7 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, + * if failed != NULL we're not actually doing a promote, we're + * recovering from an io/checksum error + */ +- bool promote_full = (failed || ++ bool promote_full = (have_io_error(failed) || + *read_full || + READ_ONCE(c->opts.promote_whole_extents)); + /* data might have to be decompressed in the write path: */ +@@ -295,18 +297,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); +- struct promote_op *promote; + int ret; + +- ret = should_promote(c, k, pos, opts, flags, failed); ++ ret = should_promote(c, k, pos, orig->opts, flags, failed); + if (ret) + goto nopromote; + +- promote = __promote_alloc(trans, +- k.k->type == KEY_TYPE_reflink_v +- ? BTREE_ID_reflink +- : BTREE_ID_extents, +- k, pos, pick, opts, sectors, rbio, failed); ++ struct bch_read_bio *promote = ++ __promote_alloc(trans, ++ k.k->type == KEY_TYPE_reflink_v ++ ? BTREE_ID_reflink ++ : BTREE_ID_extents, ++ k, pos, pick, sectors, flags, orig, failed); ++ if (!promote) ++ return NULL; ++ + ret = PTR_ERR_OR_ZERO(promote); + if (ret) + goto nopromote; +@@ -321,6 +326,20 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, + + /* Read */ + ++static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, ++ struct bch_read_bio *rbio, struct bpos read_pos) ++{ ++ return bch2_inum_offset_err_msg_trans(trans, out, ++ (subvol_inum) { rbio->subvol, read_pos.inode }, ++ read_pos.offset << 9); ++} ++ ++static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, ++ struct bch_read_bio *rbio, struct bpos read_pos) ++{ ++ bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); ++} ++ + #define READ_RETRY_AVOID 1 + #define READ_RETRY 2 + #define READ_ERR 3 +@@ -355,20 +374,20 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) + { + BUG_ON(rbio->bounce && !rbio->split); + +- if (rbio->promote) +- promote_free(rbio->c, rbio->promote); +- rbio->promote = NULL; +- +- if (rbio->bounce) +- bch2_bio_free_pages_pool(rbio->c, &rbio->bio); +- + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + +- if (rbio->kmalloc) +- kfree(rbio); +- else ++ if (unlikely(rbio->promote)) { ++ if (!rbio->bio.bi_status) ++ promote_start(rbio); ++ else ++ promote_free(rbio); ++ } else { ++ if (rbio->bounce) ++ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); ++ + bio_put(&rbio->bio); ++ } + + rbio = parent; + } +@@ -388,61 +407,47 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) + bio_endio(&rbio->bio); + } + +-static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, ++static noinline void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) + { ++ struct data_update *u = container_of(rbio, struct data_update, rbio); + struct btree_trans *trans = bch2_trans_get(c); +- struct btree_iter iter; +- struct bkey_buf sk; +- struct bkey_s_c k; +- int ret; +- +- flags &= ~BCH_READ_LAST_FRAGMENT; +- flags |= BCH_READ_MUST_CLONE; +- +- bch2_bkey_buf_init(&sk); +- +- bch2_trans_iter_init(trans, &iter, rbio->data_btree, +- rbio->read_pos, BTREE_ITER_slots); + retry: + bch2_trans_begin(trans); +- rbio->bio.bi_status = 0; + +- ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = lockrestart_do(trans, ++ bkey_err(k = bch2_bkey_get_iter(trans, &iter, ++ u->btree_id, bkey_start_pos(&u->k.k->k), ++ 0))); + if (ret) + goto err; + +- bch2_bkey_buf_reassemble(&sk, c, k); +- k = bkey_i_to_s_c(sk.k); +- +- if (!bch2_bkey_matches_ptr(c, k, +- rbio->pick.ptr, +- rbio->data_pos.offset - +- rbio->pick.crc.offset)) { ++ if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; +- goto out; ++ goto err; + } + + ret = __bch2_read_extent(trans, rbio, bvec_iter, +- rbio->read_pos, +- rbio->data_btree, +- k, 0, failed, flags); ++ bkey_start_pos(&u->k.k->k), ++ u->btree_id, ++ bkey_i_to_s_c(u->k.k), ++ 0, failed, flags); ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ + if (ret == READ_RETRY) + goto retry; + if (ret) +- goto err; +-out: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ ++ BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); + bch2_rbio_done(rbio); +- bch2_trans_iter_exit(trans, &iter); + bch2_trans_put(trans); +- bch2_bkey_buf_exit(&sk, c); +- return; +-err: +- rbio->bio.bi_status = BLK_STS_IOERR; +- goto out; + } + + static void bch2_rbio_retry(struct work_struct *work) +@@ -463,21 +468,20 @@ static void bch2_rbio_retry(struct work_struct *work) + if (rbio->retry == READ_RETRY_AVOID) + bch2_mark_io_failure(&failed, &rbio->pick); + +- rbio->bio.bi_status = 0; ++ if (!rbio->split) ++ rbio->bio.bi_status = 0; + + rbio = bch2_rbio_free(rbio); + +- flags |= BCH_READ_IN_RETRY; +- flags &= ~BCH_READ_MAY_PROMOTE; ++ flags |= BCH_READ_in_retry; ++ flags &= ~BCH_READ_may_promote; ++ flags &= ~BCH_READ_last_fragment; ++ flags |= BCH_READ_must_clone; + +- if (flags & BCH_READ_NODECODE) { ++ if (flags & BCH_READ_data_update) + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); +- } else { +- flags &= ~BCH_READ_LAST_FRAGMENT; +- flags |= BCH_READ_MUST_CLONE; +- ++ else + __bch2_read(c, rbio, iter, inum, &failed, flags); +- } + } + + static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, +@@ -485,7 +489,7 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + { + rbio->retry = retry; + +- if (rbio->flags & BCH_READ_IN_RETRY) ++ if (rbio->flags & BCH_READ_in_retry) + return; + + if (retry == READ_ERR) { +@@ -499,6 +503,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + } + } + ++static void bch2_read_io_err(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bio *bio = &rbio->bio; ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); ++ prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); ++ ++ if (ca) { ++ bch2_io_error(ca, BCH_MEMBER_ERROR_read); ++ bch_err_ratelimited(ca, "%s", buf.buf); ++ } else { ++ bch_err_ratelimited(c, "%s", buf.buf); ++ } ++ ++ printbuf_exit(&buf); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); ++} ++ + static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) + { +@@ -562,6 +589,73 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) + __bch2_rbio_narrow_crcs(trans, rbio)); + } + ++static void bch2_read_csum_err(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bio *src = &rbio->bio; ++ struct bch_extent_crc_unpacked crc = rbio->pick.crc; ++ struct nonce nonce = extent_nonce(rbio->version, crc); ++ struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); ++ prt_str(&buf, "data "); ++ bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); ++ ++ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; ++ if (ca) { ++ bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); ++ bch_err_ratelimited(ca, "%s", buf.buf); ++ } else { ++ bch_err_ratelimited(c, "%s", buf.buf); ++ } ++ ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ printbuf_exit(&buf); ++} ++ ++static void bch2_read_decompress_err(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); ++ prt_str(&buf, "decompression error"); ++ ++ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; ++ if (ca) ++ bch_err_ratelimited(ca, "%s", buf.buf); ++ else ++ bch_err_ratelimited(c, "%s", buf.buf); ++ ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ printbuf_exit(&buf); ++} ++ ++static void bch2_read_decrypt_err(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct printbuf buf = PRINTBUF; ++ ++ bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); ++ prt_str(&buf, "decrypt error"); ++ ++ struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; ++ if (ca) ++ bch_err_ratelimited(ca, "%s", buf.buf); ++ else ++ bch_err_ratelimited(c, "%s", buf.buf); ++ ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ printbuf_exit(&buf); ++} ++ + /* Inner part that may run in process context */ + static void __bch2_read_endio(struct work_struct *work) + { +@@ -602,32 +696,40 @@ static void __bch2_read_endio(struct work_struct *work) + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + +- if (rbio->flags & BCH_READ_NODECODE) +- goto nodecode; ++ if (likely(!(rbio->flags & BCH_READ_data_update))) { ++ /* Adjust crc to point to subset of data we want: */ ++ crc.offset += rbio->offset_into_extent; ++ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + +- /* Adjust crc to point to subset of data we want: */ +- crc.offset += rbio->offset_into_extent; +- crc.live_size = bvec_iter_sectors(rbio->bvec_iter); ++ if (crc_is_compressed(crc)) { ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; + +- if (crc_is_compressed(crc)) { +- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); +- if (ret) +- goto decrypt_err; ++ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && ++ !c->opts.no_data_io) ++ goto decompression_err; ++ } else { ++ /* don't need to decrypt the entire bio: */ ++ nonce = nonce_add(nonce, crc.offset << 9); ++ bio_advance(src, crc.offset << 9); + +- if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && +- !c->opts.no_data_io) +- goto decompression_err; +- } else { +- /* don't need to decrypt the entire bio: */ +- nonce = nonce_add(nonce, crc.offset << 9); +- bio_advance(src, crc.offset << 9); ++ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); ++ src->bi_iter.bi_size = dst_iter.bi_size; + +- BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); +- src->bi_iter.bi_size = dst_iter.bi_size; ++ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (ret) ++ goto decrypt_err; + +- ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); +- if (ret) +- goto decrypt_err; ++ if (rbio->bounce) { ++ struct bvec_iter src_iter = src->bi_iter; ++ ++ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); ++ } ++ } ++ } else { ++ if (rbio->split) ++ rbio->parent->pick = rbio->pick; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; +@@ -644,12 +746,9 @@ static void __bch2_read_endio(struct work_struct *work) + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; +- +- promote_start(rbio->promote, rbio); +- rbio->promote = NULL; + } +-nodecode: +- if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { ++ ++ if (likely(!(rbio->flags & BCH_READ_in_retry))) { + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } +@@ -662,39 +761,19 @@ static void __bch2_read_endio(struct work_struct *work) + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ +- if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { +- rbio->flags |= BCH_READ_MUST_BOUNCE; ++ if (!rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { ++ rbio->flags |= BCH_READ_must_bounce; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; + } + +- struct printbuf buf = PRINTBUF; +- buf.atomic++; +- prt_str(&buf, "data "); +- bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); +- +- struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; +- if (ca) { +- bch_err_inum_offset_ratelimited(ca, +- rbio->read_pos.inode, +- rbio->read_pos.offset << 9, +- "data %s", buf.buf); +- bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); +- } +- printbuf_exit(&buf); +- bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + goto out; + decompression_err: +- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, +- rbio->read_pos.offset << 9, +- "decompression error"); +- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + goto out; + decrypt_err: +- bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, +- rbio->read_pos.offset << 9, +- "decrypt error"); +- bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + goto out; + } + +@@ -715,24 +794,16 @@ static void bch2_read_endio(struct bio *bio) + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + +- if (bio->bi_status) { +- if (ca) { +- bch_err_inum_offset_ratelimited(ca, +- rbio->read_pos.inode, +- rbio->read_pos.offset, +- "data read error: %s", +- bch2_blk_status_to_str(bio->bi_status)); +- bch2_io_error(ca, BCH_MEMBER_ERROR_read); +- } +- bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); ++ if (unlikely(bio->bi_status)) { ++ bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + return; + } + +- if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || + (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { + trace_and_count(c, read_reuse_race, &rbio->bio); + +- if (rbio->flags & BCH_READ_RETRY_IF_STALE) ++ if (rbio->flags & BCH_READ_retry_if_stale) + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + else + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); +@@ -750,45 +821,6 @@ static void bch2_read_endio(struct bio *bio) + bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); + } + +-int __bch2_read_indirect_extent(struct btree_trans *trans, +- unsigned *offset_into_extent, +- struct bkey_buf *orig_k) +-{ +- struct btree_iter iter; +- struct bkey_s_c k; +- u64 reflink_offset; +- int ret; +- +- reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + +- *offset_into_extent; +- +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, +- POS(0, reflink_offset), 0); +- ret = bkey_err(k); +- if (ret) +- goto err; +- +- if (k.k->type != KEY_TYPE_reflink_v && +- k.k->type != KEY_TYPE_indirect_inline_data) { +- bch_err_inum_offset_ratelimited(trans->c, +- orig_k->k->k.p.inode, +- orig_k->k->k.p.offset << 9, +- "%llu len %u points to nonexistent indirect extent %llu", +- orig_k->k->k.p.offset, +- orig_k->k->k.size, +- reflink_offset); +- bch2_inconsistent_error(trans->c); +- ret = -BCH_ERR_missing_indirect_extent; +- goto err; +- } +- +- *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); +- bch2_bkey_buf_reassemble(orig_k, trans->c, k); +-err: +- bch2_trans_iter_exit(trans, &iter); +- return ret; +-} +- + static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c k, +@@ -845,7 +877,6 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; +- struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; +@@ -868,15 +899,24 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + if (!pick_ret) + goto hole; + +- if (pick_ret < 0) { ++ if (unlikely(pick_ret < 0)) { ++ struct printbuf buf = PRINTBUF; ++ bch2_read_err_msg_trans(trans, &buf, orig, read_pos); ++ prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); ++ bch2_bkey_val_to_text(&buf, c, k); ++ ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ goto err; ++ } ++ ++ if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { + struct printbuf buf = PRINTBUF; ++ bch2_read_err_msg_trans(trans, &buf, orig, read_pos); ++ prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); + bch2_bkey_val_to_text(&buf, c, k); + +- bch_err_inum_offset_ratelimited(c, +- read_pos.inode, read_pos.offset << 9, +- "no device to read from: %s\n %s", +- bch2_err_str(pick_ret), +- buf.buf); ++ bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + goto err; + } +@@ -889,7 +929,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ +- if ((flags & BCH_READ_IN_RETRY) && ++ if ((flags & BCH_READ_in_retry) && + !pick.ptr.cached && + ca && + unlikely(dev_ptr_stale(ca, &pick.ptr))) { +@@ -903,48 +943,52 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ +- bch2_trans_unlock(trans); ++ if (!(flags & BCH_READ_in_retry)) ++ bch2_trans_unlock(trans); ++ else ++ bch2_trans_unlock_long(trans); ++ ++ if (!(flags & BCH_READ_data_update)) { ++ if (!(flags & BCH_READ_last_fragment) || ++ bio_flagged(&orig->bio, BIO_CHAIN)) ++ flags |= BCH_READ_must_clone; ++ ++ narrow_crcs = !(flags & BCH_READ_in_retry) && ++ bch2_can_narrow_extent_crcs(k, pick.crc); ++ ++ if (narrow_crcs && (flags & BCH_READ_user_mapped)) ++ flags |= BCH_READ_must_bounce; + +- if (flags & BCH_READ_NODECODE) { ++ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ ++ if (crc_is_compressed(pick.crc) || ++ (pick.crc.csum_type != BCH_CSUM_none && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ (bch2_csum_type_is_encryption(pick.crc.csum_type) && ++ (flags & BCH_READ_user_mapped)) || ++ (flags & BCH_READ_must_bounce)))) { ++ read_full = true; ++ bounce = true; ++ } ++ } else { ++ read_full = true; + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ +- if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { ++ struct data_update *u = container_of(orig, struct data_update, rbio); ++ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { + if (ca) + percpu_ref_put(&ca->io_ref); + goto hole; + } + + iter.bi_size = pick.crc.compressed_size << 9; +- goto get_bio; +- } +- +- if (!(flags & BCH_READ_LAST_FRAGMENT) || +- bio_flagged(&orig->bio, BIO_CHAIN)) +- flags |= BCH_READ_MUST_CLONE; +- +- narrow_crcs = !(flags & BCH_READ_IN_RETRY) && +- bch2_can_narrow_extent_crcs(k, pick.crc); +- +- if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) +- flags |= BCH_READ_MUST_BOUNCE; +- +- EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); +- +- if (crc_is_compressed(pick.crc) || +- (pick.crc.csum_type != BCH_CSUM_none && +- (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || +- (bch2_csum_type_is_encryption(pick.crc.csum_type) && +- (flags & BCH_READ_USER_MAPPED)) || +- (flags & BCH_READ_MUST_BOUNCE)))) { +- read_full = true; +- bounce = true; + } + +- if (orig->opts.promote_target)// || failed) +- promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, +- &rbio, &bounce, &read_full, failed); ++ if (orig->opts.promote_target || have_io_error(failed)) ++ rbio = promote_alloc(trans, iter, k, &pick, flags, orig, ++ &bounce, &read_full, failed); + + if (!read_full) { + EBUG_ON(crc_is_compressed(pick.crc)); +@@ -963,7 +1007,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + } +-get_bio: ++ + if (rbio) { + /* + * promote already allocated bounce rbio: +@@ -978,17 +1022,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; + +- rbio = rbio_init(bio_alloc_bioset(NULL, ++ rbio = rbio_init_fragment(bio_alloc_bioset(NULL, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOFS, + &c->bio_read_split), +- orig->opts); ++ orig); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + rbio->bounce = true; +- rbio->split = true; +- } else if (flags & BCH_READ_MUST_CLONE) { ++ } else if (flags & BCH_READ_must_clone) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't +@@ -997,11 +1040,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ +- rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, ++ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + &c->bio_read_split), +- orig->opts); ++ orig); + rbio->bio.bi_iter = iter; +- rbio->split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; +@@ -1010,11 +1052,8 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + +- rbio->c = c; + rbio->submit_time = local_clock(); +- if (rbio->split) +- rbio->parent = orig; +- else ++ if (!rbio->split) + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; +@@ -1024,20 +1063,14 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; +- /* XXX: only initialize this if needed */ +- rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->bversion; +- rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); + +- if (flags & BCH_READ_NODECODE) +- orig->pick = pick; +- + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; +@@ -1052,21 +1085,25 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ +- if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) ++ if (ca && pick.ptr.cached && !(flags & BCH_READ_data_update)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + +- if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { ++ if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { + bio_inc_remaining(&orig->bio); + trace_and_count(c, read_split, &orig->bio); + } + + if (!rbio->pick.idx) { +- if (!rbio->have_ioref) { +- bch_err_inum_offset_ratelimited(c, +- read_pos.inode, +- read_pos.offset << 9, +- "no device to read from"); ++ if (unlikely(!rbio->have_ioref)) { ++ struct printbuf buf = PRINTBUF; ++ bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); ++ prt_printf(&buf, "no device to read from:\n "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } +@@ -1076,10 +1113,10 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (unlikely(c->opts.no_data_io)) { +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + bio_endio(&rbio->bio); + } else { +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); +@@ -1097,11 +1134,11 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + goto out; + } + +- if (likely(!(flags & BCH_READ_IN_RETRY))) ++ if (likely(!(flags & BCH_READ_in_retry))) + bio_endio(&rbio->bio); + } + out: +- if (likely(!(flags & BCH_READ_IN_RETRY))) { ++ if (likely(!(flags & BCH_READ_in_retry))) { + return 0; + } else { + int ret; +@@ -1124,7 +1161,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + } + + err: +- if (flags & BCH_READ_IN_RETRY) ++ if (flags & BCH_READ_in_retry) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; +@@ -1132,16 +1169,16 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + + hole: + /* +- * won't normally happen in the BCH_READ_NODECODE ++ * won't normally happen in the BCH_READ_data_update + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: + */ +- if (flags & BCH_READ_NODECODE) ++ if (flags & BCH_READ_data_update) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); + out_read_done: +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + bch2_rbio_done(orig); + return 0; + } +@@ -1156,7 +1193,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bkey_s_c k; + int ret; + +- BUG_ON(flags & BCH_READ_NODECODE); ++ BUG_ON(flags & BCH_READ_data_update); + + bch2_bkey_buf_init(&sk); + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, +@@ -1164,7 +1201,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + BTREE_ITER_slots); + + while (1) { +- unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + bch2_trans_begin(trans); +@@ -1184,9 +1220,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + if (ret) + goto err; + +- offset_into_extent = iter.pos.offset - ++ s64 offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); +- sectors = k.k->size - offset_into_extent; ++ unsigned sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + +@@ -1201,13 +1237,13 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ +- sectors = min(sectors, k.k->size - offset_into_extent); ++ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); + +- bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; ++ unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + + if (bvec_iter.bi_size == bytes) +- flags |= BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_last_fragment; + + ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, + data_btree, k, +@@ -1215,7 +1251,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + if (ret) + goto err; + +- if (flags & BCH_READ_LAST_FRAGMENT) ++ if (flags & BCH_READ_last_fragment) + break; + + swap(bvec_iter.bi_size, bytes); +@@ -1229,16 +1265,20 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + } + + bch2_trans_iter_exit(trans, &iter); +- bch2_trans_put(trans); +- bch2_bkey_buf_exit(&sk, c); + + if (ret) { +- bch_err_inum_offset_ratelimited(c, inum.inum, +- bvec_iter.bi_sector << 9, +- "read error %i from btree lookup", ret); ++ struct printbuf buf = PRINTBUF; ++ bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); ++ prt_printf(&buf, "read error %i from btree lookup", ret); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } ++ ++ bch2_trans_put(trans); ++ bch2_bkey_buf_exit(&sk, c); + } + + void bch2_fs_io_read_exit(struct bch_fs *c) +diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h +index d9c18bb7d403..f54c9943e34a 100644 +--- a/fs/bcachefs/io_read.h ++++ b/fs/bcachefs/io_read.h +@@ -3,6 +3,7 @@ + #define _BCACHEFS_IO_READ_H + + #include "bkey_buf.h" ++#include "reflink.h" + + struct bch_read_bio { + struct bch_fs *c; +@@ -34,9 +35,9 @@ struct bch_read_bio { + u16 flags; + union { + struct { +- u16 bounce:1, ++ u16 promote:1, ++ bounce:1, + split:1, +- kmalloc:1, + have_ioref:1, + narrow_crcs:1, + hole:1, +@@ -46,8 +47,6 @@ struct bch_read_bio { + u16 _state; + }; + +- struct bch_devs_list devs_have; +- + struct extent_ptr_decoded pick; + + /* +@@ -64,8 +63,6 @@ struct bch_read_bio { + struct bpos data_pos; + struct bversion version; + +- struct promote_op *promote; +- + struct bch_io_opts opts; + + struct work_struct work; +@@ -79,32 +76,54 @@ struct bch_devs_mask; + struct cache_promote_op; + struct extent_ptr_decoded; + +-int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, +- struct bkey_buf *); +- + static inline int bch2_read_indirect_extent(struct btree_trans *trans, + enum btree_id *data_btree, +- unsigned *offset_into_extent, +- struct bkey_buf *k) ++ s64 *offset_into_extent, ++ struct bkey_buf *extent) + { +- if (k->k->k.type != KEY_TYPE_reflink_p) ++ if (extent->k->k.type != KEY_TYPE_reflink_p) + return 0; + + *data_btree = BTREE_ID_reflink; +- return __bch2_read_indirect_extent(trans, offset_into_extent, k); ++ struct btree_iter iter; ++ struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, ++ offset_into_extent, ++ bkey_i_to_s_c_reflink_p(extent->k), ++ true, 0); ++ int ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (bkey_deleted(k.k)) { ++ bch2_trans_iter_exit(trans, &iter); ++ return -BCH_ERR_missing_indirect_extent; ++ } ++ ++ bch2_bkey_buf_reassemble(extent, trans->c, k); ++ bch2_trans_iter_exit(trans, &iter); ++ return 0; + } + ++#define BCH_READ_FLAGS() \ ++ x(retry_if_stale) \ ++ x(may_promote) \ ++ x(user_mapped) \ ++ x(data_update) \ ++ x(last_fragment) \ ++ x(must_bounce) \ ++ x(must_clone) \ ++ x(in_retry) ++ ++enum __bch_read_flags { ++#define x(n) __BCH_READ_##n, ++ BCH_READ_FLAGS() ++#undef x ++}; ++ + enum bch_read_flags { +- BCH_READ_RETRY_IF_STALE = 1 << 0, +- BCH_READ_MAY_PROMOTE = 1 << 1, +- BCH_READ_USER_MAPPED = 1 << 2, +- BCH_READ_NODECODE = 1 << 3, +- BCH_READ_LAST_FRAGMENT = 1 << 4, +- +- /* internal: */ +- BCH_READ_MUST_BOUNCE = 1 << 5, +- BCH_READ_MUST_CLONE = 1 << 6, +- BCH_READ_IN_RETRY = 1 << 7, ++#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), ++ BCH_READ_FLAGS() ++#undef x + }; + + int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, +@@ -131,24 +150,39 @@ static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + + BUG_ON(rbio->_state); + +- rbio->c = c; +- rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, +- BCH_READ_RETRY_IF_STALE| +- BCH_READ_MAY_PROMOTE| +- BCH_READ_USER_MAPPED); ++ BCH_READ_retry_if_stale| ++ BCH_READ_may_promote| ++ BCH_READ_user_mapped); + } + +-static inline struct bch_read_bio *rbio_init(struct bio *bio, +- struct bch_io_opts opts) ++static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, ++ struct bch_read_bio *orig) + { + struct bch_read_bio *rbio = to_rbio(bio); + ++ rbio->c = orig->c; + rbio->_state = 0; +- rbio->promote = NULL; +- rbio->opts = opts; ++ rbio->split = true; ++ rbio->parent = orig; ++ rbio->opts = orig->opts; ++ return rbio; ++} ++ ++static inline struct bch_read_bio *rbio_init(struct bio *bio, ++ struct bch_fs *c, ++ struct bch_io_opts opts, ++ bio_end_io_t end_io) ++{ ++ struct bch_read_bio *rbio = to_rbio(bio); ++ ++ rbio->start_time = local_clock(); ++ rbio->c = c; ++ rbio->_state = 0; ++ rbio->opts = opts; ++ rbio->bio.bi_end_io = end_io; + return rbio; + } + +diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c +index 96720adcfee0..92abc239599d 100644 +--- a/fs/bcachefs/io_write.c ++++ b/fs/bcachefs/io_write.c +@@ -164,7 +164,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, + + bch2_trans_copy_iter(&iter, extent_iter); + +- for_each_btree_key_upto_continue_norestart(iter, ++ for_each_btree_key_max_continue_norestart(iter, + new->k.p, BTREE_ITER_slots, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), +@@ -216,6 +216,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + SPOS(0, + extent_iter->pos.inode, + extent_iter->snapshot), ++ BTREE_ITER_intent| + BTREE_ITER_cached); + int ret = bkey_err(k); + if (unlikely(ret)) +@@ -369,11 +370,11 @@ static int bch2_write_index_default(struct bch_write_op *op) + bkey_start_pos(&sk.k->k), + BTREE_ITER_slots|BTREE_ITER_intent); + +- ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: ++ ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: + bch2_extent_update(trans, inum, &iter, sk.k, + &op->res, + op->new_i_size, &op->i_sectors_delta, +- op->flags & BCH_WRITE_CHECK_ENOSPC); ++ op->flags & BCH_WRITE_check_enospc); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) +@@ -395,6 +396,21 @@ static int bch2_write_index_default(struct bch_write_op *op) + + /* Writes */ + ++static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, ++ u64 offset) ++{ ++ bch2_inum_offset_err_msg(op->c, out, ++ (subvol_inum) { op->subvol, op->pos.inode, }, ++ offset << 9); ++ prt_printf(out, "write error%s: ", ++ op->flags & BCH_WRITE_move ? "(internal move)" : ""); ++} ++ ++void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) ++{ ++ __bch2_write_op_error(out, op, op->pos.offset); ++} ++ + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + enum bch_data_type type, + const struct bkey_i *k, +@@ -467,7 +483,7 @@ static void bch2_write_done(struct closure *cl) + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + bch2_disk_reservation_put(c, &op->res); + +- if (!(op->flags & BCH_WRITE_MOVE)) ++ if (!(op->flags & BCH_WRITE_move)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + +@@ -513,7 +529,7 @@ static void __bch2_write_index(struct bch_write_op *op) + unsigned dev; + int ret = 0; + +- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { ++ if (unlikely(op->flags & BCH_WRITE_io_error)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; +@@ -522,7 +538,7 @@ static void __bch2_write_index(struct bch_write_op *op) + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + +- ret = !(op->flags & BCH_WRITE_MOVE) ++ ret = !(op->flags & BCH_WRITE_move) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); + +@@ -531,14 +547,14 @@ static void __bch2_write_index(struct bch_write_op *op) + + op->written += sectors_start - keylist_sectors(keys); + +- if (ret && !bch2_err_matches(ret, EROFS)) { ++ if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + +- bch_err_inum_offset_ratelimited(c, +- insert->k.p.inode, insert->k.p.offset << 9, +- "%s write error while doing btree update: %s", +- op->flags & BCH_WRITE_MOVE ? "move" : "user", +- bch2_err_str(ret)); ++ struct printbuf buf = PRINTBUF; ++ __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); ++ prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); + } + + if (ret) +@@ -554,7 +570,7 @@ static void __bch2_write_index(struct bch_write_op *op) + err: + keys->top = keys->keys; + op->error = ret; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + goto out; + } + +@@ -589,8 +605,8 @@ static CLOSURE_CALLBACK(bch2_write_index) + struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; + +- if ((op->flags & BCH_WRITE_SUBMITTED) && +- (op->flags & BCH_WRITE_MOVE)) ++ if ((op->flags & BCH_WRITE_submitted) && ++ (op->flags & BCH_WRITE_move)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); + + spin_lock_irqsave(&wp->writes_lock, flags); +@@ -621,20 +637,18 @@ void bch2_write_point_do_index_updates(struct work_struct *work) + + while (1) { + spin_lock_irq(&wp->writes_lock); +- op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); +- if (op) +- list_del(&op->wp_list); ++ op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); + wp_update_state(wp, op != NULL); + spin_unlock_irq(&wp->writes_lock); + + if (!op) + break; + +- op->flags |= BCH_WRITE_IN_WORKER; ++ op->flags |= BCH_WRITE_in_worker; + + __bch2_write_index(op); + +- if (!(op->flags & BCH_WRITE_SUBMITTED)) ++ if (!(op->flags & BCH_WRITE_submitted)) + __bch2_write(op); + else + bch2_write_done(&op->cl); +@@ -658,7 +672,7 @@ static void bch2_write_endio(struct bio *bio) + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + set_bit(wbio->dev, op->failed.d); +- op->flags |= BCH_WRITE_IO_ERROR; ++ op->flags |= BCH_WRITE_io_error; + } + + if (wbio->nocow) { +@@ -705,7 +719,7 @@ static void init_append_extent(struct bch_write_op *op, + bch2_extent_crc_append(&e->k_i, crc); + + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, +- op->flags & BCH_WRITE_CACHED); ++ op->flags & BCH_WRITE_cached); + + bch2_keylist_push(&op->insert_keys); + } +@@ -822,7 +836,7 @@ static enum prep_encoded_ret { + struct bch_fs *c = op->c; + struct bio *bio = &op->wbio.bio; + +- if (!(op->flags & BCH_WRITE_DATA_ENCODED)) ++ if (!(op->flags & BCH_WRITE_data_encoded)) + return PREP_ENCODED_OK; + + BUG_ON(bio_sectors(bio) != op->crc.compressed_size); +@@ -859,7 +873,7 @@ static enum prep_encoded_ret { + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + return PREP_ENCODED_CHECKSUM_ERR; + +- if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) ++ if (bch2_bio_uncompress_inplace(op, bio)) + return PREP_ENCODED_ERR; + } + +@@ -930,9 +944,9 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + if (ec_buf || + op->compression_opt || + (op->csum_type && +- !(op->flags & BCH_WRITE_PAGES_STABLE)) || ++ !(op->flags & BCH_WRITE_pages_stable)) || + (bch2_csum_type_is_encryption(op->csum_type) && +- !(op->flags & BCH_WRITE_PAGES_OWNED))) { ++ !(op->flags & BCH_WRITE_pages_owned))) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); +@@ -952,7 +966,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + break; + + BUG_ON(op->compression_opt && +- (op->flags & BCH_WRITE_DATA_ENCODED) && ++ (op->flags & BCH_WRITE_data_encoded) && + bch2_csum_type_is_encryption(op->crc.csum_type)); + BUG_ON(op->compression_opt && !bounce); + +@@ -990,7 +1004,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + } + } + +- if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ if ((op->flags & BCH_WRITE_data_encoded) && + !crc_is_compressed(crc) && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { +@@ -1022,7 +1036,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + crc.compression_type = compression_type; + crc.nonce = nonce; + } else { +- if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ if ((op->flags & BCH_WRITE_data_encoded) && + bch2_rechecksum_bio(c, src, version, op->crc, + NULL, &op->crc, + src_len >> 9, +@@ -1080,11 +1094,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + *_dst = dst; + return more; + csum_err: +- bch_err_inum_offset_ratelimited(c, +- op->pos.inode, +- op->pos.offset << 9, +- "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)", +- op->flags & BCH_WRITE_MOVE ? "move" : "user"); ++ { ++ struct printbuf buf = PRINTBUF; ++ bch2_write_op_error(&buf, op); ++ prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } ++ + ret = -EIO; + err: + if (to_wbio(dst)->bounce) +@@ -1165,7 +1182,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) + struct btree_trans *trans = bch2_trans_get(c); + + for_each_keylist_key(&op->insert_keys, orig) { +- int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, ++ int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + bkey_start_pos(&orig->k), orig->k.p, + BTREE_ITER_intent, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ +@@ -1175,11 +1192,11 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + +- bch_err_inum_offset_ratelimited(c, +- insert->k.p.inode, insert->k.p.offset << 9, +- "%s write error while doing btree update: %s", +- op->flags & BCH_WRITE_MOVE ? "move" : "user", +- bch2_err_str(ret)); ++ struct printbuf buf = PRINTBUF; ++ __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); ++ prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); + } + + if (ret) { +@@ -1193,9 +1210,9 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) + + static void __bch2_nocow_write_done(struct bch_write_op *op) + { +- if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { ++ if (unlikely(op->flags & BCH_WRITE_io_error)) { + op->error = -EIO; +- } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) ++ } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) + bch2_nocow_write_convert_unwritten(op); + } + +@@ -1224,7 +1241,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + struct bucket_to_lock *stale_at; + int stale, ret; + +- if (op->flags & BCH_WRITE_MOVE) ++ if (op->flags & BCH_WRITE_move) + return; + + darray_init(&buckets); +@@ -1282,7 +1299,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + }), GFP_KERNEL|__GFP_NOFAIL); + + if (ptr->unwritten) +- op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; ++ op->flags |= BCH_WRITE_convert_unwritten; + } + + /* Unlock before taking nocow locks, doing IO: */ +@@ -1290,7 +1307,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + bch2_trans_unlock(trans); + + bch2_cut_front(op->pos, op->insert_keys.top); +- if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) ++ if (op->flags & BCH_WRITE_convert_unwritten) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + + darray_for_each(buckets, i) { +@@ -1315,7 +1332,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + wbio_init(bio)->put_bio = true; + bio->bi_opf = op->wbio.bio.bi_opf; + } else { +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + } + + op->pos.offset += bio_sectors(bio); +@@ -1329,7 +1346,7 @@ static void bch2_nocow_write(struct bch_write_op *op) + op->insert_keys.top, true); + + bch2_keylist_push(&op->insert_keys); +- if (op->flags & BCH_WRITE_SUBMITTED) ++ if (op->flags & BCH_WRITE_submitted) + break; + bch2_btree_iter_advance(&iter); + } +@@ -1339,23 +1356,25 @@ static void bch2_nocow_write(struct bch_write_op *op) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + ++ bch2_trans_put(trans); ++ darray_exit(&buckets); ++ + if (ret) { +- bch_err_inum_offset_ratelimited(c, +- op->pos.inode, op->pos.offset << 9, +- "%s: btree lookup error %s", __func__, bch2_err_str(ret)); ++ struct printbuf buf = PRINTBUF; ++ bch2_write_op_error(&buf, op); ++ prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); + op->error = ret; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_submitted; + } + +- bch2_trans_put(trans); +- darray_exit(&buckets); +- + /* fallback to cow write path? */ +- if (!(op->flags & BCH_WRITE_SUBMITTED)) { ++ if (!(op->flags & BCH_WRITE_submitted)) { + closure_sync(&op->cl); + __bch2_nocow_write_done(op); + op->insert_keys.top = op->insert_keys.keys; +- } else if (op->flags & BCH_WRITE_SYNC) { ++ } else if (op->flags & BCH_WRITE_sync) { + closure_sync(&op->cl); + bch2_nocow_write_done(&op->cl.work); + } else { +@@ -1407,7 +1426,7 @@ static void __bch2_write(struct bch_write_op *op) + + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { + bch2_nocow_write(op); +- if (op->flags & BCH_WRITE_SUBMITTED) ++ if (op->flags & BCH_WRITE_submitted) + goto out_nofs_restore; + } + again: +@@ -1437,7 +1456,7 @@ static void __bch2_write(struct bch_write_op *op) + ret = bch2_trans_run(c, lockrestart_do(trans, + bch2_alloc_sectors_start_trans(trans, + op->target, +- op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), ++ op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), + op->write_point, + &op->devs_have, + op->nr_replicas, +@@ -1460,16 +1479,16 @@ static void __bch2_write(struct bch_write_op *op) + bch2_alloc_sectors_done_inlined(c, wp); + err: + if (ret <= 0) { +- op->flags |= BCH_WRITE_SUBMITTED; +- +- if (ret < 0) { +- if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) +- bch_err_inum_offset_ratelimited(c, +- op->pos.inode, +- op->pos.offset << 9, +- "%s(): %s error: %s", __func__, +- op->flags & BCH_WRITE_MOVE ? "move" : "user", +- bch2_err_str(ret)); ++ op->flags |= BCH_WRITE_submitted; ++ ++ if (unlikely(ret < 0)) { ++ if (!(op->flags & BCH_WRITE_alloc_nowait)) { ++ struct printbuf buf = PRINTBUF; ++ bch2_write_op_error(&buf, op); ++ prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); ++ bch_err_ratelimited(c, "%s", buf.buf); ++ printbuf_exit(&buf); ++ } + op->error = ret; + break; + } +@@ -1495,14 +1514,14 @@ static void __bch2_write(struct bch_write_op *op) + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. + */ +- if ((op->flags & BCH_WRITE_SYNC) || +- (!(op->flags & BCH_WRITE_SUBMITTED) && +- !(op->flags & BCH_WRITE_IN_WORKER))) { ++ if ((op->flags & BCH_WRITE_sync) || ++ (!(op->flags & BCH_WRITE_submitted) && ++ !(op->flags & BCH_WRITE_in_worker))) { + bch2_wait_on_allocator(c, &op->cl); + + __bch2_write_index(op); + +- if (!(op->flags & BCH_WRITE_SUBMITTED)) ++ if (!(op->flags & BCH_WRITE_submitted)) + goto again; + bch2_write_done(&op->cl); + } else { +@@ -1523,8 +1542,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) + + memset(&op->failed, 0, sizeof(op->failed)); + +- op->flags |= BCH_WRITE_WROTE_DATA_INLINE; +- op->flags |= BCH_WRITE_SUBMITTED; ++ op->flags |= BCH_WRITE_wrote_data_inline; ++ op->flags |= BCH_WRITE_submitted; + + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); + +@@ -1587,20 +1606,19 @@ CLOSURE_CALLBACK(bch2_write) + BUG_ON(!op->write_point.v); + BUG_ON(bkey_eq(op->pos, POS_MAX)); + +- if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) +- op->flags |= BCH_WRITE_ALLOC_NOWAIT; ++ if (op->flags & BCH_WRITE_only_specified_devs) ++ op->flags |= BCH_WRITE_alloc_nowait; + + op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); + op->start_time = local_clock(); + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(bio)->put_bio = false; + +- if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { +- bch_err_inum_offset_ratelimited(c, +- op->pos.inode, +- op->pos.offset << 9, +- "%s write error: misaligned write", +- op->flags & BCH_WRITE_MOVE ? "move" : "user"); ++ if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { ++ struct printbuf buf = PRINTBUF; ++ bch2_write_op_error(&buf, op); ++ prt_printf(&buf, "misaligned write"); ++ printbuf_exit(&buf); + op->error = -EIO; + goto err; + } +@@ -1610,7 +1628,7 @@ CLOSURE_CALLBACK(bch2_write) + goto err; + } + +- if (!(op->flags & BCH_WRITE_MOVE) && ++ if (!(op->flags & BCH_WRITE_move) && + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; +diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h +index 5400ce94ee57..02cca52be0bd 100644 +--- a/fs/bcachefs/io_write.h ++++ b/fs/bcachefs/io_write.h +@@ -20,22 +20,23 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw + void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + enum bch_data_type, const struct bkey_i *, bool); + ++void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); ++ + #define BCH_WRITE_FLAGS() \ +- x(ALLOC_NOWAIT) \ +- x(CACHED) \ +- x(DATA_ENCODED) \ +- x(PAGES_STABLE) \ +- x(PAGES_OWNED) \ +- x(ONLY_SPECIFIED_DEVS) \ +- x(WROTE_DATA_INLINE) \ +- x(FROM_INTERNAL) \ +- x(CHECK_ENOSPC) \ +- x(SYNC) \ +- x(MOVE) \ +- x(IN_WORKER) \ +- x(SUBMITTED) \ +- x(IO_ERROR) \ +- x(CONVERT_UNWRITTEN) ++ x(alloc_nowait) \ ++ x(cached) \ ++ x(data_encoded) \ ++ x(pages_stable) \ ++ x(pages_owned) \ ++ x(only_specified_devs) \ ++ x(wrote_data_inline) \ ++ x(check_enospc) \ ++ x(sync) \ ++ x(move) \ ++ x(in_worker) \ ++ x(submitted) \ ++ x(io_error) \ ++ x(convert_unwritten) + + enum __bch_write_flags { + #define x(f) __BCH_WRITE_##f, +diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h +index 6e878a6f2f0b..3ef6df9145ef 100644 +--- a/fs/bcachefs/io_write_types.h ++++ b/fs/bcachefs/io_write_types.h +@@ -64,7 +64,7 @@ struct bch_write_op { + struct bpos pos; + struct bversion version; + +- /* For BCH_WRITE_DATA_ENCODED: */ ++ /* For BCH_WRITE_data_encoded: */ + struct bch_extent_crc_unpacked crc; + + struct write_point_specifier write_point; +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +index 2dc0d60c1745..cb2c3722f674 100644 +--- a/fs/bcachefs/journal.c ++++ b/fs/bcachefs/journal.c +@@ -113,11 +113,10 @@ journal_seq_to_buf(struct journal *j, u64 seq) + + static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) + { +- unsigned i; +- +- for (i = 0; i < ARRAY_SIZE(p->list); i++) +- INIT_LIST_HEAD(&p->list[i]); +- INIT_LIST_HEAD(&p->flushed); ++ for (unsigned i = 0; i < ARRAY_SIZE(p->unflushed); i++) ++ INIT_LIST_HEAD(&p->unflushed[i]); ++ for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++) ++ INIT_LIST_HEAD(&p->flushed[i]); + atomic_set(&p->count, count); + p->devs.nr = 0; + } +@@ -217,6 +216,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) + if (__bch2_journal_pin_put(j, seq)) + bch2_journal_reclaim_fast(j); + bch2_journal_do_writes(j); ++ ++ /* ++ * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an ++ * open journal entry ++ */ ++ wake_up(&j->wait); + } + + /* +@@ -251,6 +256,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t + if (!__journal_entry_is_open(old)) + return; + ++ if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) ++ old.cur_entry_offset = j->cur_entry_offset_if_blocked; ++ + /* Close out old buffer: */ + buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + +@@ -373,6 +381,10 @@ static int journal_entry_open(struct journal *j) + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) + return JOURNAL_ERR_max_in_flight; + ++ if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX, ++ c, "cannot start: journal seq overflow")) ++ return JOURNAL_ERR_insufficient_devices; /* -EROFS */ ++ + BUG_ON(!j->cur_entry_sectors); + + buf->expires = +@@ -588,6 +600,16 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, + : -BCH_ERR_journal_res_get_blocked; + } + ++static unsigned max_dev_latency(struct bch_fs *c) ++{ ++ u64 nsecs = 0; ++ ++ for_each_rw_member(c, ca) ++ nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); ++ ++ return nsecs_to_jiffies(nsecs); ++} ++ + /* + * Essentially the entry function to the journaling code. When bcachefs is doing + * a btree insert, it calls this function to get the current journal write. +@@ -599,17 +621,31 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, + * btree node write locks. + */ + int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, +- unsigned flags) ++ unsigned flags, ++ struct btree_trans *trans) + { + int ret; + + if (closure_wait_event_timeout(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + (flags & JOURNAL_RES_GET_NONBLOCK), +- HZ * 10)) ++ HZ)) + return ret; + ++ if (trans) ++ bch2_trans_unlock_long(trans); ++ + struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ int remaining_wait = max(max_dev_latency(c) * 2, HZ * 10); ++ ++ remaining_wait = max(0, remaining_wait - HZ); ++ ++ if (closure_wait_event_timeout(&j->async_wait, ++ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || ++ (flags & JOURNAL_RES_GET_NONBLOCK), ++ remaining_wait)) ++ return ret; ++ + struct printbuf buf = PRINTBUF; + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", +@@ -664,7 +700,7 @@ void bch2_journal_entry_res_resize(struct journal *j, + * @seq: seq to flush + * @parent: closure object to wait with + * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, +- * -EIO if @seq will never be flushed ++ * -BCH_ERR_journal_flush_err if @seq will never be flushed + * + * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * necessary +@@ -687,7 +723,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + + /* Recheck under lock: */ + if (j->err_seq && seq >= j->err_seq) { +- ret = -EIO; ++ ret = -BCH_ERR_journal_flush_err; + goto out; + } + +@@ -714,7 +750,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + * livelock: + */ + sched_annotate_sleep(); +- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); + if (ret) + return ret; + +@@ -794,10 +830,11 @@ int bch2_journal_flush(struct journal *j) + } + + /* +- * bch2_journal_noflush_seq - tell the journal not to issue any flushes before ++ * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the ++ * range [start, end) + * @seq + */ +-bool bch2_journal_noflush_seq(struct journal *j, u64 seq) ++bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 unwritten_seq; +@@ -806,15 +843,15 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) + if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) + return false; + +- if (seq <= c->journal.flushed_seq_ondisk) ++ if (c->journal.flushed_seq_ondisk >= start) + return false; + + spin_lock(&j->lock); +- if (seq <= c->journal.flushed_seq_ondisk) ++ if (c->journal.flushed_seq_ondisk >= start) + goto out; + + for (unwritten_seq = journal_last_unwritten_seq(j); +- unwritten_seq < seq; ++ unwritten_seq < end; + unwritten_seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); + +@@ -831,19 +868,14 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) + return ret; + } + +-int bch2_journal_meta(struct journal *j) ++static int __bch2_journal_meta(struct journal *j) + { +- struct journal_buf *buf; +- struct journal_res res; +- int ret; +- +- memset(&res, 0, sizeof(res)); +- +- ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ struct journal_res res = {}; ++ int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0, NULL); + if (ret) + return ret; + +- buf = j->buf + (res.seq & JOURNAL_BUF_MASK); ++ struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + + if (!buf->flush_time) { +@@ -856,27 +888,70 @@ int bch2_journal_meta(struct journal *j) + return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); + } + ++int bch2_journal_meta(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ ++ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) ++ return -EROFS; ++ ++ int ret = __bch2_journal_meta(j); ++ bch2_write_ref_put(c, BCH_WRITE_REF_journal); ++ return ret; ++} ++ + /* block/unlock the journal: */ + + void bch2_journal_unblock(struct journal *j) + { + spin_lock(&j->lock); +- j->blocked--; ++ if (!--j->blocked && ++ j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && ++ j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { ++ union journal_res_state old, new; ++ ++ old.v = atomic64_read(&j->reservations.counter); ++ do { ++ new.v = old.v; ++ new.cur_entry_offset = j->cur_entry_offset_if_blocked; ++ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); ++ } + spin_unlock(&j->lock); + + journal_wake(j); + } + ++static void __bch2_journal_block(struct journal *j) ++{ ++ if (!j->blocked++) { ++ union journal_res_state old, new; ++ ++ old.v = atomic64_read(&j->reservations.counter); ++ do { ++ j->cur_entry_offset_if_blocked = old.cur_entry_offset; ++ ++ if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) ++ break; ++ ++ new.v = old.v; ++ new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; ++ } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); ++ ++ journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); ++ } ++} ++ + void bch2_journal_block(struct journal *j) + { + spin_lock(&j->lock); +- j->blocked++; ++ __bch2_journal_block(j); + spin_unlock(&j->lock); + + journal_quiesce(j); + } + +-static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) ++static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, ++ u64 max_seq, bool *blocked) + { + struct journal_buf *ret = NULL; + +@@ -893,13 +968,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou + struct journal_buf *buf = j->buf + idx; + + if (buf->need_flush_to_write_buffer) { +- if (seq == journal_cur_seq(j)) +- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); +- + union journal_res_state s; + s.v = atomic64_read_acquire(&j->reservations.counter); + +- ret = journal_state_count(s, idx) ++ unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); ++ ++ if (open && !*blocked) { ++ __bch2_journal_block(j); ++ *blocked = true; ++ } ++ ++ ret = journal_state_count(s, idx) > open + ? ERR_PTR(-EAGAIN) + : buf; + break; +@@ -912,11 +991,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou + return ret; + } + +-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) ++struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, ++ u64 max_seq, bool *blocked) + { + struct journal_buf *ret; ++ *blocked = false; ++ ++ wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, ++ max_seq, blocked)) != ERR_PTR(-EAGAIN)); ++ if (IS_ERR_OR_NULL(ret) && *blocked) ++ bch2_journal_unblock(j); + +- wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); + return ret; + } + +@@ -945,19 +1030,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + } + + for (nr_got = 0; nr_got < nr_want; nr_got++) { +- if (new_fs) { +- bu[nr_got] = bch2_bucket_alloc_new_fs(ca); +- if (bu[nr_got] < 0) { +- ret = -BCH_ERR_ENOSPC_bucket_alloc; +- break; +- } +- } else { +- ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, +- BCH_DATA_journal, cl); +- ret = PTR_ERR_OR_ZERO(ob[nr_got]); +- if (ret) +- break; ++ enum bch_watermark watermark = new_fs ++ ? BCH_WATERMARK_btree ++ : BCH_WATERMARK_normal; ++ ++ ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, ++ BCH_DATA_journal, cl); ++ ret = PTR_ERR_OR_ZERO(ob[nr_got]); ++ if (ret) ++ break; + ++ if (!new_fs) { + ret = bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(trans, ca, + ob[nr_got]->bucket, BCH_DATA_journal, +@@ -967,9 +1050,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + bch_err_msg(c, ret, "marking new journal buckets"); + break; + } +- +- bu[nr_got] = ob[nr_got]->bucket; + } ++ ++ bu[nr_got] = ob[nr_got]->bucket; + } + + if (!nr_got) +@@ -1009,8 +1092,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + if (ret) + goto err_unblock; + +- if (!new_fs) +- bch2_write_super(c); ++ bch2_write_super(c); + + /* Commit: */ + if (c) +@@ -1044,9 +1126,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + bu[i], BCH_DATA_free, 0, + BTREE_TRIGGER_transactional)); + err_free: +- if (!new_fs) +- for (i = 0; i < nr_got; i++) +- bch2_open_bucket_put(c, ob[i]); ++ for (i = 0; i < nr_got; i++) ++ bch2_open_bucket_put(c, ob[i]); + + kfree(new_bucket_seq); + kfree(new_buckets); +@@ -1193,7 +1274,7 @@ void bch2_fs_journal_stop(struct journal *j) + * Always write a new journal entry, to make sure the clock hands are up + * to date (and match the superblock) + */ +- bch2_journal_meta(j); ++ __bch2_journal_meta(j); + + journal_quiesce(j); + cancel_delayed_work_sync(&j->write_work); +@@ -1217,6 +1298,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) + bool had_entries = false; + u64 last_seq = cur_seq, nr, seq; + ++ if (cur_seq >= JOURNAL_SEQ_MAX) { ++ bch_err(c, "cannot start: journal seq overflow"); ++ return -EINVAL; ++ } ++ + genradix_for_each_reverse(&c->journal_entries, iter, _i) { + i = *_i; + +@@ -1474,6 +1560,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + case JOURNAL_ENTRY_CLOSED_VAL: + prt_printf(out, "closed\n"); + break; ++ case JOURNAL_ENTRY_BLOCKED_VAL: ++ prt_printf(out, "blocked\n"); ++ break; + default: + prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); + break; +@@ -1499,6 +1588,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + printbuf_indent_sub(out, 2); + + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { ++ if (!ca->mi.durability) ++ continue; ++ + struct journal_device *ja = &ca->journal; + + if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) +@@ -1508,6 +1600,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + continue; + + prt_printf(out, "dev %u:\n", ca->dev_idx); ++ prt_printf(out, "durability %u:\n", ca->mi.durability); + printbuf_indent_add(out, 2); + prt_printf(out, "nr\t%u\n", ja->nr); + prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); +@@ -1519,6 +1612,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + printbuf_indent_sub(out, 2); + } + ++ prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); ++ + rcu_read_unlock(); + + --out->atomic; +@@ -1530,54 +1625,3 @@ void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) + __bch2_journal_debug_to_text(out, j); + spin_unlock(&j->lock); + } +- +-bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +-{ +- struct journal_entry_pin_list *pin_list; +- struct journal_entry_pin *pin; +- +- spin_lock(&j->lock); +- if (!test_bit(JOURNAL_running, &j->flags)) { +- spin_unlock(&j->lock); +- return true; +- } +- +- *seq = max(*seq, j->pin.front); +- +- if (*seq >= j->pin.back) { +- spin_unlock(&j->lock); +- return true; +- } +- +- out->atomic++; +- +- pin_list = journal_seq_pin(j, *seq); +- +- prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); +- printbuf_indent_add(out, 2); +- +- for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) +- list_for_each_entry(pin, &pin_list->list[i], list) +- prt_printf(out, "\t%px %ps\n", pin, pin->flush); +- +- if (!list_empty(&pin_list->flushed)) +- prt_printf(out, "flushed:\n"); +- +- list_for_each_entry(pin, &pin_list->flushed, list) +- prt_printf(out, "\t%px %ps\n", pin, pin->flush); +- +- printbuf_indent_sub(out, 2); +- +- --out->atomic; +- spin_unlock(&j->lock); +- +- return false; +-} +- +-void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +-{ +- u64 seq = 0; +- +- while (!bch2_journal_seq_pins_to_text(out, j, &seq)) +- seq++; +-} +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +index 2762be6f9814..dccddd5420ad 100644 +--- a/fs/bcachefs/journal.h ++++ b/fs/bcachefs/journal.h +@@ -285,7 +285,8 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq + spin_lock(&j->lock); + bch2_journal_buf_put_final(j, seq); + spin_unlock(&j->lock); +- } ++ } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) ++ wake_up(&j->wait); + } + + /* +@@ -311,7 +312,7 @@ static inline void bch2_journal_res_put(struct journal *j, + } + + int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, +- unsigned); ++ unsigned, struct btree_trans *); + + /* First bits for BCH_WATERMARK: */ + enum journal_res_flags { +@@ -367,7 +368,8 @@ static inline int journal_res_get_fast(struct journal *j, + } + + static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, +- unsigned u64s, unsigned flags) ++ unsigned u64s, unsigned flags, ++ struct btree_trans *trans) + { + int ret; + +@@ -379,7 +381,7 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re + if (journal_res_get_fast(j, res, flags)) + goto out; + +- ret = bch2_journal_res_get_slowpath(j, res, flags); ++ ret = bch2_journal_res_get_slowpath(j, res, flags, trans); + if (ret) + return ret; + out: +@@ -403,7 +405,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); + + int bch2_journal_flush_seq(struct journal *, u64, unsigned); + int bch2_journal_flush(struct journal *); +-bool bch2_journal_noflush_seq(struct journal *, u64); ++bool bch2_journal_noflush_seq(struct journal *, u64, u64); + int bch2_journal_meta(struct journal *); + + void bch2_journal_halt(struct journal *); +@@ -411,7 +413,7 @@ void bch2_journal_halt(struct journal *); + static inline int bch2_journal_error(struct journal *j) + { + return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL +- ? -EIO : 0; ++ ? -BCH_ERR_journal_shutdown : 0; + } + + struct bch_dev; +@@ -424,12 +426,10 @@ static inline void bch2_journal_set_replay_done(struct journal *j) + + void bch2_journal_unblock(struct journal *); + void bch2_journal_block(struct journal *); +-struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); ++struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); + + void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); + void bch2_journal_debug_to_text(struct printbuf *, struct journal *); +-void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +-bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + + int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +index fb35dd336331..11c39e0c34f4 100644 +--- a/fs/bcachefs/journal_io.c ++++ b/fs/bcachefs/journal_io.c +@@ -17,6 +17,9 @@ + #include "sb-clean.h" + #include "trace.h" + ++#include ++#include ++ + void bch2_journal_pos_from_member_info_set(struct bch_fs *c) + { + lockdep_assert_held(&c->sb_lock); +@@ -299,7 +302,7 @@ static void journal_entry_err_msg(struct printbuf *out, + journal_entry_err_msg(&_buf, version, jset, entry); \ + prt_printf(&_buf, msg, ##__VA_ARGS__); \ + \ +- switch (flags & BCH_VALIDATE_write) { \ ++ switch (from.flags & BCH_VALIDATE_write) { \ + case READ: \ + mustfix_fsck_err(c, _err, "%s", _buf.buf); \ + break; \ +@@ -325,11 +328,11 @@ static void journal_entry_err_msg(struct printbuf *out, + static int journal_validate_key(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, +- unsigned level, enum btree_id btree_id, + struct bkey_i *k, +- unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from, ++ unsigned version, int big_endian) + { ++ enum bch_validate_flags flags = from.flags; + int write = flags & BCH_VALIDATE_write; + void *next = vstruct_next(entry); + int ret = 0; +@@ -364,11 +367,10 @@ static int journal_validate_key(struct bch_fs *c, + } + + if (!write) +- bch2_bkey_compat(level, btree_id, version, big_endian, ++ bch2_bkey_compat(from.level, from.btree, version, big_endian, + write, NULL, bkey_to_packed(k)); + +- ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), +- __btree_node_type(level, btree_id), write); ++ ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); + if (ret == -BCH_ERR_fsck_delete_bkey) { + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); +@@ -379,7 +381,7 @@ static int journal_validate_key(struct bch_fs *c, + goto fsck_err; + + if (write) +- bch2_bkey_compat(level, btree_id, version, big_endian, ++ bch2_bkey_compat(from.level, from.btree, version, big_endian, + write, NULL, bkey_to_packed(k)); + fsck_err: + return ret; +@@ -389,16 +391,15 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_i *k = entry->start; + ++ from.level = entry->level; ++ from.btree = entry->btree_id; ++ + while (k != vstruct_last(entry)) { +- int ret = journal_validate_key(c, jset, entry, +- entry->level, +- entry->btree_id, +- k, version, big_endian, +- flags|BCH_VALIDATE_journal); ++ int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); + if (ret == FSCK_DELETED_KEY) + continue; + else if (ret) +@@ -421,7 +422,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs + bch2_prt_jset_entry_type(out, entry->type); + prt_str(out, ": "); + } +- prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); ++ bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); ++ prt_char(out, ' '); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); + first = false; + } +@@ -431,11 +433,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_i *k = entry->start; + int ret = 0; + ++ from.root = true; ++ from.level = entry->level + 1; ++ from.btree = entry->btree_id; ++ + if (journal_entry_err_on(!entry->u64s || + le16_to_cpu(entry->u64s) != k->k.u64s, + c, version, jset, entry, +@@ -452,8 +458,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, + return 0; + } + +- ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, +- version, big_endian, flags); ++ ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); + if (ret == FSCK_DELETED_KEY) + ret = 0; + fsck_err: +@@ -470,7 +475,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + /* obsolete, don't care: */ + return 0; +@@ -485,7 +490,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + int ret = 0; + +@@ -512,7 +517,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct jset_entry_blacklist_v2 *bl_entry; + int ret = 0; +@@ -554,7 +559,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); +@@ -588,7 +593,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); +@@ -632,7 +637,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); +@@ -665,14 +670,14 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + +- prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); ++ prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); + } + + static int journal_entry_dev_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); +@@ -729,7 +734,7 @@ static int journal_entry_log_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + return 0; + } +@@ -738,19 +743,19 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) + { + struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); +- unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); + +- prt_printf(out, "%.*s", bytes, l->d); ++ prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); + } + + static int journal_entry_overwrite_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { ++ from.flags = 0; + return journal_entry_btree_keys_validate(c, jset, entry, +- version, big_endian, READ); ++ version, big_endian, from); + } + + static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, +@@ -763,10 +768,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + return journal_entry_btree_keys_validate(c, jset, entry, +- version, big_endian, READ); ++ version, big_endian, from); + } + + static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, +@@ -779,7 +784,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + unsigned bytes = vstruct_bytes(entry); + unsigned expected = 16; +@@ -809,7 +814,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs * + struct jset_entry_ops { + int (*validate)(struct bch_fs *, struct jset *, + struct jset_entry *, unsigned, int, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); + }; + +@@ -827,11 +832,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + return entry->type < BCH_JSET_ENTRY_NR + ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, +- version, big_endian, flags) ++ version, big_endian, from) + : 0; + } + +@@ -849,10 +854,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, + static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + enum bch_validate_flags flags) + { ++ struct bkey_validate_context from = { ++ .flags = flags, ++ .from = BKEY_VALIDATE_journal, ++ .journal_seq = le64_to_cpu(jset->seq), ++ }; ++ + unsigned version = le32_to_cpu(jset->version); + int ret = 0; + + vstruct_for_each(jset, entry) { ++ from.journal_offset = (u64 *) entry - jset->_data; ++ + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), + c, version, jset, entry, + journal_entry_past_jset_end, +@@ -861,8 +874,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + break; + } + +- ret = bch2_journal_entry_validate(c, jset, entry, +- version, JSET_BIG_ENDIAN(jset), flags); ++ ret = bch2_journal_entry_validate(c, jset, entry, version, ++ JSET_BIG_ENDIAN(jset), from); + if (ret) + break; + } +@@ -875,13 +888,17 @@ static int jset_validate(struct bch_fs *c, + struct jset *jset, u64 sector, + enum bch_validate_flags flags) + { +- unsigned version; ++ struct bkey_validate_context from = { ++ .flags = flags, ++ .from = BKEY_VALIDATE_journal, ++ .journal_seq = le64_to_cpu(jset->seq), ++ }; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + +- version = le32_to_cpu(jset->version); ++ unsigned version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, + jset_unsupported_version, +@@ -926,15 +943,16 @@ static int jset_validate_early(struct bch_fs *c, + unsigned bucket_sectors_left, + unsigned sectors_read) + { +- size_t bytes = vstruct_bytes(jset); +- unsigned version; +- enum bch_validate_flags flags = BCH_VALIDATE_journal; ++ struct bkey_validate_context from = { ++ .from = BKEY_VALIDATE_journal, ++ .journal_seq = le64_to_cpu(jset->seq), ++ }; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + +- version = le32_to_cpu(jset->version); ++ unsigned version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), + c, version, jset, NULL, + jset_unsupported_version, +@@ -947,6 +965,7 @@ static int jset_validate_early(struct bch_fs *c, + return -EINVAL; + } + ++ size_t bytes = vstruct_bytes(jset); + if (bytes > (sectors_read << 9) && + sectors_read < bucket_sectors_left) + return JOURNAL_ENTRY_REREAD; +@@ -1231,8 +1250,6 @@ int bch2_journal_read(struct bch_fs *c, + * those entries will be blacklisted: + */ + genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { +- enum bch_validate_flags flags = BCH_VALIDATE_journal; +- + i = *_i; + + if (journal_replay_ignore(i)) +@@ -1252,6 +1269,10 @@ int bch2_journal_read(struct bch_fs *c, + continue; + } + ++ struct bkey_validate_context from = { ++ .from = BKEY_VALIDATE_journal, ++ .journal_seq = le64_to_cpu(i->j.seq), ++ }; + if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), + c, le32_to_cpu(i->j.version), &i->j, NULL, + jset_last_seq_newer_than_seq, +@@ -1411,27 +1432,50 @@ int bch2_journal_read(struct bch_fs *c, + + /* journal write: */ + ++static void journal_advance_devs_to_next_bucket(struct journal *j, ++ struct dev_alloc_list *devs, ++ unsigned sectors, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ ++ darray_for_each(*devs, i) { ++ struct bch_dev *ca = rcu_dereference(c->devs[*i]); ++ if (!ca) ++ continue; ++ ++ struct journal_device *ja = &ca->journal; ++ ++ if (sectors > ja->sectors_free && ++ sectors <= ca->mi.bucket_size && ++ bch2_journal_dev_buckets_available(j, ja, ++ journal_space_discarded)) { ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ /* ++ * ja->bucket_seq[ja->cur_idx] must always have ++ * something sensible: ++ */ ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); ++ } ++ } ++} ++ + static void __journal_write_alloc(struct journal *j, + struct journal_buf *w, +- struct dev_alloc_list *devs_sorted, ++ struct dev_alloc_list *devs, + unsigned sectors, + unsigned *replicas, + unsigned replicas_want) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); +- struct journal_device *ja; +- struct bch_dev *ca; +- unsigned i; + +- if (*replicas >= replicas_want) +- return; +- +- for (i = 0; i < devs_sorted->nr; i++) { +- ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); ++ darray_for_each(*devs, i) { ++ struct bch_dev *ca = rcu_dereference(c->devs[*i]); + if (!ca) + continue; + +- ja = &ca->journal; ++ struct journal_device *ja = &ca->journal; + + /* + * Check that we can use this device, and aren't already using +@@ -1477,65 +1521,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) + { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_devs_mask devs; +- struct journal_device *ja; +- struct bch_dev *ca; + struct dev_alloc_list devs_sorted; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + unsigned target = c->opts.metadata_target ?: + c->opts.foreground_target; +- unsigned i, replicas = 0, replicas_want = ++ unsigned replicas = 0, replicas_want = + READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_need = min_t(unsigned, replicas_want, + READ_ONCE(c->opts.metadata_replicas_required)); ++ bool advance_done = false; + + rcu_read_lock(); +-retry: +- devs = target_rw_devs(c, BCH_DATA_journal, target); + +- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); ++ /* We might run more than once if we have to stop and do discards: */ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); ++ bkey_for_each_ptr(ptrs, p) { ++ struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); ++ if (ca) ++ replicas += ca->mi.durability; ++ } + +- __journal_write_alloc(j, w, &devs_sorted, +- sectors, &replicas, replicas_want); ++retry_target: ++ devs = target_rw_devs(c, BCH_DATA_journal, target); ++ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); ++retry_alloc: ++ __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + +- if (replicas >= replicas_want) ++ if (likely(replicas >= replicas_want)) + goto done; + +- for (i = 0; i < devs_sorted.nr; i++) { +- ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); +- if (!ca) +- continue; +- +- ja = &ca->journal; +- +- if (sectors > ja->sectors_free && +- sectors <= ca->mi.bucket_size && +- bch2_journal_dev_buckets_available(j, ja, +- journal_space_discarded)) { +- ja->cur_idx = (ja->cur_idx + 1) % ja->nr; +- ja->sectors_free = ca->mi.bucket_size; +- +- /* +- * ja->bucket_seq[ja->cur_idx] must always have +- * something sensible: +- */ +- ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); +- } ++ if (!advance_done) { ++ journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); ++ advance_done = true; ++ goto retry_alloc; + } + +- __journal_write_alloc(j, w, &devs_sorted, +- sectors, &replicas, replicas_want); +- + if (replicas < replicas_want && target) { + /* Retry from all devices: */ + target = 0; +- goto retry; ++ advance_done = false; ++ goto retry_target; + } + done: + rcu_read_unlock(); + + BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); + +- return replicas >= replicas_need ? 0 : -EROFS; ++ return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; + } + + static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) +@@ -1732,6 +1764,7 @@ static CLOSURE_CALLBACK(journal_write_submit) + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; ++ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 0); + + BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); + ca->prev_journal_sector = bio->bi_iter.bi_sector; +@@ -2023,19 +2056,21 @@ CLOSURE_CALLBACK(bch2_journal_write) + bch2_journal_do_discards(j); + } + +- if (ret) { ++ if (ret && !bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + +- prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), ++ prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), + le64_to_cpu(w->data->seq), ++ vstruct_sectors(w->data, c->block_bits), + bch2_err_str(ret)); + __bch2_journal_debug_to_text(&buf, j); + spin_unlock(&j->lock); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +- goto err; + } ++ if (ret) ++ goto err; + + /* + * write is allocated, no longer need to account for it in +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +index 2ca9cde30ea8..12b39fcb4424 100644 +--- a/fs/bcachefs/journal_io.h ++++ b/fs/bcachefs/journal_io.h +@@ -63,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + + int bch2_journal_entry_validate(struct bch_fs *, struct jset *, + struct jset_entry *, unsigned, int, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, + struct jset_entry *); + +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +index ace291f175dd..6a9cefb635d6 100644 +--- a/fs/bcachefs/journal_reclaim.c ++++ b/fs/bcachefs/journal_reclaim.c +@@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja, + enum journal_space_from from) + { ++ if (!ja->nr) ++ return 0; ++ + unsigned available = (journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr; + +@@ -137,14 +140,18 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned pos, nr_devs = 0; + struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; ++ unsigned min_bucket_size = U32_MAX; + + BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); + + rcu_read_lock(); + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { +- if (!ca->journal.nr) ++ if (!ca->journal.nr || ++ !ca->mi.durability) + continue; + ++ min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); ++ + space = journal_dev_space_available(j, ca, from); + if (!space.next_entry) + continue; +@@ -164,7 +171,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne + * We sorted largest to smallest, and we want the smallest out of the + * @nr_devs_want largest devices: + */ +- return dev_space[nr_devs_want - 1]; ++ space = dev_space[nr_devs_want - 1]; ++ space.next_entry = min(space.next_entry, min_bucket_size); ++ return space; + } + + void bch2_journal_space_available(struct journal *j) +@@ -318,8 +327,10 @@ void bch2_journal_reclaim_fast(struct journal *j) + popped = true; + } + +- if (popped) ++ if (popped) { + bch2_journal_space_available(j); ++ __closure_wake_up(&j->reclaim_flush_wait); ++ } + } + + bool __bch2_journal_pin_put(struct journal *j, u64 seq) +@@ -353,6 +364,9 @@ static inline bool __journal_pin_drop(struct journal *j, + pin->seq = 0; + list_del_init(&pin->list); + ++ if (j->reclaim_flush_wait.list.first) ++ __closure_wake_up(&j->reclaim_flush_wait); ++ + /* + * Unpinning a journal entry may make journal_next_bucket() succeed, if + * writing a new last_seq will now make another bucket available: +@@ -374,11 +388,11 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) + { + if (fn == bch2_btree_node_flush0 || + fn == bch2_btree_node_flush1) +- return JOURNAL_PIN_btree; ++ return JOURNAL_PIN_TYPE_btree; + else if (fn == bch2_btree_key_cache_journal_flush) +- return JOURNAL_PIN_key_cache; ++ return JOURNAL_PIN_TYPE_key_cache; + else +- return JOURNAL_PIN_other; ++ return JOURNAL_PIN_TYPE_other; + } + + static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, +@@ -397,7 +411,12 @@ static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq, + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; +- list_add(&pin->list, &pin_list->list[type]); ++ ++ if (list_empty(&pin_list->unflushed[type]) && ++ j->reclaim_flush_wait.list.first) ++ __closure_wake_up(&j->reclaim_flush_wait); ++ ++ list_add(&pin->list, &pin_list->unflushed[type]); + } + + void bch2_journal_pin_copy(struct journal *j, +@@ -490,16 +509,15 @@ journal_get_next_pin(struct journal *j, + { + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; +- unsigned i; + + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { + if (*seq > seq_to_flush && !allowed_above_seq) + break; + +- for (i = 0; i < JOURNAL_PIN_NR; i++) +- if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || +- ((1U << i) & allowed_above_seq)) { +- ret = list_first_entry_or_null(&pin_list->list[i], ++ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) ++ if (((BIT(i) & allowed_below_seq) && *seq <= seq_to_flush) || ++ (BIT(i) & allowed_above_seq)) { ++ ret = list_first_entry_or_null(&pin_list->unflushed[i], + struct journal_entry_pin, list); + if (ret) + return ret; +@@ -535,8 +553,8 @@ static size_t journal_flush_pins(struct journal *j, + } + + if (min_key_cache) { +- allowed_above |= 1U << JOURNAL_PIN_key_cache; +- allowed_below |= 1U << JOURNAL_PIN_key_cache; ++ allowed_above |= BIT(JOURNAL_PIN_TYPE_key_cache); ++ allowed_below |= BIT(JOURNAL_PIN_TYPE_key_cache); + } + + cond_resched(); +@@ -544,7 +562,9 @@ static size_t journal_flush_pins(struct journal *j, + j->last_flushed = jiffies; + + spin_lock(&j->lock); +- pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); ++ pin = journal_get_next_pin(j, seq_to_flush, ++ allowed_below, ++ allowed_above, &seq); + if (pin) { + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; +@@ -567,7 +587,7 @@ static size_t journal_flush_pins(struct journal *j, + spin_lock(&j->lock); + /* Pin might have been dropped or rearmed: */ + if (likely(!err && !j->flush_in_progress_dropped)) +- list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); ++ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); + j->flush_in_progress = NULL; + j->flush_in_progress_dropped = false; + spin_unlock(&j->lock); +@@ -758,10 +778,12 @@ static int bch2_journal_reclaim_thread(void *arg) + journal_empty = fifo_empty(&j->pin); + spin_unlock(&j->lock); + ++ long timeout = j->next_reclaim - jiffies; ++ + if (journal_empty) + schedule(); +- else if (time_after(j->next_reclaim, jiffies)) +- schedule_timeout(j->next_reclaim - jiffies); ++ else if (timeout > 0) ++ schedule_timeout(timeout); + else + break; + } +@@ -805,10 +827,41 @@ int bch2_journal_reclaim_start(struct journal *j) + return 0; + } + ++static bool journal_pins_still_flushing(struct journal *j, u64 seq_to_flush, ++ unsigned types) ++{ ++ struct journal_entry_pin_list *pin_list; ++ u64 seq; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(pin_list, &j->pin, seq) { ++ if (seq > seq_to_flush) ++ break; ++ ++ for (unsigned i = 0; i < JOURNAL_PIN_TYPE_NR; i++) ++ if ((BIT(i) & types) && ++ (!list_empty(&pin_list->unflushed[i]) || ++ !list_empty(&pin_list->flushed[i]))) { ++ spin_unlock(&j->lock); ++ return true; ++ } ++ } ++ spin_unlock(&j->lock); ++ ++ return false; ++} ++ ++static bool journal_flush_pins_or_still_flushing(struct journal *j, u64 seq_to_flush, ++ unsigned types) ++{ ++ return journal_flush_pins(j, seq_to_flush, types, 0, 0, 0) || ++ journal_pins_still_flushing(j, seq_to_flush, types); ++} ++ + static int journal_flush_done(struct journal *j, u64 seq_to_flush, + bool *did_work) + { +- int ret; ++ int ret = 0; + + ret = bch2_journal_error(j); + if (ret) +@@ -816,12 +869,18 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + + mutex_lock(&j->reclaim_lock); + +- if (journal_flush_pins(j, seq_to_flush, +- (1U << JOURNAL_PIN_key_cache)| +- (1U << JOURNAL_PIN_other), 0, 0, 0) || +- journal_flush_pins(j, seq_to_flush, +- (1U << JOURNAL_PIN_btree), 0, 0, 0)) ++ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, ++ BIT(JOURNAL_PIN_TYPE_key_cache)| ++ BIT(JOURNAL_PIN_TYPE_other))) { + *did_work = true; ++ goto unlock; ++ } ++ ++ if (journal_flush_pins_or_still_flushing(j, seq_to_flush, ++ BIT(JOURNAL_PIN_TYPE_btree))) { ++ *did_work = true; ++ goto unlock; ++ } + + if (seq_to_flush > journal_cur_seq(j)) + bch2_journal_entry_close(j); +@@ -836,6 +895,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, + !fifo_used(&j->pin); + + spin_unlock(&j->lock); ++unlock: + mutex_unlock(&j->reclaim_lock); + + return ret; +@@ -849,7 +909,7 @@ bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) + if (!test_bit(JOURNAL_running, &j->flags)) + return false; + +- closure_wait_event(&j->async_wait, ++ closure_wait_event(&j->reclaim_flush_wait, + journal_flush_done(j, seq_to_flush, &did_work)); + + return did_work; +@@ -915,3 +975,54 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) + + return ret; + } ++ ++bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *pin; ++ ++ spin_lock(&j->lock); ++ if (!test_bit(JOURNAL_running, &j->flags)) { ++ spin_unlock(&j->lock); ++ return true; ++ } ++ ++ *seq = max(*seq, j->pin.front); ++ ++ if (*seq >= j->pin.back) { ++ spin_unlock(&j->lock); ++ return true; ++ } ++ ++ out->atomic++; ++ ++ pin_list = journal_seq_pin(j, *seq); ++ ++ prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count)); ++ printbuf_indent_add(out, 2); ++ ++ prt_printf(out, "unflushed:\n"); ++ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++) ++ list_for_each_entry(pin, &pin_list->unflushed[i], list) ++ prt_printf(out, "\t%px %ps\n", pin, pin->flush); ++ ++ prt_printf(out, "flushed:\n"); ++ for (unsigned i = 0; i < ARRAY_SIZE(pin_list->flushed); i++) ++ list_for_each_entry(pin, &pin_list->flushed[i], list) ++ prt_printf(out, "\t%px %ps\n", pin, pin->flush); ++ ++ printbuf_indent_sub(out, 2); ++ ++ --out->atomic; ++ spin_unlock(&j->lock); ++ ++ return false; ++} ++ ++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++{ ++ u64 seq = 0; ++ ++ while (!bch2_journal_seq_pins_to_text(out, j, &seq)) ++ seq++; ++} +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +index ec84c3345281..0a73d7134e1c 100644 +--- a/fs/bcachefs/journal_reclaim.h ++++ b/fs/bcachefs/journal_reclaim.h +@@ -78,4 +78,7 @@ static inline bool bch2_journal_flush_all_pins(struct journal *j) + + int bch2_journal_flush_device_pins(struct journal *, int); + ++void bch2_journal_pins_to_text(struct printbuf *, struct journal *); ++bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); ++ + #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +index 19183fcf7ad7..3ba433a48eb8 100644 +--- a/fs/bcachefs/journal_types.h ++++ b/fs/bcachefs/journal_types.h +@@ -9,6 +9,9 @@ + #include "super_types.h" + #include "fifo.h" + ++/* btree write buffer steals 8 bits for its own purposes: */ ++#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) ++ + #define JOURNAL_BUF_BITS 2 + #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) + #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) +@@ -50,15 +53,15 @@ struct journal_buf { + */ + + enum journal_pin_type { +- JOURNAL_PIN_btree, +- JOURNAL_PIN_key_cache, +- JOURNAL_PIN_other, +- JOURNAL_PIN_NR, ++ JOURNAL_PIN_TYPE_btree, ++ JOURNAL_PIN_TYPE_key_cache, ++ JOURNAL_PIN_TYPE_other, ++ JOURNAL_PIN_TYPE_NR, + }; + + struct journal_entry_pin_list { +- struct list_head list[JOURNAL_PIN_NR]; +- struct list_head flushed; ++ struct list_head unflushed[JOURNAL_PIN_TYPE_NR]; ++ struct list_head flushed[JOURNAL_PIN_TYPE_NR]; + atomic_t count; + struct bch_devs_list devs; + }; +@@ -112,6 +115,7 @@ union journal_res_state { + */ + #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) + ++#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) + #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) + #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) + +@@ -193,6 +197,7 @@ struct journal { + * insufficient devices: + */ + enum journal_errors cur_entry_error; ++ unsigned cur_entry_offset_if_blocked; + + unsigned buf_size_want; + /* +@@ -221,6 +226,7 @@ struct journal { + /* Used when waiting because the journal was full */ + wait_queue_head_t wait; + struct closure_waitlist async_wait; ++ struct closure_waitlist reclaim_flush_wait; + + struct delayed_work write_work; + struct workqueue_struct *wq; +diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c +index 60e00702d1a4..75f27ec26f85 100644 +--- a/fs/bcachefs/logged_ops.c ++++ b/fs/bcachefs/logged_ops.c +@@ -63,8 +63,10 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, + int bch2_resume_logged_ops(struct bch_fs *c) + { + int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, +- BTREE_ID_logged_ops, POS_MIN, ++ for_each_btree_key_max(trans, iter, ++ BTREE_ID_logged_ops, ++ POS(LOGGED_OPS_INUM_logged_ops, 0), ++ POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), + BTREE_ITER_prefetch, k, + resume_logged_op(trans, &iter, k))); + bch_err_fn(c, ret); +@@ -74,9 +76,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) + static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) + { + struct btree_iter iter; +- int ret; +- +- ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); ++ int ret = bch2_bkey_get_empty_slot(trans, &iter, ++ BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); + if (ret) + return ret; + +diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h +index 6a4bf7129dba..cfb67c95d4c8 100644 +--- a/fs/bcachefs/logged_ops_format.h ++++ b/fs/bcachefs/logged_ops_format.h +@@ -2,6 +2,11 @@ + #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H + #define _BCACHEFS_LOGGED_OPS_FORMAT_H + ++enum logged_ops_inums { ++ LOGGED_OPS_INUM_logged_ops, ++ LOGGED_OPS_INUM_inode_cursors, ++}; ++ + struct bch_logged_op_truncate { + struct bch_val v; + __le32 subvol; +diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c +index 10857eccdeaf..ce794d55818f 100644 +--- a/fs/bcachefs/lru.c ++++ b/fs/bcachefs/lru.c +@@ -12,7 +12,7 @@ + + /* KEY_TYPE_lru is obsolete: */ + int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + int ret = 0; + +@@ -192,7 +192,7 @@ int bch2_check_lrus(struct bch_fs *c) + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, +- NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_check_lru_key(trans, &iter, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); +diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h +index e6a7d8241bb8..f31a6cf1514c 100644 +--- a/fs/bcachefs/lru.h ++++ b/fs/bcachefs/lru.h +@@ -33,7 +33,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) + return BCH_LRU_read; + } + +-int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); + void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + void bch2_lru_pos_to_text(struct printbuf *, struct bpos); +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +index 0ef4a86850bb..ff787d3d50d2 100644 +--- a/fs/bcachefs/move.c ++++ b/fs/bcachefs/move.c +@@ -21,6 +21,8 @@ + #include "journal_reclaim.h" + #include "keylist.h" + #include "move.h" ++#include "rebalance.h" ++#include "reflink.h" + #include "replicas.h" + #include "snapshot.h" + #include "super-io.h" +@@ -72,11 +74,7 @@ struct moving_io { + unsigned read_sectors; + unsigned write_sectors; + +- struct bch_read_bio rbio; +- + struct data_update write; +- /* Must be last since it is variable size */ +- struct bio_vec bi_inline_vecs[]; + }; + + static void move_free(struct moving_io *io) +@@ -86,13 +84,12 @@ static void move_free(struct moving_io *io) + if (io->b) + atomic_dec(&io->b->count); + +- bch2_data_update_exit(&io->write); +- + mutex_lock(&ctxt->lock); + list_del(&io->io_list); + wake_up(&ctxt->wait); + mutex_unlock(&ctxt->lock); + ++ bch2_data_update_exit(&io->write); + kfree(io); + } + +@@ -112,7 +109,7 @@ static void move_write_done(struct bch_write_op *op) + + static void move_write(struct moving_io *io) + { +- if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { ++ if (unlikely(io->write.rbio.bio.bi_status || io->write.rbio.hole)) { + move_free(io); + return; + } +@@ -130,7 +127,7 @@ static void move_write(struct moving_io *io) + atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_inc(&io->write.ctxt->write_ios); + +- bch2_data_update_read_done(&io->write, io->rbio.pick.crc); ++ bch2_data_update_read_done(&io->write); + } + + struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) +@@ -143,7 +140,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx + + static void move_read_endio(struct bio *bio) + { +- struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); ++ struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); + struct moving_context *ctxt = io->write.ctxt; + + atomic_sub(io->read_sectors, &ctxt->read_sectors); +@@ -196,6 +193,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) + list_del(&ctxt->list); + mutex_unlock(&c->moving_context_lock); + ++ /* ++ * Generally, releasing a transaction within a transaction restart means ++ * an unhandled transaction restart: but this can happen legitimately ++ * within the move code, e.g. when bch2_move_ratelimit() tells us to ++ * exit before we've retried ++ */ ++ bch2_trans_begin(ctxt->trans); + bch2_trans_put(ctxt->trans); + memset(ctxt, 0, sizeof(*ctxt)); + } +@@ -249,11 +253,6 @@ int bch2_move_extent(struct moving_context *ctxt, + { + struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; +- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); +- struct moving_io *io; +- const union bch_extent_entry *entry; +- struct extent_ptr_decoded p; +- unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + + trace_move_extent2(c, k, &io_opts, &data_opts); +@@ -276,13 +275,7 @@ int bch2_move_extent(struct moving_context *ctxt, + */ + bch2_trans_unlock(trans); + +- /* write path might have to decompress data: */ +- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) +- sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); +- +- pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); +- io = kzalloc(sizeof(struct moving_io) + +- sizeof(struct bio_vec) * pages, GFP_KERNEL); ++ struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); + if (!io) + goto err; + +@@ -291,29 +284,13 @@ int bch2_move_extent(struct moving_context *ctxt, + io->read_sectors = k.k->size; + io->write_sectors = k.k->size; + +- bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); +- bio_set_prio(&io->write.op.wbio.bio, +- IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); +- +- if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, +- GFP_KERNEL)) +- goto err_free; +- +- io->rbio.c = c; +- io->rbio.opts = io_opts; +- bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); +- io->rbio.bio.bi_vcnt = pages; +- bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); +- io->rbio.bio.bi_iter.bi_size = sectors << 9; +- +- io->rbio.bio.bi_opf = REQ_OP_READ; +- io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); +- io->rbio.bio.bi_end_io = move_read_endio; +- + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + io_opts, data_opts, iter->btree_id, k); + if (ret) +- goto err_free_pages; ++ goto err_free; ++ ++ io->write.rbio.bio.bi_end_io = move_read_endio; ++ bio_set_prio(&io->write.rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + io->write.op.end_io = move_write_done; + +@@ -347,18 +324,16 @@ int bch2_move_extent(struct moving_context *ctxt, + * ctxt when doing wakeup + */ + closure_get(&ctxt->cl); +- bch2_read_extent(trans, &io->rbio, ++ bch2_read_extent(trans, &io->write.rbio, + bkey_start_pos(k.k), + iter->btree_id, k, 0, +- BCH_READ_NODECODE| +- BCH_READ_LAST_FRAGMENT); ++ BCH_READ_data_update| ++ BCH_READ_last_fragment); + return 0; +-err_free_pages: +- bio_free_pages(&io->write.op.wbio.bio); + err_free: + kfree(io); + err: +- if (ret == -BCH_ERR_data_update_done) ++ if (bch2_err_matches(ret, BCH_ERR_data_update_done)) + return 0; + + if (bch2_err_matches(ret, EROFS) || +@@ -379,34 +354,42 @@ int bch2_move_extent(struct moving_context *ctxt, + return ret; + } + +-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, ++static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, + struct per_snapshot_io_opts *io_opts, ++ struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ ++ struct btree_iter *extent_iter, + struct bkey_s_c extent_k) + { + struct bch_fs *c = trans->c; + u32 restart_count = trans->restart_count; ++ struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; + int ret = 0; + +- if (io_opts->cur_inum != extent_k.k->p.inode) { ++ if (extent_k.k->type == KEY_TYPE_reflink_v) ++ goto out; ++ ++ if (io_opts->cur_inum != extent_pos.inode) { + io_opts->d.nr = 0; + +- ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), ++ ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), + BTREE_ITER_all_snapshots, k, ({ +- if (k.k->p.offset != extent_k.k->p.inode) ++ if (k.k->p.offset != extent_pos.inode) + break; + + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; +- BUG_ON(bch2_inode_unpack(k, &inode)); ++ _ret3 = bch2_inode_unpack(k, &inode); ++ if (_ret3) ++ break; + + struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; + bch2_inode_opts_get(&e.io_opts, trans->c, &inode); + + darray_push(&io_opts->d, e); + })); +- io_opts->cur_inum = extent_k.k->p.inode; ++ io_opts->cur_inum = extent_pos.inode; + } + + ret = ret ?: trans_was_restarted(trans, restart_count); +@@ -415,43 +398,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, + + if (extent_k.k->p.snapshot) + darray_for_each(io_opts->d, i) +- if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) +- return &i->io_opts; +- +- return &io_opts->fs_io_opts; ++ if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { ++ opts_ret = &i->io_opts; ++ break; ++ } ++out: ++ ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); ++ if (ret) ++ return ERR_PTR(ret); ++ return opts_ret; + } + + int bch2_move_get_io_opts_one(struct btree_trans *trans, + struct bch_io_opts *io_opts, ++ struct btree_iter *extent_iter, + struct bkey_s_c extent_k) + { +- struct btree_iter iter; +- struct bkey_s_c k; +- int ret; ++ struct bch_fs *c = trans->c; ++ ++ *io_opts = bch2_opts_to_inode_opts(c->opts); + + /* reflink btree? */ +- if (!extent_k.k->p.inode) { +- *io_opts = bch2_opts_to_inode_opts(trans->c->opts); +- return 0; +- } ++ if (!extent_k.k->p.inode) ++ goto out; + +- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, ++ struct btree_iter inode_iter; ++ struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), + BTREE_ITER_cached); +- ret = bkey_err(k); ++ int ret = bkey_err(inode_k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + +- if (!ret && bkey_is_inode(k.k)) { ++ if (!ret && bkey_is_inode(inode_k.k)) { + struct bch_inode_unpacked inode; +- bch2_inode_unpack(k, &inode); +- bch2_inode_opts_get(io_opts, trans->c, &inode); +- } else { +- *io_opts = bch2_opts_to_inode_opts(trans->c->opts); ++ bch2_inode_unpack(inode_k, &inode); ++ bch2_inode_opts_get(io_opts, c, &inode); + } +- +- bch2_trans_iter_exit(trans, &iter); +- return 0; ++ bch2_trans_iter_exit(trans, &inode_iter); ++out: ++ return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); + } + + int bch2_move_ratelimit(struct moving_context *ctxt) +@@ -509,9 +495,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + struct per_snapshot_io_opts snapshot_io_opts; + struct bch_io_opts *io_opts; + struct bkey_buf sk; +- struct btree_iter iter; ++ struct btree_iter iter, reflink_iter = {}; + struct bkey_s_c k; + struct data_update_opts data_opts; ++ /* ++ * If we're moving a single file, also process reflinked data it points ++ * to (this includes propagating changed io_opts from the inode to the ++ * extent): ++ */ ++ bool walk_indirect = start.inode == end.inode; + int ret = 0, ret2; + + per_snapshot_io_opts_init(&snapshot_io_opts, c); +@@ -531,6 +523,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + bch2_ratelimit_reset(ctxt->rate); + + while (!bch2_move_ratelimit(ctxt)) { ++ struct btree_iter *extent_iter = &iter; ++ + bch2_trans_begin(trans); + + k = bch2_btree_iter_peek(&iter); +@@ -549,10 +543,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + if (ctxt->stats) + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + ++ if (walk_indirect && ++ k.k->type == KEY_TYPE_reflink_p && ++ REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); ++ ++ bch2_trans_iter_exit(trans, &reflink_iter); ++ k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); ++ ret = bkey_err(k); ++ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ++ continue; ++ if (ret) ++ break; ++ ++ if (bkey_deleted(k.k)) ++ goto next_nondata; ++ ++ /* ++ * XXX: reflink pointers may point to multiple indirect ++ * extents, so don't advance past the entire reflink ++ * pointer - need to fixup iter->k ++ */ ++ extent_iter = &reflink_iter; ++ } ++ + if (!bkey_extent_is_direct_data(k.k)) + goto next_nondata; + +- io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); ++ io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, ++ iter.pos, extent_iter, k); + ret = PTR_ERR_OR_ZERO(io_opts); + if (ret) + continue; +@@ -568,12 +588,12 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + +- ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); ++ ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + if (ret2) { + if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) + continue; + +- if (ret2 == -ENOMEM) { ++ if (bch2_err_matches(ret2, ENOMEM)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; +@@ -589,6 +609,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, + bch2_btree_iter_advance(&iter); + } + ++ bch2_trans_iter_exit(trans, &reflink_iter); + bch2_trans_iter_exit(trans, &iter); + bch2_bkey_buf_exit(&sk, c); + per_snapshot_io_opts_exit(&snapshot_io_opts); +@@ -654,16 +675,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + struct bch_fs *c = trans->c; + bool is_kthread = current->flags & PF_KTHREAD; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); +- struct btree_iter iter; ++ struct btree_iter iter = {}, bp_iter = {}; + struct bkey_buf sk; +- struct bch_backpointer bp; +- struct bch_alloc_v4 a_convert; +- const struct bch_alloc_v4 *a; + struct bkey_s_c k; + struct data_update_opts data_opts; +- unsigned dirty_sectors, bucket_size; +- u64 fragmentation; +- struct bpos bp_pos = POS_MIN; ++ unsigned sectors_moved = 0; ++ struct bkey_buf last_flushed; + int ret = 0; + + struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); +@@ -672,6 +689,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + + trace_bucket_evacuate(c, &bucket); + ++ bch2_bkey_buf_init(&last_flushed); ++ bkey_init(&last_flushed.k->k); + bch2_bkey_buf_init(&sk); + + /* +@@ -679,21 +698,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + */ + bch2_trans_begin(trans); + +- bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, +- bucket, BTREE_ITER_cached); +- ret = lockrestart_do(trans, +- bkey_err(k = bch2_btree_iter_peek_slot(&iter))); +- bch2_trans_iter_exit(trans, &iter); ++ bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, ++ bucket_pos_to_bp_start(ca, bucket), 0); + + bch_err_msg(c, ret, "looking up alloc key"); + if (ret) + goto err; + +- a = bch2_alloc_to_v4(k, &a_convert); +- dirty_sectors = bch2_bucket_sectors_dirty(*a); +- bucket_size = ca->mi.bucket_size; +- fragmentation = alloc_lru_idx_fragmentation(*a, ca); +- + ret = bch2_btree_write_buffer_tryflush(trans); + bch_err_msg(c, ret, "flushing btree write buffer"); + if (ret) +@@ -705,18 +716,23 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + + bch2_trans_begin(trans); + +- ret = bch2_get_next_backpointer(trans, ca, bucket, gen, +- &bp_pos, &bp, +- BTREE_ITER_cached); ++ k = bch2_btree_iter_peek(&bp_iter); ++ ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; +- if (bkey_eq(bp_pos, POS_MAX)) ++ ++ if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) + break; + +- if (!bp.level) { +- k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); ++ if (k.k->type != KEY_TYPE_backpointer) ++ goto next; ++ ++ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); ++ ++ if (!bp.v->level) { ++ k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; +@@ -728,7 +744,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + +- ret = bch2_move_get_io_opts_one(trans, &io_opts, k); ++ ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); + if (ret) { + bch2_trans_iter_exit(trans, &iter); + continue; +@@ -738,14 +754,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + data_opts.target = io_opts.background_target; + data_opts.rewrite_ptrs = 0; + ++ unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ + unsigned i = 0; +- bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { +- if (ptr->dev == bucket.inode) { +- data_opts.rewrite_ptrs |= 1U << i; +- if (ptr->cached) { ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { ++ if (p.ptr.dev == bucket.inode) { ++ if (p.ptr.cached) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } ++ data_opts.rewrite_ptrs |= 1U << i; ++ break; + } + i++; + } +@@ -765,14 +785,15 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + goto err; + + if (ctxt->stats) +- atomic64_add(k.k->size, &ctxt->stats->sectors_seen); ++ atomic64_add(sectors, &ctxt->stats->sectors_seen); ++ sectors_moved += sectors; + } else { + struct btree *b; + +- b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); ++ b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); + ret = PTR_ERR_OR_ZERO(b); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) +- continue; ++ goto next; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) +@@ -796,15 +817,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, + atomic64_add(sectors, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_moved); + } ++ sectors_moved += btree_sectors(c); + } + next: +- bp_pos = bpos_nosnap_successor(bp_pos); ++ bch2_btree_iter_advance(&bp_iter); + } + +- trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); ++ trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); + err: ++ bch2_trans_iter_exit(trans, &bp_iter); + bch2_dev_put(ca); + bch2_bkey_buf_exit(&sk, c); ++ bch2_bkey_buf_exit(&last_flushed, c); + return ret; + } + +@@ -1158,7 +1182,7 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str + + mutex_lock(&ctxt->lock); + list_for_each_entry(io, &ctxt->ios, io_list) +- bch2_write_op_to_text(out, &io->write.op); ++ bch2_data_update_inflight_to_text(out, &io->write); + mutex_unlock(&ctxt->lock); + + printbuf_indent_sub(out, 4); +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +index 9baf3093a678..51e0505a8156 100644 +--- a/fs/bcachefs/move.h ++++ b/fs/bcachefs/move.h +@@ -110,9 +110,8 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt + darray_exit(&io_opts->d); + } + +-struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, +- struct per_snapshot_io_opts *, struct bkey_s_c); +-int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); ++int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, ++ struct btree_iter *, struct bkey_s_c); + + int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); + +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +index d658be90f737..21805509ab9e 100644 +--- a/fs/bcachefs/movinggc.c ++++ b/fs/bcachefs/movinggc.c +@@ -167,7 +167,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, + + bch2_trans_begin(trans); + +- ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, ++ ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, + lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), + lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + 0, k, ({ +@@ -215,7 +215,8 @@ static int bch2_copygc(struct moving_context *ctxt, + }; + move_buckets buckets = { 0 }; + struct move_bucket_in_flight *f; +- u64 moved = atomic64_read(&ctxt->stats->sectors_moved); ++ u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); ++ u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); + int ret = 0; + + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); +@@ -245,7 +246,6 @@ static int bch2_copygc(struct moving_context *ctxt, + *did_work = true; + } + err: +- darray_exit(&buckets); + + /* no entries in LRU btree found, or got to end: */ + if (bch2_err_matches(ret, ENOENT)) +@@ -254,8 +254,11 @@ static int bch2_copygc(struct moving_context *ctxt, + if (ret < 0 && !bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "from bch2_move_data()"); + +- moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; +- trace_and_count(c, copygc, c, moved, 0, 0, 0); ++ sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; ++ sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; ++ trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); ++ ++ darray_exit(&buckets); + return ret; + } + +@@ -350,9 +353,9 @@ static int bch2_copygc_thread(void *arg) + bch2_trans_unlock_long(ctxt.trans); + cond_resched(); + +- if (!c->copy_gc_enabled) { ++ if (!c->opts.copygc_enabled) { + move_buckets_wait(&ctxt, buckets, true); +- kthread_wait_freezable(c->copy_gc_enabled || ++ kthread_wait_freezable(c->opts.copygc_enabled || + kthread_should_stop()); + } + +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +index 0e2ee262fbd4..6772faf385a5 100644 +--- a/fs/bcachefs/opts.c ++++ b/fs/bcachefs/opts.c +@@ -1,6 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0 + + #include ++#include + + #include "bcachefs.h" + #include "compress.h" +@@ -48,12 +49,12 @@ static const char * const __bch2_csum_types[] = { + NULL + }; + +-const char * const bch2_csum_opts[] = { ++const char * const __bch2_csum_opts[] = { + BCH_CSUM_OPTS() + NULL + }; + +-static const char * const __bch2_compression_types[] = { ++const char * const __bch2_compression_types[] = { + BCH_COMPRESSION_TYPES() + NULL + }; +@@ -113,6 +114,7 @@ void bch2_prt_##name(struct printbuf *out, type t) \ + PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); + PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); + PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); ++PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); + PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); + PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); + PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); +@@ -333,17 +335,18 @@ int bch2_opt_parse(struct bch_fs *c, + switch (opt->type) { + case BCH_OPT_BOOL: + if (val) { +- ret = kstrtou64(val, 10, res); ++ ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); ++ if (ret != -BCH_ERR_option_not_bool) { ++ *res = ret; ++ } else { ++ if (err) ++ prt_printf(err, "%s: must be bool", opt->attr.name); ++ return ret; ++ } + } else { +- ret = 0; + *res = 1; + } + +- if (ret < 0 || (*res != 0 && *res != 1)) { +- if (err) +- prt_printf(err, "%s: must be bool", opt->attr.name); +- return ret < 0 ? ret : -BCH_ERR_option_not_bool; +- } + break; + case BCH_OPT_UINT: + if (!val) { +@@ -710,11 +713,14 @@ void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, + + struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) + { +- return (struct bch_io_opts) { ++ struct bch_io_opts opts = { + #define x(_name, _bits) ._name = src._name, + BCH_INODE_OPTS() + #undef x + }; ++ ++ bch2_io_opts_fixups(&opts); ++ return opts; + } + + bool bch2_opt_is_inode_opt(enum bch_opt_id id) +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +index 23dda014e331..a182b5d454ba 100644 +--- a/fs/bcachefs/opts.h ++++ b/fs/bcachefs/opts.h +@@ -16,7 +16,8 @@ extern const char * const bch2_version_upgrade_opts[]; + extern const char * const bch2_sb_features[]; + extern const char * const bch2_sb_compat[]; + extern const char * const __bch2_btree_ids[]; +-extern const char * const bch2_csum_opts[]; ++extern const char * const __bch2_csum_opts[]; ++extern const char * const __bch2_compression_types[]; + extern const char * const bch2_compression_opts[]; + extern const char * const __bch2_str_hash_types[]; + extern const char * const bch2_str_hash_opts[]; +@@ -27,6 +28,7 @@ extern const char * const bch2_d_types[]; + void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); + void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); + void bch2_prt_data_type(struct printbuf *, enum bch_data_type); ++void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); + void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); + void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); + void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); +@@ -171,12 +173,12 @@ enum fsck_err_opts { + "size", "Maximum size of checksummed/compressed extents")\ + x(metadata_checksum, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_STR(bch2_csum_opts), \ ++ OPT_STR(__bch2_csum_opts), \ + BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(data_checksum, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_STR(bch2_csum_opts), \ ++ OPT_STR(__bch2_csum_opts), \ + BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(compression, u8, \ +@@ -220,14 +222,14 @@ enum fsck_err_opts { + BCH_SB_ERASURE_CODE, false, \ + NULL, "Enable erasure coding (DO NOT USE YET)") \ + x(inodes_32bit, u8, \ +- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_INODE_32BIT, true, \ + NULL, "Constrain inode numbers to 32 bits") \ +- x(shard_inode_numbers, u8, \ +- OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ +- OPT_BOOL(), \ +- BCH_SB_SHARD_INUMS, true, \ ++ x(shard_inode_numbers_bits, u8, \ ++ OPT_FS|OPT_FORMAT, \ ++ OPT_UINT(0, 8), \ ++ BCH_SB_SHARD_INUMS_NBITS, 0, \ + NULL, "Shard new inode numbers by CPU id") \ + x(inodes_use_key_cache, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ +@@ -473,6 +475,18 @@ enum fsck_err_opts { + BCH2_NO_SB_OPT, true, \ + NULL, "Enable nocow mode: enables runtime locking in\n"\ + "data move path needed if nocow will ever be in use\n")\ ++ x(copygc_enabled, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, true, \ ++ NULL, "Enable copygc: disable for debugging, or to\n"\ ++ "quiet the system when doing performance testing\n")\ ++ x(rebalance_enabled, u8, \ ++ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH2_NO_SB_OPT, true, \ ++ NULL, "Enable rebalance: disable for debugging, or to\n"\ ++ "quiet the system when doing performance testing\n")\ + x(no_data_io, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ +@@ -488,7 +502,7 @@ enum fsck_err_opts { + OPT_DEVICE, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, 0, \ +- "size", "Size of filesystem on device") \ ++ "size", "Specifies the bucket size; must be greater than the btree node size")\ + x(durability, u8, \ + OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ + OPT_UINT(0, BCH_REPLICAS_MAX), \ +@@ -624,14 +638,39 @@ struct bch_io_opts { + #define x(_name, _bits) u##_bits _name; + BCH_INODE_OPTS() + #undef x ++#define x(_name, _bits) u64 _name##_from_inode:1; ++ BCH_INODE_OPTS() ++#undef x + }; + +-static inline unsigned background_compression(struct bch_io_opts opts) ++static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) + { +- return opts.background_compression ?: opts.compression; ++ if (!opts->background_target) ++ opts->background_target = opts->foreground_target; ++ if (!opts->background_compression) ++ opts->background_compression = opts->compression; ++ if (opts->nocow) { ++ opts->compression = opts->background_compression = 0; ++ opts->data_checksum = 0; ++ opts->erasure_code = 0; ++ } + } + + struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); + bool bch2_opt_is_inode_opt(enum bch_opt_id); + ++/* rebalance opts: */ ++ ++static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts) ++{ ++ return (struct bch_extent_rebalance) { ++ .type = BIT(BCH_EXTENT_ENTRY_rebalance), ++#define x(_name) \ ++ ._name = opts->_name, \ ++ ._name##_from_inode = opts->_name##_from_inode, ++ BCH_REBALANCE_OPTS() ++#undef x ++ }; ++}; ++ + #endif /* _BCACHEFS_OPTS_H */ +diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h +index 1d570387b77f..d0dd398baa2b 100644 +--- a/fs/bcachefs/printbuf.h ++++ b/fs/bcachefs/printbuf.h +@@ -251,16 +251,23 @@ static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) + printbuf_nul_terminate_reserved(out); + } + ++static inline void printbuf_reset_keep_tabstops(struct printbuf *buf) ++{ ++ buf->pos = 0; ++ buf->allocation_failure = 0; ++ buf->last_newline = 0; ++ buf->last_field = 0; ++ buf->indent = 0; ++ buf->cur_tabstop = 0; ++} ++ + /** + * printbuf_reset - re-use a printbuf without freeing and re-initializing it: + */ + static inline void printbuf_reset(struct printbuf *buf) + { +- buf->pos = 0; +- buf->allocation_failure = 0; +- buf->indent = 0; ++ printbuf_reset_keep_tabstops(buf); + buf->nr_tabstops = 0; +- buf->cur_tabstop = 0; + } + + /** +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +index 74f45a8162ad..8b857fc33244 100644 +--- a/fs/bcachefs/quota.c ++++ b/fs/bcachefs/quota.c +@@ -60,7 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { + }; + + int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + int ret = 0; + +diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h +index a62abcc5332a..1551800ff44c 100644 +--- a/fs/bcachefs/quota.h ++++ b/fs/bcachefs/quota.h +@@ -5,10 +5,10 @@ + #include "inode.h" + #include "quota_types.h" + +-enum bch_validate_flags; + extern const struct bch_sb_field_ops bch_sb_field_ops_quota; + +-int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_quota ((struct bkey_ops) { \ +diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c +index 40a20192eee8..bef2aa1b8bcd 100644 +--- a/fs/bcachefs/rcu_pending.c ++++ b/fs/bcachefs/rcu_pending.c +@@ -25,21 +25,37 @@ enum rcu_pending_special { + #define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) + #define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) + +-static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp) ++#ifdef __KERNEL__ ++typedef unsigned long rcu_gp_poll_state_t; ++ ++static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) ++{ ++ return l == r; ++} ++#else ++typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; ++ ++static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) ++{ ++ return l.grace_period_id == r.grace_period_id; ++} ++#endif ++ ++static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) + { + return ssp + ? get_state_synchronize_srcu(ssp) + : get_state_synchronize_rcu(); + } + +-static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp) ++static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) + { + return ssp + ? start_poll_synchronize_srcu(ssp) + : start_poll_synchronize_rcu(); + } + +-static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie) ++static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) + { + return ssp + ? poll_state_synchronize_srcu(ssp, cookie) +@@ -71,13 +87,13 @@ struct rcu_pending_seq { + GENRADIX(struct rcu_head *) objs; + size_t nr; + struct rcu_head **cursor; +- unsigned long seq; ++ rcu_gp_poll_state_t seq; + }; + + struct rcu_pending_list { + struct rcu_head *head; + struct rcu_head *tail; +- unsigned long seq; ++ rcu_gp_poll_state_t seq; + }; + + struct rcu_pending_pcpu { +@@ -316,10 +332,10 @@ static void rcu_pending_rcu_cb(struct rcu_head *rcu) + } + + static __always_inline struct rcu_pending_seq * +-get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) ++get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) + { + darray_for_each_reverse(p->objs, objs) +- if (objs->seq == seq) ++ if (rcu_gp_poll_cookie_eq(objs->seq, seq)) + return objs; + + if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) +@@ -329,7 +345,7 @@ get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) + } + + static noinline bool +-rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, ++rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, + struct rcu_head *head, void *ptr, + unsigned long *flags) + { +@@ -364,7 +380,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, + again: + for (struct rcu_pending_list *i = p->lists; + i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { +- if (i->seq == seq) { ++ if (rcu_gp_poll_cookie_eq(i->seq, seq)) { + rcu_pending_list_add(i, head); + return false; + } +@@ -408,7 +424,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, + struct rcu_pending_pcpu *p; + struct rcu_pending_seq *objs; + struct genradix_node *new_node = NULL; +- unsigned long seq, flags; ++ unsigned long flags; + bool start_gp = false; + + BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); +@@ -416,7 +432,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, + local_irq_save(flags); + p = this_cpu_ptr(pending->p); + spin_lock(&p->lock); +- seq = __get_state_synchronize_rcu(pending->srcu); ++ rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); + restart: + if (may_sleep && + unlikely(process_finished_items(pending, p, flags))) +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +index cd6647374353..90dbf04c07a1 100644 +--- a/fs/bcachefs/rebalance.c ++++ b/fs/bcachefs/rebalance.c +@@ -24,6 +24,192 @@ + #include + #include + ++/* bch_extent_rebalance: */ ++ ++static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ ++ bkey_extent_entry_for_each(ptrs, entry) ++ if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) ++ return &entry->rebalance; ++ ++ return NULL; ++} ++ ++static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, ++ struct bch_io_opts *opts, ++ struct bkey_s_c k, ++ struct bkey_ptrs_c ptrs) ++{ ++ if (!opts->background_compression) ++ return 0; ++ ++ unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned ptr_bit = 1; ++ unsigned rewrite_ptrs = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || ++ p.ptr.unwritten) ++ return 0; ++ ++ if (!p.ptr.cached && p.crc.compression_type != compression_type) ++ rewrite_ptrs |= ptr_bit; ++ ptr_bit <<= 1; ++ } ++ ++ return rewrite_ptrs; ++} ++ ++static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, ++ struct bch_io_opts *opts, ++ struct bkey_ptrs_c ptrs) ++{ ++ if (!opts->background_target || ++ !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) ++ return 0; ++ ++ unsigned ptr_bit = 1; ++ unsigned rewrite_ptrs = 0; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) ++ rewrite_ptrs |= ptr_bit; ++ ptr_bit <<= 1; ++ } ++ ++ return rewrite_ptrs; ++} ++ ++static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, ++ struct bch_io_opts *opts, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ ++ return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | ++ bch2_bkey_ptrs_need_move(c, opts, ptrs); ++} ++ ++u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); ++ if (!opts) ++ return 0; ++ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ u64 sectors = 0; ++ ++ if (opts->background_compression) { ++ unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || ++ p.ptr.unwritten) { ++ sectors = 0; ++ goto incompressible; ++ } ++ ++ if (!p.ptr.cached && p.crc.compression_type != compression_type) ++ sectors += p.crc.compressed_size; ++ } ++ } ++incompressible: ++ if (opts->background_target && ++ bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) ++ sectors += p.crc.compressed_size; ++ } ++ ++ return sectors; ++} ++ ++static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, ++ struct bkey_s_c k) ++{ ++ if (!bkey_extent_is_direct_data(k.k)) ++ return 0; ++ ++ const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); ++ ++ if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { ++ struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); ++ return old == NULL || memcmp(old, &new, sizeof(new)); ++ } else { ++ return old != NULL; ++ } ++} ++ ++int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, ++ struct bkey_i *_k) ++{ ++ if (!bkey_extent_is_direct_data(&_k->k)) ++ return 0; ++ ++ struct bkey_s k = bkey_i_to_s(_k); ++ struct bch_extent_rebalance *old = ++ (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); ++ ++ if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { ++ if (!old) { ++ old = bkey_val_end(k); ++ k.k->u64s += sizeof(*old) / sizeof(u64); ++ } ++ ++ *old = io_opts_to_rebalance_opts(opts); ++ } else { ++ if (old) ++ extent_entry_drop(k, (union bch_extent_entry *) old); ++ } ++ ++ return 0; ++} ++ ++int bch2_get_update_rebalance_opts(struct btree_trans *trans, ++ struct bch_io_opts *io_opts, ++ struct btree_iter *iter, ++ struct bkey_s_c k) ++{ ++ BUG_ON(iter->flags & BTREE_ITER_is_extents); ++ BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); ++ ++ const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v ++ ? bch2_bkey_rebalance_opts(k) : NULL; ++ if (r) { ++#define x(_name) \ ++ if (r->_name##_from_inode) { \ ++ io_opts->_name = r->_name; \ ++ io_opts->_name##_from_inode = true; \ ++ } ++ BCH_REBALANCE_OPTS() ++#undef x ++ } ++ ++ if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) ++ return 0; ++ ++ struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); ++ int ret = PTR_ERR_OR_ZERO(n); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(n, k); ++ ++ /* On successfull transaction commit, @k was invalidated: */ ++ ++ return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: ++ bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0) ?: ++ -BCH_ERR_transaction_restart_nested; ++} ++ + #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) + + static const char * const bch2_rebalance_state_strs[] = { +@@ -33,7 +219,7 @@ static const char * const bch2_rebalance_state_strs[] = { + #undef x + }; + +-static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) ++int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) + { + struct btree_iter iter; + struct bkey_s_c k; +@@ -71,9 +257,8 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) + int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) + { + int ret = bch2_trans_commit_do(c, NULL, NULL, +- BCH_TRANS_COMMIT_no_enospc| +- BCH_TRANS_COMMIT_lazy_rw, +- __bch2_set_rebalance_needs_scan(trans, inum)); ++ BCH_TRANS_COMMIT_no_enospc, ++ bch2_set_rebalance_needs_scan_trans(trans, inum)); + rebalance_wakeup(c); + return ret; + } +@@ -121,6 +306,9 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) + { ++ if (!bch2_bkey_rebalance_opts(k)) ++ return 0; ++ + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) +@@ -134,32 +322,28 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, + static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + struct bpos work_pos, + struct btree_iter *extent_iter, ++ struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { + struct bch_fs *c = trans->c; +- struct bkey_s_c k; + + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, + work_pos, + BTREE_ITER_all_snapshots); +- k = bch2_btree_iter_peek_slot(extent_iter); ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); + if (bkey_err(k)) + return k; + +- const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; +- if (!r) { +- /* raced due to btree write buffer, nothing to do */ +- return bkey_s_c_null; +- } ++ int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); ++ if (ret) ++ return bkey_s_c_err(ret); + + memset(data_opts, 0, sizeof(*data_opts)); +- +- data_opts->rewrite_ptrs = +- bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); +- data_opts->target = r->target; +- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); ++ data_opts->target = io_opts->background_target; ++ data_opts->write_flags |= BCH_WRITE_only_specified_devs; + + if (!data_opts->rewrite_ptrs) { + /* +@@ -178,12 +362,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, + if (trace_rebalance_extent_enabled()) { + struct printbuf buf = PRINTBUF; + +- prt_str(&buf, "target="); +- bch2_target_to_text(&buf, c, r->target); +- prt_str(&buf, " compression="); +- bch2_compression_opt_to_text(&buf, r->compression); +- prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, k); ++ prt_newline(&buf); ++ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ ++ unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); ++ if (p) { ++ prt_str(&buf, "compression="); ++ bch2_compression_opt_to_text(&buf, io_opts->background_compression); ++ prt_str(&buf, " "); ++ bch2_prt_u64_base2(&buf, p); ++ prt_newline(&buf); ++ } ++ ++ p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); ++ if (p) { ++ prt_str(&buf, "move="); ++ bch2_target_to_text(&buf, c, io_opts->background_target); ++ prt_str(&buf, " "); ++ bch2_prt_u64_base2(&buf, p); ++ prt_newline(&buf); ++ } + + trace_rebalance_extent(c, buf.buf); + printbuf_exit(&buf); +@@ -212,14 +412,10 @@ static int do_rebalance_extent(struct moving_context *ctxt, + bch2_bkey_buf_init(&sk); + + ret = bkey_err(k = next_rebalance_extent(trans, work_pos, +- extent_iter, &data_opts)); ++ extent_iter, &io_opts, &data_opts)); + if (ret || !k.k) + goto out; + +- ret = bch2_move_get_io_opts_one(trans, &io_opts, k); +- if (ret) +- goto out; +- + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + + /* +@@ -253,21 +449,9 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) + { +- unsigned target, compression; +- +- if (k.k->p.inode) { +- target = io_opts->background_target; +- compression = background_compression(*io_opts); +- } else { +- const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); +- +- target = r ? r->target : io_opts->background_target; +- compression = r ? r->compression : background_compression(*io_opts); +- } +- +- data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); +- data_opts->target = target; +- data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); ++ data_opts->target = io_opts->background_target; ++ data_opts->write_flags |= BCH_WRITE_only_specified_devs; + return data_opts->rewrite_ptrs != 0; + } + +@@ -338,9 +522,9 @@ static int do_rebalance(struct moving_context *ctxt) + BTREE_ITER_all_snapshots); + + while (!bch2_move_ratelimit(ctxt)) { +- if (!r->enabled) { ++ if (!c->opts.rebalance_enabled) { + bch2_moving_ctxt_flush_all(ctxt); +- kthread_wait_freezable(r->enabled || ++ kthread_wait_freezable(c->opts.rebalance_enabled || + kthread_should_stop()); + } + +diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h +index 28a52638f16c..0a0821ab895d 100644 +--- a/fs/bcachefs/rebalance.h ++++ b/fs/bcachefs/rebalance.h +@@ -2,8 +2,18 @@ + #ifndef _BCACHEFS_REBALANCE_H + #define _BCACHEFS_REBALANCE_H + ++#include "compress.h" ++#include "disk_groups.h" + #include "rebalance_types.h" + ++u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); ++int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); ++int bch2_get_update_rebalance_opts(struct btree_trans *, ++ struct bch_io_opts *, ++ struct btree_iter *, ++ struct bkey_s_c); ++ ++int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); + int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); + int bch2_set_fs_needs_rebalance(struct bch_fs *); + +diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h +new file mode 100644 +index 000000000000..ff9a1342a22b +--- /dev/null ++++ b/fs/bcachefs/rebalance_format.h +@@ -0,0 +1,53 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_FORMAT_H ++#define _BCACHEFS_REBALANCE_FORMAT_H ++ ++struct bch_extent_rebalance { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:6, ++ unused:3, ++ ++ promote_target_from_inode:1, ++ erasure_code_from_inode:1, ++ data_checksum_from_inode:1, ++ background_compression_from_inode:1, ++ data_replicas_from_inode:1, ++ background_target_from_inode:1, ++ ++ promote_target:16, ++ erasure_code:1, ++ data_checksum:4, ++ data_replicas:4, ++ background_compression:8, /* enum bch_compression_opt */ ++ background_target:16; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 background_target:16, ++ background_compression:8, ++ data_replicas:4, ++ data_checksum:4, ++ erasure_code:1, ++ promote_target:16, ++ ++ background_target_from_inode:1, ++ data_replicas_from_inode:1, ++ background_compression_from_inode:1, ++ data_checksum_from_inode:1, ++ erasure_code_from_inode:1, ++ promote_target_from_inode:1, ++ ++ unused:3, ++ type:6; ++#endif ++}; ++ ++/* subset of BCH_INODE_OPTS */ ++#define BCH_REBALANCE_OPTS() \ ++ x(data_checksum) \ ++ x(background_compression) \ ++ x(data_replicas) \ ++ x(promote_target) \ ++ x(background_target) \ ++ x(erasure_code) ++ ++#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ ++ +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +index 0fffb536c1d0..fe5098c17dfc 100644 +--- a/fs/bcachefs/rebalance_types.h ++++ b/fs/bcachefs/rebalance_types.h +@@ -30,8 +30,6 @@ struct bch_fs_rebalance { + struct bbpos scan_start; + struct bbpos scan_end; + struct bch_move_stats scan_stats; +- +- unsigned enabled:1; + }; + + #endif /* _BCACHEFS_REBALANCE_TYPES_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +index 3c7f941dde39..98825437381c 100644 +--- a/fs/bcachefs/recovery.c ++++ b/fs/bcachefs/recovery.c +@@ -34,21 +34,83 @@ + + #define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +-void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) ++int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) + { +- if (btree >= BTREE_ID_NR_MAX) +- return; +- + u64 b = BIT_ULL(btree); ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + if (!(c->sb.btrees_lost_data & b)) { +- bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); ++ struct printbuf buf = PRINTBUF; ++ bch2_btree_id_to_text(&buf, btree); ++ bch_err(c, "flagging btree %s lost data", buf.buf); ++ printbuf_exit(&buf); ++ ext->btrees_lost_data |= cpu_to_le64(b); ++ } + +- mutex_lock(&c->sb_lock); +- bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); +- bch2_write_super(c); +- mutex_unlock(&c->sb_lock); ++ /* Once we have runtime self healing for topology errors we won't need this: */ ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; ++ ++ /* Btree node accounting will be off: */ ++ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ /* ++ * These are much more minor, and don't need to be corrected right away, ++ * but in debug mode we want the next fsck run to be clean: ++ */ ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; ++#endif ++ ++ switch (btree) { ++ case BTREE_ID_alloc: ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ ++ __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); ++ __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); ++ __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); ++ __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); ++ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); ++ __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); ++ goto out; ++ case BTREE_ID_backpointers: ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; ++ goto out; ++ case BTREE_ID_need_discard: ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ goto out; ++ case BTREE_ID_freespace: ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ goto out; ++ case BTREE_ID_bucket_gens: ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ goto out; ++ case BTREE_ID_lru: ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; ++ goto out; ++ case BTREE_ID_accounting: ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; ++ goto out; ++ default: ++ ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; ++ goto out; + } ++out: ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static void kill_btree(struct bch_fs *c, enum btree_id btree) ++{ ++ bch2_btree_id_root(c, btree)->alive = false; ++ bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + } + + /* for -o reconstruct_alloc: */ +@@ -79,6 +141,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) + __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); + ++ __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); ++ + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); +@@ -99,16 +163,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, +- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); +- bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, +- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); +- bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, +- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); +- bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, +- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); +- bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, +- 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); ++ for (unsigned i = 0; i < btree_id_nr_alive(c); i++) ++ if (btree_id_is_alloc(i)) ++ kill_btree(c, i); + } + + /* +@@ -354,10 +411,13 @@ int bch2_journal_replay(struct bch_fs *c) + ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim + : 0), + bch2_journal_replay_key(trans, k)); +- bch_err_msg(c, ret, "while replaying key at btree %s level %u:", +- bch2_btree_id_str(k->btree_id), k->level); +- if (ret) ++ if (ret) { ++ struct printbuf buf = PRINTBUF; ++ bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); ++ bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); ++ printbuf_exit(&buf); + goto err; ++ } + + BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); + } +@@ -403,7 +463,9 @@ static int journal_replay_entry_early(struct bch_fs *c, + + switch (entry->type) { + case BCH_JSET_ENTRY_btree_root: { +- struct btree_root *r; ++ ++ if (unlikely(!entry->u64s)) ++ return 0; + + if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, + c, invalid_btree_id, +@@ -417,15 +479,11 @@ static int journal_replay_entry_early(struct bch_fs *c, + return ret; + } + +- r = bch2_btree_id_root(c, entry->btree_id); ++ struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); + +- if (entry->u64s) { +- r->level = entry->level; +- bkey_copy(&r->key, (struct bkey_i *) entry->start); +- r->error = 0; +- } else { +- r->error = -BCH_ERR_btree_node_read_error; +- } ++ r->level = entry->level; ++ bkey_copy(&r->key, (struct bkey_i *) entry->start); ++ r->error = 0; + r->alive = true; + break; + } +@@ -505,6 +563,7 @@ static int journal_replay_early(struct bch_fs *c, + + static int read_btree_roots(struct bch_fs *c) + { ++ struct printbuf buf = PRINTBUF; + int ret = 0; + + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { +@@ -513,33 +572,22 @@ static int read_btree_roots(struct bch_fs *c) + if (!r->alive) + continue; + +- if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) +- continue; ++ printbuf_reset(&buf); ++ bch2_btree_id_level_to_text(&buf, i, r->level); + + if (mustfix_fsck_err_on((ret = r->error), + c, btree_root_bkey_invalid, + "invalid btree root %s", +- bch2_btree_id_str(i)) || ++ buf.buf) || + mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), + c, btree_root_read_error, +- "error reading btree root %s l=%u: %s", +- bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { +- if (btree_id_is_alloc(i)) { +- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); +- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); +- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); +- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); +- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); +- c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); ++ "error reading btree root %s: %s", ++ buf.buf, bch2_err_str(ret))) { ++ if (btree_id_is_alloc(i)) + r->error = 0; +- } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { +- bch_info(c, "will run btree node scan"); +- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); +- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); +- } + +- ret = 0; +- bch2_btree_lost_data(c, i); ++ ret = bch2_btree_lost_data(c, i); ++ BUG_ON(ret); + } + } + +@@ -553,6 +601,7 @@ static int read_btree_roots(struct bch_fs *c) + } + } + fsck_err: ++ printbuf_exit(&buf); + return ret; + } + +@@ -563,6 +612,7 @@ static bool check_version_upgrade(struct bch_fs *c) + bch2_latest_compatible_version(c->sb.version)); + unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; + unsigned new_version = 0; ++ bool ret = false; + + if (old_version < bcachefs_metadata_required_upgrade_below) { + if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || +@@ -618,14 +668,32 @@ static bool check_version_upgrade(struct bch_fs *c) + } + + bch_info(c, "%s", buf.buf); ++ printbuf_exit(&buf); + +- bch2_sb_upgrade(c, new_version); ++ ret = true; ++ } + ++ if (new_version > c->sb.version_incompat && ++ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { ++ struct printbuf buf = PRINTBUF; ++ ++ prt_str(&buf, "Now allowing incompatible features up to "); ++ bch2_version_to_text(&buf, new_version); ++ prt_str(&buf, ", previously allowed up to "); ++ bch2_version_to_text(&buf, c->sb.version_incompat_allowed); ++ prt_newline(&buf); ++ ++ bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); +- return true; ++ ++ ret = true; + } + +- return false; ++ if (ret) ++ bch2_sb_upgrade(c, new_version, ++ c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); ++ ++ return ret; + } + + int bch2_fs_recovery(struct bch_fs *c) +@@ -660,8 +728,13 @@ int bch2_fs_recovery(struct bch_fs *c) + goto err; + } + +- if (c->opts.norecovery) +- c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; ++ if (c->opts.norecovery) { ++ c->opts.recovery_pass_last = c->opts.recovery_pass_last ++ ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) ++ : BCH_RECOVERY_PASS_snapshots_read; ++ c->opts.nochanges = true; ++ c->opts.read_only = true; ++ } + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); +@@ -708,17 +781,20 @@ int bch2_fs_recovery(struct bch_fs *c) + + c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + ++ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { ++ SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); ++ write_sb = true; ++ } ++ + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) +- c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); +- + if (c->opts.fsck) + set_bit(BCH_FS_fsck_running, &c->flags); + if (c->sb.clean) + set_bit(BCH_FS_clean_recovery, &c->flags); ++ set_bit(BCH_FS_recovery_running, &c->flags); + + ret = bch2_blacklist_table_initialize(c); + if (ret) { +@@ -807,15 +883,15 @@ int bch2_fs_recovery(struct bch_fs *c) + c->journal_replay_seq_start = last_seq; + c->journal_replay_seq_end = blacklist_seq - 1; + +- if (c->opts.reconstruct_alloc) +- bch2_reconstruct_alloc(c); +- + zero_out_btree_mem_ptr(&c->journal_keys); + + ret = journal_replay_early(c, clean); + if (ret) + goto err; + ++ if (c->opts.reconstruct_alloc) ++ bch2_reconstruct_alloc(c); ++ + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that +@@ -870,16 +946,17 @@ int bch2_fs_recovery(struct bch_fs *c) + */ + set_bit(BCH_FS_may_go_rw, &c->flags); + clear_bit(BCH_FS_fsck_running, &c->flags); ++ clear_bit(BCH_FS_recovery_running, &c->flags); + + /* in case we don't run journal replay, i.e. norecovery mode */ + set_bit(BCH_FS_accounting_replay_done, &c->flags); + ++ bch2_async_btree_node_rewrites_flush(c); ++ + /* fsync if we fixed errors */ +- if (test_bit(BCH_FS_errors_fixed, &c->flags) && +- bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { ++ if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_meta(&c->journal); +- bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + } + + /* If we fixed errors, verify that fs is actually clean now: */ +@@ -1021,7 +1098,7 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_check_version_downgrade(c); + + if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { +- bch2_sb_upgrade(c, bcachefs_metadata_version_current); ++ bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + bch2_write_super(c); + } +@@ -1035,7 +1112,6 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + +- c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; + set_bit(BCH_FS_btree_running, &c->flags); + set_bit(BCH_FS_may_go_rw, &c->flags); + +@@ -1076,9 +1152,6 @@ int bch2_fs_initialize(struct bch_fs *c) + if (ret) + goto err; + +- for_each_online_member(c, ca) +- ca->new_fs_bucket_idx = 0; +- + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; +@@ -1137,6 +1210,7 @@ int bch2_fs_initialize(struct bch_fs *c) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + ++ c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; + return 0; + err: + bch_err_fn(c, ret); +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +index 4bf818de1f2f..b0d55754b21b 100644 +--- a/fs/bcachefs/recovery.h ++++ b/fs/bcachefs/recovery.h +@@ -2,7 +2,7 @@ + #ifndef _BCACHEFS_RECOVERY_H + #define _BCACHEFS_RECOVERY_H + +-void bch2_btree_lost_data(struct bch_fs *, enum btree_id); ++int bch2_btree_lost_data(struct bch_fs *, enum btree_id); + + int bch2_journal_replay(struct bch_fs *); + +diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c +index dff589ddc984..0b3c951c32da 100644 +--- a/fs/bcachefs/recovery_passes.c ++++ b/fs/bcachefs/recovery_passes.c +@@ -46,7 +46,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) + + set_bit(BCH_FS_may_go_rw, &c->flags); + +- if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) ++ if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + return bch2_fs_read_write_early(c); + return 0; + } +@@ -100,20 +100,34 @@ u64 bch2_recovery_passes_from_stable(u64 v) + /* + * For when we need to rewind recovery passes and run a pass we skipped: + */ +-int bch2_run_explicit_recovery_pass(struct bch_fs *c, +- enum bch_recovery_pass pass) ++static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, ++ enum bch_recovery_pass pass) + { +- if (c->opts.recovery_passes & BIT_ULL(pass)) ++ if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) ++ return -BCH_ERR_not_in_recovery; ++ ++ if (c->recovery_passes_complete & BIT_ULL(pass)) + return 0; + +- bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", +- bch2_recovery_passes[pass], pass, +- bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); ++ bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); ++ ++ if (pass < BCH_RECOVERY_PASS_set_may_go_rw && ++ c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { ++ if (print) ++ bch_info(c, "need recovery pass %s (%u), but already rw", ++ bch2_recovery_passes[pass], pass); ++ return -BCH_ERR_cannot_rewind_recovery; ++ } ++ ++ if (print) ++ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", ++ bch2_recovery_passes[pass], pass, ++ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + + c->opts.recovery_passes |= BIT_ULL(pass); + +- if (c->curr_recovery_pass >= pass) { +- c->curr_recovery_pass = pass; ++ if (c->curr_recovery_pass > pass) { ++ c->next_recovery_pass = pass; + c->recovery_passes_complete &= (1ULL << pass) >> 1; + return -BCH_ERR_restart_recovery; + } else { +@@ -121,6 +135,27 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, + } + } + ++int bch2_run_explicit_recovery_pass(struct bch_fs *c, ++ enum bch_recovery_pass pass) ++{ ++ unsigned long flags; ++ spin_lock_irqsave(&c->recovery_pass_lock, flags); ++ int ret = __bch2_run_explicit_recovery_pass(c, pass); ++ spin_unlock_irqrestore(&c->recovery_pass_lock, flags); ++ return ret; ++} ++ ++int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, ++ enum bch_recovery_pass pass) ++{ ++ lockdep_assert_held(&c->sb_lock); ++ ++ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); ++ __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); ++ ++ return bch2_run_explicit_recovery_pass(c, pass); ++} ++ + int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, + enum bch_recovery_pass pass) + { +@@ -233,31 +268,48 @@ int bch2_run_recovery_passes(struct bch_fs *c) + */ + c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + +- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { +- if (c->opts.recovery_pass_last && +- c->curr_recovery_pass > c->opts.recovery_pass_last) +- break; +- +- if (should_run_recovery_pass(c, c->curr_recovery_pass)) { +- unsigned pass = c->curr_recovery_pass; ++ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { ++ c->next_recovery_pass = c->curr_recovery_pass + 1; + +- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?: +- bch2_journal_flush(&c->journal); +- if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || +- (ret && c->curr_recovery_pass < pass)) +- continue; +- if (ret) +- break; ++ spin_lock_irq(&c->recovery_pass_lock); ++ unsigned pass = c->curr_recovery_pass; + +- c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); ++ if (c->opts.recovery_pass_last && ++ c->curr_recovery_pass > c->opts.recovery_pass_last) { ++ spin_unlock_irq(&c->recovery_pass_lock); ++ break; + } + +- c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); +- +- if (!test_bit(BCH_FS_error, &c->flags)) +- bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); +- +- c->curr_recovery_pass++; ++ if (!should_run_recovery_pass(c, pass)) { ++ c->curr_recovery_pass++; ++ c->recovery_pass_done = max(c->recovery_pass_done, pass); ++ spin_unlock_irq(&c->recovery_pass_lock); ++ continue; ++ } ++ spin_unlock_irq(&c->recovery_pass_lock); ++ ++ ret = bch2_run_recovery_pass(c, pass) ?: ++ bch2_journal_flush(&c->journal); ++ ++ if (!ret && !test_bit(BCH_FS_error, &c->flags)) ++ bch2_clear_recovery_pass_required(c, pass); ++ ++ spin_lock_irq(&c->recovery_pass_lock); ++ if (c->next_recovery_pass < c->curr_recovery_pass) { ++ /* ++ * bch2_run_explicit_recovery_pass() was called: we ++ * can't always catch -BCH_ERR_restart_recovery because ++ * it may have been called from another thread (btree ++ * node read completion) ++ */ ++ ret = 0; ++ c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); ++ } else { ++ c->recovery_passes_complete |= BIT_ULL(pass); ++ c->recovery_pass_done = max(c->recovery_pass_done, pass); ++ } ++ c->curr_recovery_pass = c->next_recovery_pass; ++ spin_unlock_irq(&c->recovery_pass_lock); + } + + return ret; +diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h +index 99b464e127b8..7d7339c8fa29 100644 +--- a/fs/bcachefs/recovery_passes.h ++++ b/fs/bcachefs/recovery_passes.h +@@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v); + u64 bch2_fsck_recovery_passes(void); + + int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); ++int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); + int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); + + int bch2_run_online_recovery_passes(struct bch_fs *); +diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h +index 94dc20ca2065..418557960ed6 100644 +--- a/fs/bcachefs/recovery_passes_types.h ++++ b/fs/bcachefs/recovery_passes_types.h +@@ -8,53 +8,59 @@ + #define PASS_ALWAYS BIT(3) + #define PASS_ONLINE BIT(4) + ++#ifdef CONFIG_BCACHEFS_DEBUG ++#define PASS_FSCK_DEBUG BIT(1) ++#else ++#define PASS_FSCK_DEBUG 0 ++#endif ++ + /* + * Passes may be reordered, but the second field is a persistent identifier and + * must never change: + */ +-#define BCH_RECOVERY_PASSES() \ +- x(recovery_pass_empty, 41, PASS_SILENT) \ +- x(scan_for_btree_nodes, 37, 0) \ +- x(check_topology, 4, 0) \ +- x(accounting_read, 39, PASS_ALWAYS) \ +- x(alloc_read, 0, PASS_ALWAYS) \ +- x(stripes_read, 1, PASS_ALWAYS) \ +- x(initialize_subvolumes, 2, 0) \ +- x(snapshots_read, 3, PASS_ALWAYS) \ +- x(check_allocations, 5, PASS_FSCK) \ +- x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ +- x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ +- x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ +- x(journal_replay, 9, PASS_ALWAYS) \ +- x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ +- x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ +- x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ +- x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ +- x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ +- x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ +- x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ +- x(bucket_gens_init, 17, 0) \ +- x(reconstruct_snapshots, 38, 0) \ +- x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ +- x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ +- x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ +- x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ +- x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ +- x(fs_upgrade_for_subvolumes, 22, 0) \ +- x(check_inodes, 24, PASS_FSCK) \ +- x(check_extents, 25, PASS_FSCK) \ +- x(check_indirect_extents, 26, PASS_FSCK) \ +- x(check_dirents, 27, PASS_FSCK) \ +- x(check_xattrs, 28, PASS_FSCK) \ +- x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ +- x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ +- x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ +- x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ +- x(check_nlinks, 31, PASS_FSCK) \ +- x(resume_logged_ops, 23, PASS_ALWAYS) \ +- x(delete_dead_inodes, 32, PASS_ALWAYS) \ +- x(fix_reflink_p, 33, 0) \ +- x(set_fs_needs_rebalance, 34, 0) \ ++#define BCH_RECOVERY_PASSES() \ ++ x(recovery_pass_empty, 41, PASS_SILENT) \ ++ x(scan_for_btree_nodes, 37, 0) \ ++ x(check_topology, 4, 0) \ ++ x(accounting_read, 39, PASS_ALWAYS) \ ++ x(alloc_read, 0, PASS_ALWAYS) \ ++ x(stripes_read, 1, PASS_ALWAYS) \ ++ x(initialize_subvolumes, 2, 0) \ ++ x(snapshots_read, 3, PASS_ALWAYS) \ ++ x(check_allocations, 5, PASS_FSCK) \ ++ x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ ++ x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ ++ x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ ++ x(journal_replay, 9, PASS_ALWAYS) \ ++ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ ++ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ ++ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ ++ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ ++ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ ++ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ ++ x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ ++ x(bucket_gens_init, 17, 0) \ ++ x(reconstruct_snapshots, 38, 0) \ ++ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ ++ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ ++ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ ++ x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ ++ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ ++ x(fs_upgrade_for_subvolumes, 22, 0) \ ++ x(check_inodes, 24, PASS_FSCK) \ ++ x(check_extents, 25, PASS_FSCK) \ ++ x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ ++ x(check_dirents, 27, PASS_FSCK) \ ++ x(check_xattrs, 28, PASS_FSCK) \ ++ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ ++ x(check_unreachable_inodes, 40, PASS_FSCK) \ ++ x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ ++ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ ++ x(check_nlinks, 31, PASS_FSCK) \ ++ x(resume_logged_ops, 23, PASS_ALWAYS) \ ++ x(delete_dead_inodes, 32, PASS_ALWAYS) \ ++ x(fix_reflink_p, 33, 0) \ ++ x(set_fs_needs_rebalance, 34, 0) + + /* We normally enumerate recovery passes in the order we run them: */ + enum bch_recovery_pass { +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +index f457925fa362..93ba4f4e47ca 100644 +--- a/fs/bcachefs/reflink.c ++++ b/fs/bcachefs/reflink.c +@@ -15,6 +15,17 @@ + + #include + ++static inline bool bkey_extent_is_reflink_data(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_reflink_v: ++ case KEY_TYPE_indirect_inline_data: ++ return true; ++ default: ++ return false; ++ } ++} ++ + static inline unsigned bkey_type_to_indirect(const struct bkey *k) + { + switch (k->type) { +@@ -30,15 +41,15 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) + /* reflink pointers */ + + int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + int ret = 0; + +- bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), ++ bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad), + c, reflink_p_front_pad_bad, + "idx < front_pad (%llu < %u)", +- le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); ++ REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad)); + fsck_err: + return ret; + } +@@ -49,7 +60,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + prt_printf(out, "idx %llu front_pad %u back_pad %u", +- le64_to_cpu(p.v->idx), ++ REFLINK_P_IDX(p.v), + le32_to_cpu(p.v->front_pad), + le32_to_cpu(p.v->back_pad)); + } +@@ -65,49 +76,250 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r + */ + return false; + +- if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) ++ if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) ++ return false; ++ ++ if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v)) + return false; + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; + } + ++/* indirect extents */ ++ ++int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, ++ struct bkey_validate_context from) ++{ ++ int ret = 0; ++ ++ bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), ++ c, reflink_v_pos_bad, ++ "indirect extent above maximum position 0:%llu", ++ REFLINK_P_IDX_MAX); ++ ++ ret = bch2_bkey_ptrs_validate(c, k, from); ++fsck_err: ++ return ret; ++} ++ ++void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); ++ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++#if 0 ++Currently disabled, needs to be debugged: ++ ++bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); ++ ++ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); ++} ++#endif ++ ++/* indirect inline data */ ++ ++int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, ++ struct bkey_validate_context from) ++{ ++ return 0; ++} ++ ++void bch2_indirect_inline_data_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); ++ unsigned datalen = bkey_inline_data_bytes(k.k); ++ ++ prt_printf(out, "refcount %llu datalen %u: %*phN", ++ le64_to_cpu(d.v->refcount), datalen, ++ min(datalen, 32U), d.v->data); ++} ++ ++/* lookup */ ++ ++static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p, ++ bool should_commit) ++{ ++ struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); ++ int ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ return ret; ++ ++ SET_REFLINK_P_ERROR(&new->v, false); ++ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); ++ if (ret) ++ return ret; ++ ++ if (!should_commit) ++ return 0; ++ ++ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ -BCH_ERR_transaction_restart_nested; ++} ++ ++static int bch2_indirect_extent_missing_error(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, ++ u64 missing_start, u64 missing_end, ++ bool should_commit) ++{ ++ if (REFLINK_P_ERROR(p.v)) ++ return -BCH_ERR_missing_indirect_extent; ++ ++ struct bch_fs *c = trans->c; ++ u64 live_start = REFLINK_P_IDX(p.v); ++ u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; ++ u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); ++ u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); ++ struct printbuf buf = PRINTBUF; ++ int ret = 0; ++ ++ BUG_ON(missing_start < refd_start); ++ BUG_ON(missing_end > refd_end); ++ ++ if (fsck_err(trans, reflink_p_to_missing_reflink_v, ++ "pointer to missing indirect extent\n" ++ " %s\n" ++ " missing range %llu-%llu", ++ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), ++ missing_start, missing_end)) { ++ struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ goto err; ++ ++ /* ++ * Is the missing range not actually needed? ++ * ++ * p.v->idx refers to the data that we actually want, but if the ++ * indirect extent we point to was bigger, front_pad and back_pad ++ * indicate the range we took a reference on. ++ */ ++ ++ if (missing_end <= live_start) { ++ new->v.front_pad = cpu_to_le32(live_start - missing_end); ++ } else if (missing_start >= live_end) { ++ new->v.back_pad = cpu_to_le32(missing_start - live_end); ++ } else { ++ struct bpos new_start = bkey_start_pos(&new->k); ++ struct bpos new_end = new->k.p; ++ ++ if (missing_start > live_start) ++ new_start.offset += missing_start - live_start; ++ if (missing_end < live_end) ++ new_end.offset -= live_end - missing_end; ++ ++ bch2_cut_front(new_start, &new->k_i); ++ bch2_cut_back(new_end, &new->k_i); ++ ++ SET_REFLINK_P_ERROR(&new->v, true); ++ } ++ ++ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); ++ if (ret) ++ goto err; ++ ++ if (should_commit) ++ ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ -BCH_ERR_transaction_restart_nested; ++ } ++err: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++/* ++ * This is used from the read path, which doesn't expect to have to do a ++ * transaction commit, and from triggers, which should not be doing a commit: ++ */ ++struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, ++ struct btree_iter *iter, ++ s64 *offset_into_extent, ++ struct bkey_s_c_reflink_p p, ++ bool should_commit, ++ unsigned iter_flags) ++{ ++ BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad))); ++ BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad)); ++ ++ u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; ++ ++ struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, ++ POS(0, reflink_offset), iter_flags); ++ if (bkey_err(k)) ++ return k; ++ ++ if (unlikely(!bkey_extent_is_reflink_data(k.k))) { ++ bch2_trans_iter_exit(trans, iter); ++ ++ unsigned size = min((u64) k.k->size, ++ REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - ++ reflink_offset); ++ bch2_key_resize(&iter->k, size); ++ ++ int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, ++ k.k->p.offset, should_commit); ++ if (ret) ++ return bkey_s_c_err(ret); ++ } else if (unlikely(REFLINK_P_ERROR(p.v))) { ++ bch2_trans_iter_exit(trans, iter); ++ ++ int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); ++ if (ret) ++ return bkey_s_c_err(ret); ++ } ++ ++ *offset_into_extent = reflink_offset - bkey_start_offset(k.k); ++ return k; ++} ++ ++/* reflink pointer trigger */ ++ + static int trans_trigger_reflink_p_segment(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, u64 *idx, + enum btree_iter_update_trigger_flags flags) + { + struct bch_fs *c = trans->c; +- struct btree_iter iter; +- struct bkey_i *k; +- __le64 *refcount; +- int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; + struct printbuf buf = PRINTBUF; +- int ret; + +- k = bch2_bkey_get_mut_noupdate(trans, &iter, +- BTREE_ID_reflink, POS(0, *idx), +- BTREE_ITER_with_updates); +- ret = PTR_ERR_OR_ZERO(k); ++ s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); ++ struct btree_iter iter; ++ struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false, ++ BTREE_ITER_intent| ++ BTREE_ITER_with_updates); ++ int ret = bkey_err(k); + if (ret) +- goto err; ++ return ret; + +- refcount = bkey_refcount(bkey_i_to_s(k)); +- if (!refcount) { +- bch2_bkey_val_to_text(&buf, c, p.s_c); +- bch2_trans_inconsistent(trans, +- "nonexistent indirect extent at %llu while marking\n %s", +- *idx, buf.buf); +- ret = -EIO; +- goto err; ++ if (bkey_deleted(k.k)) { ++ if (!(flags & BTREE_TRIGGER_overwrite)) ++ ret = -BCH_ERR_missing_indirect_extent; ++ goto next; + } + ++ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ goto err; ++ ++ __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); + if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); +- bch2_trans_inconsistent(trans, +- "indirect extent refcount underflow at %llu while marking\n %s", +- *idx, buf.buf); +- ret = -EIO; +- goto err; ++ prt_printf(&buf, "\n "); ++ bch2_bkey_val_to_text(&buf, c, k); ++ log_fsck_err(trans, reflink_refcount_underflow, ++ "indirect extent refcount underflow while marking\n %s", ++ buf.buf); ++ goto next; + } + + if (flags & BTREE_TRIGGER_insert) { +@@ -115,25 +327,26 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), +- le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); ++ REFLINK_P_IDX(v) - bkey_start_offset(&new->k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), +- k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); ++ new->k.p.offset - p.k->size - REFLINK_P_IDX(v)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + +- le64_add_cpu(refcount, add); ++ le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1); + + bch2_btree_iter_set_pos_to_extent_start(&iter); +- ret = bch2_trans_update(trans, &iter, k, 0); ++ ret = bch2_trans_update(trans, &iter, new, 0); + if (ret) + goto err; +- +- *idx = k->k.p.offset; ++next: ++ *idx = k.k->p.offset; + err: ++fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +@@ -147,9 +360,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, + struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; +- u64 start = le64_to_cpu(p.v->idx); +- u64 end = le64_to_cpu(p.v->idx) + p.k->size; +- u64 next_idx = end + le32_to_cpu(p.v->back_pad); ++ u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); + s64 ret = 0; + struct printbuf buf = PRINTBUF; + +@@ -168,36 +379,14 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, + *idx = r->offset; + return 0; + not_found: +- BUG_ON(!(flags & BTREE_TRIGGER_check_repair)); +- +- if (fsck_err(trans, reflink_p_to_missing_reflink_v, +- "pointer to missing indirect extent\n" +- " %s\n" +- " missing range %llu-%llu", +- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), +- *idx, next_idx)) { +- struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c); +- ret = PTR_ERR_OR_ZERO(update); ++ if (flags & BTREE_TRIGGER_check_repair) { ++ ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); + if (ret) + goto err; +- +- if (next_idx <= start) { +- bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx); +- } else if (*idx >= end) { +- bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end); +- } else { +- bkey_error_init(update); +- update->k.p = p.k->p; +- update->k.size = p.k->size; +- set_bkey_val_u64s(&update->k, 0); +- } +- +- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun); + } + + *idx = next_idx; + err: +-fsck_err: + printbuf_exit(&buf); + return ret; + } +@@ -210,8 +399,8 @@ static int __trigger_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + int ret = 0; + +- u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); +- u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); ++ u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); ++ u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); + + if (flags & BTREE_TRIGGER_transactional) { + while (idx < end && !ret) +@@ -253,35 +442,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, + return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); + } + +-/* indirect extents */ +- +-int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) +-{ +- return bch2_bkey_ptrs_validate(c, k, flags); +-} +- +-void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, +- struct bkey_s_c k) +-{ +- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); +- +- prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); +- +- bch2_bkey_ptrs_to_text(out, c, k); +-} +- +-#if 0 +-Currently disabled, needs to be debugged: +- +-bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +-{ +- struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); +- struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); +- +- return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +-} +-#endif ++/* indirect extent trigger */ + + static inline void + check_indirect_extent_deleting(struct bkey_s new, +@@ -307,25 +468,6 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, + return bch2_trigger_extent(trans, btree_id, level, old, new, flags); + } + +-/* indirect inline data */ +- +-int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) +-{ +- return 0; +-} +- +-void bch2_indirect_inline_data_to_text(struct printbuf *out, +- struct bch_fs *c, struct bkey_s_c k) +-{ +- struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); +- unsigned datalen = bkey_inline_data_bytes(k.k); +- +- prt_printf(out, "refcount %llu datalen %u: %*phN", +- le64_to_cpu(d.v->refcount), datalen, +- min(datalen, 32U), d.v->data); +-} +- + int bch2_trigger_indirect_inline_data(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, +@@ -336,9 +478,12 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *trans, + return 0; + } + ++/* create */ ++ + static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, +- struct bkey_i *orig) ++ struct bkey_i *orig, ++ bool reflink_p_may_update_opts_field) + { + struct bch_fs *c = trans->c; + struct btree_iter reflink_iter = { NULL }; +@@ -358,6 +503,14 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + if (ret) + goto err; + ++ /* ++ * XXX: we're assuming that 56 bits will be enough for the life of the ++ * filesystem: we need to implement wraparound, with a cursor in the ++ * logged ops btree: ++ */ ++ if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) ++ return -ENOSPC; ++ + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) +@@ -394,7 +547,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, + memset(&r_p->v, 0, sizeof(r_p->v)); + #endif + +- r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ++ SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); ++ ++ if (reflink_p_may_update_opts_field) ++ SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); + + ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, + BTREE_UPDATE_internal_snapshot_node); +@@ -409,7 +565,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) + struct bkey_s_c k; + int ret; + +- for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { ++ for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { + if (bkey_extent_is_unwritten(k)) + continue; + +@@ -426,7 +582,8 @@ s64 bch2_remap_range(struct bch_fs *c, + subvol_inum dst_inum, u64 dst_offset, + subvol_inum src_inum, u64 src_offset, + u64 remap_sectors, +- u64 new_i_size, s64 *i_sectors_delta) ++ u64 new_i_size, s64 *i_sectors_delta, ++ bool may_change_src_io_path_opts) + { + struct btree_trans *trans; + struct btree_iter dst_iter, src_iter; +@@ -439,6 +596,8 @@ s64 bch2_remap_range(struct bch_fs *c, + struct bpos src_want; + u64 dst_done = 0; + u32 dst_snapshot, src_snapshot; ++ bool reflink_p_may_update_opts_field = ++ bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); + int ret = 0, ret2 = 0; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) +@@ -520,7 +679,8 @@ s64 bch2_remap_range(struct bch_fs *c, + src_k = bkey_i_to_s_c(new_src.k); + + ret = bch2_make_extent_indirect(trans, &src_iter, +- new_src.k); ++ new_src.k, ++ reflink_p_may_update_opts_field); + if (ret) + continue; + +@@ -533,11 +693,15 @@ s64 bch2_remap_range(struct bch_fs *c, + struct bkey_i_reflink_p *dst_p = + bkey_reflink_p_init(new_dst.k); + +- u64 offset = le64_to_cpu(src_p.v->idx) + ++ u64 offset = REFLINK_P_IDX(src_p.v) + + (src_want.offset - + bkey_start_offset(src_k.k)); + +- dst_p->v.idx = cpu_to_le64(offset); ++ SET_REFLINK_P_IDX(&dst_p->v, offset); ++ ++ if (reflink_p_may_update_opts_field && ++ may_change_src_io_path_opts) ++ SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); + } else { + BUG(); + } +@@ -547,7 +711,7 @@ s64 bch2_remap_range(struct bch_fs *c, + min(src_k.k->p.offset - src_want.offset, + dst_end.offset - dst_iter.pos.offset)); + +- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: ++ ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: + bch2_extent_update(trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, + new_i_size, i_sectors_delta, +@@ -591,3 +755,97 @@ s64 bch2_remap_range(struct bch_fs *c, + + return dst_done ?: ret ?: ret2; + } ++ ++/* fsck */ ++ ++static int bch2_gc_write_reflink_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ size_t *idx) ++{ ++ struct bch_fs *c = trans->c; ++ const __le64 *refcount = bkey_refcount_c(k); ++ struct printbuf buf = PRINTBUF; ++ struct reflink_gc *r; ++ int ret = 0; ++ ++ if (!refcount) ++ return 0; ++ ++ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && ++ r->offset < k.k->p.offset) ++ ++*idx; ++ ++ if (!r || ++ r->offset != k.k->p.offset || ++ r->size != k.k->size) { ++ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); ++ return -EINVAL; ++ } ++ ++ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), ++ trans, reflink_v_refcount_wrong, ++ "reflink key has wrong refcount:\n" ++ " %s\n" ++ " should be %u", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), ++ r->refcount)) { ++ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ++ ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ goto out; ++ ++ if (!r->refcount) ++ new->k.type = KEY_TYPE_deleted; ++ else ++ *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); ++ ret = bch2_trans_update(trans, iter, new, 0); ++ } ++out: ++fsck_err: ++ printbuf_exit(&buf); ++ return ret; ++} ++ ++int bch2_gc_reflink_done(struct bch_fs *c) ++{ ++ size_t idx = 0; ++ ++ int ret = bch2_trans_run(c, ++ for_each_btree_key_commit(trans, iter, ++ BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_prefetch, k, ++ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ++ bch2_gc_write_reflink_key(trans, &iter, k, &idx))); ++ c->reflink_gc_nr = 0; ++ return ret; ++} ++ ++int bch2_gc_reflink_start(struct bch_fs *c) ++{ ++ c->reflink_gc_nr = 0; ++ ++ int ret = bch2_trans_run(c, ++ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, ++ BTREE_ITER_prefetch, k, ({ ++ const __le64 *refcount = bkey_refcount_c(k); ++ ++ if (!refcount) ++ continue; ++ ++ struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, ++ c->reflink_gc_nr++, GFP_KERNEL); ++ if (!r) { ++ ret = -BCH_ERR_ENOMEM_gc_reflink_start; ++ break; ++ } ++ ++ r->offset = k.k->p.offset; ++ r->size = k.k->size; ++ r->refcount = 0; ++ 0; ++ }))); ++ ++ bch_err_fn(c, ret); ++ return ret; ++} +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +index 51afe11d8ed6..1632780bdf18 100644 +--- a/fs/bcachefs/reflink.h ++++ b/fs/bcachefs/reflink.h +@@ -2,9 +2,8 @@ + #ifndef _BCACHEFS_REFLINK_H + #define _BCACHEFS_REFLINK_H + +-enum bch_validate_flags; +- +-int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, +@@ -19,7 +18,8 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, + .min_val_size = 16, \ + }) + +-int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, +@@ -34,7 +34,7 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, + }) + + int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); + int bch2_trigger_indirect_inline_data(struct btree_trans *, +@@ -73,7 +73,15 @@ static inline __le64 *bkey_refcount(struct bkey_s k) + } + } + ++struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *, ++ s64 *, struct bkey_s_c_reflink_p, ++ bool, unsigned); ++ + s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, +- subvol_inum, u64, u64, u64, s64 *); ++ subvol_inum, u64, u64, u64, s64 *, ++ bool); ++ ++int bch2_gc_reflink_done(struct bch_fs *); ++int bch2_gc_reflink_start(struct bch_fs *); + + #endif /* _BCACHEFS_REFLINK_H */ +diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h +index 6772eebb1fc6..92995e4f898e 100644 +--- a/fs/bcachefs/reflink_format.h ++++ b/fs/bcachefs/reflink_format.h +@@ -4,7 +4,7 @@ + + struct bch_reflink_p { + struct bch_val v; +- __le64 idx; ++ __le64 idx_flags; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of +@@ -17,6 +17,11 @@ struct bch_reflink_p { + __le32 back_pad; + } __packed __aligned(8); + ++LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); ++LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); ++LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS, ++ struct bch_reflink_p, idx_flags, 57, 58); ++ + struct bch_reflink_v { + struct bch_val v; + __le64 refcount; +diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c +index 005275281804..59c8770e4a0e 100644 +--- a/fs/bcachefs/sb-clean.c ++++ b/fs/bcachefs/sb-clean.c +@@ -23,6 +23,10 @@ + int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, + int write) + { ++ struct bkey_validate_context from = { ++ .flags = write, ++ .from = BKEY_VALIDATE_superblock, ++ }; + struct jset_entry *entry; + int ret; + +@@ -40,7 +44,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle + ret = bch2_journal_entry_validate(c, NULL, entry, + le16_to_cpu(c->disk_sb.sb->version), + BCH_SB_BIG_ENDIAN(c->disk_sb.sb), +- write); ++ from); + if (ret) + return ret; + } +diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h +index 62ea478215d0..fdcf598f08b1 100644 +--- a/fs/bcachefs/sb-counters_format.h ++++ b/fs/bcachefs/sb-counters_format.h +@@ -2,86 +2,91 @@ + #ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H + #define _BCACHEFS_SB_COUNTERS_FORMAT_H + +-#define BCH_PERSISTENT_COUNTERS() \ +- x(io_read, 0) \ +- x(io_write, 1) \ +- x(io_move, 2) \ +- x(bucket_invalidate, 3) \ +- x(bucket_discard, 4) \ +- x(bucket_alloc, 5) \ +- x(bucket_alloc_fail, 6) \ +- x(btree_cache_scan, 7) \ +- x(btree_cache_reap, 8) \ +- x(btree_cache_cannibalize, 9) \ +- x(btree_cache_cannibalize_lock, 10) \ +- x(btree_cache_cannibalize_lock_fail, 11) \ +- x(btree_cache_cannibalize_unlock, 12) \ +- x(btree_node_write, 13) \ +- x(btree_node_read, 14) \ +- x(btree_node_compact, 15) \ +- x(btree_node_merge, 16) \ +- x(btree_node_split, 17) \ +- x(btree_node_rewrite, 18) \ +- x(btree_node_alloc, 19) \ +- x(btree_node_free, 20) \ +- x(btree_node_set_root, 21) \ +- x(btree_path_relock_fail, 22) \ +- x(btree_path_upgrade_fail, 23) \ +- x(btree_reserve_get_fail, 24) \ +- x(journal_entry_full, 25) \ +- x(journal_full, 26) \ +- x(journal_reclaim_finish, 27) \ +- x(journal_reclaim_start, 28) \ +- x(journal_write, 29) \ +- x(read_promote, 30) \ +- x(read_bounce, 31) \ +- x(read_split, 33) \ +- x(read_retry, 32) \ +- x(read_reuse_race, 34) \ +- x(move_extent_read, 35) \ +- x(move_extent_write, 36) \ +- x(move_extent_finish, 37) \ +- x(move_extent_fail, 38) \ +- x(move_extent_start_fail, 39) \ +- x(copygc, 40) \ +- x(copygc_wait, 41) \ +- x(gc_gens_end, 42) \ +- x(gc_gens_start, 43) \ +- x(trans_blocked_journal_reclaim, 44) \ +- x(trans_restart_btree_node_reused, 45) \ +- x(trans_restart_btree_node_split, 46) \ +- x(trans_restart_fault_inject, 47) \ +- x(trans_restart_iter_upgrade, 48) \ +- x(trans_restart_journal_preres_get, 49) \ +- x(trans_restart_journal_reclaim, 50) \ +- x(trans_restart_journal_res_get, 51) \ +- x(trans_restart_key_cache_key_realloced, 52) \ +- x(trans_restart_key_cache_raced, 53) \ +- x(trans_restart_mark_replicas, 54) \ +- x(trans_restart_mem_realloced, 55) \ +- x(trans_restart_memory_allocation_failure, 56) \ +- x(trans_restart_relock, 57) \ +- x(trans_restart_relock_after_fill, 58) \ +- x(trans_restart_relock_key_cache_fill, 59) \ +- x(trans_restart_relock_next_node, 60) \ +- x(trans_restart_relock_parent_for_fill, 61) \ +- x(trans_restart_relock_path, 62) \ +- x(trans_restart_relock_path_intent, 63) \ +- x(trans_restart_too_many_iters, 64) \ +- x(trans_restart_traverse, 65) \ +- x(trans_restart_upgrade, 66) \ +- x(trans_restart_would_deadlock, 67) \ +- x(trans_restart_would_deadlock_write, 68) \ +- x(trans_restart_injected, 69) \ +- x(trans_restart_key_cache_upgrade, 70) \ +- x(trans_traverse_all, 71) \ +- x(transaction_commit, 72) \ +- x(write_super, 73) \ +- x(trans_restart_would_deadlock_recursion_limit, 74) \ +- x(trans_restart_write_buffer_flush, 75) \ +- x(trans_restart_split_race, 76) \ +- x(write_buffer_flush_slowpath, 77) \ +- x(write_buffer_flush_sync, 78) ++enum counters_flags { ++ TYPE_COUNTER = BIT(0), /* event counters */ ++ TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ ++}; ++ ++#define BCH_PERSISTENT_COUNTERS() \ ++ x(io_read, 0, TYPE_SECTORS) \ ++ x(io_write, 1, TYPE_SECTORS) \ ++ x(io_move, 2, TYPE_SECTORS) \ ++ x(bucket_invalidate, 3, TYPE_COUNTER) \ ++ x(bucket_discard, 4, TYPE_COUNTER) \ ++ x(bucket_alloc, 5, TYPE_COUNTER) \ ++ x(bucket_alloc_fail, 6, TYPE_COUNTER) \ ++ x(btree_cache_scan, 7, TYPE_COUNTER) \ ++ x(btree_cache_reap, 8, TYPE_COUNTER) \ ++ x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ ++ x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ ++ x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ ++ x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ ++ x(btree_node_write, 13, TYPE_COUNTER) \ ++ x(btree_node_read, 14, TYPE_COUNTER) \ ++ x(btree_node_compact, 15, TYPE_COUNTER) \ ++ x(btree_node_merge, 16, TYPE_COUNTER) \ ++ x(btree_node_split, 17, TYPE_COUNTER) \ ++ x(btree_node_rewrite, 18, TYPE_COUNTER) \ ++ x(btree_node_alloc, 19, TYPE_COUNTER) \ ++ x(btree_node_free, 20, TYPE_COUNTER) \ ++ x(btree_node_set_root, 21, TYPE_COUNTER) \ ++ x(btree_path_relock_fail, 22, TYPE_COUNTER) \ ++ x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ ++ x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ ++ x(journal_entry_full, 25, TYPE_COUNTER) \ ++ x(journal_full, 26, TYPE_COUNTER) \ ++ x(journal_reclaim_finish, 27, TYPE_COUNTER) \ ++ x(journal_reclaim_start, 28, TYPE_COUNTER) \ ++ x(journal_write, 29, TYPE_COUNTER) \ ++ x(read_promote, 30, TYPE_COUNTER) \ ++ x(read_bounce, 31, TYPE_COUNTER) \ ++ x(read_split, 33, TYPE_COUNTER) \ ++ x(read_retry, 32, TYPE_COUNTER) \ ++ x(read_reuse_race, 34, TYPE_COUNTER) \ ++ x(move_extent_read, 35, TYPE_SECTORS) \ ++ x(move_extent_write, 36, TYPE_SECTORS) \ ++ x(move_extent_finish, 37, TYPE_SECTORS) \ ++ x(move_extent_fail, 38, TYPE_COUNTER) \ ++ x(move_extent_start_fail, 39, TYPE_COUNTER) \ ++ x(copygc, 40, TYPE_COUNTER) \ ++ x(copygc_wait, 41, TYPE_COUNTER) \ ++ x(gc_gens_end, 42, TYPE_COUNTER) \ ++ x(gc_gens_start, 43, TYPE_COUNTER) \ ++ x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ ++ x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ ++ x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ ++ x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ ++ x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ ++ x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ ++ x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ ++ x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ ++ x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ ++ x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ ++ x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ ++ x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ ++ x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ ++ x(trans_restart_relock, 57, TYPE_COUNTER) \ ++ x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ ++ x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ ++ x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ ++ x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ ++ x(trans_restart_relock_path, 62, TYPE_COUNTER) \ ++ x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ ++ x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ ++ x(trans_restart_traverse, 65, TYPE_COUNTER) \ ++ x(trans_restart_upgrade, 66, TYPE_COUNTER) \ ++ x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ ++ x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ ++ x(trans_restart_injected, 69, TYPE_COUNTER) \ ++ x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ ++ x(trans_traverse_all, 71, TYPE_COUNTER) \ ++ x(transaction_commit, 72, TYPE_COUNTER) \ ++ x(write_super, 73, TYPE_COUNTER) \ ++ x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ ++ x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ ++ x(trans_restart_split_race, 76, TYPE_COUNTER) \ ++ x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ ++ x(write_buffer_flush_sync, 78, TYPE_COUNTER) + + enum bch_persistent_counters { + #define x(t, n, ...) BCH_COUNTER_##t, +diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c +index 8767c33c2b51..14f6b6a5fb38 100644 +--- a/fs/bcachefs/sb-downgrade.c ++++ b/fs/bcachefs/sb-downgrade.c +@@ -81,7 +81,19 @@ + BCH_FSCK_ERR_accounting_mismatch) \ + x(inode_has_child_snapshots, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ +- BCH_FSCK_ERR_inode_has_child_snapshots_wrong) ++ BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ ++ x(backpointer_bucket_gen, \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ ++ BCH_FSCK_ERR_backpointer_to_missing_ptr, \ ++ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ ++ x(disk_accounting_big_endian, \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ ++ BCH_FSCK_ERR_accounting_mismatch, \ ++ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ ++ BCH_FSCK_ERR_accounting_key_junk_at_end) \ ++ x(directory_size, \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ ++ BCH_FSCK_ERR_directory_size_mismatch) \ + + #define DOWNGRADE_TABLE() \ + x(bucket_stripe_sectors, \ +@@ -117,7 +129,19 @@ + BCH_FSCK_ERR_bkey_version_in_future) \ + x(rebalance_work_acct_fix, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ +- BCH_FSCK_ERR_accounting_mismatch) ++ BCH_FSCK_ERR_accounting_mismatch, \ ++ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ ++ BCH_FSCK_ERR_accounting_key_junk_at_end) \ ++ x(backpointer_bucket_gen, \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ ++ BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ ++ BCH_FSCK_ERR_backpointer_to_missing_ptr, \ ++ BCH_FSCK_ERR_ptr_to_missing_backpointer) \ ++ x(disk_accounting_big_endian, \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ ++ BCH_FSCK_ERR_accounting_mismatch, \ ++ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ ++ BCH_FSCK_ERR_accounting_key_junk_at_end) + + struct upgrade_downgrade_entry { + u64 recovery_passes; +diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h +index 9feb6739f77a..ea0a18364751 100644 +--- a/fs/bcachefs/sb-errors_format.h ++++ b/fs/bcachefs/sb-errors_format.h +@@ -5,9 +5,8 @@ + enum bch_fsck_flags { + FSCK_CAN_FIX = 1 << 0, + FSCK_CAN_IGNORE = 1 << 1, +- FSCK_NEED_FSCK = 1 << 2, +- FSCK_NO_RATELIMIT = 1 << 3, +- FSCK_AUTOFIX = 1 << 4, ++ FSCK_NO_RATELIMIT = 1 << 2, ++ FSCK_AUTOFIX = 1 << 3, + }; + + #define BCH_SB_ERRS() \ +@@ -58,8 +57,8 @@ enum bch_fsck_flags { + x(bset_wrong_sector_offset, 44, 0) \ + x(bset_empty, 45, 0) \ + x(bset_bad_seq, 46, 0) \ +- x(bset_blacklisted_journal_seq, 47, 0) \ +- x(first_bset_blacklisted_journal_seq, 48, 0) \ ++ x(bset_blacklisted_journal_seq, 47, FSCK_AUTOFIX) \ ++ x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ + x(btree_node_bad_btree, 49, 0) \ + x(btree_node_bad_level, 50, 0) \ + x(btree_node_bad_min_key, 51, 0) \ +@@ -68,17 +67,17 @@ enum bch_fsck_flags { + x(btree_node_bkey_past_bset_end, 54, 0) \ + x(btree_node_bkey_bad_format, 55, 0) \ + x(btree_node_bad_bkey, 56, 0) \ +- x(btree_node_bkey_out_of_order, 57, 0) \ +- x(btree_root_bkey_invalid, 58, 0) \ +- x(btree_root_read_error, 59, 0) \ ++ x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \ ++ x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ ++ x(btree_root_read_error, 59, FSCK_AUTOFIX) \ + x(btree_root_bad_min_key, 60, 0) \ + x(btree_root_bad_max_key, 61, 0) \ +- x(btree_node_read_error, 62, 0) \ +- x(btree_node_topology_bad_min_key, 63, 0) \ +- x(btree_node_topology_bad_max_key, 64, 0) \ +- x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ +- x(btree_node_topology_overwritten_by_next_node, 66, 0) \ +- x(btree_node_topology_interior_node_empty, 67, 0) \ ++ x(btree_node_read_error, 62, FSCK_AUTOFIX) \ ++ x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ ++ x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ ++ x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ ++ x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ ++ x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ + x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ + x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ + x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ +@@ -123,11 +122,12 @@ enum bch_fsck_flags { + x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ + x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ + x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ ++ x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \ + x(bucket_sector_count_overflow, 112, 0) \ + x(bucket_metadata_type_mismatch, 113, 0) \ +- x(need_discard_key_wrong, 114, 0) \ +- x(freespace_key_wrong, 115, 0) \ +- x(freespace_hole_missing, 116, 0) \ ++ x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ ++ x(freespace_key_wrong, 115, FSCK_AUTOFIX) \ ++ x(freespace_hole_missing, 116, FSCK_AUTOFIX) \ + x(bucket_gens_val_size_bad, 117, 0) \ + x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \ + x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \ +@@ -139,9 +139,10 @@ enum bch_fsck_flags { + x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ + x(backpointer_bucket_offset_wrong, 125, 0) \ + x(backpointer_level_bad, 294, 0) \ +- x(backpointer_to_missing_device, 126, 0) \ +- x(backpointer_to_missing_alloc, 127, 0) \ +- x(backpointer_to_missing_ptr, 128, 0) \ ++ x(backpointer_dev_bad, 297, 0) \ ++ x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \ ++ x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \ ++ x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ + x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ + x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ + x(lru_entry_bad, 131, FSCK_AUTOFIX) \ +@@ -167,14 +168,15 @@ enum bch_fsck_flags { + x(ptr_to_incorrect_stripe, 151, 0) \ + x(ptr_gen_newer_than_bucket_gen, 152, 0) \ + x(ptr_too_stale, 153, 0) \ +- x(stale_dirty_ptr, 154, 0) \ ++ x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ + x(ptr_bucket_data_type_mismatch, 155, 0) \ + x(ptr_cached_and_erasure_coded, 156, 0) \ + x(ptr_crc_uncompressed_size_too_small, 157, 0) \ ++ x(ptr_crc_uncompressed_size_too_big, 161, 0) \ ++ x(ptr_crc_uncompressed_size_mismatch, 300, 0) \ + x(ptr_crc_csum_type_unknown, 158, 0) \ + x(ptr_crc_compression_type_unknown, 159, 0) \ + x(ptr_crc_redundant, 160, 0) \ +- x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_nonce_mismatch, 162, 0) \ + x(ptr_stripe_redundant, 163, 0) \ + x(reservation_key_nr_replicas_invalid, 164, 0) \ +@@ -209,6 +211,7 @@ enum bch_fsck_flags { + x(bkey_in_missing_snapshot, 190, 0) \ + x(inode_pos_inode_nonzero, 191, 0) \ + x(inode_pos_blockdev_range, 192, 0) \ ++ x(inode_alloc_cursor_inode_bad, 301, 0) \ + x(inode_unpack_error, 193, 0) \ + x(inode_str_hash_invalid, 194, 0) \ + x(inode_v3_fields_start_bad, 195, 0) \ +@@ -232,6 +235,7 @@ enum bch_fsck_flags { + x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ + x(inode_has_child_snapshots_wrong, 287, 0) \ + x(inode_unreachable, 210, FSCK_AUTOFIX) \ ++ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ + x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ + x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ + x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ +@@ -252,6 +256,7 @@ enum bch_fsck_flags { + x(dirent_in_missing_dir_inode, 227, 0) \ + x(dirent_in_non_dir_inode, 228, 0) \ + x(dirent_to_missing_inode, 229, 0) \ ++ x(dirent_to_overwritten_inode, 302, 0) \ + x(dirent_to_missing_subvol, 230, 0) \ + x(dirent_to_itself, 231, 0) \ + x(quota_type_invalid, 232, 0) \ +@@ -288,7 +293,7 @@ enum bch_fsck_flags { + x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ + x(snapshot_node_missing, 264, 0) \ + x(dup_backpointer_to_bad_csum_extent, 265, 0) \ +- x(btree_bitmap_not_marked, 266, 0) \ ++ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ + x(sb_clean_entry_overrun, 267, 0) \ + x(btree_ptr_v2_written_0, 268, 0) \ + x(subvol_snapshot_bad, 269, 0) \ +@@ -306,7 +311,10 @@ enum bch_fsck_flags { + x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ + x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ + x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ +- x(MAX, 295, 0) ++ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ ++ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ ++ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ ++ x(MAX, 304, 0) + + enum bch_sb_error_id { + #define x(t, n, ...) BCH_FSCK_ERR_##t = n, +diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c +index 617d07e53b20..7e7c66a1e1a6 100644 +--- a/fs/bcachefs/six.c ++++ b/fs/bcachefs/six.c +@@ -491,8 +491,12 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + list_del(&wait->list); + raw_spin_unlock(&lock->wait_lock); + +- if (unlikely(acquired)) ++ if (unlikely(acquired)) { + do_six_unlock_type(lock, type); ++ } else if (type == SIX_LOCK_write) { ++ six_clear_bitmask(lock, SIX_LOCK_HELD_write); ++ six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); ++ } + break; + } + +@@ -501,10 +505,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + + __set_current_state(TASK_RUNNING); + out: +- if (ret && type == SIX_LOCK_write) { +- six_clear_bitmask(lock, SIX_LOCK_HELD_write); +- six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); +- } + trace_contention_end(lock, 0); + + return ret; +@@ -616,8 +616,6 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long + + if (type != SIX_LOCK_write) + six_release(&lock->dep_map, ip); +- else +- lock->seq++; + + if (type == SIX_LOCK_intent && + lock->intent_lock_recurse) { +@@ -625,6 +623,15 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long + return; + } + ++ if (type == SIX_LOCK_write && ++ lock->write_lock_recurse) { ++ --lock->write_lock_recurse; ++ return; ++ } ++ ++ if (type == SIX_LOCK_write) ++ lock->seq++; ++ + do_six_unlock_type(lock, type); + } + EXPORT_SYMBOL_GPL(six_unlock_ip); +@@ -735,13 +742,13 @@ void six_lock_increment(struct six_lock *lock, enum six_lock_type type) + atomic_add(l[type].lock_val, &lock->state); + } + break; ++ case SIX_LOCK_write: ++ lock->write_lock_recurse++; ++ fallthrough; + case SIX_LOCK_intent: + EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); + lock->intent_lock_recurse++; + break; +- case SIX_LOCK_write: +- BUG(); +- break; + } + } + EXPORT_SYMBOL_GPL(six_lock_increment); +diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h +index 68d46fd7f391..c142e06b7a3a 100644 +--- a/fs/bcachefs/six.h ++++ b/fs/bcachefs/six.h +@@ -137,6 +137,7 @@ struct six_lock { + atomic_t state; + u32 seq; + unsigned intent_lock_recurse; ++ unsigned write_lock_recurse; + struct task_struct *owner; + unsigned __percpu *readers; + raw_spinlock_t wait_lock; +diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c +index ae57638506c3..c54091a28909 100644 +--- a/fs/bcachefs/snapshot.c ++++ b/fs/bcachefs/snapshot.c +@@ -2,6 +2,7 @@ + + #include "bcachefs.h" + #include "bkey_buf.h" ++#include "btree_cache.h" + #include "btree_key_cache.h" + #include "btree_update.h" + #include "buckets.h" +@@ -32,7 +33,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, + } + + int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + int ret = 0; + +@@ -225,7 +226,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + } + + int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_snapshot s; + u32 i, id; +@@ -279,23 +280,6 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, + return ret; + } + +-static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) +-{ +- struct snapshot_t *t = snapshot_t_mut(c, id); +- u32 parent = id; +- +- while ((parent = bch2_snapshot_parent_early(c, parent)) && +- parent - id - 1 < IS_ANCESTOR_BITMAP) +- __set_bit(parent - id - 1, t->is_ancestor); +-} +- +-static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) +-{ +- mutex_lock(&c->snapshot_table_lock); +- __set_is_ancestor_bitmap(c, id); +- mutex_unlock(&c->snapshot_table_lock); +-} +- + static int __bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, +@@ -317,6 +301,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + ++ t->live = true; + t->parent = le32_to_cpu(s.v->parent); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); +@@ -335,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, + t->skip[2] = 0; + } + +- __set_is_ancestor_bitmap(c, id); ++ u32 parent = id; ++ ++ while ((parent = bch2_snapshot_parent_early(c, parent)) && ++ parent - id - 1 < IS_ANCESTOR_BITMAP) ++ __set_bit(parent - id - 1, t->is_ancestor); + + if (BCH_SNAPSHOT_DELETED(s.v)) { + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); +@@ -365,70 +354,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, + BTREE_ITER_with_updates, snapshot, s); + } + +-static int bch2_snapshot_live(struct btree_trans *trans, u32 id) +-{ +- struct bch_snapshot v; +- int ret; +- +- if (!id) +- return 0; +- +- ret = bch2_snapshot_lookup(trans, id, &v); +- if (bch2_err_matches(ret, ENOENT)) +- bch_err(trans->c, "snapshot node %u not found", id); +- if (ret) +- return ret; +- +- return !BCH_SNAPSHOT_DELETED(&v); +-} +- +-/* +- * If @k is a snapshot with just one live child, it's part of a linear chain, +- * which we consider to be an equivalence class: and then after snapshot +- * deletion cleanup, there should only be a single key at a given position in +- * this equivalence class. +- * +- * This sets the equivalence class of @k to be the child's equivalence class, if +- * it's part of such a linear chain: this correctly sets equivalence classes on +- * startup if we run leaf to root (i.e. in natural key order). +- */ +-static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) +-{ +- struct bch_fs *c = trans->c; +- unsigned i, nr_live = 0, live_idx = 0; +- struct bkey_s_c_snapshot snap; +- u32 id = k.k->p.offset, child[2]; +- +- if (k.k->type != KEY_TYPE_snapshot) +- return 0; +- +- snap = bkey_s_c_to_snapshot(k); +- +- child[0] = le32_to_cpu(snap.v->children[0]); +- child[1] = le32_to_cpu(snap.v->children[1]); +- +- for (i = 0; i < 2; i++) { +- int ret = bch2_snapshot_live(trans, child[i]); +- +- if (ret < 0) +- return ret; +- +- if (ret) +- live_idx = i; +- nr_live += ret; +- } +- +- mutex_lock(&c->snapshot_table_lock); +- +- snapshot_t_mut(c, id)->equiv = nr_live == 1 +- ? snapshot_t_mut(c, child[live_idx])->equiv +- : id; +- +- mutex_unlock(&c->snapshot_table_lock); +- +- return 0; +-} +- + /* fsck: */ + + static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) +@@ -506,7 +431,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, + break; + } + } +- + bch2_trans_iter_exit(trans, &iter); + + if (!ret && !found) { +@@ -536,6 +460,7 @@ static int check_snapshot_tree(struct btree_trans *trans, + struct bch_snapshot s; + struct bch_subvolume subvol; + struct printbuf buf = PRINTBUF; ++ struct btree_iter snapshot_iter = {}; + u32 root_id; + int ret; + +@@ -545,22 +470,35 @@ static int check_snapshot_tree(struct btree_trans *trans, + st = bkey_s_c_to_snapshot_tree(k); + root_id = le32_to_cpu(st.v->root_snapshot); + +- ret = bch2_snapshot_lookup(trans, root_id, &s); ++ struct bkey_s_c_snapshot snapshot_k = ++ bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, ++ POS(0, root_id), 0, snapshot); ++ ret = bkey_err(snapshot_k); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + ++ if (!ret) ++ bkey_val_copy(&s, snapshot_k); ++ + if (fsck_err_on(ret || + root_id != bch2_snapshot_root(c, root_id) || + st.k->p.offset != le32_to_cpu(s.tree), + trans, snapshot_tree_to_missing_snapshot, + "snapshot tree points to missing/incorrect snapshot:\n %s", +- (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { ++ (bch2_bkey_val_to_text(&buf, c, st.s_c), ++ prt_newline(&buf), ++ ret ++ ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) ++ : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), ++ buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto err; + } + +- ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), +- false, 0, &subvol); ++ if (!st.v->master_subvol) ++ goto out; ++ ++ ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + +@@ -603,8 +541,10 @@ static int check_snapshot_tree(struct btree_trans *trans, + u->v.master_subvol = cpu_to_le32(subvol_id); + st = snapshot_tree_i_to_s_c(u); + } ++out: + err: + fsck_err: ++ bch2_trans_iter_exit(trans, &snapshot_iter); + printbuf_exit(&buf); + return ret; + } +@@ -799,7 +739,7 @@ static int check_snapshot(struct btree_trans *trans, + + if (should_have_subvol) { + id = le32_to_cpu(s.subvol); +- ret = bch2_subvolume_get(trans, id, 0, false, &subvol); ++ ret = bch2_subvolume_get(trans, id, false, &subvol); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot points to nonexistent subvolume:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); +@@ -902,7 +842,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) + { + struct bch_fs *c = trans->c; + +- if (bch2_snapshot_equiv(c, id)) ++ if (bch2_snapshot_exists(c, id)) + return 0; + + /* Do we need to reconstruct the snapshot_tree entry as well? */ +@@ -951,8 +891,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) + + return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: + bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, +- bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: +- bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); ++ bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); + } + + /* Figure out which snapshot nodes belong in the same tree: */ +@@ -1050,7 +989,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) + snapshot_id_list_to_text(&buf, t); + + darray_for_each(*t, id) { +- if (fsck_err_on(!bch2_snapshot_equiv(c, *id), ++ if (fsck_err_on(!bch2_snapshot_exists(c, *id), + trans, snapshot_node_missing, + "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { + if (t->nr > 1) { +@@ -1083,10 +1022,12 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct printbuf buf = PRINTBUF; + int ret = 0; + +- if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), ++ if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), + trans, bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", +- (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ++ (bch2_btree_id_to_text(&buf, iter->btree_id), ++ prt_char(&buf, ' '), ++ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; + fsck_err: +@@ -1100,13 +1041,11 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, + int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) + { + struct btree_iter iter; +- struct bkey_i_snapshot *s; +- int ret = 0; +- +- s = bch2_bkey_get_mut_typed(trans, &iter, ++ struct bkey_i_snapshot *s = ++ bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, id), + 0, snapshot); +- ret = PTR_ERR_OR_ZERO(s); ++ int ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + trans->c, "missing snapshot %u", id); +@@ -1294,10 +1233,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + goto err; + + new_snapids[i] = iter.pos.offset; +- +- mutex_lock(&c->snapshot_table_lock); +- snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; +- mutex_unlock(&c->snapshot_table_lock); + } + err: + bch2_trans_iter_exit(trans, &iter); +@@ -1403,129 +1338,153 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + * that key to snapshot leaf nodes, where we can mutate it + */ + +-static int delete_dead_snapshots_process_key(struct btree_trans *trans, +- struct btree_iter *iter, +- struct bkey_s_c k, +- snapshot_id_list *deleted, +- snapshot_id_list *equiv_seen, +- struct bpos *last_pos) ++struct snapshot_interior_delete { ++ u32 id; ++ u32 live_child; ++}; ++typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; ++ ++static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) + { +- int ret = bch2_check_key_has_snapshot(trans, iter, k); +- if (ret) +- return ret < 0 ? ret : 0; ++ darray_for_each(*l, i) ++ if (i->id == id) ++ return i->live_child; ++ return 0; ++} + +- struct bch_fs *c = trans->c; +- u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); +- if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ ++static unsigned __live_child(struct snapshot_table *t, u32 id, ++ snapshot_id_list *delete_leaves, ++ interior_delete_list *delete_interior) ++{ ++ struct snapshot_t *s = __snapshot_t(t, id); ++ if (!s) + return 0; + +- if (!bkey_eq(k.k->p, *last_pos)) +- equiv_seen->nr = 0; ++ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) ++ if (s->children[i] && ++ !snapshot_list_has_id(delete_leaves, s->children[i]) && ++ !interior_delete_has_id(delete_interior, s->children[i])) ++ return s->children[i]; + +- if (snapshot_list_has_id(deleted, k.k->p.snapshot)) +- return bch2_btree_delete_at(trans, iter, +- BTREE_UPDATE_internal_snapshot_node); ++ for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { ++ u32 live_child = s->children[i] ++ ? __live_child(t, s->children[i], delete_leaves, delete_interior) ++ : 0; ++ if (live_child) ++ return live_child; ++ } + +- if (!bpos_eq(*last_pos, k.k->p) && +- snapshot_list_has_id(equiv_seen, equiv)) +- return bch2_btree_delete_at(trans, iter, +- BTREE_UPDATE_internal_snapshot_node); ++ return 0; ++} + +- *last_pos = k.k->p; ++static unsigned live_child(struct bch_fs *c, u32 id, ++ snapshot_id_list *delete_leaves, ++ interior_delete_list *delete_interior) ++{ ++ rcu_read_lock(); ++ u32 ret = __live_child(rcu_dereference(c->snapshots), id, ++ delete_leaves, delete_interior); ++ rcu_read_unlock(); ++ return ret; ++} + +- ret = snapshot_list_add_nodup(c, equiv_seen, equiv); +- if (ret) +- return ret; ++static int delete_dead_snapshots_process_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, ++ snapshot_id_list *delete_leaves, ++ interior_delete_list *delete_interior) ++{ ++ if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) ++ return bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_internal_snapshot_node); + +- /* +- * When we have a linear chain of snapshot nodes, we consider +- * those to form an equivalence class: we're going to collapse +- * them all down to a single node, and keep the leaf-most node - +- * which has the same id as the equivalence class id. +- * +- * If there are multiple keys in different snapshots at the same +- * position, we're only going to keep the one in the newest +- * snapshot (we delete the others above) - the rest have been +- * overwritten and are redundant, and for the key we're going to keep we +- * need to move it to the equivalance class ID if it's not there +- * already. +- */ +- if (equiv != k.k->p.snapshot) { ++ u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); ++ if (live_child) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + +- new->k.p.snapshot = equiv; +- +- struct btree_iter new_iter; +- bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, +- BTREE_ITER_all_snapshots| +- BTREE_ITER_cached| +- BTREE_ITER_intent); ++ new->k.p.snapshot = live_child; + +- ret = bch2_btree_iter_traverse(&new_iter) ?: +- bch2_trans_update(trans, &new_iter, new, +- BTREE_UPDATE_internal_snapshot_node) ?: +- bch2_btree_delete_at(trans, iter, +- BTREE_UPDATE_internal_snapshot_node); +- bch2_trans_iter_exit(trans, &new_iter); ++ struct btree_iter dst_iter; ++ struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, ++ iter->btree_id, new->k.p, ++ BTREE_ITER_all_snapshots| ++ BTREE_ITER_intent); ++ ret = bkey_err(dst_k); + if (ret) + return ret; ++ ++ ret = (bkey_deleted(dst_k.k) ++ ? bch2_trans_update(trans, &dst_iter, new, ++ BTREE_UPDATE_internal_snapshot_node) ++ : 0) ?: ++ bch2_btree_delete_at(trans, iter, ++ BTREE_UPDATE_internal_snapshot_node); ++ bch2_trans_iter_exit(trans, &dst_iter); ++ return ret; + } + + return 0; + } + +-static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) ++/* ++ * For a given snapshot, if it doesn't have a subvolume that points to it, and ++ * it doesn't have child snapshot nodes - it's now redundant and we can mark it ++ * as deleted. ++ */ ++static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, ++ snapshot_id_list *delete_leaves, ++ interior_delete_list *delete_interior) + { +- struct bkey_s_c_snapshot snap; +- u32 children[2]; +- int ret; +- + if (k.k->type != KEY_TYPE_snapshot) + return 0; + +- snap = bkey_s_c_to_snapshot(k); +- if (BCH_SNAPSHOT_DELETED(snap.v) || +- BCH_SNAPSHOT_SUBVOL(snap.v)) ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); ++ unsigned live_children = 0; ++ ++ if (BCH_SNAPSHOT_SUBVOL(s.v)) + return 0; + +- children[0] = le32_to_cpu(snap.v->children[0]); +- children[1] = le32_to_cpu(snap.v->children[1]); ++ for (unsigned i = 0; i < 2; i++) { ++ u32 child = le32_to_cpu(s.v->children[i]); + +- ret = bch2_snapshot_live(trans, children[0]) ?: +- bch2_snapshot_live(trans, children[1]); +- if (ret < 0) +- return ret; +- return !ret; +-} ++ live_children += child && ++ !snapshot_list_has_id(delete_leaves, child); ++ } + +-/* +- * For a given snapshot, if it doesn't have a subvolume that points to it, and +- * it doesn't have child snapshot nodes - it's now redundant and we can mark it +- * as deleted. +- */ +-static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) +-{ +- int ret = bch2_snapshot_needs_delete(trans, k); ++ if (live_children == 0) { ++ return snapshot_list_add(c, delete_leaves, s.k->p.offset); ++ } else if (live_children == 1) { ++ struct snapshot_interior_delete d = { ++ .id = s.k->p.offset, ++ .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), ++ }; ++ ++ if (!d.live_child) { ++ bch_err(c, "error finding live child of snapshot %u", d.id); ++ return -EINVAL; ++ } + +- return ret <= 0 +- ? ret +- : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); ++ return darray_push(delete_interior, d); ++ } else { ++ return 0; ++ } + } + + static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, +- snapshot_id_list *skip) ++ interior_delete_list *skip) + { + rcu_read_lock(); +- while (snapshot_list_has_id(skip, id)) ++ while (interior_delete_has_id(skip, id)) + id = __bch2_snapshot_parent(c, id); + + while (n--) { + do { + id = __bch2_snapshot_parent(c, id); +- } while (snapshot_list_has_id(skip, id)); ++ } while (interior_delete_has_id(skip, id)); + } + rcu_read_unlock(); + +@@ -1534,7 +1493,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, + + static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k, +- snapshot_id_list *deleted) ++ interior_delete_list *deleted) + { + struct bch_fs *c = trans->c; + u32 nr_deleted_ancestors = 0; +@@ -1544,7 +1503,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + if (k.k->type != KEY_TYPE_snapshot) + return 0; + +- if (snapshot_list_has_id(deleted, k.k->p.offset)) ++ if (interior_delete_has_id(deleted, k.k->p.offset)) + return 0; + + s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); +@@ -1553,7 +1512,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + return ret; + + darray_for_each(*deleted, i) +- nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); ++ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); + + if (!nr_deleted_ancestors) + return 0; +@@ -1571,7 +1530,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { + u32 id = le32_to_cpu(s->v.skip[j]); + +- if (snapshot_list_has_id(deleted, id)) { ++ if (interior_delete_has_id(deleted, id)) { + id = bch2_snapshot_nth_parent_skip(c, + parent, + depth > 1 +@@ -1590,51 +1549,45 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + + int bch2_delete_dead_snapshots(struct bch_fs *c) + { +- struct btree_trans *trans; +- snapshot_id_list deleted = { 0 }; +- snapshot_id_list deleted_interior = { 0 }; +- int ret = 0; +- + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) + return 0; + +- trans = bch2_trans_get(c); ++ struct btree_trans *trans = bch2_trans_get(c); ++ snapshot_id_list delete_leaves = {}; ++ interior_delete_list delete_interior = {}; ++ int ret = 0; + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ +- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, +- POS_MIN, 0, k, +- NULL, NULL, 0, +- bch2_delete_redundant_snapshot(trans, k)); +- bch_err_msg(c, ret, "deleting redundant snapshots"); ++ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, ++ check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); ++ if (!bch2_err_matches(ret, EROFS)) ++ bch_err_msg(c, ret, "walking snapshots"); + if (ret) + goto err; + +- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, +- POS_MIN, 0, k, +- bch2_snapshot_set_equiv(trans, k)); +- bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); +- if (ret) ++ if (!delete_leaves.nr && !delete_interior.nr) + goto err; + +- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, +- POS_MIN, 0, k, ({ +- if (k.k->type != KEY_TYPE_snapshot) +- continue; ++ { ++ struct printbuf buf = PRINTBUF; ++ prt_printf(&buf, "deleting leaves"); ++ darray_for_each(delete_leaves, i) ++ prt_printf(&buf, " %u", *i); + +- BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) +- ? snapshot_list_add(c, &deleted, k.k->p.offset) +- : 0; +- })); +- bch_err_msg(c, ret, "walking snapshots"); +- if (ret) +- goto err; ++ prt_printf(&buf, " interior"); ++ darray_for_each(delete_interior, i) ++ prt_printf(&buf, " %u->%u", i->id, i->live_child); ++ ++ ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); ++ printbuf_exit(&buf); ++ if (ret) ++ goto err; ++ } + + for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { +- struct bpos last_pos = POS_MIN; +- snapshot_id_list equiv_seen = { 0 }; + struct disk_reservation res = { 0 }; + + if (!btree_type_has_snapshots(btree)) +@@ -1644,33 +1597,26 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) + btree, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, +- delete_dead_snapshots_process_key(trans, &iter, k, &deleted, +- &equiv_seen, &last_pos)); ++ delete_dead_snapshots_process_key(trans, &iter, k, ++ &delete_leaves, ++ &delete_interior)); + + bch2_disk_reservation_put(c, &res); +- darray_exit(&equiv_seen); + +- bch_err_msg(c, ret, "deleting keys from dying snapshots"); ++ if (!bch2_err_matches(ret, EROFS)) ++ bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) + goto err; + } + +- bch2_trans_unlock(trans); +- down_write(&c->snapshot_create_lock); +- +- ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, +- POS_MIN, 0, k, ({ +- u32 snapshot = k.k->p.offset; +- u32 equiv = bch2_snapshot_equiv(c, snapshot); +- +- equiv != snapshot +- ? snapshot_list_add(c, &deleted_interior, snapshot) +- : 0; +- })); +- +- bch_err_msg(c, ret, "walking snapshots"); +- if (ret) +- goto err_create_lock; ++ darray_for_each(delete_leaves, i) { ++ ret = commit_do(trans, NULL, NULL, 0, ++ bch2_snapshot_node_delete(trans, *i)); ++ if (!bch2_err_matches(ret, EROFS)) ++ bch_err_msg(c, ret, "deleting snapshot %u", *i); ++ if (ret) ++ goto err; ++ } + + /* + * Fixing children of deleted snapshots can't be done completely +@@ -1680,32 +1626,24 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) + ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, + BTREE_ITER_intent, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +- bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); ++ bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); + if (ret) +- goto err_create_lock; +- +- darray_for_each(deleted, i) { +- ret = commit_do(trans, NULL, NULL, 0, +- bch2_snapshot_node_delete(trans, *i)); +- bch_err_msg(c, ret, "deleting snapshot %u", *i); +- if (ret) +- goto err_create_lock; +- } ++ goto err; + +- darray_for_each(deleted_interior, i) { ++ darray_for_each(delete_interior, i) { + ret = commit_do(trans, NULL, NULL, 0, +- bch2_snapshot_node_delete(trans, *i)); +- bch_err_msg(c, ret, "deleting snapshot %u", *i); ++ bch2_snapshot_node_delete(trans, i->id)); ++ if (!bch2_err_matches(ret, EROFS)) ++ bch_err_msg(c, ret, "deleting snapshot %u", i->id); + if (ret) +- goto err_create_lock; ++ goto err; + } +-err_create_lock: +- up_write(&c->snapshot_create_lock); + err: +- darray_exit(&deleted_interior); +- darray_exit(&deleted); ++ darray_exit(&delete_interior); ++ darray_exit(&delete_leaves); + bch2_trans_put(trans); +- bch_err_fn(c, ret); ++ if (!bch2_err_matches(ret, EROFS)) ++ bch_err_fn(c, ret); + return ret; + } + +@@ -1721,8 +1659,12 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) + + void bch2_delete_dead_snapshots_async(struct bch_fs *c) + { +- if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && +- !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) ++ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) ++ return; ++ ++ BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); ++ ++ if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + } + +@@ -1735,18 +1677,10 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- bch2_trans_iter_init(trans, &iter, id, pos, +- BTREE_ITER_not_extents| +- BTREE_ITER_all_snapshots); +- while (1) { +- k = bch2_btree_iter_prev(&iter); +- ret = bkey_err(k); +- if (ret) +- break; +- +- if (!k.k) +- break; +- ++ for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), ++ BTREE_ITER_not_extents| ++ BTREE_ITER_all_snapshots, ++ k, ret) { + if (!bkey_eq(pos, k.k->p)) + break; + +@@ -1760,37 +1694,36 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + return ret; + } + +-static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) ++static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) + { +- struct bch_fs *c = trans->c; +- struct bkey_s_c_snapshot snap; +- int ret = 0; ++ /* If there's one child, it's redundant and keys will be moved to the child */ ++ return !!snap.v->children[0] + !!snap.v->children[1] == 1; ++} + ++static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) ++{ + if (k.k->type != KEY_TYPE_snapshot) + return 0; + +- snap = bkey_s_c_to_snapshot(k); ++ struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || +- bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || +- (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { +- set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); +- return 0; +- } ++ interior_snapshot_needs_delete(snap)) ++ set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); + +- return ret; ++ return 0; + } + + int bch2_snapshots_read(struct bch_fs *c) + { ++ /* ++ * Initializing the is_ancestor bitmaps requires ancestors to already be ++ * initialized - so mark in reverse: ++ */ + int ret = bch2_trans_run(c, +- for_each_btree_key(trans, iter, BTREE_ID_snapshots, +- POS_MIN, 0, k, ++ for_each_btree_key_reverse(trans, iter, BTREE_ID_snapshots, ++ POS_MAX, 0, k, + __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: +- bch2_snapshot_set_equiv(trans, k) ?: +- bch2_check_snapshot_needs_deletion(trans, k)) ?: +- for_each_btree_key(trans, iter, BTREE_ID_snapshots, +- POS_MIN, 0, k, +- (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); ++ bch2_check_snapshot_needs_deletion(trans, k))); + bch_err_fn(c, ret); + + /* +diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h +index 29c94716293e..00373cf32e7b 100644 +--- a/fs/bcachefs/snapshot.h ++++ b/fs/bcachefs/snapshot.h +@@ -2,11 +2,9 @@ + #ifndef _BCACHEFS_SNAPSHOT_H + #define _BCACHEFS_SNAPSHOT_H + +-enum bch_validate_flags; +- + void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, +- enum bch_validate_flags); ++ struct bkey_validate_context); + + #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ + .key_validate = bch2_snapshot_tree_validate, \ +@@ -19,7 +17,8 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); + int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); + + void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +-int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, + enum btree_iter_update_trigger_flags); +@@ -120,19 +119,19 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) + return id; + } + +-static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) ++static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) + { + const struct snapshot_t *s = snapshot_t(c, id); +- return s ? s->equiv : 0; ++ return s ? s->live : 0; + } + +-static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) ++static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) + { + rcu_read_lock(); +- id = __bch2_snapshot_equiv(c, id); ++ bool ret = __bch2_snapshot_exists(c, id); + rcu_read_unlock(); + +- return id; ++ return ret; + } + + static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c +new file mode 100644 +index 000000000000..d78451c2a0c6 +--- /dev/null ++++ b/fs/bcachefs/str_hash.c +@@ -0,0 +1,295 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "fsck.h" ++#include "str_hash.h" ++#include "subvolume.h" ++ ++static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) ++{ ++ if (d.v->d_type == DT_SUBVOL) { ++ struct bch_subvolume subvol; ++ int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), ++ false, &subvol); ++ if (ret && !bch2_err_matches(ret, ENOENT)) ++ return ret; ++ return !ret; ++ } else { ++ struct btree_iter iter; ++ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, ++ SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); ++ int ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ ret = bkey_is_inode(k.k); ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++ } ++} ++ ++static noinline int fsck_rename_dirent(struct btree_trans *trans, ++ struct snapshots_seen *s, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct bkey_s_c_dirent old) ++{ ++ struct qstr old_name = bch2_dirent_get_name(old); ++ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); ++ int ret = PTR_ERR_OR_ZERO(new); ++ if (ret) ++ return ret; ++ ++ bkey_dirent_init(&new->k_i); ++ dirent_copy_target(new, old); ++ new->k.p = old.k->p; ++ ++ for (unsigned i = 0; i < 1000; i++) { ++ unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", ++ old_name.len, old_name.name, i); ++ unsigned u64s = BKEY_U64s + dirent_val_u64s(len); ++ ++ if (u64s > U8_MAX) ++ return -EINVAL; ++ ++ new->k.u64s = u64s; ++ ++ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, ++ (subvol_inum) { 0, old.k->p.inode }, ++ old.k->p.snapshot, &new->k_i, ++ BTREE_UPDATE_internal_snapshot_node); ++ if (!bch2_err_matches(ret, EEXIST)) ++ break; ++ } ++ ++ if (ret) ++ return ret; ++ ++ return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); ++} ++ ++static noinline int hash_pick_winner(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *hash_info, ++ struct bkey_s_c k1, ++ struct bkey_s_c k2) ++{ ++ if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && ++ !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) ++ return 0; ++ ++ switch (desc.btree_id) { ++ case BTREE_ID_dirents: { ++ int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); ++ if (ret < 0) ++ return ret; ++ if (!ret) ++ return 0; ++ ++ ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); ++ if (ret < 0) ++ return ret; ++ if (!ret) ++ return 1; ++ return 2; ++ } ++ default: ++ return 0; ++ } ++} ++ ++static int repair_inode_hash_info(struct btree_trans *trans, ++ struct bch_inode_unpacked *snapshot_root) ++{ ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, ++ SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), ++ BTREE_ITER_all_snapshots, k, ret) { ++ if (k.k->p.offset != snapshot_root->bi_inum) ++ break; ++ if (!bkey_is_inode(k.k)) ++ continue; ++ ++ struct bch_inode_unpacked inode; ++ ret = bch2_inode_unpack(k, &inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || ++ INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), ++ trans, inode_snapshot_mismatch, ++ "inode hash info in different snapshots don't match")) { ++ inode.bi_hash_seed = snapshot_root->bi_hash_seed; ++ SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); ++ ret = __bch2_fsck_write_inode(trans, &inode) ?: ++ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ -BCH_ERR_transaction_restart_nested; ++ break; ++ } ++ } ++fsck_err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++/* ++ * All versions of the same inode in different snapshots must have the same hash ++ * seed/type: verify that the hash info we're using matches the root ++ */ ++static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, ++ struct bch_hash_info *hash_info) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), ++ BTREE_ITER_all_snapshots, k, ret) { ++ if (k.k->p.offset != inum) ++ break; ++ if (bkey_is_inode(k.k)) ++ goto found; ++ } ++ bch_err(c, "%s(): inum %llu not found", __func__, inum); ++ ret = -BCH_ERR_fsck_repair_unimplemented; ++ goto err; ++found:; ++ struct bch_inode_unpacked inode; ++ ret = bch2_inode_unpack(k, &inode); ++ if (ret) ++ goto err; ++ ++ struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); ++ if (hash_info->type != hash2.type || ++ memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { ++ ret = repair_inode_hash_info(trans, &inode); ++ if (!ret) { ++ bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" ++ "%u %llx %llx\n" ++ "%u %llx %llx", ++ hash_info->type, ++ hash_info->siphash_key.k0, ++ hash_info->siphash_key.k1, ++ hash2.type, ++ hash2.siphash_key.k0, ++ hash2.siphash_key.k1); ++ ret = -BCH_ERR_fsck_repair_unimplemented; ++ } ++ } ++err: ++ bch2_trans_iter_exit(trans, &iter); ++ return ret; ++} ++ ++int __bch2_str_hash_check_key(struct btree_trans *trans, ++ struct snapshots_seen *s, ++ const struct bch_hash_desc *desc, ++ struct bch_hash_info *hash_info, ++ struct btree_iter *k_iter, struct bkey_s_c hash_k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter iter = { NULL }; ++ struct printbuf buf = PRINTBUF; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ u64 hash = desc->hash_bkey(hash_info, hash_k); ++ if (hash_k.k->p.offset < hash) ++ goto bad_hash; ++ ++ for_each_btree_key_norestart(trans, iter, desc->btree_id, ++ SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), ++ BTREE_ITER_slots, k, ret) { ++ if (bkey_eq(k.k->p, hash_k.k->p)) ++ break; ++ ++ if (k.k->type == desc->key_type && ++ !desc->cmp_bkey(k, hash_k)) ++ goto duplicate_entries; ++ ++ if (bkey_deleted(k.k)) { ++ bch2_trans_iter_exit(trans, &iter); ++ goto bad_hash; ++ } ++ } ++out: ++ bch2_trans_iter_exit(trans, &iter); ++ printbuf_exit(&buf); ++ return ret; ++bad_hash: ++ /* ++ * Before doing any repair, check hash_info itself: ++ */ ++ ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); ++ if (ret) ++ goto out; ++ ++ if (fsck_err(trans, hash_table_key_wrong_offset, ++ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", ++ bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { ++ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); ++ if (IS_ERR(new)) ++ return PTR_ERR(new); ++ ++ k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, ++ (subvol_inum) { 0, hash_k.k->p.inode }, ++ hash_k.k->p.snapshot, new, ++ STR_HASH_must_create| ++ BTREE_ITER_with_updates| ++ BTREE_UPDATE_internal_snapshot_node); ++ ret = bkey_err(k); ++ if (ret) ++ goto out; ++ if (k.k) ++ goto duplicate_entries; ++ ++ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, ++ BTREE_UPDATE_internal_snapshot_node) ?: ++ bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: ++ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: ++ -BCH_ERR_transaction_restart_nested; ++ goto out; ++ } ++fsck_err: ++ goto out; ++duplicate_entries: ++ ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); ++ if (ret < 0) ++ goto out; ++ ++ if (!fsck_err(trans, hash_table_key_duplicate, ++ "duplicate hash table keys%s:\n%s", ++ ret != 2 ? "" : ", both point to valid inodes", ++ (printbuf_reset(&buf), ++ bch2_bkey_val_to_text(&buf, c, hash_k), ++ prt_newline(&buf), ++ bch2_bkey_val_to_text(&buf, c, k), ++ buf.buf))) ++ goto out; ++ ++ switch (ret) { ++ case 0: ++ ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); ++ break; ++ case 1: ++ ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); ++ break; ++ case 2: ++ ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: ++ bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); ++ goto out; ++ } ++ ++ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: ++ -BCH_ERR_transaction_restart_nested; ++ goto out; ++} +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +index ec2b1feea520..55a4ac7bf220 100644 +--- a/fs/bcachefs/str_hash.h ++++ b/fs/bcachefs/str_hash.h +@@ -160,7 +160,7 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, + struct bkey_s_c k; + int ret; + +- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, ++ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + POS(inum.inum, U64_MAX), + BTREE_ITER_slots|flags, k, ret) { +@@ -210,7 +210,7 @@ bch2_hash_hole(struct btree_trans *trans, + if (ret) + return ret; + +- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, ++ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + POS(inum.inum, U64_MAX), + BTREE_ITER_slots|BTREE_ITER_intent, k, ret) +@@ -265,7 +265,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, + bool found = false; + int ret; + +- for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, ++ for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, + SPOS(insert->k.p.inode, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), +@@ -393,4 +393,26 @@ int bch2_hash_delete(struct btree_trans *trans, + return ret; + } + ++struct snapshots_seen; ++int __bch2_str_hash_check_key(struct btree_trans *, ++ struct snapshots_seen *, ++ const struct bch_hash_desc *, ++ struct bch_hash_info *, ++ struct btree_iter *, struct bkey_s_c); ++ ++static inline int bch2_str_hash_check_key(struct btree_trans *trans, ++ struct snapshots_seen *s, ++ const struct bch_hash_desc *desc, ++ struct bch_hash_info *hash_info, ++ struct btree_iter *k_iter, struct bkey_s_c hash_k) ++{ ++ if (hash_k.k->type != desc->key_type) ++ return 0; ++ ++ if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) ++ return 0; ++ ++ return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); ++} ++ + #endif /* _BCACHEFS_STR_HASH_H */ +diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c +index 80e5efaff524..e3d0475232e5 100644 +--- a/fs/bcachefs/subvolume.c ++++ b/fs/bcachefs/subvolume.c +@@ -207,7 +207,7 @@ int bch2_check_subvol_children(struct bch_fs *c) + /* Subvolumes: */ + + int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); + int ret = 0; +@@ -286,11 +286,11 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) + static __always_inline int + bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, +- int iter_flags, + struct bch_subvolume *s) + { + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), +- iter_flags, subvolume, s); ++ BTREE_ITER_cached| ++ BTREE_ITER_with_updates, subvolume, s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && + inconsistent_if_not_found, + trans->c, "missing subvolume %u", subvol); +@@ -299,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, + + int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, +- int iter_flags, + struct bch_subvolume *s) + { +- return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); ++ return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); + } + + int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) + { + struct bch_subvolume s; +- int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s); ++ int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); + if (ret) + return ret; + +@@ -328,7 +327,7 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, + struct bch_snapshot snap; + + return bch2_snapshot_lookup(trans, snapshot, &snap) ?: +- bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); ++ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); + } + + int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, +@@ -396,8 +395,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d + struct bch_subvolume s; + + return lockrestart_do(trans, +- bch2_subvolume_get(trans, subvolid_to_delete, true, +- BTREE_ITER_cached, &s)) ?: ++ bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, +@@ -411,26 +409,56 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d + */ + static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) + { +- struct btree_iter iter; +- struct bkey_s_c_subvolume subvol; +- u32 snapid; +- int ret = 0; ++ struct btree_iter subvol_iter = {}, snapshot_iter = {}, snapshot_tree_iter = {}; + +- subvol = bch2_bkey_get_iter_typed(trans, &iter, ++ struct bkey_s_c_subvolume subvol = ++ bch2_bkey_get_iter_typed(trans, &subvol_iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_cached|BTREE_ITER_intent, + subvolume); +- ret = bkey_err(subvol); ++ int ret = bkey_err(subvol); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); + if (ret) +- return ret; ++ goto err; + +- snapid = le32_to_cpu(subvol.v->snapshot); ++ u32 snapid = le32_to_cpu(subvol.v->snapshot); ++ ++ struct bkey_s_c_snapshot snapshot = ++ bch2_bkey_get_iter_typed(trans, &snapshot_iter, ++ BTREE_ID_snapshots, POS(0, snapid), ++ 0, snapshot); ++ ret = bkey_err(subvol); ++ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, ++ "missing snapshot %u", snapid); ++ if (ret) ++ goto err; ++ ++ u32 treeid = le32_to_cpu(snapshot.v->tree); + +- ret = bch2_btree_delete_at(trans, &iter, 0) ?: ++ struct bkey_s_c_snapshot_tree snapshot_tree = ++ bch2_bkey_get_iter_typed(trans, &snapshot_tree_iter, ++ BTREE_ID_snapshot_trees, POS(0, treeid), ++ 0, snapshot_tree); ++ ++ if (le32_to_cpu(snapshot_tree.v->master_subvol) == subvolid) { ++ struct bkey_i_snapshot_tree *snapshot_tree_mut = ++ bch2_bkey_make_mut_typed(trans, &snapshot_tree_iter, ++ &snapshot_tree.s_c, ++ 0, snapshot_tree); ++ ret = PTR_ERR_OR_ZERO(snapshot_tree_mut); ++ if (ret) ++ goto err; ++ ++ snapshot_tree_mut->v.master_subvol = 0; ++ } ++ ++ ret = bch2_btree_delete_at(trans, &subvol_iter, 0) ?: + bch2_snapshot_node_set_deleted(trans, snapid); +- bch2_trans_iter_exit(trans, &iter); ++err: ++ bch2_trans_iter_exit(trans, &snapshot_tree_iter); ++ bch2_trans_iter_exit(trans, &snapshot_iter); ++ bch2_trans_iter_exit(trans, &subvol_iter); + return ret; + } + +@@ -675,7 +703,7 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) + /* set bi_subvol on root inode */ + int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) + { +- int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, ++ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_fs_upgrade_for_subvolumes(trans)); + bch_err_fn(c, ret); + return ret; +diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h +index f897d106e142..910f6196700e 100644 +--- a/fs/bcachefs/subvolume.h ++++ b/fs/bcachefs/subvolume.h +@@ -5,12 +5,11 @@ + #include "darray.h" + #include "subvolume_types.h" + +-enum bch_validate_flags; +- + int bch2_check_subvols(struct bch_fs *); + int bch2_check_subvol_children(struct bch_fs *); + +-int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, +@@ -25,7 +24,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, + + int bch2_subvol_has_children(struct btree_trans *, u32); + int bch2_subvolume_get(struct btree_trans *, unsigned, +- bool, int, struct bch_subvolume *); ++ bool, struct bch_subvolume *); + int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, + u32 *, bool); + int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); +@@ -34,7 +33,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); + int bch2_subvol_is_ro(struct bch_fs *, u32); + + static inline struct bkey_s_c +-bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end, ++bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, + u32 subvolid, unsigned flags) + { + u32 snapshot; +@@ -43,10 +42,10 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos + return bkey_s_c_err(ret); + + bch2_btree_iter_set_snapshot(iter, snapshot); +- return bch2_btree_iter_peek_upto_type(iter, end, flags); ++ return bch2_btree_iter_peek_max_type(iter, end, flags); + } + +-#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ ++#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do) \ + ({ \ + struct bkey_s_c _k; \ +@@ -54,7 +53,7 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos + \ + do { \ + _ret3 = lockrestart_do(_trans, ({ \ +- (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \ ++ (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ + _end, _subvolid, (_flags)); \ + if (!(_k).k) \ + break; \ +@@ -67,14 +66,14 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos + _ret3; \ + }) + +-#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \ ++#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ + _start, _end, _subvolid, _flags, _k, _do) \ + ({ \ + struct btree_iter _iter; \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ +- for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ ++ for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ + _end, _subvolid, _flags, _k, _do); \ + }) + +diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h +index f2ec4277c2a5..1549d6daf7af 100644 +--- a/fs/bcachefs/subvolume_types.h ++++ b/fs/bcachefs/subvolume_types.h +@@ -9,13 +9,13 @@ typedef DARRAY(u32) snapshot_id_list; + #define IS_ANCESTOR_BITMAP 128 + + struct snapshot_t { ++ bool live; + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; +- u32 equiv; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; + }; + +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +index 7c71594f6a8b..8037ccbacf6a 100644 +--- a/fs/bcachefs/super-io.c ++++ b/fs/bcachefs/super-io.c +@@ -23,6 +23,7 @@ + + #include + #include ++#include + + static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { + }; +@@ -41,7 +42,7 @@ static const struct bch2_metadata_version bch2_metadata_versions[] = { + #undef x + }; + +-void bch2_version_to_text(struct printbuf *out, unsigned v) ++void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v) + { + const char *str = "(unknown version)"; + +@@ -54,7 +55,7 @@ void bch2_version_to_text(struct printbuf *out, unsigned v) + prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); + } + +-unsigned bch2_latest_compatible_version(unsigned v) ++enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v) + { + if (!BCH_VERSION_MAJOR(v)) + return v; +@@ -68,6 +69,16 @@ unsigned bch2_latest_compatible_version(unsigned v) + return v; + } + ++void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) ++{ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, ++ max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++} ++ + const char * const bch2_sb_fields[] = { + #define x(name, nr) #name, + BCH_SB_FIELDS() +@@ -368,6 +379,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, + return -BCH_ERR_invalid_sb_features; + } + ++ if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || ++ BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { ++ prt_printf(out, "Filesystem has incompatible version"); ++ return -BCH_ERR_invalid_sb_features; ++ } ++ + block_size = le16_to_cpu(sb->block_size); + + if (block_size > PAGE_SECTORS) { +@@ -406,6 +423,21 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, + return -BCH_ERR_invalid_sb_time_precision; + } + ++ /* old versions didn't know to downgrade this field */ ++ if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version)) ++ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version)); ++ ++ if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) { ++ prt_printf(out, "Invalid version_incompat "); ++ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); ++ prt_str(out, " > incompat_allowed "); ++ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); ++ if (flags & BCH_VALIDATE_write) ++ return -BCH_ERR_invalid_sb_version; ++ else ++ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); ++ } ++ + if (!flags) { + /* + * Been seeing a bug where these are getting inexplicably +@@ -428,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, + SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); + } + ++#ifdef __KERNEL__ ++ if (!BCH_SB_SHARD_INUMS_NBITS(sb)) ++ SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); ++#endif ++ + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { + const struct bch_option *opt = bch2_opt_table + opt_id; + +@@ -519,6 +556,9 @@ static void bch2_sb_update(struct bch_fs *c) + c->sb.uuid = src->uuid; + c->sb.user_uuid = src->user_uuid; + c->sb.version = le16_to_cpu(src->version); ++ c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src); ++ c->sb.version_incompat_allowed ++ = BCH_SB_VERSION_INCOMPAT_ALLOWED(src); + c->sb.version_min = le16_to_cpu(src->version_min); + c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); + c->sb.nr_devices = src->nr_devices; +@@ -676,7 +716,8 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf + } + + enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); +- if (csum_type >= BCH_CSUM_NR) { ++ if (csum_type >= BCH_CSUM_NR || ++ bch2_csum_type_is_encryption(csum_type)) { + prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); + return -BCH_ERR_invalid_sb_csum_type; + } +@@ -878,7 +919,7 @@ static void write_super_endio(struct bio *bio) + ? BCH_MEMBER_ERROR_write + : BCH_MEMBER_ERROR_read, + "superblock %s error: %s", +- bio_data_dir(bio) ? "write" : "read", ++ str_write_read(bio_data_dir(bio)), + bch2_blk_status_to_str(bio->bi_status))) + ca->sb_write_error = 1; + +@@ -891,14 +932,15 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + ++ memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE); ++ + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; +- bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); ++ bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE); + +- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], +- bio_sectors(bio)); ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); + closure_bio_submit(bio, &c->sb_write); +@@ -1042,9 +1084,16 @@ int bch2_write_super(struct bch_fs *c) + ": Superblock write was silently dropped! (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); +- bch2_fs_fatal_error(c, "%s", buf.buf); ++ ++ if (c->opts.errors != BCH_ON_ERROR_continue && ++ c->opts.errors != BCH_ON_ERROR_fix_safe) { ++ ret = -BCH_ERR_erofs_sb_err; ++ bch2_fs_fatal_error(c, "%s", buf.buf); ++ } else { ++ bch_err(c, "%s", buf.buf); ++ } ++ + printbuf_exit(&buf); +- ret = -BCH_ERR_erofs_sb_err; + } + + if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { +@@ -1149,6 +1198,8 @@ bool bch2_check_version_downgrade(struct bch_fs *c) + */ + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); ++ if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current) ++ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current); + if (c->sb.version > bcachefs_metadata_version_current) + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); + if (c->sb.version_min > bcachefs_metadata_version_current) +@@ -1157,7 +1208,7 @@ bool bch2_check_version_downgrade(struct bch_fs *c) + return ret; + } + +-void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) ++void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) + { + lockdep_assert_held(&c->sb_lock); + +@@ -1167,6 +1218,10 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) + + c->disk_sb.sb->version = cpu_to_le16(new_version); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); ++ ++ if (incompat) ++ SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, ++ max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); + } + + static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, +@@ -1331,6 +1386,14 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_newline(out); + ++ prt_printf(out, "Incompatible features allowed:\t"); ++ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); ++ prt_newline(out); ++ ++ prt_printf(out, "Incompatible features in use:\t"); ++ bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); ++ prt_newline(out); ++ + prt_printf(out, "Version upgrade complete:\t"); + bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); + prt_newline(out); +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +index fadd364e2802..f1ab4f943720 100644 +--- a/fs/bcachefs/super-io.h ++++ b/fs/bcachefs/super-io.h +@@ -10,14 +10,29 @@ + + #include + ++#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096 ++ + static inline bool bch2_version_compatible(u16 version) + { + return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && + version >= bcachefs_metadata_version_min; + } + +-void bch2_version_to_text(struct printbuf *, unsigned); +-unsigned bch2_latest_compatible_version(unsigned); ++void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); ++enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); ++ ++void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); ++ ++static inline bool bch2_request_incompat_feature(struct bch_fs *c, ++ enum bcachefs_metadata_version version) ++{ ++ if (unlikely(version > c->sb.version_incompat)) { ++ if (version > c->sb.version_incompat_allowed) ++ return false; ++ bch2_set_version_incompat(c, version); ++ } ++ return true; ++} + + static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) + { +@@ -92,7 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) + } + + bool bch2_check_version_downgrade(struct bch_fs *); +-void bch2_sb_upgrade(struct bch_fs *, unsigned); ++void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); + + void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +index a6ed9a0bf1c7..d97ea7bd1171 100644 +--- a/fs/bcachefs/super.c ++++ b/fs/bcachefs/super.c +@@ -290,7 +290,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) + + bch2_fs_journal_stop(&c->journal); + +- bch_info(c, "%sshutdown complete, journal seq %llu", ++ bch_info(c, "%sclean shutdown complete, journal seq %llu", + test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", + c->journal.seq_ondisk); + +@@ -441,6 +441,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) + { + int ret; + ++ BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); ++ + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { + bch_err(c, "cannot go rw, unfixed btree errors"); + return -BCH_ERR_erofs_unfixed_errors; +@@ -561,6 +563,7 @@ static void __bch2_fs_free(struct bch_fs *c) + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); + bch2_fs_compress_exit(c); ++ bch2_fs_btree_gc_exit(c); + bch2_journal_keys_put_initial(c); + bch2_find_btree_nodes_exit(&c->found_btree_nodes); + BUG_ON(atomic_read(&c->journal_keys.ref)); +@@ -584,7 +587,6 @@ static void __bch2_fs_free(struct bch_fs *c) + #endif + kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); +- kfree(c->unused_inode_hints); + + if (c->write_ref_wq) + destroy_workqueue(c->write_ref_wq); +@@ -766,21 +768,17 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + + refcount_set(&c->ro_ref, 1); + init_waitqueue_head(&c->ro_ref_wait); ++ spin_lock_init(&c->recovery_pass_lock); + sema_init(&c->online_fsck_mutex, 1); + +- init_rwsem(&c->gc_lock); +- mutex_init(&c->gc_gens_lock); +- atomic_set(&c->journal_keys.ref, 1); +- c->journal_keys.initial_ref_held = true; +- + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); + +- bch2_fs_gc_init(c); + bch2_fs_copygc_init(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_iter_init_early(c); + bch2_fs_btree_interior_update_init_early(c); ++ bch2_fs_journal_keys_init(c); + bch2_fs_allocator_background_init(c); + bch2_fs_allocator_foreground_init(c); + bch2_fs_rebalance_init(c); +@@ -809,9 +807,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + INIT_LIST_HEAD(&c->vfs_inodes_list); + mutex_init(&c->vfs_inodes_lock); + +- c->copy_gc_enabled = 1; +- c->rebalance.enabled = 1; +- + c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; + c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; + c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; +@@ -873,8 +868,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + (btree_blocks(c) + 1) * 2 * + sizeof(struct sort_iter_set); + +- c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); +- + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || + !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", +@@ -901,9 +894,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + !(c->online_reserved = alloc_percpu(u64)) || + mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, + c->opts.btree_node_size) || +- mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || +- !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, +- sizeof(u64), GFP_KERNEL))) { ++ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { + ret = -BCH_ERR_ENOMEM_fs_other_alloc; + goto err; + } +@@ -917,6 +908,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) + bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_btree_interior_update_init(c) ?: ++ bch2_fs_btree_gc_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_subvolumes_init(c) ?: +@@ -1033,9 +1025,12 @@ int bch2_fs_start(struct bch_fs *c) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + ++ c->recovery_task = current; + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) + ? bch2_fs_recovery(c) + : bch2_fs_initialize(c); ++ c->recovery_task = NULL; ++ + if (ret) + goto err; + +@@ -1120,12 +1115,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, + + prt_bdevname(&buf, fs->bdev); + prt_char(&buf, ' '); +- bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; ++ bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); + prt_newline(&buf); + + prt_bdevname(&buf, sb->bdev); + prt_char(&buf, ' '); +- bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; ++ bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); + prt_newline(&buf); + + if (!opts->no_splitbrain_check) +@@ -1198,7 +1193,7 @@ static void bch2_dev_free(struct bch_dev *ca) + + free_percpu(ca->io_done); + bch2_dev_buckets_free(ca); +- free_page((unsigned long) ca->sb_read_scratch); ++ kfree(ca->sb_read_scratch); + + bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); +@@ -1309,8 +1304,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + init_completion(&ca->ref_completion); + init_completion(&ca->io_ref_completion); + +- init_rwsem(&ca->bucket_lock); +- + INIT_WORK(&ca->io_error_work, bch2_io_error_work); + + bch2_time_stats_quantiles_init(&ca->io_latency[READ]); +@@ -1337,7 +1330,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + + if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +- !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || ++ !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || + bch2_dev_buckets_alloc(c, ca) || + !(ca->io_done = alloc_percpu(*ca->io_done))) + goto err; +@@ -1366,7 +1359,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) + { + struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); + struct bch_dev *ca = NULL; +- int ret = 0; + + if (bch2_fs_init_fault("dev_alloc")) + goto err; +@@ -1378,10 +1370,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) + ca->fs = c; + + bch2_dev_attach(c, ca, dev_idx); +- return ret; ++ return 0; + err: +- if (ca) +- bch2_dev_free(ca); + return -BCH_ERR_ENOMEM_dev_alloc; + } + +@@ -1751,11 +1741,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + if (ret) + goto err; + +- ret = bch2_dev_journal_alloc(ca, true); +- bch_err_msg(c, ret, "allocating journal"); +- if (ret) +- goto err; +- + down_write(&c->state_lock); + mutex_lock(&c->sb_lock); + +@@ -1806,11 +1791,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path) + if (ret) + goto err_late; + +- ca->new_fs_bucket_idx = 0; +- + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + ++ ret = bch2_dev_journal_alloc(ca, false); ++ bch_err_msg(c, ret, "allocating journal"); ++ if (ret) ++ goto err_late; ++ + up_write(&c->state_lock); + return 0; + +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +index dada09331d2e..fa6d52216510 100644 +--- a/fs/bcachefs/super.h ++++ b/fs/bcachefs/super.h +@@ -34,16 +34,6 @@ void bch2_fs_read_only(struct bch_fs *); + int bch2_fs_read_write(struct bch_fs *); + int bch2_fs_read_write_early(struct bch_fs *); + +-/* +- * Only for use in the recovery/fsck path: +- */ +-static inline void bch2_fs_lazy_rw(struct bch_fs *c) +-{ +- if (!test_bit(BCH_FS_rw, &c->flags) && +- !test_bit(BCH_FS_was_rw, &c->flags)) +- bch2_fs_read_write_early(c); +-} +- + void __bch2_fs_stop(struct bch_fs *); + void bch2_fs_free(struct bch_fs *); + void bch2_fs_stop(struct bch_fs *); +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +index 03e59f86f360..a7eb1f511484 100644 +--- a/fs/bcachefs/sysfs.c ++++ b/fs/bcachefs/sysfs.c +@@ -146,7 +146,7 @@ write_attribute(trigger_journal_writes); + write_attribute(trigger_btree_cache_shrink); + write_attribute(trigger_btree_key_cache_shrink); + write_attribute(trigger_freelist_wakeup); +-rw_attribute(gc_gens_pos); ++read_attribute(gc_gens_pos); + + read_attribute(uuid); + read_attribute(minor); +@@ -203,7 +203,6 @@ read_attribute(disk_groups); + + read_attribute(has_data); + read_attribute(alloc_debug); +-read_attribute(accounting); + read_attribute(usage_base); + + #define x(t, n, ...) read_attribute(t); +@@ -211,12 +210,11 @@ BCH_PERSISTENT_COUNTERS() + #undef x + + rw_attribute(discard); ++read_attribute(state); + rw_attribute(label); + +-rw_attribute(copy_gc_enabled); + read_attribute(copy_gc_wait); + +-rw_attribute(rebalance_enabled); + sysfs_pd_controller_attribute(rebalance); + read_attribute(rebalance_status); + +@@ -237,11 +235,6 @@ write_attribute(perf_test); + BCH_TIME_STATS() + #undef x + +-static struct attribute sysfs_state_rw = { +- .name = "state", +- .mode = 0444, +-}; +- + static size_t bch2_btree_cache_size(struct bch_fs *c) + { + struct btree_cache *bc = &c->btree_cache; +@@ -302,7 +295,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c + + static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) + { +- prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); ++ bch2_btree_id_to_text(out, c->gc_gens_btree); ++ prt_printf(out, ": "); + bch2_bpos_to_text(out, c->gc_gens_pos); + prt_printf(out, "\n"); + } +@@ -339,9 +333,6 @@ SHOW(bch2_fs) + if (attr == &sysfs_gc_gens_pos) + bch2_gc_gens_pos_to_text(out, c); + +- sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); +- +- sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); + sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ + + if (attr == &sysfs_copy_gc_wait) +@@ -405,9 +396,6 @@ SHOW(bch2_fs) + if (attr == &sysfs_alloc_debug) + bch2_fs_alloc_debug_to_text(out, c); + +- if (attr == &sysfs_accounting) +- bch2_fs_accounting_to_text(out, c); +- + if (attr == &sysfs_usage_base) + bch2_fs_usage_base_to_text(out, c); + +@@ -418,23 +406,6 @@ STORE(bch2_fs) + { + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + +- if (attr == &sysfs_copy_gc_enabled) { +- ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) +- ?: (ssize_t) size; +- +- if (c->copygc_thread) +- wake_up_process(c->copygc_thread); +- return ret; +- } +- +- if (attr == &sysfs_rebalance_enabled) { +- ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) +- ?: (ssize_t) size; +- +- rebalance_wakeup(c); +- return ret; +- } +- + sysfs_pd_controller_store(rebalance, &c->rebalance.pd); + + /* Debugging: */ +@@ -534,15 +505,22 @@ SHOW(bch2_fs_counters) + + printbuf_tabstop_push(out, 32); + +- #define x(t, ...) \ ++ #define x(t, n, f, ...) \ + if (attr == &sysfs_##t) { \ + counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ + counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ ++ if (f & TYPE_SECTORS) { \ ++ counter <<= 9; \ ++ counter_since_mount <<= 9; \ ++ } \ ++ \ + prt_printf(out, "since mount:\t"); \ ++ (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ + prt_human_readable_u64(out, counter_since_mount); \ + prt_newline(out); \ + \ + prt_printf(out, "since filesystem creation:\t"); \ ++ (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ + prt_human_readable_u64(out, counter); \ + prt_newline(out); \ + } +@@ -610,10 +588,8 @@ struct attribute *bch2_fs_internal_files[] = { + + &sysfs_gc_gens_pos, + +- &sysfs_copy_gc_enabled, + &sysfs_copy_gc_wait, + +- &sysfs_rebalance_enabled, + sysfs_pd_controller_files(rebalance), + + &sysfs_moving_ctxts, +@@ -622,7 +598,6 @@ struct attribute *bch2_fs_internal_files[] = { + + &sysfs_disk_groups, + &sysfs_alloc_debug, +- &sysfs_accounting, + &sysfs_usage_base, + NULL + }; +@@ -682,6 +657,13 @@ STORE(bch2_fs_opts_dir) + (id == Opt_compression && !c->opts.background_compression))) + bch2_set_rebalance_needs_scan(c, 0); + ++ if (v && id == Opt_rebalance_enabled) ++ rebalance_wakeup(c); ++ ++ if (v && id == Opt_copygc_enabled && ++ c->copygc_thread) ++ wake_up_process(c->copygc_thread); ++ + ret = size; + err: + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); +@@ -790,7 +772,7 @@ SHOW(bch2_dev) + prt_char(out, '\n'); + } + +- if (attr == &sysfs_state_rw) { ++ if (attr == &sysfs_state) { + prt_string_option(out, bch2_member_states, ca->mi.state); + prt_char(out, '\n'); + } +@@ -870,7 +852,7 @@ struct attribute *bch2_dev_files[] = { + + /* settings: */ + &sysfs_discard, +- &sysfs_state_rw, ++ &sysfs_state, + &sysfs_label, + + &sysfs_has_data, +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +index fb5c1543e52f..6c6469814637 100644 +--- a/fs/bcachefs/tests.c ++++ b/fs/bcachefs/tests.c +@@ -131,7 +131,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) + i = 0; + + ret = bch2_trans_run(c, +- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, ++ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i++); +@@ -186,7 +186,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) + i = 0; + + ret = bch2_trans_run(c, +- for_each_btree_key_upto(trans, iter, BTREE_ID_extents, ++ for_each_btree_key_max(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i); +@@ -242,7 +242,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + i = 0; + + ret = bch2_trans_run(c, +- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, ++ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i); +@@ -259,7 +259,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) + i = 0; + + ret = bch2_trans_run(c, +- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, ++ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_slots, k, ({ + if (i >= nr * 2) +@@ -302,7 +302,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + i = 0; + + ret = bch2_trans_run(c, +- for_each_btree_key_upto(trans, iter, BTREE_ID_extents, ++ for_each_btree_key_max(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i + 8); +@@ -320,7 +320,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) + i = 0; + + ret = bch2_trans_run(c, +- for_each_btree_key_upto(trans, iter, BTREE_ID_extents, ++ for_each_btree_key_max(trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_slots, k, ({ + if (i == nr) +@@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); + +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + bch2_trans_iter_exit(trans, &iter); +@@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0); + +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + bch2_trans_iter_exit(trans, &iter); +@@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) + trans = bch2_trans_get(c); + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, snapid_lo), 0); +- lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); ++ lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + + BUG_ON(k.k->p.snapshot != U32_MAX); + +@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, + BTREE_ITER_intent); +- k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); ++ k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); + ret = bkey_err(k); + if (ret) + goto err; +@@ -726,7 +726,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) + static int seq_lookup(struct bch_fs *c, u64 nr) + { + return bch2_trans_run(c, +- for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, ++ for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, + 0)); +diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h +index 5597b9d6297f..56a5a7fbc0fd 100644 +--- a/fs/bcachefs/trace.h ++++ b/fs/bcachefs/trace.h +@@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio, + (unsigned long long)__entry->sector, __entry->nr_sector) + ); + ++/* disk_accounting.c */ ++ ++TRACE_EVENT(accounting_mem_insert, ++ TP_PROTO(struct bch_fs *c, const char *acc), ++ TP_ARGS(c, acc), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(unsigned, new_nr ) ++ __string(acc, acc ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = c->dev; ++ __entry->new_nr = c->accounting.k.nr; ++ __assign_str(acc); ++ ), ++ ++ TP_printk("%d,%d entries %u added %s", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->new_nr, ++ __get_str(acc)) ++); ++ + /* fs.c: */ + TRACE_EVENT(bch2_sync_fs, + TP_PROTO(struct super_block *sb, int wait), +@@ -848,8 +872,8 @@ TRACE_EVENT(move_data, + TRACE_EVENT(evacuate_bucket, + TP_PROTO(struct bch_fs *c, struct bpos *bucket, + unsigned sectors, unsigned bucket_size, +- u64 fragmentation, int ret), +- TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), ++ int ret), ++ TP_ARGS(c, bucket, sectors, bucket_size, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) +@@ -857,7 +881,6 @@ TRACE_EVENT(evacuate_bucket, + __field(u64, bucket ) + __field(u32, sectors ) + __field(u32, bucket_size ) +- __field(u64, fragmentation ) + __field(int, ret ) + ), + +@@ -867,45 +890,42 @@ TRACE_EVENT(evacuate_bucket, + __entry->bucket = bucket->offset; + __entry->sectors = sectors; + __entry->bucket_size = bucket_size; +- __entry->fragmentation = fragmentation; + __entry->ret = ret; + ), + +- TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", ++ TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->member, __entry->bucket, + __entry->sectors, __entry->bucket_size, +- __entry->fragmentation, __entry->ret) ++ __entry->ret) + ); + + TRACE_EVENT(copygc, + TP_PROTO(struct bch_fs *c, +- u64 sectors_moved, u64 sectors_not_moved, +- u64 buckets_moved, u64 buckets_not_moved), +- TP_ARGS(c, +- sectors_moved, sectors_not_moved, +- buckets_moved, buckets_not_moved), ++ u64 buckets, ++ u64 sectors_seen, ++ u64 sectors_moved), ++ TP_ARGS(c, buckets, sectors_seen, sectors_moved), + + TP_STRUCT__entry( + __field(dev_t, dev ) ++ __field(u64, buckets ) ++ __field(u64, sectors_seen ) + __field(u64, sectors_moved ) +- __field(u64, sectors_not_moved ) +- __field(u64, buckets_moved ) +- __field(u64, buckets_not_moved ) + ), + + TP_fast_assign( + __entry->dev = c->dev; ++ __entry->buckets = buckets; ++ __entry->sectors_seen = sectors_seen; + __entry->sectors_moved = sectors_moved; +- __entry->sectors_not_moved = sectors_not_moved; +- __entry->buckets_moved = buckets_moved; +- __entry->buckets_not_moved = buckets_moved; + ), + +- TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", ++ TP_printk("%d,%d buckets %llu sectors seen %llu moved %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), +- __entry->sectors_moved, __entry->sectors_not_moved, +- __entry->buckets_moved, __entry->buckets_not_moved) ++ __entry->buckets, ++ __entry->sectors_seen, ++ __entry->sectors_moved) + ); + + TRACE_EVENT(copygc_wait, +@@ -1316,6 +1336,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, + __entry->new_u64s) + ); + ++DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, ++ TP_PROTO(struct btree_trans *trans, ++ unsigned long caller_ip), ++ TP_ARGS(trans, caller_ip) ++); ++ + TRACE_EVENT(path_downgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, +@@ -1352,10 +1378,21 @@ TRACE_EVENT(path_downgrade, + __entry->pos_snapshot) + ); + +-DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, +- TP_PROTO(struct btree_trans *trans, +- unsigned long caller_ip), +- TP_ARGS(trans, caller_ip) ++TRACE_EVENT(key_cache_fill, ++ TP_PROTO(struct btree_trans *trans, const char *key), ++ TP_ARGS(trans, key), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 32 ) ++ __string(key, key ) ++ ), ++ ++ TP_fast_assign( ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ __assign_str(key); ++ ), ++ ++ TP_printk("%s %s", __entry->trans_fn, __get_str(key)) + ); + + TRACE_EVENT(write_buffer_flush, +@@ -1414,6 +1451,24 @@ TRACE_EVENT(write_buffer_flush_slowpath, + TP_printk("%zu/%zu", __entry->slowpath, __entry->total) + ); + ++TRACE_EVENT(write_buffer_maybe_flush, ++ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), ++ TP_ARGS(trans, caller_ip, key), ++ ++ TP_STRUCT__entry( ++ __array(char, trans_fn, 32 ) ++ __field(unsigned long, caller_ip ) ++ __string(key, key ) ++ ), ++ ++ TP_fast_assign( ++ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); ++ __assign_str(key); ++ ), ++ ++ TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) ++); ++ + DEFINE_EVENT(fs_str, rebalance_extent, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +index fb02c1c36004..1a1720116071 100644 +--- a/fs/bcachefs/util.h ++++ b/fs/bcachefs/util.h +@@ -55,6 +55,16 @@ static inline size_t buf_pages(void *p, size_t len) + PAGE_SIZE); + } + ++static inline void *bch2_kvmalloc(size_t n, gfp_t flags) ++{ ++ void *p = unlikely(n >= INT_MAX) ++ ? vmalloc(n) ++ : kvmalloc(n, flags & ~__GFP_ZERO); ++ if (p && (flags & __GFP_ZERO)) ++ memset(p, 0, n); ++ return p; ++} ++ + #define init_heap(heap, _size, gfp) \ + ({ \ + (heap)->nr = 0; \ +@@ -317,6 +327,19 @@ do { \ + _ptr ? container_of(_ptr, type, member) : NULL; \ + }) + ++static inline struct list_head *list_pop(struct list_head *head) ++{ ++ if (list_empty(head)) ++ return NULL; ++ ++ struct list_head *ret = head->next; ++ list_del_init(ret); ++ return ret; ++} ++ ++#define list_pop_entry(head, type, member) \ ++ container_of_or_null(list_pop(head), type, member) ++ + /* Does linear interpolation between powers of two */ + static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) + { +@@ -696,4 +719,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr) + return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; + } + ++static inline void memcpy_swab(void *_dst, void *_src, size_t len) ++{ ++ u8 *dst = _dst + len; ++ u8 *src = _src; ++ ++ while (len--) ++ *--dst = *src++; ++} ++ + #endif /* _BCACHEFS_UTIL_H */ +diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c +index 6a78553d9b0c..6620ecae26af 100644 +--- a/fs/bcachefs/varint.c ++++ b/fs/bcachefs/varint.c +@@ -9,6 +9,7 @@ + #include + #endif + ++#include "errcode.h" + #include "varint.h" + + /** +@@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) + u64 v; + + if (unlikely(in + bytes > end)) +- return -1; ++ return -BCH_ERR_varint_decode_error; + + if (likely(bytes < 9)) { + __le64 v_le = 0; +@@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) + unsigned bytes = ffz(*in) + 1; + + if (unlikely(in + bytes > end)) +- return -1; ++ return -BCH_ERR_varint_decode_error; + + if (likely(bytes < 9)) { + v >>= bytes; +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +index 952aca400faf..aed7c6984173 100644 +--- a/fs/bcachefs/xattr.c ++++ b/fs/bcachefs/xattr.c +@@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { + }; + + int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, +- enum bch_validate_flags flags) ++ struct bkey_validate_context from) + { + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, +@@ -309,7 +309,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) + u64 offset = 0, inum = inode->ei_inode.bi_inum; + + int ret = bch2_trans_run(c, +- for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs, ++ for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, + POS(inum, offset), + POS(inum, U64_MAX), + inode->ei_inum.subvol, 0, k, ({ +@@ -565,13 +565,6 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); + err: + mutex_unlock(&inode->ei_update_lock); +- +- if (value && +- (opt_id == Opt_background_target || +- opt_id == Opt_background_compression || +- (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) +- bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); +- + err_class_exit: + return bch2_err_class(ret); + } +@@ -609,7 +602,7 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { + + #endif /* NO_BCACHEFS_FS */ + +-const struct xattr_handler *bch2_xattr_handlers[] = { ++const struct xattr_handler * const bch2_xattr_handlers[] = { + &bch_xattr_user_handler, + &bch_xattr_trusted_handler, + &bch_xattr_security_handler, +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +index c188a5ad64ce..132fbbd15a66 100644 +--- a/fs/bcachefs/xattr.h ++++ b/fs/bcachefs/xattr.h +@@ -6,7 +6,8 @@ + + extern const struct bch_hash_desc bch2_xattr_hash_desc; + +-int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); ++int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, ++ struct bkey_validate_context); + void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + + #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ +@@ -44,6 +45,6 @@ int bch2_xattr_set(struct btree_trans *, subvol_inum, + + ssize_t bch2_xattr_list(struct dentry *, char *, size_t); + +-extern const struct xattr_handler *bch2_xattr_handlers[]; ++extern const struct xattr_handler * const bch2_xattr_handlers[]; + + #endif /* _BCACHEFS_XATTR_H */ +diff --git a/fs/fs_parser.c b/fs/fs_parser.c +index 24727ec34e5a..6521e9a9d6ef 100644 +--- a/fs/fs_parser.c ++++ b/fs/fs_parser.c +@@ -13,7 +13,7 @@ + #include + #include "internal.h" + +-static const struct constant_table bool_names[] = { ++const struct constant_table bool_names[] = { + { "0", false }, + { "1", true }, + { "false", false }, +@@ -22,6 +22,7 @@ static const struct constant_table bool_names[] = { + { "yes", true }, + { }, + }; ++EXPORT_SYMBOL(bool_names); + + static const struct constant_table * + __lookup_constant(const struct constant_table *tbl, const char *name) +diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h +index 6cf713a7e6c6..0974cd33bcba 100644 +--- a/include/linux/fs_parser.h ++++ b/include/linux/fs_parser.h +@@ -83,6 +83,8 @@ extern int fs_lookup_param(struct fs_context *fc, + + extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); + ++extern const struct constant_table bool_names[]; ++ + #ifdef CONFIG_VALIDATE_FS_PARSER + extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, + int low, int high, int special); +diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h +index 43a7b9dcf15e..fe17b4828171 100644 +--- a/include/linux/min_heap.h ++++ b/include/linux/min_heap.h +@@ -15,8 +15,8 @@ + */ + #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ + struct _name { \ +- int nr; \ +- int size; \ ++ size_t nr; \ ++ size_t size; \ + _type *data; \ + _type preallocated[_nr]; \ + } +-- +2.45.3 +