From f6e4cd2ef7c6ef7c3cf5a730f7ded9c48ccf689d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 5 Oct 2024 17:49:58 -0400
Subject: [PATCH] Update bcachefs sources to 895d5e67afdc bcachefs: Check if
 stuck in journal_res_get()

---
 .bcachefs_revision                    |   2 +-
 Makefile.compiler                     |  15 +
 include/linux/closure.h               |  35 ++
 libbcachefs/alloc_background.c        |  30 +-
 libbcachefs/alloc_background_format.h |   2 +-
 libbcachefs/bcachefs_format.h         |   3 +-
 libbcachefs/btree_gc.c                |   3 -
 libbcachefs/btree_iter.c              |   6 +-
 libbcachefs/btree_iter.h              |   8 +
 libbcachefs/btree_node_scan.c         |   3 +
 libbcachefs/btree_trans_commit.c      |   3 +-
 libbcachefs/btree_update.c            |   2 +-
 libbcachefs/chardev.c                 |   1 -
 libbcachefs/ec.c                      |   2 +-
 libbcachefs/error.c                   |  23 +-
 libbcachefs/error.h                   |   9 +-
 libbcachefs/fs-io-buffered.c          |   8 +-
 libbcachefs/fs-io-buffered.h          |   6 +-
 libbcachefs/fs.c                      |  66 ++-
 libbcachefs/fs.h                      |   9 +-
 libbcachefs/fsck.c                    | 559 ++++++++++++++++----------
 libbcachefs/fsck.h                    |   1 +
 libbcachefs/inode.c                   | 318 ++++++++++++---
 libbcachefs/inode.h                   |  38 +-
 libbcachefs/inode_format.h            |   3 +-
 libbcachefs/io_misc.c                 |  63 ++-
 libbcachefs/journal.c                 |  13 +
 libbcachefs/logged_ops.c              |  16 +-
 libbcachefs/logged_ops.h              |   2 +-
 libbcachefs/lru.c                     |  34 +-
 libbcachefs/move.c                    |   2 +-
 libbcachefs/movinggc.c                |  12 +-
 libbcachefs/rcu_pending.c             |   2 -
 libbcachefs/recovery_passes_types.h   |   1 +
 libbcachefs/replicas.c                |  34 +-
 libbcachefs/sb-downgrade.c            |   5 +-
 libbcachefs/sb-errors_format.h        |  35 +-
 libbcachefs/six.c                     |   2 +-
 libbcachefs/snapshot.c                |  97 -----
 libbcachefs/snapshot.h                |   3 -
 libbcachefs/subvolume.c               |  16 +-
 libbcachefs/subvolume.h               |   2 +
 libbcachefs/super.c                   |   4 +-
 libbcachefs/thread_with_file.c        |   2 -
 libbcachefs/util.c                    |   2 +-
 linux/closure.c                       |   2 +-
 46 files changed, 970 insertions(+), 534 deletions(-)

diff --git a/.bcachefs_revision b/.bcachefs_revision
index 8a684e22..417d5b15 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-438696e03da7c2d1678bdcce98939ba642cbb467
+895d5e67afdccd7c9262630f5e7972721885284b
diff --git a/Makefile.compiler b/Makefile.compiler
index 92be0c9a..057305ea 100644
--- a/Makefile.compiler
+++ b/Makefile.compiler
@@ -72,3 +72,18 @@ clang-min-version = $(call test-ge, $(CONFIG_CLANG_VERSION), $1)
 # ld-option
 # Usage: KBUILD_LDFLAGS += $(call ld-option, -X, -Y)
 ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3))
+
+# __rustc-option
+# Usage: MY_RUSTFLAGS += $(call __rustc-option,$(RUSTC),$(MY_RUSTFLAGS),-Cinstrument-coverage,-Zinstrument-coverage)
+__rustc-option = $(call try-run,\
+	$(1) $(2) $(3) --crate-type=rlib /dev/null --out-dir=$$TMPOUT -o "$$TMP",$(3),$(4))
+
+# rustc-option
+# Usage: rustflags-y += $(call rustc-option,-Cinstrument-coverage,-Zinstrument-coverage)
+rustc-option = $(call __rustc-option, $(RUSTC),\
+	$(KBUILD_RUSTFLAGS),$(1),$(2))
+
+# rustc-option-yn
+# Usage: flag := $(call rustc-option-yn,-Cinstrument-coverage)
+rustc-option-yn = $(call try-run,\
+	$(RUSTC) $(KBUILD_RUSTFLAGS) $(1) --crate-type=rlib /dev/null --out-dir=$$TMPOUT -o "$$TMP",y,n)
diff --git a/include/linux/closure.h b/include/linux/closure.h
index 2af44427..c24d4aaf 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -454,4 +454,39 @@ do {									\
 		__closure_wait_event(waitlist, _cond);			\
 } while (0)
 
+#define __closure_wait_event_timeout(waitlist, _cond, _until)		\
+({									\
+	struct closure cl;						\
+	long _t;							\
+									\
+	closure_init_stack(&cl);					\
+									\
+	while (1) {							\
+		closure_wait(waitlist, &cl);				\
+		if (_cond) {						\
+			_t = max(1, _until - jiffies);			\
+			break;						\
+		}							\
+		_t = max(0, _until - jiffies);				\
+		if (!_t)						\
+			break;						\
+		closure_sync_timeout(&cl, _t);				\
+	}								\
+	closure_wake_up(waitlist);					\
+	closure_sync(&cl);						\
+	_t;								\
+})
+
+/*
+ * Returns 0 if timeout expired, remaining time in jiffies (at least 1) if
+ * condition became true
+ */
+#define closure_wait_event_timeout(waitlist, _cond, _timeout)		\
+({									\
+	unsigned long _until = jiffies + _timeout;			\
+	(_cond)								\
+		? max(1, _until - jiffies)				\
+		: __closure_wait_event_timeout(waitlist, _cond, _until);\
+})
+
 #endif /* _LINUX_CLOSURE_H */
diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c
index 645b5ed4..4e4a448f 100644
--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@@ -332,7 +332,6 @@ void bch2_alloc_v4_swab(struct bkey_s k)
 	a->io_time[1]		= swab64(a->io_time[1]);
 	a->stripe		= swab32(a->stripe);
 	a->nr_external_backpointers = swab32(a->nr_external_backpointers);
-	a->fragmentation_lru	= swab64(a->fragmentation_lru);
 	a->stripe_sectors	= swab32(a->stripe_sectors);
 
 	bps = alloc_v4_backpointers(a);
@@ -347,6 +346,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 {
 	struct bch_alloc_v4 _a;
 	const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
+	struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL;
 
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
@@ -364,9 +364,13 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
 	prt_printf(out, "stripe_redundancy %u\n",	a->stripe_redundancy);
 	prt_printf(out, "io_time[READ]     %llu\n",	a->io_time[READ]);
 	prt_printf(out, "io_time[WRITE]    %llu\n",	a->io_time[WRITE]);
-	prt_printf(out, "fragmentation     %llu\n",	a->fragmentation_lru);
+
+	if (ca)
+		prt_printf(out, "fragmentation     %llu\n",	alloc_lru_idx_fragmentation(*a, ca));
 	prt_printf(out, "bp_start          %llu\n", BCH_ALLOC_V4_BACKPOINTERS_START(a));
 	printbuf_indent_sub(out, 2);
+
+	bch2_dev_put(ca);
 }
 
 void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out)
@@ -882,12 +886,13 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 				goto err;
 		}
 
-		new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, ca);
-		if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+		old_lru = alloc_lru_idx_fragmentation(*old_a, ca);
+		new_lru = alloc_lru_idx_fragmentation(*new_a, ca);
+		if (old_lru != new_lru) {
 			ret = bch2_lru_change(trans,
 					BCH_LRU_FRAGMENTATION_START,
 					bucket_to_u64(new.k->p),
-					old_a->fragmentation_lru, new_a->fragmentation_lru);
+					old_lru, new_lru);
 			if (ret)
 				goto err;
 		}
@@ -1629,18 +1634,22 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
+	struct bch_dev *ca = bch2_dev_tryget_noerror(c, alloc_k.k->p.inode);
+	if (!ca)
+		return 0;
+
 	a = bch2_alloc_to_v4(alloc_k, &a_convert);
 
-	if (a->fragmentation_lru) {
+	u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca);
+	if (lru_idx) {
 		ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START,
-					 a->fragmentation_lru,
-					 alloc_k, last_flushed);
+					 lru_idx, alloc_k, last_flushed);
 		if (ret)
-			return ret;
+			goto err;
 	}
 
 	if (a->data_type != BCH_DATA_cached)
-		return 0;
+		goto err;
 
 	if (fsck_err_on(!a->io_time[READ],
 			trans, alloc_key_cached_but_read_time_zero,
@@ -1669,6 +1678,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
 		goto err;
 err:
 fsck_err:
+	bch2_dev_put(ca);
 	printbuf_exit(&buf);
 	return ret;
 }
diff --git a/libbcachefs/alloc_background_format.h b/libbcachefs/alloc_background_format.h
index f754a295..befdaa95 100644
--- a/libbcachefs/alloc_background_format.h
+++ b/libbcachefs/alloc_background_format.h
@@ -70,7 +70,7 @@ struct bch_alloc_v4 {
 	__u32			stripe;
 	__u32			nr_external_backpointers;
 	/* end of fields in original version of alloc_v4 */
-	__u64			fragmentation_lru;
+	__u64			_fragmentation_lru; /* obsolete */
 	__u32			stripe_sectors;
 	__u32			pad;
 } __packed __aligned(8);
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 84832c2d..5004f6ba 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -678,7 +678,8 @@ struct bch_sb_field_ext {
 	x(disk_accounting_v2,		BCH_VERSION(1,  9))		\
 	x(disk_accounting_v3,		BCH_VERSION(1, 10))		\
 	x(disk_accounting_inum,		BCH_VERSION(1, 11))		\
-	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))
+	x(rebalance_work_acct_fix,	BCH_VERSION(1, 12))		\
+	x(inode_has_child_snapshots,	BCH_VERSION(1, 13))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index 660d2fa0..771154e3 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -828,8 +828,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 			return ret;
 	}
 
-	gc.fragmentation_lru = alloc_lru_idx_fragmentation(gc, ca);
-
 	if (fsck_err_on(new.data_type != gc.data_type,
 			trans, alloc_key_data_type_wrong,
 			"bucket %llu:%llu gen %u has wrong data_type"
@@ -857,7 +855,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	copy_bucket_field(alloc_key_cached_sectors_wrong,	cached_sectors);
 	copy_bucket_field(alloc_key_stripe_wrong,		stripe);
 	copy_bucket_field(alloc_key_stripe_redundancy_wrong,	stripe_redundancy);
-	copy_bucket_field(alloc_key_fragmentation_lru_wrong,	fragmentation_lru);
 #undef copy_bucket_field
 
 	if (!bch2_alloc_v4_cmp(*old, new))
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index bfe9f0c1..0883cf6e 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -2381,9 +2381,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
 		else
 			iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
 
-		if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
-			     ? bkey_gt(iter_pos, end)
-			     : bkey_ge(iter_pos, end)))
+		if (unlikely(iter->flags & BTREE_ITER_all_snapshots	? bpos_gt(iter_pos, end) :
+			     iter->flags & BTREE_ITER_is_extents	? bkey_ge(iter_pos, end) :
+									  bkey_gt(iter_pos, end)))
 			goto end;
 
 		break;
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h
index 78e63ad7..31a58bf4 100644
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
 	for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
 					  SPOS_MAX, _flags, _k, _ret)
 
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id,	\
+					     _start, _flags, _k, _ret)	\
+	for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id),	\
+				  (_start), (_flags));			\
+	     (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags),	\
+	     !((_ret) = bkey_err(_k)) && (_k).k;			\
+	     bch2_btree_iter_rewind(&(_iter)))
+
 #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret)	\
 	for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
 
diff --git a/libbcachefs/btree_node_scan.c b/libbcachefs/btree_node_scan.c
index 1e694fed..a7aedb13 100644
--- a/libbcachefs/btree_node_scan.c
+++ b/libbcachefs/btree_node_scan.c
@@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
 	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
 		return;
 
+	if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
+		return;
+
 	rcu_read_lock();
 	struct found_btree_node n = {
 		.btree_id	= BTREE_NODE_ID(bn),
diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c
index 1a74a1a2..9bf471fa 100644
--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@@ -832,7 +832,8 @@ revert_fs_usage:
 	for (struct jset_entry *entry2 = trans->journal_entries;
 	     entry2 != entry;
 	     entry2 = vstruct_next(entry2))
-		if (jset_entry_is_key(entry2) && entry2->start->k.type == KEY_TYPE_accounting) {
+		if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys &&
+		    entry2->start->k.type == KEY_TYPE_accounting) {
 			struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start);
 
 			bch2_accounting_neg(a);
diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c
index 514df618..39fc7778 100644
--- a/libbcachefs/btree_update.c
+++ b/libbcachefs/btree_update.c
@@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans,
 	       !(ret = bkey_err(old_k)) &&
 	       bkey_eq(old_pos, old_k.k->p)) {
 		struct bpos whiteout_pos =
-			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);;
+			SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);
 
 		if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) ||
 		    snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot))
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c
index ef1f7486..cbfd88f9 100644
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -471,7 +471,6 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
 static const struct file_operations bcachefs_data_ops = {
 	.release	= bch2_data_job_release,
 	.read		= bch2_data_job_read,
-	.llseek		= no_llseek,
 };
 
 static long bch2_ioctl_data(struct bch_fs *c,
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
index 1587c6e1..f4fc4f08 100644
--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@@ -900,7 +900,7 @@ err:
 	bch2_bkey_val_to_text(&msgbuf, c, orig_k);
 	bch_err_ratelimited(c,
 			    "error doing reconstruct read: %s\n  %s", msg, msgbuf.buf);
-	printbuf_exit(&msgbuf);;
+	printbuf_exit(&msgbuf);
 	ret = -BCH_ERR_stripe_reconstruct;
 	goto out;
 }
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index 3a16b535..7a79f695 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -393,6 +393,14 @@ int __bch2_fsck_err(struct bch_fs *c,
 	     !(flags & FSCK_CAN_IGNORE)))
 		ret = -BCH_ERR_fsck_errors_not_fixed;
 
+	bool exiting =
+		test_bit(BCH_FS_fsck_running, &c->flags) &&
+		(ret != -BCH_ERR_fsck_fix &&
+		 ret != -BCH_ERR_fsck_ignore);
+
+	if (exiting)
+		print = true;
+
 	if (print) {
 		if (bch2_fs_stdio_redirect(c))
 			bch2_print(c, "%s\n", out->buf);
@@ -400,9 +408,7 @@ int __bch2_fsck_err(struct bch_fs *c,
 			bch2_print_string_as_lines(KERN_ERR, out->buf);
 	}
 
-	if (test_bit(BCH_FS_fsck_running, &c->flags) &&
-	    (ret != -BCH_ERR_fsck_fix &&
-	     ret != -BCH_ERR_fsck_ignore))
+	if (exiting)
 		bch_err(c, "Unable to continue, halting");
 	else if (suppressing)
 		bch_err(c, "Ratelimiting new instances of previous error");
@@ -430,10 +436,17 @@ err:
 
 int __bch2_bkey_fsck_err(struct bch_fs *c,
 			 struct bkey_s_c k,
-			 enum bch_fsck_flags flags,
+			 enum bch_validate_flags validate_flags,
 			 enum bch_sb_error_id err,
 			 const char *fmt, ...)
 {
+	if (validate_flags & BCH_VALIDATE_silent)
+		return -BCH_ERR_fsck_delete_bkey;
+
+	unsigned fsck_flags = 0;
+	if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)))
+		fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX;
+
 	struct printbuf buf = PRINTBUF;
 	va_list args;
 
@@ -445,7 +458,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *c,
 	va_end(args);
 	prt_str(&buf, ": delete?");
 
-	int ret = __bch2_fsck_err(c, NULL, flags, err, "%s", buf.buf);
+	int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf);
 	printbuf_exit(&buf);
 	return ret;
 }
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index 21ee7211..6551ada9 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -167,10 +167,11 @@ void bch2_flush_fsck_errs(struct bch_fs *);
 #define fsck_err_on(cond, c, _err_type, ...)				\
 	__fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__)
 
+enum bch_validate_flags;
 __printf(5, 6)
 int __bch2_bkey_fsck_err(struct bch_fs *,
 			 struct bkey_s_c,
-			 enum bch_fsck_flags,
+			 enum bch_validate_flags,
 			 enum bch_sb_error_id,
 			 const char *, ...);
 
@@ -180,11 +181,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *,
  */
 #define bkey_fsck_err(c, _err_type, _err_msg, ...)			\
 do {									\
-	if ((flags & BCH_VALIDATE_silent)) {				\
-		ret = -BCH_ERR_fsck_delete_bkey;			\
-		goto fsck_err;						\
-	}								\
-	int _ret = __bch2_bkey_fsck_err(c, k, FSCK_CAN_FIX|FSCK_AUTOFIX,\
+	int _ret = __bch2_bkey_fsck_err(c, k, flags,			\
 				BCH_FSCK_ERR_##_err_type,		\
 				_err_msg, ##__VA_ARGS__);		\
 	if (_ret != -BCH_ERR_fsck_fix &&				\
diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c
index 99fef934..48a1ab9a 100644
--- a/libbcachefs/fs-io-buffered.c
+++ b/libbcachefs/fs-io-buffered.c
@@ -648,7 +648,7 @@ int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc
 
 int bch2_write_begin(struct file *file, struct address_space *mapping,
 		     loff_t pos, unsigned len,
-		     struct page **pagep, void **fsdata)
+		     struct folio **foliop, void **fsdata)
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
@@ -717,12 +717,11 @@ out:
 		goto err;
 	}
 
-	*pagep = &folio->page;
+	*foliop = folio;
 	return 0;
 err:
 	folio_unlock(folio);
 	folio_put(folio);
-	*pagep = NULL;
 err_unlock:
 	bch2_pagecache_add_put(inode);
 	kfree(res);
@@ -732,12 +731,11 @@ err_unlock:
 
 int bch2_write_end(struct file *file, struct address_space *mapping,
 		   loff_t pos, unsigned len, unsigned copied,
-		   struct page *page, void *fsdata)
+		   struct folio *folio, void *fsdata)
 {
 	struct bch_inode_info *inode = to_bch_ei(mapping->host);
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation *res = fsdata;
-	struct folio *folio = page_folio(page);
 	unsigned offset = pos - folio_pos(folio);
 
 	lockdep_assert_held(&inode->v.i_rwsem);
diff --git a/libbcachefs/fs-io-buffered.h b/libbcachefs/fs-io-buffered.h
index a6126ff7..3207ebbb 100644
--- a/libbcachefs/fs-io-buffered.h
+++ b/libbcachefs/fs-io-buffered.h
@@ -10,10 +10,10 @@ int bch2_read_folio(struct file *, struct folio *);
 int bch2_writepages(struct address_space *, struct writeback_control *);
 void bch2_readahead(struct readahead_control *);
 
-int bch2_write_begin(struct file *, struct address_space *, loff_t,
-		     unsigned, struct page **, void **);
+int bch2_write_begin(struct file *, struct address_space *, loff_t pos,
+		     unsigned len, struct folio **, void **);
 int bch2_write_end(struct file *, struct address_space *, loff_t,
-		   unsigned, unsigned, struct page *, void *);
+		   unsigned len, unsigned copied, struct folio *, void *);
 
 ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *);
 
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 1aee5baf..23cae92d 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -174,22 +174,45 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
 	.automatic_shrinking	= true,
 };
 
-static void __wait_on_freeing_inode(struct inode *inode)
-{
-	wait_queue_head_t *wq;
-	DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW);
-	wq = bit_waitqueue(&inode->i_state, __I_NEW);
-	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
-	spin_unlock(&inode->i_lock);
-	schedule();
-	finish_wait(wq, &wait.wq_entry);
-}
-
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
 {
 	return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
 }
 
+bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
+{
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return false;
+
+	subvol_inum inum = {
+		.subvol = snapshot_t(c, p.snapshot)->subvol,
+		.inum	= p.offset,
+	};
+
+	/* snapshot tree interior node, can't safely delete while online (yet) */
+	if (!inum.subvol) {
+		bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
+		return true;
+	}
+
+	return __bch2_inode_hash_find(c, inum) != NULL;
+}
+
+static void __wait_on_freeing_inode(struct bch_fs *c,
+				    struct bch_inode_info *inode,
+				    subvol_inum inum)
+{
+	wait_queue_head_t *wq;
+	DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+	wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
+	prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+	spin_unlock(&inode->v.i_lock);
+
+	if (__bch2_inode_hash_find(c, inum) == inode)
+		schedule_timeout(HZ * 10);
+	finish_wait(wq, &wait.wq_entry);
+}
+
 static struct bch_inode_info *bch2_inode_hash_find(struct bch_fs *c, struct btree_trans *trans,
 						   subvol_inum inum)
 {
@@ -204,10 +227,10 @@ repeat:
 		}
 		if ((inode->v.i_state & (I_FREEING|I_WILL_FREE))) {
 			if (!trans) {
-				__wait_on_freeing_inode(&inode->v);
+				__wait_on_freeing_inode(c, inode, inum);
 			} else {
 				bch2_trans_unlock(trans);
-				__wait_on_freeing_inode(&inode->v);
+				__wait_on_freeing_inode(c, inode, inum);
 				int ret = bch2_trans_relock(trans);
 				if (ret)
 					return ERR_PTR(ret);
@@ -232,6 +255,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod
 					&inode->hash, bch2_vfs_inodes_params);
 		BUG_ON(ret);
 		inode->v.i_hash.pprev = NULL;
+		/*
+		 * This pairs with the bch2_inode_hash_find() ->
+		 * __wait_on_freeing_inode() path
+		 */
+		inode_wake_up_bit(&inode->v, __I_NEW);
 	}
 }
 
@@ -1783,14 +1811,16 @@ again:
 				break;
 			}
 		} else if (clean_pass && this_pass_clean) {
-			wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW);
-			DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+			struct wait_bit_queue_entry wqe;
+			struct wait_queue_head *wq_head;
 
-			prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+			wq_head = inode_bit_waitqueue(&wqe, &inode->v, __I_NEW);
+			prepare_to_wait_event(wq_head, &wqe.wq_entry,
+					      TASK_UNINTERRUPTIBLE);
 			mutex_unlock(&c->vfs_inodes_lock);
 
 			schedule();
-			finish_wait(wq, &wait.wq_entry);
+			finish_wait(wq_head, &wqe.wq_entry);
 			goto again;
 		}
 	}
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index d03f7881..40dbd577 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -146,8 +146,6 @@ struct bch_inode_info *
 __bch2_create(struct mnt_idmap *, struct bch_inode_info *,
 	      struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
 
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
-
 int bch2_fs_quota_transfer(struct bch_fs *,
 			   struct bch_inode_info *,
 			   struct bch_qid,
@@ -181,6 +179,8 @@ void bch2_inode_update_after_write(struct btree_trans *,
 int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *,
 				  inode_set_fn, void *, unsigned);
 
+bool bch2_inode_is_open(struct bch_fs *c, struct bpos p);
+
 int bch2_setattr_nonsize(struct mnt_idmap *,
 			 struct bch_inode_info *,
 			 struct iattr *);
@@ -198,10 +198,7 @@ int bch2_vfs_init(void);
 
 #define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields)	({ do {} while (0); })
 
-static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
-	return NULL;
-}
+static inline bool bch2_inode_is_open(struct bch_fs *c, struct bpos p) { return false; }
 
 static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
 					       snapshot_id_list *s) {}
diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c
index 0d8b782b..f0eb17e6 100644
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@@ -28,8 +28,8 @@ static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
 		inode->bi_dir_offset	== d.k->p.offset;
 }
 
-static bool dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
-				   struct bch_inode_unpacked *inode)
+static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d,
+					 struct bch_inode_unpacked *inode)
 {
 	if (d.v->d_type == DT_SUBVOL
 	    ? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
@@ -137,16 +137,15 @@ found:
 	return ret;
 }
 
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
-			struct bch_inode_unpacked *inode,
-			u32 *snapshot)
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot,
+			struct bch_inode_unpacked *inode)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
 
 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, inode_nr, *snapshot), 0);
+			       SPOS(0, inode_nr, snapshot), 0);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -154,8 +153,6 @@ static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
 	ret = bkey_is_inode(k.k)
 		? bch2_inode_unpack(k, inode)
 		: -BCH_ERR_ENOENT_inode;
-	if (!ret)
-		*snapshot = iter.pos.snapshot;
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@@ -250,8 +247,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 
 	struct bch_inode_unpacked root_inode;
 	struct bch_hash_info root_hash_info;
-	u32 root_inode_snapshot = snapshot;
-	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
+	ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode);
 	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
 		    root_inum.inum, le32_to_cpu(st.master_subvol));
 	if (ret)
@@ -277,17 +273,23 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 	 * The bch2_check_dirents pass has already run, dangling dirents
 	 * shouldn't exist here:
 	 */
-	ret = lookup_inode(trans, inum, lostfound, &snapshot);
+	ret = lookup_inode(trans, inum, snapshot, lostfound);
 	bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
 		    inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
 	return ret;
 
 create_lostfound:
+	/*
+	 * we always create lost+found in the root snapshot; we don't want
+	 * different branches of the snapshot tree to have different lost+found
+	 */
+	snapshot = le32_to_cpu(st.root_snapshot);
 	/*
 	 * XXX: we could have a nicer log message here  if we had a nice way to
 	 * walk backpointers to print a path
 	 */
-	bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
+	bch_notice(c, "creating lost+found in subvol %llu snapshot %u",
+		   root_inum.subvol, le32_to_cpu(st.root_snapshot));
 
 	u64 now = bch2_current_time(c);
 	struct btree_iter lostfound_iter = { NULL };
@@ -296,6 +298,7 @@ create_lostfound:
 	bch2_inode_init_early(c, lostfound);
 	bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
 	lostfound->bi_dir = root_inode.bi_inum;
+	lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot);
 
 	root_inode.bi_nlink++;
 
@@ -323,19 +326,54 @@ err:
 	return ret;
 }
 
-static int reattach_inode(struct btree_trans *trans,
-			  struct bch_inode_unpacked *inode,
-			  u32 inode_snapshot)
+static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
+{
+	if (inode->bi_inum == BCACHEFS_ROOT_INO &&
+	    inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
+		return false;
+
+	return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+}
+
+static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
+					SPOS(d_pos.inode, d_pos.offset, snapshot),
+					BTREE_ITER_intent|
+					BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (bpos_eq(k.k->p, d_pos)) {
+		/*
+		 * delet_at() doesn't work because the update path doesn't
+		 * internally use BTREE_ITER_with_updates yet
+		 */
+		struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+		ret = PTR_ERR_OR_ZERO(k);
+		if (ret)
+			goto err;
+
+		bkey_init(&k->k);
+		k->k.type = KEY_TYPE_whiteout;
+		k->k.p = iter.pos;
+		ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
 {
 	struct bch_fs *c = trans->c;
-	struct bch_hash_info dir_hash;
 	struct bch_inode_unpacked lostfound;
 	char name_buf[20];
-	struct qstr name;
-	u64 dir_offset = 0;
-	u32 dirent_snapshot = inode_snapshot;
 	int ret;
 
+	u32 dirent_snapshot = inode->bi_snapshot;
 	if (inode->bi_subvol) {
 		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
 
@@ -354,17 +392,22 @@ static int reattach_inode(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	if (S_ISDIR(inode->bi_mode)) {
-		lostfound.bi_nlink++;
+	lostfound.bi_nlink += S_ISDIR(inode->bi_mode);
 
-		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
-		if (ret)
-			return ret;
+	/* ensure lost+found inode is also present in inode snapshot */
+	if (!inode->bi_subvol) {
+		BUG_ON(!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, lostfound.bi_snapshot));
+		lostfound.bi_snapshot = inode->bi_snapshot;
 	}
 
-	dir_hash = bch2_hash_info_init(c, &lostfound);
+	ret = __bch2_fsck_write_inode(trans, &lostfound);
+	if (ret)
+		return ret;
 
-	name = (struct qstr) QSTR(name_buf);
+	struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
+	struct qstr name = (struct qstr) QSTR(name_buf);
+
+	inode->bi_dir = lostfound.bi_inum;
 
 	ret = bch2_dirent_create_snapshot(trans,
 				inode->bi_parent_subvol, lostfound.bi_inum,
@@ -373,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans,
 				inode_d_type(inode),
 				&name,
 				inode->bi_subvol ?: inode->bi_inum,
-				&dir_offset,
+				&inode->bi_dir_offset,
 				STR_HASH_must_create);
 	if (ret) {
 		bch_err_msg(c, ret, "error creating dirent");
 		return ret;
 	}
 
-	inode->bi_dir		= lostfound.bi_inum;
-	inode->bi_dir_offset	= dir_offset;
+	ret = __bch2_fsck_write_inode(trans, inode);
+	if (ret)
+		return ret;
 
-	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
+	/*
+	 * Fix up inodes in child snapshots: if they should also be reattached
+	 * update the backpointer field, if they should not be we need to emit
+	 * whiteouts for the dirent we just created.
+	 */
+	if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
+		snapshot_id_list whiteouts_done;
+		struct btree_iter iter;
+		struct bkey_s_c k;
+
+		darray_init(&whiteouts_done);
+
+		for_each_btree_key_reverse_norestart(trans, iter,
+				BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
+				BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
+			if (k.k->p.offset != inode->bi_inum)
+				break;
+
+			if (!bkey_is_inode(k.k) ||
+			    !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
+			    snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
+				continue;
+
+			struct bch_inode_unpacked child_inode;
+			bch2_inode_unpack(k, &child_inode);
+
+			if (!inode_should_reattach(&child_inode)) {
+				ret = maybe_delete_dirent(trans,
+							  SPOS(lostfound.bi_inum, inode->bi_dir_offset,
+							       dirent_snapshot),
+							  k.k->p.snapshot);
+				if (ret)
+					break;
+
+				ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
+				if (ret)
+					break;
+			} else {
+				iter.snapshot = k.k->p.snapshot;
+				child_inode.bi_dir = inode->bi_dir;
+				child_inode.bi_dir_offset = inode->bi_dir_offset;
+
+				ret = bch2_inode_write_flags(trans, &iter, &child_inode,
+							     BTREE_UPDATE_internal_snapshot_node);
+				if (ret)
+					break;
+			}
+		}
+		darray_exit(&whiteouts_done);
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	return ret;
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -422,7 +518,7 @@ static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume
 	if (ret)
 		return ret;
 
-	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
+	ret = reattach_inode(trans, &inode);
 	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
 	return ret;
 }
@@ -540,8 +636,9 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32
 	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL);
 	new_inode.bi_size = i_size;
 	new_inode.bi_inum = inum;
+	new_inode.bi_snapshot = snapshot;
 
-	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
+	return __bch2_fsck_write_inode(trans, &new_inode);
 }
 
 struct snapshots_seen {
@@ -988,7 +1085,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
 		 */
 		inode->bi_dir = 0;
 		inode->bi_dir_offset = 0;
-		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
 		*write_inode = true;
 	}
 
@@ -1000,30 +1096,14 @@ fsck_err:
 	return ret;
 }
 
-static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
-{
-	subvol_inum inum = {
-		.subvol = snapshot_t(c, p.snapshot)->subvol,
-		.inum	= p.offset,
-	};
-
-	/* snapshot tree corruption, can't safely delete */
-	if (!inum.subvol) {
-		bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
-		return true;
-	}
-
-	return __bch2_inode_hash_find(c, inum) != NULL;
-}
-
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_s_c k,
 		       struct bch_inode_unpacked *prev,
-		       struct snapshots_seen *s,
-		       bool full)
+		       struct snapshots_seen *s)
 {
 	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
 	struct bch_inode_unpacked u;
 	bool do_update = false;
 	int ret;
@@ -1043,12 +1123,6 @@ static int check_inode(struct btree_trans *trans,
 
 	BUG_ON(bch2_inode_unpack(k, &u));
 
-	if (!full &&
-	    !(u.bi_flags & (BCH_INODE_i_size_dirty|
-			    BCH_INODE_i_sectors_dirty|
-			    BCH_INODE_unlinked)))
-		return 0;
-
 	if (prev->bi_inum != u.bi_inum)
 		*prev = u;
 
@@ -1057,31 +1131,64 @@ static int check_inode(struct btree_trans *trans,
 			trans, inode_snapshot_mismatch,
 			"inodes in different snapshots don't match")) {
 		bch_err(c, "repair not implemented yet");
-		return -BCH_ERR_fsck_repair_unimplemented;
+		ret = -BCH_ERR_fsck_repair_unimplemented;
+		goto err_noprint;
 	}
 
-	if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
-	    bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
-		struct bpos new_min_pos;
-
-		ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
+	if (u.bi_dir || u.bi_dir_offset) {
+		ret = check_inode_dirent_inode(trans, &u, &do_update);
 		if (ret)
 			goto err;
-
-		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
-
-		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
-
-		bch_err_msg(c, ret, "in fsck updating inode");
-		if (ret)
-			return ret;
-
-		if (!bpos_eq(new_min_pos, POS_MIN))
-			bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
-		return 0;
 	}
 
-	if (u.bi_flags & BCH_INODE_unlinked) {
+	if (fsck_err_on(u.bi_dir && (u.bi_flags & BCH_INODE_unlinked),
+			trans, inode_unlinked_but_has_dirent,
+			"inode unlinked but has dirent\n%s",
+			(printbuf_reset(&buf),
+			 bch2_inode_unpacked_to_text(&buf, &u),
+			 buf.buf))) {
+		u.bi_flags &= ~BCH_INODE_unlinked;
+		do_update = true;
+	}
+
+	if (S_ISDIR(u.bi_mode) && (u.bi_flags & BCH_INODE_unlinked)) {
+		/* Check for this early so that check_unreachable_inode() will reattach it */
+
+		ret = bch2_empty_dir_snapshot(trans, k.k->p.offset, 0, k.k->p.snapshot);
+		if (ret && ret != -BCH_ERR_ENOTEMPTY_dir_not_empty)
+			goto err;
+
+		fsck_err_on(ret, trans, inode_dir_unlinked_but_not_empty,
+			    "dir unlinked but not empty\n%s",
+			    (printbuf_reset(&buf),
+			     bch2_inode_unpacked_to_text(&buf, &u),
+			     buf.buf));
+		u.bi_flags &= ~BCH_INODE_unlinked;
+		do_update = true;
+		ret = 0;
+	}
+
+	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+	if (ret < 0)
+		goto err;
+
+	if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
+			trans, inode_has_child_snapshots_wrong,
+			"inode has_child_snapshots flag wrong (should be %u)\n%s",
+			ret,
+			(printbuf_reset(&buf),
+			 bch2_inode_unpacked_to_text(&buf, &u),
+			 buf.buf))) {
+		if (ret)
+			u.bi_flags |= BCH_INODE_has_child_snapshot;
+		else
+			u.bi_flags &= !BCH_INODE_has_child_snapshot;
+		do_update = true;
+	}
+	ret = 0;
+
+	if ((u.bi_flags & BCH_INODE_unlinked) &&
+	    !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
 		if (!test_bit(BCH_FS_started, &c->flags)) {
 			/*
 			 * If we're not in online fsck, don't delete unlinked
@@ -1095,7 +1202,7 @@ static int check_inode(struct btree_trans *trans,
 			 */
 			ret = check_inode_deleted_list(trans, k.k->p);
 			if (ret < 0)
-				return ret;
+				goto err_noprint;
 
 			fsck_err_on(!ret,
 				    trans, unlinked_inode_not_on_deleted_list,
@@ -1106,83 +1213,17 @@ static int check_inode(struct btree_trans *trans,
 			if (ret)
 				goto err;
 		} else {
-			if (fsck_err_on(bch2_inode_is_open(c, k.k->p),
+			if (fsck_err_on(!bch2_inode_is_open(c, k.k->p),
 					trans, inode_unlinked_and_not_open,
 				      "inode %llu%u unlinked and not open",
 				      u.bi_inum, u.bi_snapshot)) {
 				ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
 				bch_err_msg(c, ret, "in fsck deleting inode");
-				return ret;
+				goto err_noprint;
 			}
 		}
 	}
 
-	/* i_size_dirty is vestigal, since we now have logged ops for truncate * */
-	if (u.bi_flags & BCH_INODE_i_size_dirty &&
-	    (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
-	     fsck_err(trans, inode_i_size_dirty_but_clean,
-		      "filesystem marked clean, but inode %llu has i_size dirty",
-		      u.bi_inum))) {
-		bch_verbose(c, "truncating inode %llu", u.bi_inum);
-
-		/*
-		 * XXX: need to truncate partial blocks too here - or ideally
-		 * just switch units to bytes and that issue goes away
-		 */
-		ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
-				SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
-				     iter->pos.snapshot),
-				POS(u.bi_inum, U64_MAX),
-				0, NULL);
-		bch_err_msg(c, ret, "in fsck truncating inode");
-		if (ret)
-			return ret;
-
-		/*
-		 * We truncated without our normal sector accounting hook, just
-		 * make sure we recalculate it:
-		 */
-		u.bi_flags |= BCH_INODE_i_sectors_dirty;
-
-		u.bi_flags &= ~BCH_INODE_i_size_dirty;
-		do_update = true;
-	}
-
-	/* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
-	if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
-	    (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
-	     fsck_err(trans, inode_i_sectors_dirty_but_clean,
-		      "filesystem marked clean, but inode %llu has i_sectors dirty",
-		      u.bi_inum))) {
-		s64 sectors;
-
-		bch_verbose(c, "recounting sectors for inode %llu",
-			    u.bi_inum);
-
-		sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
-		if (sectors < 0) {
-			bch_err_msg(c, sectors, "in fsck recounting inode sectors");
-			return sectors;
-		}
-
-		u.bi_sectors = sectors;
-		u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
-		do_update = true;
-	}
-
-	if (u.bi_flags & BCH_INODE_backptr_untrusted) {
-		u.bi_dir = 0;
-		u.bi_dir_offset = 0;
-		u.bi_flags &= ~BCH_INODE_backptr_untrusted;
-		do_update = true;
-	}
-
-	if (u.bi_dir || u.bi_dir_offset) {
-		ret = check_inode_dirent_inode(trans, &u, &do_update);
-		if (ret)
-			goto err;
-	}
-
 	if (fsck_err_on(u.bi_parent_subvol &&
 			(u.bi_subvol == 0 ||
 			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
@@ -1224,20 +1265,21 @@ static int check_inode(struct btree_trans *trans,
 	}
 do_update:
 	if (do_update) {
-		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u);
 		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
-			return ret;
+			goto err_noprint;
 	}
 err:
 fsck_err:
 	bch_err_fn(c, ret);
+err_noprint:
+	printbuf_exit(&buf);
 	return ret;
 }
 
 int bch2_check_inodes(struct bch_fs *c)
 {
-	bool full = c->opts.fsck;
 	struct bch_inode_unpacked prev = { 0 };
 	struct snapshots_seen s;
 
@@ -1248,13 +1290,104 @@ int bch2_check_inodes(struct bch_fs *c)
 				POS_MIN,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			check_inode(trans, &iter, k, &prev, &s, full)));
+			check_inode(trans, &iter, k, &prev, &s)));
 
 	snapshots_seen_exit(&s);
 	bch_err_fn(c, ret);
 	return ret;
 }
 
+static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
+					    struct bch_inode_unpacked *inode)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	/*
+	 * We look for inodes to reattach in natural key order, leaves first,
+	 * but we should do the reattach at the oldest version that needs to be
+	 * reattached:
+	 */
+	for_each_btree_key_norestart(trans, iter,
+				     BTREE_ID_inodes,
+				     SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
+				     BTREE_ITER_all_snapshots, k, ret) {
+		if (k.k->p.offset != inode->bi_inum)
+			break;
+
+		if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
+			continue;
+
+		if (!bkey_is_inode(k.k))
+			break;
+
+		struct bch_inode_unpacked parent_inode;
+		bch2_inode_unpack(k, &parent_inode);
+
+		if (!inode_should_reattach(&parent_inode))
+			break;
+
+		*inode = parent_inode;
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	return ret;
+}
+
+static int check_unreachable_inode(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c k)
+{
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (!bkey_is_inode(k.k))
+		return 0;
+
+	struct bch_inode_unpacked inode;
+	BUG_ON(bch2_inode_unpack(k, &inode));
+
+	if (!inode_should_reattach(&inode))
+		return 0;
+
+	ret = find_oldest_inode_needs_reattach(trans, &inode);
+	if (ret)
+		return ret;
+
+	if (fsck_err(trans, inode_unreachable,
+		     "unreachable inode:\n%s",
+		     (bch2_inode_unpacked_to_text(&buf, &inode),
+		      buf.buf)))
+		ret = reattach_inode(trans, &inode);
+fsck_err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+/*
+ * Reattach unreachable (but not unlinked) inodes
+ *
+ * Run after check_inodes() and check_dirents(), so we node that inode
+ * backpointer fields point to valid dirents, and every inode that has a dirent
+ * that points to it has its backpointer field set - so we're just looking for
+ * non-unlinked inodes without backpointers:
+ *
+ * XXX: this is racy w.r.t. hardlink removal in online fsck
+ */
+int bch2_check_unreachable_inodes(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+				POS_MIN,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_unreachable_inode(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
 {
 	switch (btree) {
@@ -1347,7 +1480,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal
 				w->last_pos.inode, i->snapshot,
 				i->inode.bi_sectors, i->count)) {
 			i->inode.bi_sectors = i->count;
-			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
+			ret = bch2_fsck_write_inode(trans, &i->inode);
 			if (ret)
 				break;
 		}
@@ -1657,8 +1790,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			    !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
 				continue;
 
-			if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
-					k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+			if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
 					!bkey_extent_is_reservation(k),
 					trans, extent_past_end_of_inode,
 					"extent type past end of inode %llu:%u, i_size %llu\n  %s",
@@ -1789,7 +1921,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
-			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
+			ret = bch2_fsck_write_inode(trans, &i->inode);
 			if (ret)
 				break;
 		}
@@ -1810,8 +1942,7 @@ noinline_for_stack
 static int check_dirent_inode_dirent(struct btree_trans *trans,
 				   struct btree_iter *iter,
 				   struct bkey_s_c_dirent d,
-				   struct bch_inode_unpacked *target,
-				   u32 target_snapshot)
+				   struct bch_inode_unpacked *target)
 {
 	struct bch_fs *c = trans->c;
 	struct printbuf buf = PRINTBUF;
@@ -1821,6 +1952,32 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 	if (inode_points_to_dirent(target, d))
 		return 0;
 
+	if (!target->bi_dir &&
+	    !target->bi_dir_offset) {
+		fsck_err_on(S_ISDIR(target->bi_mode),
+			    trans, inode_dir_missing_backpointer,
+			    "directory with missing backpointer\n%s",
+			    (printbuf_reset(&buf),
+			     bch2_bkey_val_to_text(&buf, c, d.s_c),
+			     prt_printf(&buf, "\n"),
+			     bch2_inode_unpacked_to_text(&buf, target),
+			     buf.buf));
+
+		fsck_err_on(target->bi_flags & BCH_INODE_unlinked,
+			    trans, inode_unlinked_but_has_dirent,
+			    "inode unlinked but has dirent\n%s",
+			    (printbuf_reset(&buf),
+			     bch2_bkey_val_to_text(&buf, c, d.s_c),
+			     prt_printf(&buf, "\n"),
+			     bch2_inode_unpacked_to_text(&buf, target),
+			     buf.buf));
+
+		target->bi_flags &= ~BCH_INODE_unlinked;
+		target->bi_dir		= d.k->p.inode;
+		target->bi_dir_offset	= d.k->p.offset;
+		return __bch2_fsck_write_inode(trans, target);
+	}
+
 	if (bch2_inode_should_have_bp(target) &&
 	    !fsck_err(trans, inode_wrong_backpointer,
 		      "dirent points to inode that does not point back:\n  %s",
@@ -1830,15 +1987,8 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 		       buf.buf)))
 		goto err;
 
-	if (!target->bi_dir &&
-	    !target->bi_dir_offset) {
-		target->bi_dir		= d.k->p.inode;
-		target->bi_dir_offset	= d.k->p.offset;
-		return __bch2_fsck_write_inode(trans, target, target_snapshot);
-	}
-
 	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
-			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+			      SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot));
 	ret = bkey_err(bp_dirent);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		goto err;
@@ -1851,14 +2001,14 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 			"inode %llu:%u has wrong backpointer:\n"
 			"got       %llu:%llu\n"
 			"should be %llu:%llu",
-			target->bi_inum, target_snapshot,
+			target->bi_inum, target->bi_snapshot,
 			target->bi_dir,
 			target->bi_dir_offset,
 			d.k->p.inode,
 			d.k->p.offset)) {
 		target->bi_dir		= d.k->p.inode;
 		target->bi_dir_offset	= d.k->p.offset;
-		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+		ret = __bch2_fsck_write_inode(trans, target);
 		goto out;
 	}
 
@@ -1873,7 +2023,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 			trans, inode_dir_multiple_links,
 			"%s %llu:%u with multiple links\n%s",
 			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
-			target->bi_inum, target_snapshot, buf.buf)) {
+			target->bi_inum, target->bi_snapshot, buf.buf)) {
 		ret = __remove_dirent(trans, d.k->p);
 		goto out;
 	}
@@ -1886,10 +2036,10 @@ static int check_dirent_inode_dirent(struct btree_trans *trans,
 	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
 			trans, inode_multiple_links_but_nlink_0,
 			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
-			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+			target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
 		target->bi_nlink++;
 		target->bi_flags &= ~BCH_INODE_unlinked;
-		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+		ret = __bch2_fsck_write_inode(trans, target);
 		if (ret)
 			goto err;
 	}
@@ -1906,15 +2056,14 @@ noinline_for_stack
 static int check_dirent_target(struct btree_trans *trans,
 			       struct btree_iter *iter,
 			       struct bkey_s_c_dirent d,
-			       struct bch_inode_unpacked *target,
-			       u32 target_snapshot)
+			       struct bch_inode_unpacked *target)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_i_dirent *n;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
+	ret = check_dirent_inode_dirent(trans, iter, d, target);
 	if (ret)
 		goto err;
 
@@ -2073,7 +2222,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 	u64 target_inum = le64_to_cpu(s.v->inode);
 	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
 
-	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
+	ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		goto err;
 
@@ -2089,13 +2238,13 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *
 			target_inum,
 			subvol_root.bi_parent_subvol, parent_subvol)) {
 		subvol_root.bi_parent_subvol = parent_subvol;
-		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+		subvol_root.bi_snapshot = le32_to_cpu(s.v->snapshot);
+		ret = __bch2_fsck_write_inode(trans, &subvol_root);
 		if (ret)
 			goto err;
 	}
 
-	ret = check_dirent_target(trans, iter, d, &subvol_root,
-				  target_snapshot);
+	ret = check_dirent_target(trans, iter, d, &subvol_root);
 	if (ret)
 		goto err;
 out:
@@ -2188,8 +2337,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		}
 
 		darray_for_each(target->inodes, i) {
-			ret = check_dirent_target(trans, iter, d,
-						  &i->inode, i->snapshot);
+			ret = check_dirent_target(trans, iter, d, &i->inode);
 			if (ret)
 				goto err;
 		}
@@ -2330,7 +2478,7 @@ static int check_root_trans(struct btree_trans *trans)
 			goto err;
 	}
 
-	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+	ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode);
 	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
@@ -2343,8 +2491,9 @@ static int check_root_trans(struct btree_trans *trans)
 		bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755,
 				0, NULL);
 		root_inode.bi_inum = inum;
+		root_inode.bi_snapshot = snapshot;
 
-		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
+		ret = __bch2_fsck_write_inode(trans, &root_inode);
 		bch_err_msg(c, ret, "writing root inode");
 	}
 err:
@@ -2396,22 +2545,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
 		if (ret)
 			break;
 
-		/*
-		 * We've checked that inode backpointers point to valid dirents;
-		 * here, it's sufficient to check that the subvolume root has a
-		 * dirent:
-		 */
-		if (fsck_err_on(!subvol_root.bi_dir,
-				trans, subvol_unreachable,
-				"unreachable subvolume %s",
-				(bch2_bkey_val_to_text(&buf, c, s.s_c),
-				 prt_newline(&buf),
-				 bch2_inode_unpacked_to_text(&buf, &subvol_root),
-				 buf.buf))) {
-			ret = reattach_subvol(trans, s);
-			break;
-		}
-
 		u32 parent = le32_to_cpu(s.v->fs_path_parent);
 
 		if (darray_u32_has(&subvol_path, parent)) {
@@ -2472,12 +2605,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 	return false;
 }
 
-/*
- * Check that a given inode is reachable from its subvolume root - we already
- * verified subvolume connectivity:
- *
- * XXX: we should also be verifying that inodes are in the right subvolumes
- */
 static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
 {
 	struct bch_fs *c = trans->c;
@@ -2491,6 +2618,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 
 	BUG_ON(bch2_inode_unpack(inode_k, &inode));
 
+	if (!S_ISDIR(inode.bi_mode))
+		return 0;
+
 	while (!inode.bi_subvol) {
 		struct btree_iter dirent_iter;
 		struct bkey_s_c_dirent d;
@@ -2505,21 +2635,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 			bch2_trans_iter_exit(trans, &dirent_iter);
 
 		if (bch2_err_matches(ret, ENOENT)) {
-			ret = 0;
-			if (fsck_err(trans, inode_unreachable,
-				     "unreachable inode\n%s",
-				     (printbuf_reset(&buf),
-				      bch2_bkey_val_to_text(&buf, c, inode_k),
-				      buf.buf)))
-				ret = reattach_inode(trans, &inode, snapshot);
+			printbuf_reset(&buf);
+			bch2_bkey_val_to_text(&buf, c, inode_k);
+			bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
+				bch2_err_str(ret), buf.buf);
 			goto out;
 		}
 
 		bch2_trans_iter_exit(trans, &dirent_iter);
 
-		if (!S_ISDIR(inode.bi_mode))
-			break;
-
 		ret = darray_push(p, ((struct pathbuf_entry) {
 			.inum		= inode.bi_inum,
 			.snapshot	= snapshot,
@@ -2557,7 +2681,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
 				if (ret)
 					break;
 
-				ret = reattach_inode(trans, &inode, snapshot);
+				ret = reattach_inode(trans, &inode);
 				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
 			}
 			break;
@@ -2572,9 +2696,8 @@ fsck_err:
 }
 
 /*
- * Check for unreachable inodes, as well as loops in the directory structure:
- * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
- * unreachable:
+ * Check for loops in the directory structure: all other connectivity issues
+ * have been fixed by prior passes
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
@@ -2702,6 +2825,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
 			if (S_ISDIR(u.bi_mode))
 				continue;
 
+			/*
+			 * Previous passes ensured that bi_nlink is nonzero if
+			 * it had multiple hardlinks:
+			 */
 			if (!u.bi_nlink)
 				continue;
 
@@ -2787,7 +2914,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
 			bch2_inode_nlink_get(&u), link->count)) {
 		bch2_inode_nlink_set(&u, link->count);
-		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u);
 	}
 fsck_err:
 	return ret;
diff --git a/libbcachefs/fsck.h b/libbcachefs/fsck.h
index a4ef9427..1cca3101 100644
--- a/libbcachefs/fsck.h
+++ b/libbcachefs/fsck.h
@@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *);
 int bch2_check_xattrs(struct bch_fs *);
 int bch2_check_root(struct bch_fs *);
 int bch2_check_subvolume_structure(struct bch_fs *);
+int bch2_check_unreachable_inodes(struct bch_fs *);
 int bch2_check_directory_structure(struct bch_fs *);
 int bch2_check_nlinks(struct bch_fs *);
 int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c
index 753c2088..9d6040d4 100644
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -12,6 +12,7 @@
 #include "error.h"
 #include "extents.h"
 #include "extent_update.h"
+#include "fs.h"
 #include "inode.h"
 #include "str_hash.h"
 #include "snapshot.h"
@@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = {
 };
 #undef  x
 
+static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+
 static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
 
 static int inode_decode_field(const u8 *in, const u8 *end,
@@ -327,22 +330,20 @@ int bch2_inode_unpack(struct bkey_s_c k,
 		: bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-int bch2_inode_peek_nowarn(struct btree_trans *trans,
-		    struct btree_iter *iter,
-		    struct bch_inode_unpacked *inode,
-		    subvol_inum inum, unsigned flags)
+int __bch2_inode_peek(struct btree_trans *trans,
+		      struct btree_iter *iter,
+		      struct bch_inode_unpacked *inode,
+		      subvol_inum inum, unsigned flags,
+		      bool warn)
 {
-	struct bkey_s_c k;
 	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+	int ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn);
 	if (ret)
 		return ret;
 
-	k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
-			       SPOS(0, inum.inum, snapshot),
-			       flags|BTREE_ITER_cached);
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes,
+					       SPOS(0, inum.inum, snapshot),
+					       flags|BTREE_ITER_cached);
 	ret = bkey_err(k);
 	if (ret)
 		return ret;
@@ -357,20 +358,12 @@ int bch2_inode_peek_nowarn(struct btree_trans *trans,
 
 	return 0;
 err:
+	if (warn)
+		bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
 	bch2_trans_iter_exit(trans, iter);
 	return ret;
 }
 
-int bch2_inode_peek(struct btree_trans *trans,
-		    struct btree_iter *iter,
-		    struct bch_inode_unpacked *inode,
-		    subvol_inum inum, unsigned flags)
-{
-	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
-	bch_err_msg(trans->c, ret, "looking up inum %llu:%llu:", inum.subvol, inum.inum);
-	return ret;
-}
-
 int bch2_inode_write_flags(struct btree_trans *trans,
 		     struct btree_iter *iter,
 		     struct bch_inode_unpacked *inode,
@@ -387,9 +380,7 @@ int bch2_inode_write_flags(struct btree_trans *trans,
 	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
 }
 
-int __bch2_fsck_write_inode(struct btree_trans *trans,
-			 struct bch_inode_unpacked *inode,
-			 u32 snapshot)
+int __bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
 {
 	struct bkey_inode_buf *inode_p =
 		bch2_trans_kmalloc(trans, sizeof(*inode_p));
@@ -398,19 +389,17 @@ int __bch2_fsck_write_inode(struct btree_trans *trans,
 		return PTR_ERR(inode_p);
 
 	bch2_inode_pack(inode_p, inode);
-	inode_p->inode.k.p.snapshot = snapshot;
+	inode_p->inode.k.p.snapshot = inode->bi_snapshot;
 
 	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
 				&inode_p->inode.k_i,
 				BTREE_UPDATE_internal_snapshot_node);
 }
 
-int bch2_fsck_write_inode(struct btree_trans *trans,
-			    struct bch_inode_unpacked *inode,
-			    u32 snapshot)
+int bch2_fsck_write_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
 {
 	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			    __bch2_fsck_write_inode(trans, inode, snapshot));
+			    __bch2_fsck_write_inode(trans, inode));
 	bch_err_fn(trans->c, ret);
 	return ret;
 }
@@ -589,9 +578,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k)
 	}
 }
 
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
 {
-	return bkey_inode_flags(k) & BCH_INODE_unlinked;
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
+		return;
+	case KEY_TYPE_inode_v2:
+		bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
+		return;
+	case KEY_TYPE_inode_v3:
+		bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
+		return;
+	default:
+		BUG();
+	}
+}
+
+static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
+{
+	unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
+
+	return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
+}
+
+static struct bkey_s_c
+bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+				   enum btree_id btree, struct bpos pos,
+				   unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key_upto_norestart(trans, *iter, btree,
+					  bpos_successor(pos),
+					  SPOS(pos.inode, pos.offset, U32_MAX),
+					  flags|BTREE_ITER_all_snapshots, k, ret)
+		if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
+			return k;
+
+	bch2_trans_iter_exit(trans, iter);
+	return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+static struct bkey_s_c
+bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+				    struct bpos pos, unsigned flags)
+{
+	struct bkey_s_c k;
+again:
+	k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
+	if (!k.k ||
+	    bkey_err(k) ||
+	    bkey_is_inode(k.k))
+		return k;
+
+	bch2_trans_iter_exit(trans, iter);
+	pos = k.k->p;
+	goto again;
+}
+
+int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	for_each_btree_key_upto_norestart(trans, iter,
+			BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
+			BTREE_ITER_all_snapshots|
+			BTREE_ITER_with_updates, k, ret)
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
+		    bkey_is_inode(k.k)) {
+			ret = 1;
+			break;
+		}
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int update_inode_has_children(struct btree_trans *trans,
+				     struct bkey_s k,
+				     bool have_child)
+{
+	if (!have_child) {
+		int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+		if (ret)
+			return ret < 0 ? ret : 0;
+	}
+
+	u64 f = bkey_inode_flags(k.s_c);
+	if (have_child != !!(f & BCH_INODE_has_child_snapshot))
+		bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
+
+	return 0;
+}
+
+static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
+					    bool have_child)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
+						&iter, pos, BTREE_ITER_with_updates);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+	if (!k.k)
+		return 0;
+
+	if (!have_child) {
+		ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+		if (ret) {
+			ret = ret < 0 ? ret : 0;
+			goto err;
+		}
+	}
+
+	u64 f = bkey_inode_flags(k);
+	if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
+		struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
+					     BTREE_UPDATE_internal_snapshot_node);
+		ret = PTR_ERR_OR_ZERO(update);
+		if (ret)
+			goto err;
+
+		bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
+	}
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
 }
 
 int bch2_trigger_inode(struct btree_trans *trans,
@@ -600,6 +717,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
 		       struct bkey_s new,
 		       enum btree_iter_update_trigger_flags flags)
 {
+	struct bch_fs *c = trans->c;
+
 	if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
 		BUG_ON(!trans->journal_res.seq);
 		bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
@@ -613,13 +732,41 @@ int bch2_trigger_inode(struct btree_trans *trans,
 			return ret;
 	}
 
-	int deleted_delta =	(int) bkey_is_deleted_inode(new.s_c) -
-				(int) bkey_is_deleted_inode(old);
-	if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) {
-		int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
-						      new.k->p, deleted_delta > 0);
-		if (ret)
-			return ret;
+	if (flags & BTREE_TRIGGER_transactional) {
+		int unlinked_delta =	(int) bkey_is_unlinked_inode(new.s_c) -
+					(int) bkey_is_unlinked_inode(old);
+		if (unlinked_delta) {
+			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+							      new.k->p, unlinked_delta > 0);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * If we're creating or deleting an inode at this snapshot ID,
+		 * and there might be an inode in a parent snapshot ID, we might
+		 * need to set or clear the has_child_snapshot flag on the
+		 * parent.
+		 */
+		int deleted_delta = (int) bkey_is_inode(new.k) -
+				    (int) bkey_is_inode(old.k);
+		if (deleted_delta &&
+		    bch2_snapshot_parent(c, new.k->p.snapshot)) {
+			int ret = update_parent_inode_has_children(trans, new.k->p,
+								   deleted_delta > 0);
+			if (ret)
+				return ret;
+		}
+
+		/*
+		 * When an inode is first updated in a new snapshot, we may need
+		 * to clear has_child_snapshot
+		 */
+		if (deleted_delta > 0) {
+			int ret = update_inode_has_children(trans, new, false);
+			if (ret)
+				return ret;
+		}
 	}
 
 	return 0;
@@ -902,6 +1049,11 @@ err:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		goto retry;
 
+	if (ret)
+		goto err2;
+
+	ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
+err2:
 	bch2_trans_put(trans);
 	return ret;
 }
@@ -1006,7 +1158,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
 	return 0;
 }
 
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter = { NULL };
@@ -1069,6 +1221,44 @@ err:
 	return ret ?: -BCH_ERR_transaction_restart_nested;
 }
 
+/*
+ * After deleting an inode, there may be versions in older snapshots that should
+ * also be deleted - if they're not referenced by sibling snapshots and not open
+ * in other subvolumes:
+ */
+static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+next_parent:
+	ret = lockrestart_do(trans,
+		bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
+	if (ret || !k.k)
+		return ret;
+
+	bool unlinked = bkey_is_unlinked_inode(k);
+	pos = k.k->p;
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (!unlinked)
+		return 0;
+
+	if (bch2_inode_is_open(trans->c, pos))
+		return 0;
+
+	ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
+	if (ret)
+		return ret;
+	goto next_parent;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+	return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
+		delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
+}
+
 static int may_delete_deleted_inode(struct btree_trans *trans,
 				    struct btree_iter *iter,
 				    struct bpos pos,
@@ -1078,6 +1268,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 	struct btree_iter inode_iter;
 	struct bkey_s_c k;
 	struct bch_inode_unpacked inode;
+	struct printbuf buf = PRINTBUF;
 	int ret;
 
 	k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
@@ -1113,6 +1304,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 			pos.offset, pos.snapshot))
 		goto delete;
 
+	if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+			trans, deleted_inode_has_child_snapshots,
+			"inode with child snapshots %llu:%u in deleted_inodes btree",
+			pos.offset, pos.snapshot))
+		goto delete;
+
+	ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+	if (ret < 0)
+		goto out;
+
+	if (ret) {
+		if (fsck_err(trans, inode_has_child_snapshots_wrong,
+			     "inode has_child_snapshots flag wrong (should be set)\n%s",
+			     (printbuf_reset(&buf),
+			      bch2_inode_unpacked_to_text(&buf, &inode),
+			      buf.buf))) {
+			inode.bi_flags |= BCH_INODE_has_child_snapshot;
+			ret = __bch2_fsck_write_inode(trans, &inode);
+			if (ret)
+				goto out;
+		}
+		goto delete;
+
+	}
+
 	if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
 	    !fsck_err(trans, deleted_inode_but_clean,
 		      "filesystem marked as clean but have deleted inode %llu:%u",
@@ -1121,33 +1337,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 		goto out;
 	}
 
-	if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
-		struct bpos new_min_pos;
-
-		ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
-		if (ret)
-			goto out;
-
-		inode.bi_flags &= ~BCH_INODE_unlinked;
-
-		ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
-					     BTREE_UPDATE_internal_snapshot_node);
-		bch_err_msg(c, ret, "clearing inode unlinked flag");
-		if (ret)
-			goto out;
-
-		/*
-		 * We'll need another write buffer flush to pick up the new
-		 * unlinked inodes in the snapshot leaves:
-		 */
-		*need_another_pass = true;
-		goto out;
-	}
-
 	ret = 1;
 out:
 fsck_err:
 	bch2_trans_iter_exit(trans, &inode_iter);
+	printbuf_exit(&buf);
 	return ret;
 delete:
 	ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h
index 695abd70..c8e98443 100644
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -5,6 +5,7 @@
 #include "bkey.h"
 #include "bkey_methods.h"
 #include "opts.h"
+#include "snapshot.h"
 
 enum bch_validate_flags;
 extern const char * const bch2_inode_opts[];
@@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
 			  enum bch_validate_flags);
 void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
+int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
+
+static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+	return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
+		? __bch2_inode_has_child_snapshots(trans, pos)
+		: 0;
+}
+
 int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
 		       struct bkey_s_c, struct bkey_s,
 		       enum btree_iter_update_trigger_flags);
@@ -97,10 +107,26 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
-int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
-		    struct bch_inode_unpacked *, subvol_inum, unsigned);
-int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
-		    struct bch_inode_unpacked *, subvol_inum, unsigned);
+int __bch2_inode_peek(struct btree_trans *, struct btree_iter *,
+		      struct bch_inode_unpacked *, subvol_inum, unsigned, bool);
+
+static inline int bch2_inode_peek_nowarn(struct btree_trans *trans,
+					 struct btree_iter *iter,
+					 struct bch_inode_unpacked *inode,
+					 subvol_inum inum, unsigned flags)
+{
+	return __bch2_inode_peek(trans, iter, inode, inum, flags, false);
+}
+
+static inline int bch2_inode_peek(struct btree_trans *trans,
+				  struct btree_iter *iter,
+				  struct bch_inode_unpacked *inode,
+				  subvol_inum inum, unsigned flags)
+{
+	return __bch2_inode_peek(trans, iter, inode, inum, flags, true);
+	int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags);
+	return ret;
+}
 
 int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *,
 		     struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags);
@@ -112,8 +138,8 @@ static inline int bch2_inode_write(struct btree_trans *trans,
 	return bch2_inode_write_flags(trans, iter, inode, 0);
 }
 
-int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
-int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *);
 
 void bch2_inode_init_early(struct bch_fs *,
 			   struct bch_inode_unpacked *);
diff --git a/libbcachefs/inode_format.h b/libbcachefs/inode_format.h
index 83d10733..a204e46b 100644
--- a/libbcachefs/inode_format.h
+++ b/libbcachefs/inode_format.h
@@ -133,7 +133,8 @@ enum inode_opt_id {
 	x(i_size_dirty,			5)	\
 	x(i_sectors_dirty,		6)	\
 	x(unlinked,			7)	\
-	x(backptr_untrusted,		8)
+	x(backptr_untrusted,		8)	\
+	x(has_child_snapshot,		9)
 
 /* bits 20+ reserved for packed fields below: */
 
diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c
index 177ed331..307ed0a4 100644
--- a/libbcachefs/io_misc.c
+++ b/libbcachefs/io_misc.c
@@ -224,13 +224,14 @@ void bch2_logged_op_truncate_to_text(struct printbuf *out, struct bch_fs *c, str
 
 static int truncate_set_isize(struct btree_trans *trans,
 			      subvol_inum inum,
-			      u64 new_i_size)
+			      u64 new_i_size,
+			      bool warn)
 {
 	struct btree_iter iter = { NULL };
 	struct bch_inode_unpacked inode_u;
 	int ret;
 
-	ret   = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent) ?:
+	ret   = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn) ?:
 		(inode_u.bi_size = new_i_size, 0) ?:
 		bch2_inode_write(trans, &iter, &inode_u);
 
@@ -247,10 +248,11 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
 	struct bkey_i_logged_op_truncate *op = bkey_i_to_logged_op_truncate(op_k);
 	subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
 	u64 new_i_size = le64_to_cpu(op->v.new_i_size);
+	bool warn_errors = i_sectors_delta != NULL;
 	int ret;
 
 	ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			truncate_set_isize(trans, inum, new_i_size));
+			truncate_set_isize(trans, inum, new_i_size, i_sectors_delta != NULL));
 	if (ret)
 		goto err;
 
@@ -263,8 +265,8 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
 		ret = 0;
 err:
-	bch2_logged_op_finish(trans, op_k);
-	bch_err_fn(c, ret);
+	if (warn_errors)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -288,9 +290,14 @@ int bch2_truncate(struct bch_fs *c, subvol_inum inum, u64 new_i_size, u64 *i_sec
 	 * resume only proceeding in one of the snapshots
 	 */
 	down_read(&c->snapshot_create_lock);
-	int ret = bch2_trans_run(c,
-		bch2_logged_op_start(trans, &op.k_i) ?:
-		__bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta));
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = bch2_logged_op_start(trans, &op.k_i);
+	if (ret)
+		goto out;
+	ret = __bch2_resume_logged_op_truncate(trans, &op.k_i, i_sectors_delta);
+	ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
+out:
+	bch2_trans_put(trans);
 	up_read(&c->snapshot_create_lock);
 
 	return ret;
@@ -308,7 +315,8 @@ void bch2_logged_op_finsert_to_text(struct printbuf *out, struct bch_fs *c, stru
 	prt_printf(out, " src_offset=%llu",	le64_to_cpu(op.v->src_offset));
 }
 
-static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset, s64 len)
+static int adjust_i_size(struct btree_trans *trans, subvol_inum inum,
+			 u64 offset, s64 len, bool warn)
 {
 	struct btree_iter iter;
 	struct bch_inode_unpacked inode_u;
@@ -317,7 +325,7 @@ static int adjust_i_size(struct btree_trans *trans, subvol_inum inum, u64 offset
 	offset	<<= 9;
 	len	<<= 9;
 
-	ret = bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent);
+	ret = __bch2_inode_peek(trans, &iter, &inode_u, inum, BTREE_ITER_intent, warn);
 	if (ret)
 		return ret;
 
@@ -357,12 +365,22 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
 	u64 len = abs(shift);
 	u64 pos = le64_to_cpu(op->v.pos);
 	bool insert = shift > 0;
+	u32 snapshot;
+	bool warn_errors = i_sectors_delta != NULL;
 	int ret = 0;
 
 	ret = bch2_inum_opts_get(trans, inum, &opts);
 	if (ret)
 		return ret;
 
+	/*
+	 * check for missing subvolume before fpunch, as in resume we don't want
+	 * it to be a fatal error
+	 */
+	ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors);
+	if (ret)
+		return ret;
+
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
 			     POS(inum.inum, 0),
 			     BTREE_ITER_intent);
@@ -373,7 +391,7 @@ case LOGGED_OP_FINSERT_start:
 
 	if (insert) {
 		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, src_offset, len) ?:
+				adjust_i_size(trans, inum, src_offset, len, warn_errors) ?:
 				bch2_logged_op_update(trans, &op->k_i));
 		if (ret)
 			goto err;
@@ -396,11 +414,11 @@ case LOGGED_OP_FINSERT_shift_extents:
 		struct bkey_i delete, *copy;
 		struct bkey_s_c k;
 		struct bpos src_pos = POS(inum.inum, src_offset);
-		u32 snapshot;
 
 		bch2_trans_begin(trans);
 
-		ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
+		ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot,
+						    warn_errors);
 		if (ret)
 			goto btree_err;
 
@@ -463,12 +481,12 @@ btree_err:
 
 	if (!insert) {
 		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, src_offset, shift) ?:
+				adjust_i_size(trans, inum, src_offset, shift, warn_errors) ?:
 				bch2_logged_op_update(trans, &op->k_i));
 	} else {
 		/* We need an inode update to update bi_journal_seq for fsync: */
 		ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-				adjust_i_size(trans, inum, 0, 0) ?:
+				adjust_i_size(trans, inum, 0, 0, warn_errors) ?:
 				bch2_logged_op_update(trans, &op->k_i));
 	}
 
@@ -477,9 +495,9 @@ case LOGGED_OP_FINSERT_finish:
 	break;
 	}
 err:
-	bch_err_fn(c, ret);
-	bch2_logged_op_finish(trans, op_k);
 	bch2_trans_iter_exit(trans, &iter);
+	if (warn_errors)
+		bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -508,9 +526,14 @@ int bch2_fcollapse_finsert(struct bch_fs *c, subvol_inum inum,
 	 * resume only proceeding in one of the snapshots
 	 */
 	down_read(&c->snapshot_create_lock);
-	int ret = bch2_trans_run(c,
-		bch2_logged_op_start(trans, &op.k_i) ?:
-		__bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta));
+	struct btree_trans *trans = bch2_trans_get(c);
+	int ret = bch2_logged_op_start(trans, &op.k_i);
+	if (ret)
+		goto out;
+	ret = __bch2_resume_logged_op_finsert(trans, &op.k_i, i_sectors_delta);
+	ret = bch2_logged_op_finish(trans, &op.k_i) ?: ret;
+out:
+	bch2_trans_put(trans);
 	up_read(&c->snapshot_create_lock);
 
 	return ret;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index f5f7db50..dc099f06 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 {
 	int ret;
 
+	if (closure_wait_event_timeout(&j->async_wait,
+		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+		   (flags & JOURNAL_RES_GET_NONBLOCK),
+		   HZ * 10))
+		return ret;
+
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct printbuf buf = PRINTBUF;
+	bch2_journal_debug_to_text(&buf, j);
+	bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
+		buf.buf);
+	printbuf_exit(&buf);
+
 	closure_wait_event(&j->async_wait,
 		   (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
 		   (flags & JOURNAL_RES_GET_NONBLOCK));
diff --git a/libbcachefs/logged_ops.c b/libbcachefs/logged_ops.c
index 6f4a4e10..60e00702 100644
--- a/libbcachefs/logged_ops.c
+++ b/libbcachefs/logged_ops.c
@@ -34,8 +34,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
 			    struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
-	const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
-	struct bkey_buf sk;
 	u32 restart_count = trans->restart_count;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
@@ -46,13 +44,15 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
 		    (bch2_bkey_val_to_text(&buf, c, k),
 		     buf.buf));
 
-	if (!fn)
-		return 0;
-
+	struct bkey_buf sk;
 	bch2_bkey_buf_init(&sk);
 	bch2_bkey_buf_reassemble(&sk, c, k);
 
-	fn->resume(trans, sk.k);
+	const struct bch_logged_op_fn *fn = logged_op_fn(sk.k->k.type);
+	if (fn)
+		fn->resume(trans, sk.k);
+
+	ret = bch2_logged_op_finish(trans, sk.k);
 
 	bch2_bkey_buf_exit(&sk, c);
 fsck_err:
@@ -93,7 +93,7 @@ int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
 			 __bch2_logged_op_start(trans, k));
 }
 
-void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
+int bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
 {
 	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 			    bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
@@ -113,4 +113,6 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
 				    buf.buf, bch2_err_str(ret));
 		printbuf_exit(&buf);
 	}
+
+	return ret;
 }
diff --git a/libbcachefs/logged_ops.h b/libbcachefs/logged_ops.h
index 4d1e786a..30ae9ef7 100644
--- a/libbcachefs/logged_ops.h
+++ b/libbcachefs/logged_ops.h
@@ -15,6 +15,6 @@ static inline int bch2_logged_op_update(struct btree_trans *trans, struct bkey_i
 
 int bch2_resume_logged_ops(struct bch_fs *);
 int bch2_logged_op_start(struct btree_trans *, struct bkey_i *);
-void bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
+int bch2_logged_op_finish(struct btree_trans *, struct bkey_i *);
 
 #endif /* _BCACHEFS_LOGGED_OPS_H */
diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c
index 96f2f4f8..10857ecc 100644
--- a/libbcachefs/lru.c
+++ b/libbcachefs/lru.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "alloc_background.h"
+#include "bkey_buf.h"
 #include "btree_iter.h"
 #include "btree_update.h"
 #include "btree_write_buffer.h"
@@ -118,7 +119,7 @@ fsck_err:
 static int bch2_check_lru_key(struct btree_trans *trans,
 			      struct btree_iter *lru_iter,
 			      struct bkey_s_c lru_k,
-			      struct bpos *last_flushed_pos)
+			      struct bkey_buf *last_flushed)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
@@ -132,11 +133,13 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 	u64 idx;
 	int ret;
 
-	if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos),
+	struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos);
+
+	if (fsck_err_on(!ca,
 			trans, lru_entry_to_invalid_bucket,
 			"lru key points to nonexistent device:bucket %llu:%llu",
 			alloc_pos.inode, alloc_pos.offset))
-		return bch2_btree_delete_at(trans, lru_iter, 0);
+		return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
 
 	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0);
 	ret = bkey_err(k);
@@ -150,18 +153,15 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 		idx = alloc_lru_idx_read(*a);
 		break;
 	case BCH_LRU_fragmentation:
-		idx = a->fragmentation_lru;
+		idx = alloc_lru_idx_fragmentation(*a, ca);
 		break;
 	}
 
 	if (lru_k.k->type != KEY_TYPE_set ||
 	    lru_pos_time(lru_k.k->p) != idx) {
-		if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) {
-			*last_flushed_pos = lru_k.k->p;
-			ret = bch2_btree_write_buffer_flush_sync(trans) ?:
-				-BCH_ERR_transaction_restart_write_buffer_flush;
-			goto out;
-		}
+		ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed);
+		if (ret)
+			goto err;
 
 		if (fsck_err(trans, lru_entry_bad,
 			     "incorrect lru entry: lru %s time %llu\n"
@@ -171,12 +171,12 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 			     lru_pos_time(lru_k.k->p),
 			     (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
 			     (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf)))
-			ret = bch2_btree_delete_at(trans, lru_iter, 0);
+			ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false);
 	}
-out:
 err:
 fsck_err:
 	bch2_trans_iter_exit(trans, &iter);
+	bch2_dev_put(ca);
 	printbuf_exit(&buf2);
 	printbuf_exit(&buf1);
 	return ret;
@@ -184,12 +184,18 @@ fsck_err:
 
 int bch2_check_lrus(struct bch_fs *c)
 {
-	struct bpos last_flushed_pos = POS_MIN;
+	struct bkey_buf last_flushed;
+
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
 	int ret = bch2_trans_run(c,
 		for_each_btree_key_commit(trans, iter,
 				BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k,
 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
-			bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
+			bch2_check_lru_key(trans, &iter, k, &last_flushed)));
+
+	bch2_bkey_buf_exit(&last_flushed, c);
 	bch_err_fn(c, ret);
 	return ret;
 
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 7d3920e0..8c456d8b 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -692,7 +692,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt,
 	a = bch2_alloc_to_v4(k, &a_convert);
 	dirty_sectors = bch2_bucket_sectors_dirty(*a);
 	bucket_size = ca->mi.bucket_size;
-	fragmentation = a->fragmentation_lru;
+	fragmentation = alloc_lru_idx_fragmentation(*a, ca);
 
 	ret = bch2_btree_write_buffer_tryflush(trans);
 	bch_err_msg(c, ret, "flushing btree write buffer");
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index d86565bf..d658be90 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -73,6 +73,7 @@ move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b)
 static int bch2_bucket_is_movable(struct btree_trans *trans,
 				  struct move_bucket *b, u64 time)
 {
+	struct bch_fs *c = trans->c;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	struct bch_alloc_v4 _a;
@@ -90,14 +91,19 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
+	struct bch_dev *ca = bch2_dev_tryget(c, k.k->p.inode);
+	if (!ca)
+		goto out;
+
 	a = bch2_alloc_to_v4(k, &_a);
 	b->k.gen	= a->gen;
 	b->sectors	= bch2_bucket_sectors_dirty(*a);
+	u64 lru_idx	= alloc_lru_idx_fragmentation(*a, ca);
 
-	ret = data_type_movable(a->data_type) &&
-		a->fragmentation_lru &&
-		a->fragmentation_lru <= time;
+	ret = lru_idx && lru_idx <= time;
 
+	bch2_dev_put(ca);
+out:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
 }
diff --git a/libbcachefs/rcu_pending.c b/libbcachefs/rcu_pending.c
index 67522aa3..40a20192 100644
--- a/libbcachefs/rcu_pending.c
+++ b/libbcachefs/rcu_pending.c
@@ -478,9 +478,7 @@ start_gp:
 		 */
 		if (!p->cb_armed) {
 			p->cb_armed = true;
-			spin_unlock_irqrestore(&p->lock, flags);
 			__call_rcu(pending->srcu, &p->cb, rcu_pending_rcu_cb);
-			goto free_node;
 		} else {
 			__start_poll_synchronize_rcu(pending->srcu);
 		}
diff --git a/libbcachefs/recovery_passes_types.h b/libbcachefs/recovery_passes_types.h
index 50406ce0..9d96c06e 100644
--- a/libbcachefs/recovery_passes_types.h
+++ b/libbcachefs/recovery_passes_types.h
@@ -46,6 +46,7 @@
 	x(check_dirents,			27, PASS_FSCK)			\
 	x(check_xattrs,				28, PASS_FSCK)			\
 	x(check_root,				29, PASS_ONLINE|PASS_FSCK)	\
+	x(check_unreachable_inodes,		40, PASS_ONLINE|PASS_FSCK)	\
 	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)	\
 	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)	\
 	x(check_nlinks,				31, PASS_FSCK)			\
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c
index bcb32767..797da103 100644
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
 	prt_printf(out, "]");
 }
 
-static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
-					       struct bch_sb *sb,
-					       struct printbuf *err)
+static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
+					   struct bch_sb *sb,
+					   struct printbuf *err)
 {
 	if (!r->nr_devs) {
 		prt_printf(err, "no devices in entry ");
@@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
 				 struct bch_fs *c,
 				 struct printbuf *err)
 {
-	mutex_lock(&c->sb_lock);
-	int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
-	mutex_unlock(&c->sb_lock);
-	return ret;
+	if (!r->nr_devs) {
+		prt_printf(err, "no devices in entry ");
+		goto bad;
+	}
+
+	if (r->nr_required > 1 &&
+	    r->nr_required >= r->nr_devs) {
+		prt_printf(err, "bad nr_required in entry ");
+		goto bad;
+	}
+
+	for (unsigned i = 0; i < r->nr_devs; i++)
+		if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+		    !bch2_dev_exists(c, r->devs[i])) {
+			prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+			goto bad;
+		}
+
+	return 0;
+bad:
+	bch2_replicas_entry_to_text(err, r);
+	return -BCH_ERR_invalid_replicas_entry;
 }
 
 void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(cpu_r, i);
 
-		int ret = bch2_replicas_entry_validate_locked(e, sb, err);
+		int ret = bch2_replicas_entry_sb_validate(e, sb, err);
 		if (ret)
 			return ret;
 
diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c
index 5102059a..ae715ff6 100644
--- a/libbcachefs/sb-downgrade.c
+++ b/libbcachefs/sb-downgrade.c
@@ -78,7 +78,10 @@
 	  BCH_FSCK_ERR_accounting_mismatch)			\
 	x(rebalance_work_acct_fix,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch)
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(inode_has_child_snapshots,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
+	  BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
 
 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
diff --git a/libbcachefs/sb-errors_format.h b/libbcachefs/sb-errors_format.h
index ed5dca5e..4cdddf15 100644
--- a/libbcachefs/sb-errors_format.h
+++ b/libbcachefs/sb-errors_format.h
@@ -115,8 +115,8 @@ enum bch_fsck_flags {
 	x(alloc_key_data_type_inconsistency,			101,	0)		\
 	x(alloc_key_to_missing_dev_bucket,			102,	0)		\
 	x(alloc_key_cached_inconsistency,			103,	0)		\
-	x(alloc_key_cached_but_read_time_zero,			104,	0)		\
-	x(alloc_key_to_missing_lru_entry,			105,	0)		\
+	x(alloc_key_cached_but_read_time_zero,			104,	FSCK_AUTOFIX)	\
+	x(alloc_key_to_missing_lru_entry,			105,	FSCK_AUTOFIX)	\
 	x(alloc_key_data_type_wrong,				106,	FSCK_AUTOFIX)	\
 	x(alloc_key_gen_wrong,					107,	FSCK_AUTOFIX)	\
 	x(alloc_key_dirty_sectors_wrong,			108,	FSCK_AUTOFIX)	\
@@ -129,20 +129,20 @@ enum bch_fsck_flags {
 	x(freespace_key_wrong,					115,	0)		\
 	x(freespace_hole_missing,				116,	0)		\
 	x(bucket_gens_val_size_bad,				117,	0)		\
-	x(bucket_gens_key_wrong,				118,	0)		\
-	x(bucket_gens_hole_wrong,				119,	0)		\
-	x(bucket_gens_to_invalid_dev,				120,	0)		\
-	x(bucket_gens_to_invalid_buckets,			121,	0)		\
-	x(bucket_gens_nonzero_for_invalid_buckets,		122,	0)		\
+	x(bucket_gens_key_wrong,				118,	FSCK_AUTOFIX)	\
+	x(bucket_gens_hole_wrong,				119,	FSCK_AUTOFIX)	\
+	x(bucket_gens_to_invalid_dev,				120,	FSCK_AUTOFIX)	\
+	x(bucket_gens_to_invalid_buckets,			121,	FSCK_AUTOFIX)	\
+	x(bucket_gens_nonzero_for_invalid_buckets,		122,	FSCK_AUTOFIX)	\
 	x(need_discard_freespace_key_to_invalid_dev_bucket,	123,	0)		\
 	x(need_discard_freespace_key_bad,			124,	0)		\
 	x(backpointer_bucket_offset_wrong,			125,	0)		\
 	x(backpointer_to_missing_device,			126,	0)		\
 	x(backpointer_to_missing_alloc,				127,	0)		\
 	x(backpointer_to_missing_ptr,				128,	0)		\
-	x(lru_entry_at_time_0,					129,	0)		\
-	x(lru_entry_to_invalid_bucket,				130,	0)		\
-	x(lru_entry_bad,					131,	0)		\
+	x(lru_entry_at_time_0,					129,	FSCK_AUTOFIX)	\
+	x(lru_entry_to_invalid_bucket,				130,	FSCK_AUTOFIX)	\
+	x(lru_entry_bad,					131,	FSCK_AUTOFIX)	\
 	x(btree_ptr_val_too_big,				132,	0)		\
 	x(btree_ptr_v2_val_too_big,				133,	0)		\
 	x(btree_ptr_has_non_ptr,				134,	0)		\
@@ -158,9 +158,9 @@ enum bch_fsck_flags {
 	x(ptr_after_last_bucket,				144,	0)		\
 	x(ptr_before_first_bucket,				145,	0)		\
 	x(ptr_spans_multiple_buckets,				146,	0)		\
-	x(ptr_to_missing_backpointer,				147,	0)		\
-	x(ptr_to_missing_alloc_key,				148,	0)		\
-	x(ptr_to_missing_replicas_entry,			149,	0)		\
+	x(ptr_to_missing_backpointer,				147,	FSCK_AUTOFIX)	\
+	x(ptr_to_missing_alloc_key,				148,	FSCK_AUTOFIX)	\
+	x(ptr_to_missing_replicas_entry,			149,	FSCK_AUTOFIX)	\
 	x(ptr_to_missing_stripe,				150,	0)		\
 	x(ptr_to_incorrect_stripe,				151,	0)		\
 	x(ptr_gen_newer_than_bucket_gen,			152,	0)		\
@@ -194,7 +194,7 @@ enum bch_fsck_flags {
 	x(snapshot_skiplist_not_normalized,			180,	0)		\
 	x(snapshot_skiplist_bad,				181,	0)		\
 	x(snapshot_should_not_have_subvol,			182,	0)		\
-	x(snapshot_to_bad_snapshot_tree,			183,	0)		\
+	x(snapshot_to_bad_snapshot_tree,			183,	FSCK_AUTOFIX)	\
 	x(snapshot_bad_depth,					184,	0)		\
 	x(snapshot_bad_skiplist,				185,	0)		\
 	x(subvol_pos_bad,					186,	0)		\
@@ -211,6 +211,7 @@ enum bch_fsck_flags {
 	x(inode_unlinked_but_clean,				197,	0)		\
 	x(inode_unlinked_but_nlink_nonzero,			198,	0)		\
 	x(inode_unlinked_and_not_open,				281,	0)		\
+	x(inode_unlinked_but_has_dirent,			285,	0)		\
 	x(inode_checksum_type_invalid,				199,	0)		\
 	x(inode_compression_type_invalid,			200,	0)		\
 	x(inode_subvol_root_but_not_dir,			201,	0)		\
@@ -219,14 +220,18 @@ enum bch_fsck_flags {
 	x(inode_i_sectors_wrong,				204,	FSCK_AUTOFIX)	\
 	x(inode_dir_wrong_nlink,				205,	FSCK_AUTOFIX)	\
 	x(inode_dir_multiple_links,				206,	FSCK_AUTOFIX)	\
+	x(inode_dir_missing_backpointer,			284,	FSCK_AUTOFIX)	\
+	x(inode_dir_unlinked_but_not_empty,			286,	FSCK_AUTOFIX)	\
 	x(inode_multiple_links_but_nlink_0,			207,	FSCK_AUTOFIX)	\
 	x(inode_wrong_backpointer,				208,	FSCK_AUTOFIX)	\
 	x(inode_wrong_nlink,					209,	FSCK_AUTOFIX)	\
+	x(inode_has_child_snapshots_wrong,			287,	0)		\
 	x(inode_unreachable,					210,	FSCK_AUTOFIX)	\
 	x(deleted_inode_but_clean,				211,	FSCK_AUTOFIX)	\
 	x(deleted_inode_missing,				212,	FSCK_AUTOFIX)	\
 	x(deleted_inode_is_dir,					213,	FSCK_AUTOFIX)	\
 	x(deleted_inode_not_unlinked,				214,	FSCK_AUTOFIX)	\
+	x(deleted_inode_has_child_snapshots,			288,	FSCK_AUTOFIX)	\
 	x(extent_overlapping,					215,	0)		\
 	x(key_in_missing_inode,					216,	0)		\
 	x(key_in_wrong_inode_type,				217,	0)		\
@@ -295,7 +300,7 @@ enum bch_fsck_flags {
 	x(accounting_key_replicas_devs_unsorted,		280,	FSCK_AUTOFIX)	\
 	x(accounting_key_version_0,				282,	FSCK_AUTOFIX)	\
 	x(logged_op_but_clean,					283,	FSCK_AUTOFIX)	\
-	x(MAX,							284,	0)
+	x(MAX,							289,	0)
 
 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/libbcachefs/six.c b/libbcachefs/six.c
index b3583c50..617d07e5 100644
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@@ -341,7 +341,7 @@ static inline bool six_owner_running(struct six_lock *lock)
 	 */
 	rcu_read_lock();
 	struct task_struct *owner = READ_ONCE(lock->owner);
-	bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
+	bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current);
 	rcu_read_unlock();
 
 	return ret;
diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c
index 1809442b..9f4d13fa 100644
--- a/libbcachefs/snapshot.c
+++ b/libbcachefs/snapshot.c
@@ -1732,103 +1732,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	return ret;
 }
 
-static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
-{
-	const struct snapshot_t *s = snapshot_t(c, id);
-
-	return s->children[1] ?: s->children[0];
-}
-
-static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
-{
-	u32 child;
-
-	while ((child = bch2_snapshot_smallest_child(c, id)))
-		id = child;
-	return id;
-}
-
-static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
-					       enum btree_id btree,
-					       struct bkey_s_c interior_k,
-					       u32 leaf_id, struct bpos *new_min_pos)
-{
-	struct btree_iter iter;
-	struct bpos pos = interior_k.k->p;
-	struct bkey_s_c k;
-	struct bkey_i *new;
-	int ret;
-
-	pos.snapshot = leaf_id;
-
-	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
-	k = bch2_btree_iter_peek_slot(&iter);
-	ret = bkey_err(k);
-	if (ret)
-		goto out;
-
-	/* key already overwritten in this snapshot? */
-	if (k.k->p.snapshot != interior_k.k->p.snapshot)
-		goto out;
-
-	if (bpos_eq(*new_min_pos, POS_MIN)) {
-		*new_min_pos = k.k->p;
-		new_min_pos->snapshot = leaf_id;
-	}
-
-	new = bch2_bkey_make_mut_noupdate(trans, interior_k);
-	ret = PTR_ERR_OR_ZERO(new);
-	if (ret)
-		goto out;
-
-	new->k.p.snapshot = leaf_id;
-	ret = bch2_trans_update(trans, &iter, new, 0);
-out:
-	bch2_set_btree_iter_dontneed(&iter);
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
-					  enum btree_id btree,
-					  struct bkey_s_c k,
-					  struct bpos *new_min_pos)
-{
-	struct bch_fs *c = trans->c;
-	struct bkey_buf sk;
-	u32 restart_count = trans->restart_count;
-	int ret = 0;
-
-	bch2_bkey_buf_init(&sk);
-	bch2_bkey_buf_reassemble(&sk, c, k);
-	k = bkey_i_to_s_c(sk.k);
-
-	*new_min_pos = POS_MIN;
-
-	for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
-	     id < k.k->p.snapshot;
-	     id++) {
-		if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
-		    !bch2_snapshot_is_leaf(c, id))
-			continue;
-again:
-		ret =   btree_trans_too_many_iters(trans) ?:
-			bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
-			bch2_trans_commit(trans, NULL, NULL, 0);
-		if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
-			bch2_trans_begin(trans);
-			goto again;
-		}
-
-		if (ret)
-			break;
-	}
-
-	bch2_bkey_buf_exit(&sk, c);
-
-	return ret ?: trans_was_restarted(trans, restart_count);
-}
-
 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h
index eb5ef642..29c94716 100644
--- a/libbcachefs/snapshot.h
+++ b/libbcachefs/snapshot.h
@@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
 }
 
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
-					  struct bkey_s_c, struct bpos *);
-
 int bch2_snapshots_read(struct bch_fs *);
 void bch2_fs_snapshots_exit(struct bch_fs *);
 
diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c
index 6845dde1..91d8187e 100644
--- a/libbcachefs/subvolume.c
+++ b/libbcachefs/subvolume.c
@@ -102,7 +102,8 @@ static int check_subvol(struct btree_trans *trans,
 				inode.bi_inum, inode.bi_snapshot,
 				inode.bi_subvol, subvol.k->p.offset)) {
 			inode.bi_subvol = subvol.k->p.offset;
-			ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+			inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot);
+			ret = __bch2_fsck_write_inode(trans, &inode);
 			if (ret)
 				goto err;
 		}
@@ -331,8 +332,8 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
 		bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol);
 }
 
-int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
-				u32 *snapid)
+int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+				  u32 *snapid, bool warn)
 {
 	struct btree_iter iter;
 	struct bkey_s_c_subvolume subvol;
@@ -343,7 +344,8 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
 					  BTREE_ITER_cached|BTREE_ITER_with_updates,
 					  subvolume);
 	ret = bkey_err(subvol);
-	bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c,
+
+	bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c,
 				"missing subvolume %u", subvolid);
 
 	if (likely(!ret))
@@ -352,6 +354,12 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
 	return ret;
 }
 
+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid,
+				u32 *snapid)
+{
+	return __bch2_subvolume_get_snapshot(trans, subvolid, snapid, true);
+}
+
 static int bch2_subvolume_reparent(struct btree_trans *trans,
 				   struct btree_iter *iter,
 				   struct bkey_s_c k,
diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h
index e62f8765..f897d106 100644
--- a/libbcachefs/subvolume.h
+++ b/libbcachefs/subvolume.h
@@ -26,6 +26,8 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
 int bch2_subvol_has_children(struct btree_trans *, u32);
 int bch2_subvolume_get(struct btree_trans *, unsigned,
 		       bool, int, struct bch_subvolume *);
+int __bch2_subvolume_get_snapshot(struct btree_trans *, u32,
+				  u32 *, bool);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
 
 int bch2_subvol_is_ro_trans(struct btree_trans *, u32);
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 873e4be7..32cc7a4b 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -1120,12 +1120,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
 
 		prt_bdevname(&buf, fs->bdev);
 		prt_char(&buf, ' ');
-		bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
+		bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));
 		prt_newline(&buf);
 
 		prt_bdevname(&buf, sb->bdev);
 		prt_char(&buf, ' ');
-		bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
+		bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));
 		prt_newline(&buf);
 
 		if (!opts->no_splitbrain_check)
diff --git a/libbcachefs/thread_with_file.c b/libbcachefs/thread_with_file.c
index fb3442a7..dea73bc1 100644
--- a/libbcachefs/thread_with_file.c
+++ b/libbcachefs/thread_with_file.c
@@ -275,7 +275,6 @@ static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigne
 }
 
 static const struct file_operations thread_with_stdio_fops = {
-	.llseek		= no_llseek,
 	.read		= thread_with_stdio_read,
 	.write		= thread_with_stdio_write,
 	.poll		= thread_with_stdio_poll,
@@ -285,7 +284,6 @@ static const struct file_operations thread_with_stdio_fops = {
 };
 
 static const struct file_operations thread_with_stdout_fops = {
-	.llseek		= no_llseek,
 	.read		= thread_with_stdio_read,
 	.poll		= thread_with_stdout_poll,
 	.flush		= thread_with_stdio_flush,
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 42f565c7..e0a876cb 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -222,7 +222,7 @@ u64 bch2_read_flag_list(const char *opt, const char * const list[])
 			break;
 		}
 
-		ret |= 1 << flag;
+		ret |= BIT_ULL(flag);
 	}
 
 	kfree(d);
diff --git a/linux/closure.c b/linux/closure.c
index 116afae2..2bfe7d2a 100644
--- a/linux/closure.c
+++ b/linux/closure.c
@@ -278,7 +278,7 @@ static int debug_show(struct seq_file *f, void *data)
 			seq_printf(f, " W %pS\n",
 				   (void *) cl->waiting_on);
 
-		seq_puts(f, "\n");
+		seq_putc(f, '\n');
 	}
 
 	spin_unlock_irq(&closure_list_lock);