Update bcachefs sources to b4927db2cdc7 bcachefs: bcachefs_metadata_version_fast_device_removal

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2025-12-10 00:00:24 +03:00 · 2025-05-04 15:38:06 -04:00 · 2025-05-04 15:38:06 -04:00 · 401a20ed98
commit 401a20ed98
parent 6e4bda5ad5
21 changed files with 530 additions and 249 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-5a0455ae19afb354634b3c5c9bf55d2171005a2f
+b4927db2cdc7f124f968f9eaa1d785298ae31c1a
--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@ -1255,6 +1255,9 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 	if (unlikely(ret))
 		return ret;

+	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
+		erasure_code = false;
+
 	req->nr_replicas	= nr_replicas;
 	req->target		= target;
 	req->ec			= erasure_code;
@ -1262,9 +1265,6 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans,
 	req->flags		= flags;
 	req->devs_have		= devs_have;

-	if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING))
-		erasure_code = false;
-
 	BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
 	req->ptrs.nr		= 0;
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -696,7 +696,8 @@ struct bch_sb_field_ext {
 	x(stripe_lru,			BCH_VERSION(1, 23))		\
 	x(casefolding,			BCH_VERSION(1, 24))		\
 	x(extent_flags,			BCH_VERSION(1, 25))		\
-	x(snapshot_deletion_v2,		BCH_VERSION(1, 26))
+	x(snapshot_deletion_v2,		BCH_VERSION(1, 26))		\
+	x(fast_device_removal,		BCH_VERSION(1, 27))

 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -1079,6 +1079,10 @@ out:
 	 * allocator thread - issue wakeup in case they blocked on gc_lock:
 	 */
 	closure_wake_up(&c->freelist_wait);
+
+	if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags))
+		bch2_sb_members_clean_deleted(c);
+
 	bch_err_fn(c, ret);
 	return ret;
 }
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@ -212,17 +212,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target)
 	case TARGET_DEV:
 		return dev == t.dev;
 	case TARGET_GROUP: {
-		rcu_read_lock();
 		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
 		const struct bch_devs_mask *m =
 			g && t.group < g->nr && !g->entries[t.group].deleted
 			? &g->entries[t.group].devs
 			: NULL;

-		bool ret = m ? test_bit(dev, m->d) : false;
-		rcu_read_unlock();
-
-		return ret;
+		return m ? test_bit(dev, m->d) : false;
 	}
 	default:
 		BUG();
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -1121,8 +1121,9 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
 static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
 			    struct bch_extent_ptr *ptr)
 {
-	if (!opts->promote_target ||
-	    !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
+	unsigned target = opts->promote_target ?: opts->foreground_target;
+
+	if (target && !bch2_dev_in_target(c, ptr->dev, target))
 		return false;

 	struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
@ -1135,33 +1136,43 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c,
 				struct bkey_s k,
 				struct bch_extent_ptr *ptr)
 {
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+	struct bkey_ptrs ptrs;
 	union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
+	bool have_cached_ptr;

 	rcu_read_lock();
-	if (!want_cached_ptr(c, opts, ptr)) {
-		bch2_bkey_drop_ptr_noerror(k, ptr);
-		goto out;
+restart_drop_ptrs:
+	ptrs = bch2_bkey_ptrs(k);
+	have_cached_ptr = false;
+
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		/*
+		 * Check if it's erasure coded - stripes can't contain cached
+		 * data. Possibly something we can fix in the future?
+		 */
+		if (&entry->ptr == ptr && p.has_ec)
+			goto drop;
+
+		if (p.ptr.cached) {
+			if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) {
+				bch2_bkey_drop_ptr_noerror(k, &entry->ptr);
+				goto restart_drop_ptrs;
+			}
+
+			have_cached_ptr = true;
+		}
 	}

-	/*
-	 * Stripes can't contain cached data, for - reasons.
-	 *
-	 * Possibly something we can fix in the future?
-	 */
-	bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-		if (&entry->ptr == ptr) {
-			if (p.has_ec)
-				bch2_bkey_drop_ptr_noerror(k, ptr);
-			else
-				ptr->cached = true;
-			goto out;
-		}
+	if (have_cached_ptr || !want_cached_ptr(c, opts, ptr))
+		goto drop;

-	BUG();
-out:
+	ptr->cached = true;
 	rcu_read_unlock();
+	return;
+drop:
+	rcu_read_unlock();
+	bch2_bkey_drop_ptr_noerror(k, ptr);
 }

 /*
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@ -790,6 +790,7 @@ static int ref_visible2(struct bch_fs *c,

 struct inode_walker_entry {
 	struct bch_inode_unpacked inode;
+	bool			whiteout;
 	u64			count;
 	u64			i_size;
 };
@ -818,12 +819,20 @@ static struct inode_walker inode_walker_init(void)
 static int add_inode(struct bch_fs *c, struct inode_walker *w,
 		     struct bkey_s_c inode)
 {
-	struct bch_inode_unpacked u;
-
-	return bch2_inode_unpack(inode, &u) ?:
-		darray_push(&w->inodes, ((struct inode_walker_entry) {
-		.inode		= u,
+	int ret = darray_push(&w->inodes, ((struct inode_walker_entry) {
+		.whiteout	= !bkey_is_inode(inode.k),
 	}));
+	if (ret)
+		return ret;
+
+	struct inode_walker_entry *n = &darray_last(w->inodes);
+	if (!n->whiteout) {
+		return bch2_inode_unpack(inode, &n->inode);
+	} else {
+		n->inode.bi_inum	= inode.k->p.inode;
+		n->inode.bi_snapshot	= inode.k->p.snapshot;
+		return 0;
+	}
 }

 static int get_inodes_all_snapshots(struct btree_trans *trans,
@ -843,13 +852,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 	w->recalculate_sums = false;
 	w->inodes.nr = 0;

-	for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
-				     BTREE_ITER_all_snapshots, k, ret) {
-		if (k.k->p.offset != inum)
+	for_each_btree_key_max_norestart(trans, iter,
+			BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX),
+			BTREE_ITER_all_snapshots, k, ret) {
+		ret = add_inode(c, w, k);
+		if (ret)
 			break;
-
-		if (bkey_is_inode(k.k))
-			add_inode(c, w, k);
 	}
 	bch2_trans_iter_exit(trans, &iter);

@ -861,63 +869,6 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 	return 0;
 }

-static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
-{
-	struct bch_fs *c = trans->c;
-
-	struct inode_walker_entry *i;
-	__darray_for_each(w->inodes, i)
-		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot))
-			goto found;
-
-	return NULL;
-found:
-	BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot);
-
-	struct printbuf buf = PRINTBUF;
-	int ret = 0;
-
-	if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
-			trans, snapshot_key_missing_inode_snapshot,
-			 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
-			 "unexpected because we should always update the inode when we update a key in that inode\n"
-			 "%s",
-			 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
-			 (bch2_bkey_val_to_text(&buf, c, k),
-			  buf.buf))) {
-		struct bch_inode_unpacked new = i->inode;
-
-		new.bi_snapshot = k.k->p.snapshot;
-
-		ret = __bch2_fsck_write_inode(trans, &new) ?:
-			bch2_trans_commit(trans, NULL, NULL, 0) ?:
-			-BCH_ERR_transaction_restart_nested;
-		goto fsck_err;
-	}
-
-	printbuf_exit(&buf);
-	return i;
-fsck_err:
-	printbuf_exit(&buf);
-	return ERR_PTR(ret);
-}
-
-static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
-					     struct inode_walker *w,
-					     struct bkey_s_c k)
-{
-	if (w->last_pos.inode != k.k->p.inode) {
-		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
-		if (ret)
-			return ERR_PTR(ret);
-	}
-
-	w->last_pos = k.k->p;
-
-	return lookup_inode_for_snapshot(trans, w, k);
-}
-
 static int get_visible_inodes(struct btree_trans *trans,
 			      struct inode_walker *w,
 			      struct snapshots_seen *s,
@ -953,6 +904,80 @@ static int get_visible_inodes(struct btree_trans *trans,
 	return ret;
 }

+static struct inode_walker_entry *
+lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+
+	struct inode_walker_entry *i;
+	__darray_for_each(w->inodes, i)
+		if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot))
+			goto found;
+
+	return NULL;
+found:
+	BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot);
+
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot,
+			trans, snapshot_key_missing_inode_snapshot,
+			 "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+			 "unexpected because we should always update the inode when we update a key in that inode\n"
+			 "%s",
+			 w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot,
+			 (bch2_bkey_val_to_text(&buf, c, k),
+			  buf.buf))) {
+		struct bch_inode_unpacked new = i->inode;
+
+		new.bi_snapshot = k.k->p.snapshot;
+
+		ret = __bch2_fsck_write_inode(trans, &new) ?:
+			bch2_trans_commit(trans, NULL, NULL, 0);
+		if (ret)
+			goto fsck_err;
+
+		struct inode_walker_entry new_entry = *i;
+
+		new_entry.inode.bi_snapshot	= k.k->p.snapshot;
+		new_entry.count			= 0;
+		new_entry.i_size		= 0;
+
+		while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot)
+			--i;
+
+		size_t pos = i - w->inodes.data;
+		ret = darray_insert_item(&w->inodes, pos, new_entry);
+		if (ret)
+			goto fsck_err;
+
+		ret = -BCH_ERR_transaction_restart_nested;
+		goto fsck_err;
+	}
+
+	printbuf_exit(&buf);
+	return i;
+fsck_err:
+	printbuf_exit(&buf);
+	return ERR_PTR(ret);
+}
+
+static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
+					     struct inode_walker *w,
+					     struct bkey_s_c k)
+{
+	if (w->last_pos.inode != k.k->p.inode) {
+		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
+	w->last_pos = k.k->p;
+
+	return lookup_inode_for_snapshot(trans, w, k);
+}
+
 /*
 * Prefer to delete the first one, since that will be the one at the wrong
 * offset:
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@ -240,6 +240,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k,
 	u64 v[2];

 	unpacked->bi_inum	= inode.k->p.offset;
+	unpacked->bi_snapshot	= inode.k->p.snapshot;
 	unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
 	unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 	unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
@ -284,13 +285,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
 {
 	memset(unpacked, 0, sizeof(*unpacked));

-	unpacked->bi_snapshot = k.k->p.snapshot;
-
 	switch (k.k->type) {
 	case KEY_TYPE_inode: {
 		struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);

 		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_snapshot	= inode.k->p.snapshot;
 		unpacked->bi_journal_seq= 0;
 		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 		unpacked->bi_flags	= le32_to_cpu(inode.v->bi_flags);
@ -309,6 +309,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
 		struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k);

 		unpacked->bi_inum	= inode.k->p.offset;
+		unpacked->bi_snapshot	= inode.k->p.snapshot;
 		unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq);
 		unpacked->bi_hash_seed	= inode.v->bi_hash_seed;
 		unpacked->bi_flags	= le64_to_cpu(inode.v->bi_flags);
@ -326,8 +327,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
 int bch2_inode_unpack(struct bkey_s_c k,
 		      struct bch_inode_unpacked *unpacked)
 {
-	unpacked->bi_snapshot = k.k->p.snapshot;
-
 	return likely(k.k->type == KEY_TYPE_inode_v3)
 		? bch2_inode_unpack_v3(k, unpacked)
 		: bch2_inode_unpack_slowpath(k, unpacked);
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -1465,6 +1465,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);

+	rcu_read_lock();
 	darray_for_each(*devs, i) {
 		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
 		if (!ca)
@ -1486,6 +1487,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j,
 			ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq);
 		}
 	}
+	rcu_read_unlock();
 }

 static void __journal_write_alloc(struct journal *j,
@ -1498,7 +1500,8 @@ static void __journal_write_alloc(struct journal *j,
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);

 	darray_for_each(*devs, i) {
-		struct bch_dev *ca = rcu_dereference(c->devs[*i]);
+		struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE,
+					BCH_DEV_WRITE_REF_journal_write);
 		if (!ca)
 			continue;

@ -1512,8 +1515,10 @@ static void __journal_write_alloc(struct journal *j,
 		    ca->mi.state != BCH_MEMBER_STATE_rw ||
 		    !ja->nr ||
 		    bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
-		    sectors > ja->sectors_free)
+		    sectors > ja->sectors_free) {
+			enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
 			continue;
+		}

 		bch2_dev_stripe_increment(ca, &j->wp.stripe);

@ -1536,15 +1541,8 @@ static void __journal_write_alloc(struct journal *j,
 	}
 }

-/**
- * journal_write_alloc - decide where to write next journal entry
- *
- * @j:		journal object
- * @w:		journal buf (entry to be written)
- *
- * Returns: 0 on success, or -BCH_ERR_insufficient_devices on failure
- */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w)
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+			       unsigned *replicas)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_devs_mask devs;
@ -1552,29 +1550,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 	unsigned target = c->opts.metadata_target ?:
 		c->opts.foreground_target;
-	unsigned replicas = 0, replicas_want =
-		READ_ONCE(c->opts.metadata_replicas);
+	unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas);
 	unsigned replicas_need = min_t(unsigned, replicas_want,
 				       READ_ONCE(c->opts.metadata_replicas_required));
 	bool advance_done = false;

-	rcu_read_lock();
-
-	/* We might run more than once if we have to stop and do discards: */
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key));
-	bkey_for_each_ptr(ptrs, p) {
-		struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev);
-		if (ca)
-			replicas += ca->mi.durability;
-	}
-
 retry_target:
 	devs = target_rw_devs(c, BCH_DATA_journal, target);
 	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
 retry_alloc:
-	__journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want);
+	__journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want);

-	if (likely(replicas >= replicas_want))
+	if (likely(*replicas >= replicas_want))
 		goto done;

 	if (!advance_done) {
@ -1583,18 +1570,16 @@ retry_alloc:
 		goto retry_alloc;
 	}

-	if (replicas < replicas_want && target) {
+	if (*replicas < replicas_want && target) {
 		/* Retry from all devices: */
 		target = 0;
 		advance_done = false;
 		goto retry_target;
 	}
 done:
-	rcu_read_unlock();
-
 	BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);

-	return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
+	return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices;
 }

 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
@ -1780,13 +1765,7 @@ static CLOSURE_CALLBACK(journal_write_submit)
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);

 	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE,
-					BCH_DEV_WRITE_REF_journal_write);
-		if (!ca) {
-			/* XXX: fix this */
-			bch_err(c, "missing device %u for journal write", ptr->dev);
-			continue;
-		}
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);

 		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
 			     sectors);
@ -2066,57 +2045,45 @@ CLOSURE_CALLBACK(bch2_journal_write)

 	j->write_start_time = local_clock();

+	mutex_lock(&j->buf_lock);
+	journal_buf_realloc(j, w);
+
+	ret = bch2_journal_write_prep(j, w);
+	mutex_unlock(&j->buf_lock);
+
+	if (unlikely(ret))
+		goto err;
+
 	spin_lock(&j->lock);
 	if (nr_rw_members > 1)
 		w->separate_flush = true;

 	ret = bch2_journal_write_pick_flush(j, w);
 	spin_unlock(&j->lock);
-	if (ret)
+
+	if (unlikely(ret))
 		goto err;

-	mutex_lock(&j->buf_lock);
-	journal_buf_realloc(j, w);
-
-	ret = bch2_journal_write_prep(j, w);
-	mutex_unlock(&j->buf_lock);
-	if (ret)
-		goto err;
-
-	j->entry_bytes_written += vstruct_bytes(w->data);
-
+	unsigned replicas_allocated = 0;
 	while (1) {
-		spin_lock(&j->lock);
-		ret = journal_write_alloc(j, w);
+		ret = journal_write_alloc(j, w, &replicas_allocated);
 		if (!ret || !j->can_discard)
 			break;

-		spin_unlock(&j->lock);
 		bch2_journal_do_discards(j);
 	}

-	if (ret && !bch2_journal_error(j)) {
-		struct printbuf buf = PRINTBUF;
-		buf.atomic++;
-
-		__bch2_journal_debug_to_text(&buf, j);
-		spin_unlock(&j->lock);
-		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
-					  le64_to_cpu(w->data->seq),
-					  vstruct_sectors(w->data, c->block_bits),
-					  bch2_err_str(ret));
-		bch2_print_str(c, KERN_ERR, buf.buf);
-		printbuf_exit(&buf);
-	}
-	if (ret)
-		goto err;
+	if (unlikely(ret))
+		goto err_allocate_write;

+	spin_lock(&j->lock);
 	/*
 	 * write is allocated, no longer need to account for it in
 	 * bch2_journal_space_available():
 	 */
 	w->sectors = 0;
 	w->write_allocated = true;
+	j->entry_bytes_written += vstruct_bytes(w->data);

 	/*
 	 * journal entry has been compacted and allocated, recalculate space
@ -2128,9 +2095,6 @@ CLOSURE_CALLBACK(bch2_journal_write)

 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));

-	if (c->opts.nochanges)
-		goto no_io;
-
 	/*
 	 * Mark journal replicas before we submit the write to guarantee
 	 * recovery will find the journal entries after a crash.
@ -2141,15 +2105,33 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	if (ret)
 		goto err;

+	if (c->opts.nochanges)
+		goto no_io;
+
 	if (!JSET_NO_FLUSH(w->data))
 		continue_at(cl, journal_write_preflush, j->wq);
 	else
 		continue_at(cl, journal_write_submit, j->wq);
 	return;
-no_io:
-	continue_at(cl, journal_write_done, j->wq);
-	return;
+err_allocate_write:
+	if (!bch2_journal_error(j)) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_journal_debug_to_text(&buf, j);
+		prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"),
+					  le64_to_cpu(w->data->seq),
+					  vstruct_sectors(w->data, c->block_bits),
+					  bch2_err_str(ret));
+		bch2_print_str(c, KERN_ERR, buf.buf);
+		printbuf_exit(&buf);
+	}
 err:
 	bch2_fatal_error(c);
+no_io:
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev);
+		enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write);
+	}
+
 	continue_at(cl, journal_write_done, j->wq);
 }
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@ -4,9 +4,11 @@
 */

 #include "bcachefs.h"
+#include "backpointers.h"
 #include "bkey_buf.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "errcode.h"
 #include "extents.h"
@ -20,7 +22,7 @@
 #include "super-io.h"

 static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
-			 unsigned dev_idx, int flags, bool metadata)
+			 unsigned dev_idx, unsigned flags, bool metadata)
 {
 	unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas;
 	unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST;
@ -37,11 +39,28 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 	return 0;
 }

+static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter,
+			   struct btree *b, unsigned dev_idx, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_buf k;
+
+	bch2_bkey_buf_init(&k);
+	bch2_bkey_buf_copy(&k, c, &b->key);
+
+	int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?:
+		bch2_btree_node_update_key(trans, iter, b, k.k, 0, false);
+
+	bch_err_fn(c, ret);
+	bch2_bkey_buf_exit(&k, c);
+	return ret;
+}
+
 static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 				     struct btree_iter *iter,
 				     struct bkey_s_c k,
 				     unsigned dev_idx,
-				     int flags)
+				     unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_i *n;
@ -77,9 +96,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	return 0;
 }

+static int bch2_dev_btree_drop_key(struct btree_trans *trans,
+				   struct bkey_s_c_backpointer bp,
+				   unsigned dev_idx,
+				   struct bkey_buf *last_flushed,
+				   unsigned flags)
+{
+	struct btree_iter iter;
+	struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed);
+	int ret = PTR_ERR_OR_ZERO(b);
+	if (ret)
+		return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret;
+
+	ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
+
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
 static int bch2_dev_usrdata_drop(struct bch_fs *c,
 				 struct progress_indicator_state *progress,
-				 unsigned dev_idx, int flags)
+				 unsigned dev_idx, unsigned flags)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
 	enum btree_id id;
@ -106,7 +143,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,

 static int bch2_dev_metadata_drop(struct bch_fs *c,
 				  struct progress_indicator_state *progress,
-				  unsigned dev_idx, int flags)
+				  unsigned dev_idx, unsigned flags)
 {
 	struct btree_trans *trans;
 	struct btree_iter iter;
@ -137,20 +174,12 @@ retry:
 			if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx))
 				goto next;

-			bch2_bkey_buf_copy(&k, c, &b->key);
-
-			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
-					    dev_idx, flags, true);
-			if (ret)
-				break;
-
-			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
+			ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
 				ret = 0;
 				continue;
 			}

-			bch_err_msg(c, ret, "updating btree node key");
 			if (ret)
 				break;
 next:
@ -176,7 +205,57 @@ err:
 	return ret;
 }

-int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags)
+static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx,
+			struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed,
+			unsigned flags)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed);
+	int ret = bkey_err(k);
+	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+		return 0;
+	if (ret)
+		return ret;
+
+	if (!bch2_bkey_has_device_c(k, dev_idx))
+		goto out;
+
+	ret = bkey_is_btree_ptr(k.k)
+		? bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags)
+		: bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags);
+out:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	struct bkey_buf last_flushed;
+	bch2_bkey_buf_init(&last_flushed);
+	bkey_init(&last_flushed.k->k);
+
+	int ret = bch2_btree_write_buffer_flush_sync(trans) ?:
+		for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
+				POS(dev_idx, 0),
+				POS(dev_idx, U64_MAX), 0, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+			if (k.k->type != KEY_TYPE_backpointer)
+				continue;
+
+			data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
+				     &last_flushed, flags);
+
+	}));
+
+	bch2_bkey_buf_exit(&last_flushed, trans->c);
+	bch2_trans_put(trans);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags)
 {
 	struct progress_indicator_state progress;
 	bch2_progress_init(&progress, c,
--- a/libbcachefs/migrate.h
+++ b/libbcachefs/migrate.h
@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_MIGRATE_H
 #define _BCACHEFS_MIGRATE_H

-int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
+int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned);
+int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned);

 #endif /* _BCACHEFS_MIGRATE_H */
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@ -80,11 +80,13 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c,
 	unsigned ptr_bit = 1;
 	unsigned rewrite_ptrs = 0;

+	rcu_read_lock();
 	bkey_for_each_ptr(ptrs, ptr) {
 		if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target))
 			rewrite_ptrs |= ptr_bit;
 		ptr_bit <<= 1;
 	}
+	rcu_read_unlock();

 	return rewrite_ptrs;
 }
@ -132,10 +134,14 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k)
 		}
 	}
 incompressible:
-	if (opts->background_target)
+	if (opts->background_target) {
+		rcu_read_lock();
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-			if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
+			if (!p.ptr.cached &&
+			    !bch2_dev_in_target(c, p.ptr.dev, opts->background_target))
 				sectors += p.crc.compressed_size;
+		rcu_read_unlock();
+	}

 	return sectors;
 }
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@ -737,11 +737,6 @@ int bch2_fs_recovery(struct bch_fs *c)
 		c->opts.read_only = true;
 	}

-	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
-		bch_info(c, "filesystem is an unresized image file, mounting ro");
-		c->opts.read_only = true;
-	}
-
 	mutex_lock(&c->sb_lock);
 	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 	bool write_sb = false;
@ -895,6 +890,17 @@ use_clean:
 	if (ret)
 		goto err;

+	ret = bch2_fs_resize_on_mount(c);
+	if (ret) {
+		up_write(&c->state_lock);
+		goto err;
+	}
+
+	if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) {
+		bch_info(c, "filesystem is an unresized image file, mounting ro");
+		c->opts.read_only = true;
+	}
+
 	if (!c->opts.read_only &&
 	    (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) {
 		bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate");
--- a/libbcachefs/sb-members.c
+++ b/libbcachefs/sb-members.c
@ -525,6 +525,7 @@ int bch2_sb_member_alloc(struct bch_fs *c)
 	unsigned u64s;
 	int best = -1;
 	u64 best_last_mount = 0;
+	unsigned nr_deleted = 0;

 	if (dev_idx < BCH_SB_MEMBERS_MAX)
 		goto have_slot;
@ -535,7 +536,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
 			continue;

 		struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx);
-		if (bch2_member_alive(&m))
+
+		nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID);
+
+		if (!bch2_is_zero(&m.uuid, sizeof(m.uuid)))
 			continue;

 		u64 last_mount = le64_to_cpu(m.last_mount);
@ -549,6 +553,10 @@ int bch2_sb_member_alloc(struct bch_fs *c)
 		goto have_slot;
 	}

+	if (nr_deleted)
+		bch_err(c, "unable to allocate new member, but have %u deleted: run fsck",
+			nr_deleted);
+
 	return -BCH_ERR_ENOSPC_sb_members;
 have_slot:
 	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
@ -564,3 +572,22 @@ have_slot:
 	c->disk_sb.sb->nr_devices = nr_devices;
 	return dev_idx;
 }
+
+void bch2_sb_members_clean_deleted(struct bch_fs *c)
+{
+	mutex_lock(&c->sb_lock);
+	bool write_sb = false;
+
+	for (unsigned i = 0; i < c->sb.nr_devices; i++) {
+		struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i);
+
+		if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) {
+			memset(&m->uuid, 0, sizeof(m->uuid));
+			write_sb = true;
+		}
+	}
+
+	if (write_sb)
+		bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
--- a/libbcachefs/sb-members.h
+++ b/libbcachefs/sb-members.h
@ -320,7 +320,8 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2;

 static inline bool bch2_member_alive(struct bch_member *m)
 {
-	return !bch2_is_zero(&m->uuid, sizeof(m->uuid));
+	return  !bch2_is_zero(&m->uuid, sizeof(m->uuid)) &&
+		!uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID);
 }

 static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev)
@ -381,5 +382,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
 void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);

 int bch2_sb_member_alloc(struct bch_fs *);
+void bch2_sb_members_clean_deleted(struct bch_fs *);

 #endif /* _BCACHEFS_SB_MEMBERS_H */
--- a/libbcachefs/sb-members_format.h
+++ b/libbcachefs/sb-members_format.h
@ -13,6 +13,10 @@
 */
 #define BCH_SB_MEMBER_INVALID		255

+#define BCH_SB_MEMBER_DELETED_UUID					\
+	UUID_INIT(0xffffffff, 0xffff, 0xffff,				\
+		  0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
+
 #define BCH_MIN_NR_NBUCKETS	(1 << 6)

 #define BCH_IOPS_MEASUREMENTS()			\
--- a/libbcachefs/snapshot.c
+++ b/libbcachefs/snapshot.c
@ -1427,6 +1427,12 @@ static unsigned live_child(struct bch_fs *c, u32 id)
 	return ret;
 }

+static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id)
+{
+	return snapshot_list_has_id(&d->delete_leaves, id) ||
+		interior_delete_has_id(&d->delete_interior, id) != 0;
+}
+
 static int delete_dead_snapshots_process_key(struct btree_trans *trans,
 					     struct btree_iter *iter,
 					     struct bkey_s_c k)
@ -1468,11 +1474,20 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans,
 	return 0;
 }

-static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter)
+static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum)
 {
 	struct bch_fs *c = trans->c;
 	struct snapshot_delete *d = &c->snapshot_delete;

+	u64 inum = iter->btree_id != BTREE_ID_inodes
+		? iter->pos.inode
+		: iter->pos.offset;
+
+	if (*prev_inum == inum)
+		return false;
+
+	*prev_inum = inum;
+
 	bool ret = !snapshot_list_has_id(&d->deleting_from_trees,
 					 bch2_snapshot_tree(c, iter->pos.snapshot));
 	if (unlikely(ret)) {
@ -1486,6 +1501,129 @@ static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree
 	return ret;
 }

+static int delete_dead_snapshot_keys_v1(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct snapshot_delete *d = &c->snapshot_delete;
+
+	for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
+		struct disk_reservation res = { 0 };
+		u64 prev_inum = 0;
+
+		d->pos.pos = POS_MIN;
+
+		if (!btree_type_has_snapshots(d->pos.btree))
+			continue;
+
+		int ret = for_each_btree_key_commit(trans, iter,
+				d->pos.btree, POS_MIN,
+				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+				&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+			d->pos.pos = iter.pos;
+
+			if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+				continue;
+
+			delete_dead_snapshots_process_key(trans, &iter, k);
+		}));
+
+		bch2_disk_reservation_put(c, &res);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree,
+					   struct bpos start, struct bpos end)
+{
+	struct bch_fs *c = trans->c;
+	struct snapshot_delete *d = &c->snapshot_delete;
+	struct disk_reservation res = { 0 };
+
+	d->pos.btree	= btree;
+	d->pos.pos	= POS_MIN;
+
+	int ret = for_each_btree_key_max_commit(trans, iter,
+			btree, start, end,
+			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+			&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+		d->pos.pos = iter.pos;
+		delete_dead_snapshots_process_key(trans, &iter, k);
+	}));
+
+	bch2_disk_reservation_put(c, &res);
+	return ret;
+}
+
+static int delete_dead_snapshot_keys_v2(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct snapshot_delete *d = &c->snapshot_delete;
+	struct disk_reservation res = { 0 };
+	u64 prev_inum = 0;
+	int ret = 0;
+
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN,
+			     BTREE_ITER_prefetch|BTREE_ITER_all_snapshots);
+
+	while (1) {
+		struct bkey_s_c k;
+		ret = lockrestart_do(trans,
+				bkey_err(k = bch2_btree_iter_peek(trans, &iter)));
+		if (ret)
+			break;
+
+		if (!k.k)
+			break;
+
+		d->pos.btree	= iter.btree_id;
+		d->pos.pos	= iter.pos;
+
+		if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+			continue;
+
+		if (snapshot_id_dying(d, k.k->p.snapshot)) {
+			struct bpos start	= POS(k.k->p.offset, 0);
+			struct bpos end		= POS(k.k->p.offset, U64_MAX);
+
+			ret   = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?:
+				delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?:
+				delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end);
+			if (ret)
+				break;
+
+			bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1));
+		} else {
+			bch2_btree_iter_advance(trans, &iter);
+		}
+	}
+	bch2_trans_iter_exit(trans, &iter);
+
+	if (ret)
+		goto err;
+
+	prev_inum = 0;
+	ret = for_each_btree_key_commit(trans, iter,
+			BTREE_ID_inodes, POS_MIN,
+			BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+			&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+		d->pos.btree	= iter.btree_id;
+		d->pos.pos	= iter.pos;
+
+		if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum))
+			continue;
+
+		delete_dead_snapshots_process_key(trans, &iter, k);
+	}));
+err:
+	bch2_disk_reservation_put(c, &res);
+	return ret;
+}
+
 /*
 * For a given snapshot, if it doesn't have a subvolume that points to it, and
 * it doesn't have child snapshot nodes - it's now redundant and we can mark it
@ -1500,6 +1638,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
 	struct snapshot_delete *d = &c->snapshot_delete;
 	struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
 	unsigned live_children = 0;
+	int ret = 0;

 	if (BCH_SNAPSHOT_SUBVOL(s.v))
 		return 0;
@ -1507,6 +1646,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
 	if (BCH_SNAPSHOT_DELETED(s.v))
 		return 0;

+	mutex_lock(&d->lock);
 	for (unsigned i = 0; i < 2; i++) {
 		u32 child = le32_to_cpu(s.v->children[i]);

@ -1517,7 +1657,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s
 	u32 tree = bch2_snapshot_tree(c, s.k->p.offset);

 	if (live_children == 0) {
-		return  snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
+		ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
 			snapshot_list_add(c, &d->delete_leaves, s.k->p.offset);
 	} else if (live_children == 1) {
 		struct snapshot_interior_delete n = {
@ -1527,14 +1667,15 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s

 		if (!n.live_child) {
 			bch_err(c, "error finding live child of snapshot %u", n.id);
-			return -EINVAL;
+			ret = -EINVAL;
+		} else {
+			ret =   snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
+				darray_push(&d->delete_interior, n);
 		}
-
-		return  snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?:
-			darray_push(&d->delete_interior, n);
-	} else {
-		return 0;
 	}
+	mutex_unlock(&d->lock);
+
+	return ret;
 }

 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
@ -1641,13 +1782,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 	 * For every snapshot node: If we have no live children and it's not
 	 * pointed to by a subvolume, delete it:
 	 */
-	mutex_lock(&d->lock);
 	d->running = true;
 	d->pos = BBPOS_MIN;

 	ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k,
 		check_should_delete_snapshot(trans, k));
-	mutex_unlock(&d->lock);
 	if (!bch2_err_matches(ret, EROFS))
 		bch_err_msg(c, ret, "walking snapshots");
 	if (ret)
@ -1666,33 +1805,13 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
 			goto err;
 	}

-	for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) {
-		struct disk_reservation res = { 0 };
-
-		d->pos.pos = POS_MIN;
-
-		if (!btree_type_has_snapshots(d->pos.btree))
-			continue;
-
-		ret = for_each_btree_key_commit(trans, iter,
-				d->pos.btree, POS_MIN,
-				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				&res, NULL, BCH_TRANS_COMMIT_no_enospc, ({
-			d->pos.pos = iter.pos;
-
-			if (skip_unrelated_snapshot_tree(trans, &iter))
-				continue;
-
-			delete_dead_snapshots_process_key(trans, &iter, k);
-		}));
-
-		bch2_disk_reservation_put(c, &res);
-
-		if (!bch2_err_matches(ret, EROFS))
-			bch_err_msg(c, ret, "deleting keys from dying snapshots");
-		if (ret)
-			goto err;
-	}
+	ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)
+		? delete_dead_snapshot_keys_v2(trans)
+		: delete_dead_snapshot_keys_v1(trans);
+	if (!bch2_err_matches(ret, EROFS))
+		bch_err_msg(c, ret, "deleting keys from dying snapshots");
+	if (ret)
+		goto err;

 	darray_for_each(d->delete_leaves, i) {
 		ret = commit_do(trans, NULL, NULL, 0,
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -87,7 +87,8 @@ int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version v
 		struct printbuf buf = PRINTBUF;
 		prt_str(&buf, "requested incompat feature ");
 		bch2_version_to_text(&buf, version);
-		prt_str(&buf, " currently not enabled");
+		prt_str(&buf, " currently not enabled, allowed up to ");
+		bch2_version_to_text(&buf, version);
 		prt_printf(&buf, "\n  set version_upgrade=incompat to enable");

 		bch_notice(c, "%s", buf.buf);
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -214,7 +214,6 @@ static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
 static void bch2_dev_io_ref_stop(struct bch_dev *, int);
 static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
 static int bch2_fs_init_rw(struct bch_fs *);
-static int bch2_fs_resize_on_mount(struct bch_fs *);

 struct bch_fs *bch2_dev_to_fs(dev_t dev)
 {
@ -1150,15 +1149,11 @@ int bch2_fs_start(struct bch_fs *c)
 		cpu_to_le64(now);
 	rcu_read_unlock();

-	bch2_write_super(c);
+	/*
+	 * Dno't write superblock yet: recovery might have to downgrade
+	 */
 	mutex_unlock(&c->sb_lock);

-	ret = bch2_fs_resize_on_mount(c);
-	if (ret) {
-		up_write(&c->state_lock);
-		goto err;
-	}
-
 	rcu_read_lock();
 	for_each_online_member_rcu(c, ca)
 		if (ca->mi.state == BCH_MEMBER_STATE_rw)
@ -1724,6 +1719,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 {
 	struct bch_member *m;
 	unsigned dev_idx = ca->dev_idx, data;
+	bool fast_device_removal = !bch2_request_incompat_feature(c,
+					bcachefs_metadata_version_fast_device_removal);
 	int ret;

 	down_write(&c->state_lock);
@ -1742,11 +1739,24 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)

 	__bch2_dev_read_only(c, ca);

-	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
-	bch_err_msg(ca, ret, "bch2_dev_data_drop()");
+	ret = fast_device_removal
+		? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags)
+		: bch2_dev_data_drop(c, ca->dev_idx, flags);
 	if (ret)
 		goto err;

+	/* Check if device still has data */
+	struct bch_dev_usage usage = bch2_dev_usage_read(ca);
+	for (unsigned i = 0; i < BCH_DATA_NR; i++)
+		if (!data_type_is_empty(i) &&
+		    !data_type_is_hidden(i) &&
+		    usage.buckets[i]) {
+			bch_err(ca, "Remove failed: still has data (%s, %llu buckets)",
+				__bch2_data_types[i], usage.buckets[i]);
+			ret = -EBUSY;
+			goto err;
+		}
+
 	ret = bch2_dev_remove_alloc(c, ca);
 	bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
 	if (ret)
@ -1810,7 +1820,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 */
 	mutex_lock(&c->sb_lock);
 	m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx);
-	memset(&m->uuid, 0, sizeof(m->uuid));
+
+	if (fast_device_removal)
+		m->uuid = BCH_SB_MEMBER_DELETED_UUID;
+	else
+		memset(&m->uuid, 0, sizeof(m->uuid));

 	bch2_write_super(c);

@ -2120,7 +2134,7 @@ err:
 	return ret;
 }

-static int bch2_fs_resize_on_mount(struct bch_fs *c)
+int bch2_fs_resize_on_mount(struct bch_fs *c)
 {
 	for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) {
 		u64 old_nbuckets = ca->mi.nbuckets;
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@ -38,6 +38,8 @@ void bch2_fs_read_only(struct bch_fs *);
 int bch2_fs_read_write(struct bch_fs *);
 int bch2_fs_read_write_early(struct bch_fs *);

+int bch2_fs_resize_on_mount(struct bch_fs *);
+
 void __bch2_fs_stop(struct bch_fs *);
 void bch2_fs_free(struct bch_fs *);
 void bch2_fs_stop(struct bch_fs *);
--- a/libbcachefs/thread_with_file.c
+++ b/libbcachefs/thread_with_file.c
@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki
 	struct stdio_buf *buf = &stdio->output;
 	unsigned long flags;
 	ssize_t ret;
-
 again:
+	if (stdio->done)
+		return -EPIPE;
+
 	spin_lock_irqsave(&buf->lock, flags);
 	ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
 	spin_unlock_irqrestore(&buf->lock, flags);