Update bcachefs sources to 6a25f7a00d bcachefs: fix ioctl code

2025-03-28 00:00:03 +03:00 · 2017-06-13 17:06:05 -08:00 · 2017-06-13 17:06:05 -08:00 · 38f22164a9
commit 38f22164a9
parent 914c4d19ed
49 changed files with 2927 additions and 2806 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-14e9ac5016803fc63c1216608c866bef16b4053e
+6a25f7a00d08c45b35bed3d649c05286ec60f7f6
--- a/3
+++ b/3
@ -69,7 +69,8 @@ SRCS=bcachefs.c				\
     libbcachefs/btree_gc.c		\
     libbcachefs/btree_io.c		\
     libbcachefs/btree_iter.c		\
-     libbcachefs/btree_update.c		\
+     libbcachefs/btree_update_interior.c\
+     libbcachefs/btree_update_leaf.c	\
     libbcachefs/buckets.c		\
     libbcachefs/checksum.c		\
     libbcachefs/clock.c		\
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@ -24,6 +24,7 @@
 #include <linux/dcache.h>
 #include <linux/generic-radix-tree.h>
 #include <linux/xattr.h>
+#include "bcachefs.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "dirent.h"
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@ -38,6 +38,14 @@ static inline void set_bit(long nr, volatile unsigned long *addr)
 	__atomic_or_fetch(p, mask, __ATOMIC_RELAXED);
 }

+static inline void __clear_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+	*p &= ~mask;
+}
+
 static inline void clear_bit(long nr, volatile unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@ -90,6 +90,8 @@ do {									\
 	__wait_event(wq, condition);					\
 } while (0)

+#define wait_event_killable(wq, condition)	({wait_event(wq, condition); 0; })
+
 #define __wait_event_timeout(wq, condition, timeout)			\
 	___wait_event(wq, ___wait_cond_timeout(condition),		\
 		      TASK_UNINTERRUPTIBLE, 0, timeout,			\
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@ -87,7 +87,7 @@ DECLARE_EVENT_CLASS(bio,
 	),

 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->dev		= bio->bi_bdev ? bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio->bi_iter.bi_sector;
 		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@ -146,17 +146,17 @@ static void pd_controllers_update(struct work_struct *work)

 			u64 size = (ca->mi.nbuckets -
 				    ca->mi.first_bucket) << bucket_bits;
-			u64 dirty = stats.buckets_dirty << bucket_bits;
+			u64 dirty = stats.buckets[S_DIRTY] << bucket_bits;
 			u64 free = __dev_buckets_free(ca, stats) << bucket_bits;
 			/*
 			 * Bytes of internal fragmentation, which can be
 			 * reclaimed by copy GC
 			 */
-			s64 fragmented = ((stats.buckets_dirty +
+			s64 fragmented = ((stats.buckets[S_DIRTY] +
 					   stats.buckets_cached) <<
 					  bucket_bits) -
 				((stats.sectors[S_DIRTY] +
-				  stats.sectors[S_CACHED] ) << 9);
+				  stats.sectors_cached) << 9);

 			fragmented = max(0LL, fragmented);

@ -912,7 +912,7 @@ static int bch2_allocator_thread(void *arg)
 				bucket = fifo_peek(&ca->free_inc);
 				discard_invalidated_bucket(ca, bucket);
 				if (kthread_should_stop())
-					goto out;
+					return 0;
 				--ca->nr_invalidated;
 			}

@ -922,7 +922,7 @@ static int bch2_allocator_thread(void *arg)
 			journal_seq = 0;
 			ret = bch2_invalidate_free_inc(c, ca, &journal_seq);
 			if (ret < 0)
-				goto out;
+				return 0;

 			ca->nr_invalidated = ret;

@ -944,7 +944,7 @@ static int bch2_allocator_thread(void *arg)
 		down_read(&c->gc_lock);
 		if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
 			up_read(&c->gc_lock);
-			goto out;
+			return 0;
 		}

 		while (1) {
@ -973,7 +973,7 @@ static int bch2_allocator_thread(void *arg)

 			if (wait_buckets_available(c, ca)) {
 				up_read(&c->gc_lock);
-				goto out;
+				return 0;
 			}
 		}
 		up_read(&c->gc_lock);
@ -992,13 +992,6 @@ static int bch2_allocator_thread(void *arg)
 		 * write out the new bucket gens:
 		 */
 	}
-out:
-	/*
-	 * Avoid a race with bch2_usage_update() trying to wake us up after
-	 * we've exited:
-	 */
-	synchronize_rcu();
-	return 0;
 }

 /* Allocation */
@ -1892,18 +1885,20 @@ void bch2_dev_allocator_stop(struct bch_dev *ca)
 	struct task_struct *p = ca->alloc_thread;

 	ca->alloc_thread = NULL;
-	smp_wmb();

 	/*
 	 * We need an rcu barrier between setting ca->alloc_thread = NULL and
-	 * the thread shutting down to avoid a race with bch2_usage_update() -
-	 * the allocator thread itself does a synchronize_rcu() on exit.
+	 * the thread shutting down to avoid bch2_wake_allocator() racing:
 	 *
 	 * XXX: it would be better to have the rcu barrier be asynchronous
 	 * instead of blocking us here
 	 */
-	if (p)
+	synchronize_rcu();
+
+	if (p) {
 		kthread_stop(p);
+		put_task_struct(p);
+	}
 }

 /* start allocator thread: */
@ -1917,11 +1912,13 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	if (ca->alloc_thread)
 		return 0;

-	p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator");
+	p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
 	if (IS_ERR(p))
 		return PTR_ERR(p);

+	get_task_struct(p);
 	ca->alloc_thread = p;
+	wake_up_process(p);
 	return 0;
 }

--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -282,7 +282,6 @@ do {									\
 #include "alloc_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
-#include "io_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "move_types.h"
@ -365,6 +364,7 @@ struct bch_dev {
 	char			name[BDEVNAME_SIZE];

 	struct bcache_superblock disk_sb;
+	int			sb_write_error;

 	struct dev_group	self;

@ -721,10 +721,6 @@ struct bch_fs {

 	atomic64_t		key_version;

-	struct bio_list		read_retry_list;
-	struct work_struct	read_retry_work;
-	spinlock_t		read_retry_lock;
-
 	struct bio_list		btree_write_error_list;
 	struct work_struct	btree_write_error_work;
 	spinlock_t		btree_write_error_lock;
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@ -27,9 +27,18 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";

-	if (k.k->size &&
-	    (bkey_deleted(k.k) || !ops->is_extents))
-		return "nonzero size field";
+	if (!ops->is_extents) {
+		if (k.k->size)
+			return "nonzero size field";
+	} else {
+		if ((k.k->size == 0) != bkey_deleted(k.k))
+			return "bad size field";
+	}
+
+	if (ops->is_extents &&
+	    !k.k->size &&
+	    !bkey_deleted(k.k))
+		return "zero size field";

 	switch (k.k->type) {
 	case KEY_TYPE_DELETED:
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@ -539,12 +539,12 @@ err:
 }

 /* Slowpath, don't want it inlined into btree_iter_traverse() */
-static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter,
+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
+						   struct btree_iter *iter,
 						   const struct bkey_i *k,
 						   unsigned level,
 						   enum six_lock_type lock_type)
 {
-	struct bch_fs *c = iter->c;
 	struct btree *b;

 	/*
@ -603,7 +603,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter,
 * The btree node will have either a read or a write lock held, depending on
 * the @write parameter.
 */
-struct btree *bch2_btree_node_get(struct btree_iter *iter,
+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
 				  const struct bkey_i *k, unsigned level,
 				  enum six_lock_type lock_type)
 {
@ -613,7 +613,7 @@ struct btree *bch2_btree_node_get(struct btree_iter *iter,
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 retry:
 	rcu_read_lock();
-	b = mca_find(iter->c, k);
+	b = mca_find(c, k);
 	rcu_read_unlock();

 	if (unlikely(!b)) {
@ -622,7 +622,7 @@ retry:
 		 * else we could read in a btree node from disk that's been
 		 * freed:
 		 */
-		b = bch2_btree_node_fill(iter, k, level, lock_type);
+		b = bch2_btree_node_fill(c, iter, k, level, lock_type);

 		/* We raced and found the btree node in the cache */
 		if (!b)
@ -706,10 +706,61 @@ retry:
 	return b;
 }

-void bch2_btree_node_prefetch(struct btree_iter *iter,
-			      const struct bkey_i *k, unsigned level)
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
+					  struct btree_iter *iter,
+					  struct btree *b,
+					  enum btree_node_sibling sib)
+{
+	struct btree *parent;
+	struct btree_node_iter node_iter;
+	struct bkey_packed *k;
+	BKEY_PADDED(k) tmp;
+	struct btree *ret;
+	unsigned level = b->level;
+
+	parent = iter->nodes[level + 1];
+	if (!parent)
+		return NULL;
+
+	if (!bch2_btree_node_relock(iter, level + 1)) {
+		bch2_btree_iter_set_locks_want(iter, level + 2);
+		return ERR_PTR(-EINTR);
+	}
+
+	node_iter = iter->node_iters[parent->level];
+
+	k = bch2_btree_node_iter_peek_all(&node_iter, parent);
+	BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p));
+
+	do {
+		k = sib == btree_prev_sib
+			? bch2_btree_node_iter_prev_all(&node_iter, parent)
+			: (bch2_btree_node_iter_advance(&node_iter, parent),
+			   bch2_btree_node_iter_peek_all(&node_iter, parent));
+		if (!k)
+			return NULL;
+	} while (bkey_deleted(k));
+
+	bch2_bkey_unpack(parent, &tmp.k, k);
+
+	ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
+
+	if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
+		btree_node_unlock(iter, level);
+		ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
+	}
+
+	if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) {
+		six_unlock_intent(&ret->lock);
+		ret = ERR_PTR(-EINTR);
+	}
+
+	return ret;
+}
+
+void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
+			      unsigned level, enum btree_id btree_id)
 {
-	struct bch_fs *c = iter->c;
 	struct btree *b;

 	BUG_ON(level >= BTREE_MAX_DEPTH);
@ -726,7 +777,7 @@ void bch2_btree_node_prefetch(struct btree_iter *iter,
 		return;

 	bkey_copy(&b->key, k);
-	if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) {
+	if (bch2_btree_node_hash_insert(c, b, level, btree_id)) {
 		/* raced with another fill: */

 		/* mark as unhashed... */
--- a/libbcachefs/btree_cache.h
+++ b/libbcachefs/btree_cache.h
@ -21,11 +21,16 @@ int bch2_btree_node_cannibalize_lock(struct bch_fs *, struct closure *);

 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);

-struct btree *bch2_btree_node_get(struct btree_iter *, const struct bkey_i *,
-				  unsigned, enum six_lock_type);
+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
+				  const struct bkey_i *, unsigned,
+				  enum six_lock_type);

-void bch2_btree_node_prefetch(struct btree_iter *, const struct bkey_i *,
-			      unsigned);
+struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
+					  struct btree *,
+					  enum btree_node_sibling);
+
+void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *,
+			      unsigned, enum btree_id);

 void bch2_fs_btree_exit(struct bch_fs *);
 int bch2_fs_btree_init(struct bch_fs *);
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -7,7 +7,7 @@
 #include "alloc.h"
 #include "bkey_methods.h"
 #include "btree_locking.h"
-#include "btree_update.h"
+#include "btree_update_interior.h"
 #include "btree_io.h"
 #include "btree_gc.h"
 #include "buckets.h"
@ -112,14 +112,14 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
 * For runtime mark and sweep:
 */
 static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
-			     struct bkey_s_c k)
+			      struct bkey_s_c k, unsigned flags)
 {
 	switch (type) {
 	case BKEY_TYPE_BTREE:
-		bch2_gc_mark_key(c, k, c->sb.btree_node_size, true);
+		bch2_gc_mark_key(c, k, c->sb.btree_node_size, true, flags);
 		return 0;
 	case BKEY_TYPE_EXTENTS:
-		bch2_gc_mark_key(c, k, k.k->size, false);
+		bch2_gc_mark_key(c, k, k.k->size, false, flags);
 		return bch2_btree_key_recalc_oldest_gen(c, k);
 	default:
 		BUG();
@ -151,13 +151,10 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 		extent_for_each_ptr(e, ptr) {
 			struct bch_dev *ca = c->devs[ptr->dev];
 			struct bucket *g = PTR_BUCKET(ca, ptr);
-			struct bucket_mark new;

 			if (!g->mark.gen_valid) {
-				bucket_cmpxchg(g, new, ({
-					new.gen = ptr->gen;
-					new.gen_valid = 1;
-				}));
+				g->_mark.gen = ptr->gen;
+				g->_mark.gen_valid = 1;
 				ca->need_alloc_write = true;
 			}

@ -166,10 +163,8 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 					type == BKEY_TYPE_BTREE
 					? "btree" : "data",
 					ptr->gen, g->mark.gen)) {
-				bucket_cmpxchg(g, new, ({
-					new.gen = ptr->gen;
-					new.gen_valid = 1;
-				}));
+				g->_mark.gen = ptr->gen;
+				g->_mark.gen_valid = 1;
 				ca->need_alloc_write = true;
 				set_bit(BCH_FS_FIXED_GENS, &c->flags);
 			}
@ -184,13 +179,14 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 		     max_t(u64, k.k->version.lo,
 			   atomic64_read(&c->key_version)));

-	bch2_btree_mark_key(c, type, k);
+	bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
 fsck_err:
 	return ret;
 }

 static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 {
+	enum bkey_type type = btree_node_type(b);
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
@ -201,8 +197,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 					       btree_node_is_extents(b),
 					       &unpacked) {
 			bch2_bkey_debugcheck(c, b, k);
-			stale = max(stale, bch2_btree_mark_key(c,
-							btree_node_type(b), k));
+			stale = max(stale, bch2_btree_mark_key(c, type, k, 0));
 		}

 	return stale;
@ -269,7 +264,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 	mutex_lock(&c->btree_root_lock);

 	b = c->btree_roots[btree_id].b;
-	bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key));
+	bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));

 	mutex_unlock(&c->btree_root_lock);
@ -379,7 +374,7 @@ static void bch2_mark_metadata(struct bch_fs *c)
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
 	struct bch_fs_usage stats = { 0 };
-	struct btree_interior_update *as;
+	struct btree_update *as;
 	struct pending_btree_node_free *d;

 	mutex_lock(&c->btree_interior_update_lock);
@ -387,9 +382,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)

 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			__bch2_gc_mark_key(c, bkey_i_to_s_c(&d->key),
-					  c->sb.btree_node_size, true,
-					  &stats);
+			__bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+					c->sb.btree_node_size, true,
+					&stats, 0,
+					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
 	/*
 	 * Don't apply stats - pending deletes aren't tracked in
 	 * bch_alloc_stats:
@ -430,7 +426,6 @@ void bch2_gc_start(struct bch_fs *c)
 			per_cpu_ptr(c->usage_percpu, cpu);

 		memset(p->s, 0, sizeof(p->s));
-		p->persistent_reserved = 0;
 	}

 	lg_global_unlock(&c->usage_lock);
@ -551,16 +546,14 @@ static void recalc_packed_keys(struct btree *b)
 		btree_keys_account_key_add(&b->nr, 0, k);
 }

-static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
-				struct btree_iter *iter)
+static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
+				struct btree *old_nodes[GC_MERGE_NODES])
 {
 	struct btree *parent = iter->nodes[old_nodes[0]->level + 1];
-	struct bch_fs *c = iter->c;
 	unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0;
 	unsigned blocks = btree_blocks(c) * 2 / 3;
 	struct btree *new_nodes[GC_MERGE_NODES];
-	struct btree_interior_update *as;
-	struct btree_reserve *res;
+	struct btree_update *as;
 	struct keylist keylist;
 	struct bkey_format_state format_state;
 	struct bkey_format new_format;
@ -580,23 +573,6 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
 			     DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks)
 		return;

-	res = bch2_btree_reserve_get(c, parent, nr_old_nodes,
-				    BTREE_INSERT_NOFAIL|
-				    BTREE_INSERT_USE_RESERVE,
-				    NULL);
-	if (IS_ERR(res)) {
-		trace_btree_gc_coalesce_fail(c,
-				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
-		return;
-	}
-
-	if (bch2_keylist_realloc(&keylist, NULL, 0,
-			(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
-		trace_btree_gc_coalesce_fail(c,
-				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
-		goto out;
-	}
-
 	/* Find a format that all keys in @old_nodes can pack into */
 	bch2_bkey_format_init(&format_state);

@ -610,21 +586,38 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES],
 		if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) {
 			trace_btree_gc_coalesce_fail(c,
 					BTREE_GC_COALESCE_FAIL_FORMAT_FITS);
-			goto out;
+			return;
 		}

+	if (bch2_keylist_realloc(&keylist, NULL, 0,
+			(BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+		trace_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
+		return;
+	}
+
+	as = bch2_btree_update_start(c, iter->btree_id,
+			btree_update_reserve_required(c, parent) + nr_old_nodes,
+			BTREE_INSERT_NOFAIL|
+			BTREE_INSERT_USE_RESERVE,
+			NULL);
+	if (IS_ERR(as)) {
+		trace_btree_gc_coalesce_fail(c,
+				BTREE_GC_COALESCE_FAIL_RESERVE_GET);
+		bch2_keylist_free(&keylist, NULL);
+		return;
+	}
+
 	trace_btree_gc_coalesce(c, parent, nr_old_nodes);

-	as = bch2_btree_interior_update_alloc(c);
-
 	for (i = 0; i < nr_old_nodes; i++)
-		bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]);
+		bch2_btree_interior_update_will_free_node(as, old_nodes[i]);

 	/* Repack everything with @new_format and sort down to one bset */
 	for (i = 0; i < nr_old_nodes; i++)
 		new_nodes[i] =
-			__bch2_btree_node_alloc_replacement(c, old_nodes[i],
-							    new_format, as, res);
+			__bch2_btree_node_alloc_replacement(as, old_nodes[i],
+							    new_format);

 	/*
 	 * Conceptually we concatenate the nodes together and slice them
@ -738,7 +731,7 @@ next:
 		bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);

 	/* Insert the newly coalesced nodes */
-	bch2_btree_insert_node(parent, iter, &keylist, res, as);
+	bch2_btree_insert_node(as, parent, iter, &keylist);

 	BUG_ON(!bch2_keylist_empty(&keylist));

@ -751,7 +744,7 @@ next:

 	/* Free the old nodes and update our sliding window */
 	for (i = 0; i < nr_old_nodes; i++) {
-		bch2_btree_node_free_inmem(iter, old_nodes[i]);
+		bch2_btree_node_free_inmem(c, old_nodes[i], iter);
 		six_unlock_intent(&old_nodes[i]->lock);

 		/*
@ -768,9 +761,9 @@ next:
 				six_unlock_intent(&new_nodes[i]->lock);
 		}
 	}
-out:
+
+	bch2_btree_update_done(as);
 	bch2_keylist_free(&keylist, NULL);
-	bch2_btree_reserve_put(c, res);
 }

 static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
@ -814,7 +807,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 		}
 		memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0]));

-		bch2_coalesce_nodes(merge, &iter);
+		bch2_coalesce_nodes(c, &iter, merge);

 		for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
 			lock_seq[i] = merge[i]->lock.state.seq;
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -2,10 +2,11 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_cache.h"
-#include "btree_update.h"
 #include "btree_io.h"
 #include "btree_iter.h"
 #include "btree_locking.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "debug.h"
@ -872,37 +873,37 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
 		    vstruct_end(i) - (void *) i->_data);
 }

-#define btree_node_error(c, b, ptr, msg, ...)				\
+#define btree_node_error(c, b, msg, ...)				\
 do {									\
 	if (write == READ &&						\
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
 		mustfix_fsck_err(c,					\
-			 "btree node read error at btree %u level %u/%u\n"\
-			"sector %llu node offset %u bset u64s %u: " msg,\
+			"btree node read error at btree %u level %u/%u\n"\
+			"pos %llu:%llu node offset %u bset u64s %u: " msg,\
 			(b)->btree_id, (b)->level,			\
 			(c)->btree_roots[(b)->btree_id].level,		\
-			(u64) ptr->offset, (b)->written,		\
-			le16_to_cpu((i)->u64s), ##__VA_ARGS__);		\
+			(b)->key.k.p.inode, (b)->key.k.p.offset,	\
+			(b)->written, le16_to_cpu((i)->u64s),		\
+			##__VA_ARGS__);					\
 	} else {							\
 		bch_err(c, "%s at btree %u level %u/%u\n"		\
-			"sector %llu node offset %u bset u64s %u: " msg,\
+			"pos %llu:%llu node offset %u bset u64s %u: " msg,\
 			write == WRITE					\
 			? "corrupt metadata in btree node write"	\
 			: "btree node error",				\
 			(b)->btree_id, (b)->level,			\
 			(c)->btree_roots[(b)->btree_id].level,		\
-			(u64) ptr->offset, (b)->written,		\
-			le16_to_cpu((i)->u64s), ##__VA_ARGS__);		\
+			(b)->key.k.p.inode, (b)->key.k.p.offset,	\
+			(b)->written, le16_to_cpu((i)->u64s),		\
+			##__VA_ARGS__);					\
 		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
 		goto fsck_err;						\
 	}								\
 } while (0)

 static int validate_bset(struct bch_fs *c, struct btree *b,
-			 const struct bch_extent_ptr *ptr,
 			 struct bset *i, unsigned sectors,
-			 unsigned *whiteout_u64s,
-			 int write)
+			 unsigned *whiteout_u64s, int write)
 {
 	struct bkey_packed *k, *prev = NULL;
 	struct bpos prev_pos = POS_MIN;
@ -910,19 +911,19 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 	int ret = 0;

 	if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
-		btree_node_error(c, b, ptr, "unsupported bset version");
+		btree_node_error(c, b, "unsupported bset version");
 		i->u64s = 0;
 		return 0;
 	}

 	if (b->written + sectors > c->sb.btree_node_size) {
-		btree_node_error(c, b, ptr, "bset past end of btree node");
+		btree_node_error(c, b, "bset past end of btree node");
 		i->u64s = 0;
 		return 0;
 	}

 	if (b->written && !i->u64s)
-		btree_node_error(c, b, ptr, "empty set");
+		btree_node_error(c, b, "empty set");

 	if (!BSET_SEPARATE_WHITEOUTS(i)) {
 		seen_non_whiteout = true;
@ -936,7 +937,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		const char *invalid;

 		if (!k->u64s) {
-			btree_node_error(c, b, ptr,
+			btree_node_error(c, b,
 				"KEY_U64s 0: %zu bytes of metadata lost",
 				vstruct_end(i) - (void *) k);

@ -945,7 +946,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		}

 		if (bkey_next(k) > vstruct_last(i)) {
-			btree_node_error(c, b, ptr,
+			btree_node_error(c, b,
 					 "key extends past end of bset");

 			i->u64s = cpu_to_le16((u64 *) k - i->_data);
@ -953,7 +954,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 		}

 		if (k->format > KEY_FORMAT_CURRENT) {
-			btree_node_error(c, b, ptr,
+			btree_node_error(c, b,
 					 "invalid bkey format %u", k->format);

 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@ -973,7 +974,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,

 			bch2_bkey_val_to_text(c, btree_node_type(b),
 					      buf, sizeof(buf), u);
-			btree_node_error(c, b, ptr,
+			btree_node_error(c, b,
 					 "invalid bkey %s: %s", buf, invalid);

 			i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@ -994,7 +995,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 			*whiteout_u64s = k->_data - i->_data;
 			seen_non_whiteout = true;
 		} else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
-			btree_node_error(c, b, ptr,
+			btree_node_error(c, b,
 					 "keys out of order: %llu:%llu > %llu:%llu",
 					 prev_pos.inode,
 					 prev_pos.offset,
@ -1013,32 +1014,7 @@ fsck_err:
 	return ret;
 }

-static bool extent_contains_ptr(struct bkey_s_c_extent e,
-				struct bch_extent_ptr match)
-{
-	const struct bch_extent_ptr *ptr;
-
-	extent_for_each_ptr(e, ptr)
-		if (!memcmp(ptr, &match, sizeof(*ptr)))
-			return true;
-
-	return false;
-}
-
-static void bch2_btree_node_read_complete(struct btree_read_bio *rb,
-					  struct btree *b)
-{
-	struct bch_dev *ca = rb->pick.ca;
-
-	bio_put(&rb->bio);
-	percpu_ref_put(&ca->io_ref);
-	clear_btree_node_read_in_flight(b);
-	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
-}
-
-void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
-			      struct bch_dev *ca,
-			      const struct bch_extent_ptr *ptr)
+int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b)
 {
 	struct btree_node_entry *bne;
 	struct bset *i = &b->data->keys;
@ -1049,7 +1025,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
 	const char *err;
 	struct bch_csum csum;
 	struct nonce nonce;
-	int ret, write = READ;
+	int ret, should_retry = 0, write = READ;

 	iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
 	__bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
@ -1066,24 +1042,22 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,

 			err = "bad magic";
 			if (le64_to_cpu(b->data->magic) != bset_magic(c))
-				goto err;
+				goto retry_err;

 			err = "bad btree header";
 			if (!b->data->keys.seq)
-				goto err;
+				goto retry_err;

 			err = "unknown checksum type";
 			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
-				goto err;
-
-			/* XXX: retry checksum errors */
+				goto retry_err;

 			nonce = btree_nonce(b, i, b->written << 9);
 			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);

 			err = "bad checksum";
 			if (bch2_crc_cmp(csum, b->data->csum))
-				goto err;
+				goto retry_err;

 			bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
 				    &b->data->flags,
@ -1116,12 +1090,19 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
 			err = "incorrect max key";
 			if (bkey_cmp(b->data->max_key, b->key.k.p))
 				goto err;
-
+#if 0
+			/*
+			 * not correct anymore, due to btree node write error
+			 * handling
+			 *
+			 * need to add b->data->seq to btree keys and verify
+			 * against that
+			 */
 			err = "incorrect backpointer";
 			if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key),
 						 b->data->ptr))
 				goto err;
-
+#endif
 			err = bch2_bkey_format_validate(&b->data->format);
 			if (err)
 				goto err;
@ -1138,22 +1119,21 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,

 			err = "unknown checksum type";
 			if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)))
-				goto err;
+				goto retry_err;

 			nonce = btree_nonce(b, i, b->written << 9);
 			csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);

 			err = "bad checksum";
-			if (memcmp(&csum, &bne->csum, sizeof(csum)))
-				goto err;
+			if (bch2_crc_cmp(csum, bne->csum))
+				goto retry_err;

 			bset_encrypt(c, i, nonce);

 			sectors = vstruct_sectors(bne, c->block_bits);
 		}

-		ret = validate_bset(c, b, ptr, i, sectors,
-				    &whiteout_u64s, READ);
+		ret = validate_bset(c, b, i, sectors, &whiteout_u64s, READ);
 		if (ret)
 			goto fsck_err;

@ -1208,40 +1188,79 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
 	btree_node_reset_sib_u64s(b);
 out:
 	mempool_free(iter, &c->fill_iter);
-	return;
+	return should_retry;
 err:
-	btree_node_error(c, b, ptr, "%s", err);
+	btree_node_error(c, b, "%s", err);
 fsck_err:
 	bch2_inconsistent_error(c);
 	set_btree_node_read_error(b);
 	goto out;
+retry_err:
+	should_retry = -1;
+	goto out;
 }

 static void btree_node_read_work(struct work_struct *work)
 {
 	struct btree_read_bio *rb =
 		container_of(work, struct btree_read_bio, work);
+	struct bch_fs *c	= rb->c;
+	struct bch_dev *ca	= rb->pick.ca;
+	struct btree *b		= rb->bio.bi_private;
+	struct bio *bio		= &rb->bio;
+	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
+	const struct bch_extent_ptr *ptr;
+	struct bch_devs_mask avoid;

-	bch2_btree_node_read_done(rb->c, rb->bio.bi_private,
-				  rb->pick.ca, &rb->pick.ptr);
-	bch2_btree_node_read_complete(rb, rb->bio.bi_private);
+	bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
+	percpu_ref_put(&rb->pick.ca->io_ref);
+
+	if (!bio->bi_error &&
+	    !bch2_btree_node_read_done(c, b))
+		goto out;
+
+	goto err;
+out:
+	bch2_time_stats_update(&c->btree_read_time, rb->start_time);
+	bio_put(&rb->bio);
+	clear_btree_node_read_in_flight(b);
+	wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+	return;
+err:
+	memset(&avoid, 0, sizeof(avoid));
+	__set_bit(ca->dev_idx, avoid.d);
+
+	extent_for_each_ptr(e, ptr) {
+		memset(&rb->pick, 0, sizeof(rb->pick));
+		bch2_get_read_device(c, e.k, ptr, NULL, &avoid, &rb->pick);
+
+		if (!rb->pick.ca)
+			continue;
+
+		bio_reset(bio);
+		bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
+		bio->bi_bdev		= rb->pick.ca->disk_sb.bdev;
+		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
+		bio->bi_iter.bi_size	= btree_bytes(c);
+		submit_bio_wait(bio);
+
+		bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read");
+		percpu_ref_put(&rb->pick.ca->io_ref);
+
+		if (!bio->bi_error &&
+		    !bch2_btree_node_read_done(c, b))
+			goto out;
+	}
+
+	set_btree_node_read_error(b);
+	goto out;
 }

 static void btree_node_read_endio(struct bio *bio)
 {
-	struct btree *b = bio->bi_private;
 	struct btree_read_bio *rb =
 		container_of(bio, struct btree_read_bio, bio);

-	if (bch2_dev_fatal_io_err_on(bio->bi_error,
-			rb->pick.ca, "IO error reading bucket %zu",
-			PTR_BUCKET_NR(rb->pick.ca, &rb->pick.ptr)) ||
-	    bch2_meta_read_fault("btree")) {
-		set_btree_node_read_error(b);
-		bch2_btree_node_read_complete(rb, rb->bio.bi_private);
-		return;
-	}
-
 	INIT_WORK(&rb->work, btree_node_read_work);
 	schedule_work(&rb->work);
 }
@ -1249,7 +1268,6 @@ static void btree_node_read_endio(struct bio *bio)
 void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 			  bool sync)
 {
-	uint64_t start_time = local_clock();
 	struct extent_pick_ptr pick;
 	struct btree_read_bio *rb;
 	struct bio *bio;
@ -1266,6 +1284,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
 	rb = container_of(bio, struct btree_read_bio, bio);
 	rb->c			= c;
+	rb->start_time		= local_clock();
 	rb->pick		= pick;
 	bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
 	bio->bi_bdev		= pick.ca->disk_sb.bdev;
@ -1277,19 +1296,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,

 	if (sync) {
 		submit_bio_wait(bio);
-
-		if (bch2_dev_fatal_io_err_on(bio->bi_error,
-				pick.ca, "IO error reading bucket %zu",
-				PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
-		    bch2_meta_read_fault("btree")) {
-			set_btree_node_read_error(b);
-			goto out;
-		}
-
-		bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
-		bch2_time_stats_update(&c->btree_read_time, start_time);
-out:
-		bch2_btree_node_read_complete(rb, b);
+		bio->bi_private	= b;
+		btree_node_read_work(&rb->work);
 	} else {
 		bio->bi_end_io	= btree_node_read_endio;
 		bio->bi_private	= b;
@ -1327,7 +1335,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
 		return -EIO;
 	}

-	bch2_btree_set_root_initial(c, b, NULL);
+	bch2_btree_set_root_for_read(c, b);
 	six_unlock_intent(&b->lock);

 	return 0;
@ -1356,7 +1364,15 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 	struct bkey_i_extent *new_key;

+	six_lock_read(&b->lock);
 	bkey_copy(&tmp.k, &b->key);
+	six_unlock_read(&b->lock);
+
+	if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) {
+		/* Node has been freed: */
+		goto out;
+	}
+
 	new_key = bkey_i_to_extent(&tmp.k);

 	while (wbio->replicas_failed) {
@ -1371,7 +1387,7 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
 		set_btree_node_noevict(b);
 		bch2_fatal_error(c);
 	}
-
+out:
 	bio_put(&wbio->bio);
 	btree_node_write_done(c, b);
 	if (cl)
@ -1385,9 +1401,9 @@ void bch2_btree_write_error_work(struct work_struct *work)
 	struct bio *bio;

 	while (1) {
-		spin_lock_irq(&c->read_retry_lock);
-		bio = bio_list_pop(&c->read_retry_list);
-		spin_unlock_irq(&c->read_retry_lock);
+		spin_lock_irq(&c->btree_write_error_lock);
+		bio = bio_list_pop(&c->btree_write_error_list);
+		spin_unlock_irq(&c->btree_write_error_lock);

 		if (!bio)
 			break;
@ -1406,7 +1422,7 @@ static void btree_node_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;

-	if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, "btree write") ||
+	if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
 	    bch2_meta_write_fault("btree"))
 		set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);

@ -1428,7 +1444,7 @@ static void btree_node_write_endio(struct bio *bio)
 		unsigned long flags;

 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
-		bio_list_add(&c->read_retry_list, &wbio->bio);
+		bio_list_add(&c->btree_write_error_list, &wbio->bio);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
 		queue_work(c->wq, &c->btree_write_error_work);
 		return;
@ -1450,7 +1466,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
 	extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
 		break;

-	ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
+	ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE);
 	if (ret)
 		bch2_inconsistent_error(c);

--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@ -10,6 +10,7 @@ struct btree_iter;

 struct btree_read_bio {
 	struct bch_fs		*c;
+	u64			start_time;
 	struct extent_pick_ptr	pick;
 	struct work_struct	work;
 	struct bio		bio;
@ -71,11 +72,10 @@ void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct bch_fs *, struct btree *,
 			 struct btree_iter *);

-void bch2_btree_node_read_done(struct bch_fs *, struct btree *,
-			      struct bch_dev *, const struct bch_extent_ptr *);
+int bch2_btree_node_read_done(struct bch_fs *, struct btree *);
 void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
 int bch2_btree_root_read(struct bch_fs *, enum btree_id,
-			const struct bkey_i *, unsigned);
+			 const struct bkey_i *, unsigned);

 void bch2_btree_complete_write(struct bch_fs *, struct btree *,
 			      struct btree_write *);
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -247,14 +247,12 @@ fail:
 	return false;
 }

-static int __bch2_btree_iter_unlock(struct btree_iter *iter)
+static void __bch2_btree_iter_unlock(struct btree_iter *iter)
 {
 	while (iter->nodes_locked)
 		btree_node_unlock(iter, __ffs(iter->nodes_locked));

 	iter->flags &= ~BTREE_ITER_UPTODATE;
-
-	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
 }

 int bch2_btree_iter_unlock(struct btree_iter *iter)
@ -263,7 +261,9 @@ int bch2_btree_iter_unlock(struct btree_iter *iter)

 	for_each_linked_btree_iter(iter, linked)
 		__bch2_btree_iter_unlock(linked);
-	return __bch2_btree_iter_unlock(iter);
+	__bch2_btree_iter_unlock(iter);
+
+	return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
 }

 /* Btree iterator: */
@ -617,13 +617,9 @@ bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
 void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
 {
 	struct btree_iter *linked;
-	unsigned level = b->level;

 	for_each_linked_btree_iter(iter, linked)
-		if (linked->nodes[level] == b) {
-			btree_node_unlock(linked, level);
-			linked->nodes[level] = BTREE_ITER_NOT_END;
-		}
+		bch2_btree_iter_node_drop(linked, b);
 }

 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
@ -631,9 +627,9 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 	unsigned level = b->level;

 	if (iter->nodes[level] == b) {
-		BUG_ON(b->lock.state.intent_lock != 1);
 		btree_node_unlock(iter, level);
 		iter->nodes[level] = BTREE_ITER_NOT_END;
+		iter->flags &= ~BTREE_ITER_UPTODATE;
 	}
 }

@ -718,7 +714,8 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 			break;

 		bch2_bkey_unpack(b, &tmp.k, k);
-		bch2_btree_node_prefetch(iter, &tmp.k, iter->level);
+		bch2_btree_node_prefetch(iter->c, &tmp.k,
+					 iter->level, iter->btree_id);
 	}

 	if (!was_locked)
@ -735,7 +732,7 @@ static inline int btree_iter_down(struct btree_iter *iter)

 	bkey_reassemble(&tmp.k, k);

-	b = bch2_btree_node_get(iter, &tmp.k, level, lock_type);
+	b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type);
 	if (unlikely(IS_ERR(b)))
 		return PTR_ERR(b);

@ -907,6 +904,8 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
 {
 	int ret;

+	iter->flags &= ~BTREE_ITER_UPTODATE;
+
 	if (unlikely(!iter->nodes[iter->level]))
 		return 0;

@ -1064,11 +1063,14 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
 		struct btree *b = iter->nodes[0];
 		struct bkey_packed *k =
 			__bch2_btree_node_iter_peek_all(&iter->node_iters[0], b);
-
-		return (struct bkey_s_c) {
+		struct bkey_s_c ret = {
 			.k = &iter->k,
 			.v = bkeyp_val(&b->format, k)
 		};
+
+		if (debug_check_bkeys(iter->c))
+			bch2_bkey_debugcheck(iter->c, b, ret);
+		return ret;
 	}

 	while (1) {
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@ -10,6 +10,7 @@
 */

 #include "btree_iter.h"
+#include "btree_io.h"
 #include "six.h"

 /* matches six lock types */
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -11,7 +11,7 @@
 #include "six.h"

 struct open_bucket;
-struct btree_interior_update;
+struct btree_update;

 #define MAX_BSETS		3U

@ -105,7 +105,7 @@ struct btree {
 	 * node to point to them: we update the parent in memory immediately,
 	 * but then we must wait until the children have been written out before
 	 * the update to the parent can be written - this is a list of the
-	 * btree_interior_updates that are blocking this node from being
+	 * btree_updates that are blocking this node from being
 	 * written:
 	 */
 	struct list_head	write_blocked;
@ -116,7 +116,7 @@ struct btree {
 	 * another write - because that write also won't yet be reachable and
 	 * marking it as completed before it's reachable would be incorrect:
 	 */
-	struct btree_interior_update *will_make_reachable;
+	struct btree_update	*will_make_reachable;

 	struct open_bucket	*ob;

@ -265,7 +265,7 @@ static inline bool btree_node_is_extents(struct btree *b)
 struct btree_root {
 	struct btree		*b;

-	struct btree_interior_update *as;
+	struct btree_update	*as;

 	/* On disk root - see async splits: */
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
@ -312,6 +312,11 @@ enum btree_gc_coalesce_fail_reason {
 	BTREE_GC_COALESCE_FAIL_FORMAT_FITS,
 };

+enum btree_node_sibling {
+	btree_prev_sib,
+	btree_next_sib,
+};
+
 typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *,
 							struct btree *,
 							struct btree_node_iter *);
--- a/libbcachefs/btree_update.h
+++ b/libbcachefs/btree_update.h
@ -1,310 +1,24 @@
-#ifndef _BCACHE_BTREE_INSERT_H
-#define _BCACHE_BTREE_INSERT_H
+#ifndef _BCACHE_BTREE_UPDATE_H
+#define _BCACHE_BTREE_UPDATE_H

-#include "btree_cache.h"
 #include "btree_iter.h"
-#include "buckets.h"
 #include "journal.h"
-#include "vstructs.h"

 struct bch_fs;
-struct bkey_format_state;
-struct bkey_format;
 struct btree;
+struct btree_insert;

-static inline void btree_node_reset_sib_u64s(struct btree *b)
-{
-	b->sib_u64s[0] = b->nr.live_u64s;
-	b->sib_u64s[1] = b->nr.live_u64s;
-}
-
-struct btree_reserve {
-	struct disk_reservation	disk_res;
-	unsigned		nr;
-	struct btree		*b[BTREE_RESERVE_MAX];
-};
-
-void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
-bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
-				struct bkey_format *);
-
-/* Btree node freeing/allocation: */
-
-/*
- * Tracks a btree node that has been (or is about to be) freed in memory, but
- * has _not_ yet been freed on disk (because the write that makes the new
- * node(s) visible and frees the old hasn't completed yet)
- */
-struct pending_btree_node_free {
-	bool			index_update_done;
-
-	__le64			seq;
-	enum btree_id		btree_id;
-	unsigned		level;
-	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
-};
-
-/*
- * Tracks an in progress split/rewrite of a btree node and the update to the
- * parent node:
- *
- * When we split/rewrite a node, we do all the updates in memory without
- * waiting for any writes to complete - we allocate the new node(s) and update
- * the parent node, possibly recursively up to the root.
- *
- * The end result is that we have one or more new nodes being written -
- * possibly several, if there were multiple splits - and then a write (updating
- * an interior node) which will make all these new nodes visible.
- *
- * Additionally, as we split/rewrite nodes we free the old nodes - but the old
- * nodes can't be freed (their space on disk can't be reclaimed) until the
- * update to the interior node that makes the new node visible completes -
- * until then, the old nodes are still reachable on disk.
- *
- */
-struct btree_interior_update {
-	struct closure			cl;
-	struct bch_fs			*c;
-
-	struct list_head		list;
-
-	/* What kind of update are we doing? */
-	enum {
-		BTREE_INTERIOR_NO_UPDATE,
-		BTREE_INTERIOR_UPDATING_NODE,
-		BTREE_INTERIOR_UPDATING_ROOT,
-		BTREE_INTERIOR_UPDATING_AS,
-	} mode;
-
-	unsigned			flags;
-	struct btree_reserve		*reserve;
-
-	/*
-	 * BTREE_INTERIOR_UPDATING_NODE:
-	 * The update that made the new nodes visible was a regular update to an
-	 * existing interior node - @b. We can't write out the update to @b
-	 * until the new nodes we created are finished writing, so we block @b
-	 * from writing by putting this btree_interior update on the
-	 * @b->write_blocked list with @write_blocked_list:
-	 */
-	struct btree			*b;
-	struct list_head		write_blocked_list;
-
-	/*
-	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
-	 * we're now blocking another btree_interior_update
-	 * @parent_as - btree_interior_update that's waiting on our nodes to finish
-	 * writing, before it can make new nodes visible on disk
-	 * @wait - list of child btree_interior_updates that are waiting on this
-	 * btree_interior_update to make all the new nodes visible before they can free
-	 * their old btree nodes
-	 */
-	struct btree_interior_update	*parent_as;
-	struct closure_waitlist		wait;
-
-	/*
-	 * We may be freeing nodes that were dirty, and thus had journal entries
-	 * pinned: we need to transfer the oldest of those pins to the
-	 * btree_interior_update operation, and release it when the new node(s)
-	 * are all persistent and reachable:
-	 */
-	struct journal_entry_pin	journal;
-
-	u64				journal_seq;
-
-	/*
-	 * Nodes being freed:
-	 * Protected by c->btree_node_pending_free_lock
-	 */
-	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
-	unsigned			nr_pending;
-
-	/* New nodes, that will be made reachable by this update: */
-	struct btree			*new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
-	unsigned			nr_new_nodes;
-
-	/* Only here to reduce stack usage on recursive splits: */
-	struct keylist			parent_keys;
-	/*
-	 * Enough room for btree_split's keys without realloc - btree node
-	 * pointers never have crc/compression info, so we only need to acount
-	 * for the pointers for three keys
-	 */
-	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
-};
-
-#define BTREE_INTERIOR_UPDATE_MUST_REWRITE	(1 << 0)
-
-#define for_each_pending_btree_node_free(c, as, p)			\
-	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
-		for (p = as->pending; p < as->pending + as->nr_pending; p++)
-
-void bch2_btree_node_free_inmem(struct btree_iter *, struct btree *);
-void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
-void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *);
-
-struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *,
-					     struct btree *,
-					     struct bkey_format,
-					     struct btree_interior_update *,
-					     struct btree_reserve *);
-
-struct btree_interior_update *
-bch2_btree_interior_update_alloc(struct bch_fs *);
-
-void bch2_btree_interior_update_will_free_node(struct bch_fs *,
-					      struct btree_interior_update *,
-					      struct btree *);
-
-void bch2_btree_set_root_initial(struct bch_fs *, struct btree *,
-				struct btree_reserve *);
-
-void bch2_btree_reserve_put(struct bch_fs *, struct btree_reserve *);
-struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *,
-					    struct btree *, unsigned,
-					    unsigned, struct closure *);
-
-int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *);
-
-/* Inserting into a given leaf node (last stage of insert): */
-
+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
+				     struct btree_iter *);
 bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
-			       struct btree_node_iter *, struct bkey_i *);
+				struct btree_node_iter *, struct bkey_i *);
 void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *,
-			   struct bkey_i *);
-
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
-{
-	return (void *) b->data + btree_bytes(c);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
-							    struct btree *b)
-{
-	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
-}
-
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
-							  struct btree *b)
-{
-	return btree_data_end(c, b);
-}
-
-static inline void *write_block(struct btree *b)
-{
-	return (void *) b->data + (b->written << 9);
-}
-
-static inline bool bset_written(struct btree *b, struct bset *i)
-{
-	return (void *) i < write_block(b);
-}
-
-static inline bool bset_unwritten(struct btree *b, struct bset *i)
-{
-	return (void *) i > write_block(b);
-}
-
-static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
-				       struct bset *i)
-{
-	return round_up(bset_byte_offset(b, vstruct_end(i)),
-			block_bytes(c)) >> 9;
-}
-
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
-						   struct btree *b)
-{
-	struct bset *i = btree_bset_last(b);
-	unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
-		b->whiteout_u64s +
-		b->uncompacted_whiteout_u64s;
-	unsigned total = c->sb.btree_node_size << 6;
-
-	EBUG_ON(used > total);
-
-	if (bset_written(b, i))
-		return 0;
-
-	return total - used;
-}
-
-static inline unsigned btree_write_set_buffer(struct btree *b)
-{
-	/*
-	 * Could buffer up larger amounts of keys for btrees with larger keys,
-	 * pending benchmarking:
-	 */
-	return 4 << 10;
-}
-
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
-						     struct btree *b)
-{
-	struct bset *i = btree_bset_last(b);
-	unsigned offset = max_t(unsigned, b->written << 9,
-				bset_byte_offset(b, vstruct_end(i)));
-	ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
-		(offset + sizeof(struct btree_node_entry) +
-		 b->whiteout_u64s * sizeof(u64) +
-		 b->uncompacted_whiteout_u64s * sizeof(u64));
-
-	EBUG_ON(offset > btree_bytes(c));
-
-	if ((unlikely(bset_written(b, i)) && n > 0) ||
-	    (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
-	     n > btree_write_set_buffer(b)))
-		return (void *) b->data + offset;
-
-	return NULL;
-}
-
-/*
- * write lock must be held on @b (else the dirty bset that we were going to
- * insert into could be written out from under us)
- */
-static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
-					      struct btree *b, unsigned u64s)
-{
-	if (btree_node_is_extents(b)) {
-		/* The insert key might split an existing key
-		 * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
-		 */
-		u64s += BKEY_EXTENT_U64s_MAX;
-	}
-
-	return u64s <= bch_btree_keys_u64s_remaining(c, b);
-}
-
-static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
-				      struct bkey_packed *k)
-{
-	if (bset_written(b, bset(b, t))) {
-		EBUG_ON(b->uncompacted_whiteout_u64s <
-			bkeyp_key_u64s(&b->format, k));
-		b->uncompacted_whiteout_u64s -=
-			bkeyp_key_u64s(&b->format, k);
-	}
-}
-
-static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
-				    struct bkey_packed *k)
-{
-	if (bset_written(b, bset(b, t))) {
-		BUG_ON(!k->needs_whiteout);
-		b->uncompacted_whiteout_u64s +=
-			bkeyp_key_u64s(&b->format, k);
-	}
-}
-
-void bch2_btree_insert_node(struct btree *, struct btree_iter *,
-			   struct keylist *, struct btree_reserve *,
-			   struct btree_interior_update *as);
+			    struct bkey_i *);

 /* Normal update interface: */

 struct btree_insert {
-	struct bch_fs	*c;
+	struct bch_fs		*c;
 	struct disk_reservation *disk_res;
 	struct journal_res	journal_res;
 	u64			*journal_seq;
@ -403,25 +117,6 @@ int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *,
 			     struct disk_reservation *,
 			     struct extent_insert_hook *, u64 *, unsigned);

-static inline bool journal_res_insert_fits(struct btree_insert *trans,
-					   struct btree_insert_entry *insert)
-{
-	unsigned u64s = 0;
-	struct btree_insert_entry *i;
-
-	/*
-	 * If we didn't get a journal reservation, we're in journal replay and
-	 * we're not journalling updates:
-	 */
-	if (!trans->journal_res.ref)
-		return true;
-
-	for (i = insert; i < trans->entries + trans->nr; i++)
-		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
-
-	return u64s <= trans->journal_res.u64s;
-}
-
 int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *,
 		     struct disk_reservation *,
 		     struct extent_insert_hook *, u64 *, int flags);
@ -438,5 +133,5 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
 int bch2_btree_node_update_key(struct bch_fs *, struct btree *,
 			       struct bkey_i_extent *);

-#endif /* _BCACHE_BTREE_INSERT_H */
+#endif /* _BCACHE_BTREE_UPDATE_H */

--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@ -0,0 +1,312 @@
+#ifndef _BCACHE_BTREE_UPDATE_INTERIOR_H
+#define _BCACHE_BTREE_UPDATE_INTERIOR_H
+
+#include "btree_cache.h"
+#include "btree_update.h"
+
+struct btree_reserve {
+	struct disk_reservation	disk_res;
+	unsigned		nr;
+	struct btree		*b[BTREE_RESERVE_MAX];
+};
+
+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *);
+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *,
+				struct bkey_format *);
+
+/* Btree node freeing/allocation: */
+
+/*
+ * Tracks a btree node that has been (or is about to be) freed in memory, but
+ * has _not_ yet been freed on disk (because the write that makes the new
+ * node(s) visible and frees the old hasn't completed yet)
+ */
+struct pending_btree_node_free {
+	bool			index_update_done;
+
+	__le64			seq;
+	enum btree_id		btree_id;
+	unsigned		level;
+	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
+};
+
+/*
+ * Tracks an in progress split/rewrite of a btree node and the update to the
+ * parent node:
+ *
+ * When we split/rewrite a node, we do all the updates in memory without
+ * waiting for any writes to complete - we allocate the new node(s) and update
+ * the parent node, possibly recursively up to the root.
+ *
+ * The end result is that we have one or more new nodes being written -
+ * possibly several, if there were multiple splits - and then a write (updating
+ * an interior node) which will make all these new nodes visible.
+ *
+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old
+ * nodes can't be freed (their space on disk can't be reclaimed) until the
+ * update to the interior node that makes the new node visible completes -
+ * until then, the old nodes are still reachable on disk.
+ *
+ */
+struct btree_update {
+	struct closure			cl;
+	struct bch_fs			*c;
+
+	struct list_head		list;
+
+	/* What kind of update are we doing? */
+	enum {
+		BTREE_INTERIOR_NO_UPDATE,
+		BTREE_INTERIOR_UPDATING_NODE,
+		BTREE_INTERIOR_UPDATING_ROOT,
+		BTREE_INTERIOR_UPDATING_AS,
+	} mode;
+	enum btree_id			btree_id;
+
+	unsigned			flags;
+	struct btree_reserve		*reserve;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * The update that made the new nodes visible was a regular update to an
+	 * existing interior node - @b. We can't write out the update to @b
+	 * until the new nodes we created are finished writing, so we block @b
+	 * from writing by putting this btree_interior update on the
+	 * @b->write_blocked list with @write_blocked_list:
+	 */
+	struct btree			*b;
+	struct list_head		write_blocked_list;
+
+	/*
+	 * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
+	 * we're now blocking another btree_update
+	 * @parent_as - btree_update that's waiting on our nodes to finish
+	 * writing, before it can make new nodes visible on disk
+	 * @wait - list of child btree_updates that are waiting on this
+	 * btree_update to make all the new nodes visible before they can free
+	 * their old btree nodes
+	 */
+	struct btree_update		*parent_as;
+	struct closure_waitlist		wait;
+
+	/*
+	 * We may be freeing nodes that were dirty, and thus had journal entries
+	 * pinned: we need to transfer the oldest of those pins to the
+	 * btree_update operation, and release it when the new node(s)
+	 * are all persistent and reachable:
+	 */
+	struct journal_entry_pin	journal;
+
+	u64				journal_seq;
+
+	/*
+	 * Nodes being freed:
+	 * Protected by c->btree_node_pending_free_lock
+	 */
+	struct pending_btree_node_free	pending[BTREE_MAX_DEPTH + GC_MERGE_NODES];
+	unsigned			nr_pending;
+
+	/* New nodes, that will be made reachable by this update: */
+	struct btree			*new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES];
+	unsigned			nr_new_nodes;
+
+	/* Only here to reduce stack usage on recursive splits: */
+	struct keylist			parent_keys;
+	/*
+	 * Enough room for btree_split's keys without realloc - btree node
+	 * pointers never have crc/compression info, so we only need to acount
+	 * for the pointers for three keys
+	 */
+	u64				inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3];
+};
+
+#define BTREE_INTERIOR_UPDATE_MUST_REWRITE	(1 << 0)
+
+#define for_each_pending_btree_node_free(c, as, p)			\
+	list_for_each_entry(as, &c->btree_interior_update_list, list)	\
+		for (p = as->pending; p < as->pending + as->nr_pending; p++)
+
+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *,
+				struct btree_iter *);
+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *);
+void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *);
+
+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
+						  struct btree *,
+						  struct bkey_format);
+
+void bch2_btree_update_done(struct btree_update *);
+struct btree_update *
+bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned,
+			unsigned, struct closure *);
+
+void bch2_btree_interior_update_will_free_node(struct btree_update *,
+					       struct btree *);
+
+void bch2_btree_insert_node(struct btree_update *, struct btree *,
+			    struct btree_iter *, struct keylist *);
+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
+int bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+				enum btree_node_sibling);
+
+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
+int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *);
+
+static inline unsigned btree_update_reserve_required(struct bch_fs *c,
+						     struct btree *b)
+{
+	unsigned depth = btree_node_root(c, b)->level - b->level;
+
+	return btree_reserve_required_nodes(depth);
+}
+
+static inline void btree_node_reset_sib_u64s(struct btree *b)
+{
+	b->sib_u64s[0] = b->nr.live_u64s;
+	b->sib_u64s[1] = b->nr.live_u64s;
+}
+
+static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+{
+	return (void *) b->data + btree_bytes(c);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
+							    struct btree *b)
+{
+	return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+}
+
+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
+							  struct btree *b)
+{
+	return btree_data_end(c, b);
+}
+
+static inline void *write_block(struct btree *b)
+{
+	return (void *) b->data + (b->written << 9);
+}
+
+static inline bool bset_written(struct btree *b, struct bset *i)
+{
+	return (void *) i < write_block(b);
+}
+
+static inline bool bset_unwritten(struct btree *b, struct bset *i)
+{
+	return (void *) i > write_block(b);
+}
+
+static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b,
+				       struct bset *i)
+{
+	return round_up(bset_byte_offset(b, vstruct_end(i)),
+			block_bytes(c)) >> 9;
+}
+
+static inline unsigned btree_write_set_buffer(struct btree *b)
+{
+	/*
+	 * Could buffer up larger amounts of keys for btrees with larger keys,
+	 * pending benchmarking:
+	 */
+	return 4 << 10;
+}
+
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
+						     struct btree *b)
+{
+	struct bset *i = btree_bset_last(b);
+	unsigned offset = max_t(unsigned, b->written << 9,
+				bset_byte_offset(b, vstruct_end(i)));
+	ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t)
+		(offset + sizeof(struct btree_node_entry) +
+		 b->whiteout_u64s * sizeof(u64) +
+		 b->uncompacted_whiteout_u64s * sizeof(u64));
+
+	EBUG_ON(offset > btree_bytes(c));
+
+	if ((unlikely(bset_written(b, i)) && n > 0) ||
+	    (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) &&
+	     n > btree_write_set_buffer(b)))
+		return (void *) b->data + offset;
+
+	return NULL;
+}
+
+static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t,
+				      struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		EBUG_ON(b->uncompacted_whiteout_u64s <
+			bkeyp_key_u64s(&b->format, k));
+		b->uncompacted_whiteout_u64s -=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+static inline void reserve_whiteout(struct btree *b, struct bset_tree *t,
+				    struct bkey_packed *k)
+{
+	if (bset_written(b, bset(b, t))) {
+		BUG_ON(!k->needs_whiteout);
+		b->uncompacted_whiteout_u64s +=
+			bkeyp_key_u64s(&b->format, k);
+	}
+}
+
+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
+						   struct btree *b)
+{
+	struct bset *i = btree_bset_last(b);
+	unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) +
+		b->whiteout_u64s +
+		b->uncompacted_whiteout_u64s;
+	unsigned total = c->sb.btree_node_size << 6;
+
+	EBUG_ON(used > total);
+
+	if (bset_written(b, i))
+		return 0;
+
+	return total - used;
+}
+
+/*
+ * write lock must be held on @b (else the dirty bset that we were going to
+ * insert into could be written out from under us)
+ */
+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
+					      struct btree *b, unsigned u64s)
+{
+	if (btree_node_is_extents(b)) {
+		/* The insert key might split an existing key
+		 * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case:
+		 */
+		u64s += BKEY_EXTENT_U64s_MAX;
+	}
+
+	return u64s <= bch_btree_keys_u64s_remaining(c, b);
+}
+
+static inline bool journal_res_insert_fits(struct btree_insert *trans,
+					   struct btree_insert_entry *insert)
+{
+	unsigned u64s = 0;
+	struct btree_insert_entry *i;
+
+	/*
+	 * If we didn't get a journal reservation, we're in journal replay and
+	 * we're not journalling updates:
+	 */
+	if (!trans->journal_res.ref)
+		return true;
+
+	for (i = insert; i < trans->entries + trans->nr; i++)
+		u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+
+	return u64s <= trans->journal_res.u64s;
+}
+
+#endif /* _BCACHE_BTREE_UPDATE_INTERIOR_H */
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -0,0 +1,660 @@
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "btree_iter.h"
+#include "btree_locking.h"
+#include "debug.h"
+#include "extents.h"
+#include "journal.h"
+#include "keylist.h"
+
+#include <linux/sort.h>
+#include <trace/events/bcachefs.h>
+
+/* Inserting into a given leaf node (last stage of insert): */
+
+/* Handle overwrites and do insert, for non extents: */
+bool bch2_btree_bset_insert_key(struct btree_iter *iter,
+				struct btree *b,
+				struct btree_node_iter *node_iter,
+				struct bkey_i *insert)
+{
+	const struct bkey_format *f = &b->format;
+	struct bkey_packed *k;
+	struct bset_tree *t;
+	unsigned clobber_u64s;
+
+	EBUG_ON(btree_node_just_written(b));
+	EBUG_ON(bset_written(b, btree_bset_last(b)));
+	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
+	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
+		bkey_cmp(insert->k.p, b->data->max_key) > 0);
+
+	k = bch2_btree_node_iter_peek_all(node_iter, b);
+	if (k && !bkey_cmp_packed(b, k, &insert->k)) {
+		BUG_ON(bkey_whiteout(k));
+
+		t = bch2_bkey_to_bset(b, k);
+
+		if (bset_unwritten(b, bset(b, t)) &&
+		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
+			BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k));
+
+			k->type = insert->k.type;
+			memcpy_u64s(bkeyp_val(f, k), &insert->v,
+				    bkey_val_u64s(&insert->k));
+			return true;
+		}
+
+		insert->k.needs_whiteout = k->needs_whiteout;
+
+		btree_keys_account_key_drop(&b->nr, t - b->set, k);
+
+		if (t == bset_tree_last(b)) {
+			clobber_u64s = k->u64s;
+
+			/*
+			 * If we're deleting, and the key we're deleting doesn't
+			 * need a whiteout (it wasn't overwriting a key that had
+			 * been written to disk) - just delete it:
+			 */
+			if (bkey_whiteout(&insert->k) && !k->needs_whiteout) {
+				bch2_bset_delete(b, k, clobber_u64s);
+				bch2_btree_node_iter_fix(iter, b, node_iter, t,
+							k, clobber_u64s, 0);
+				return true;
+			}
+
+			goto overwrite;
+		}
+
+		k->type = KEY_TYPE_DELETED;
+		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+					k->u64s, k->u64s);
+
+		if (bkey_whiteout(&insert->k)) {
+			reserve_whiteout(b, t, k);
+			return true;
+		} else {
+			k->needs_whiteout = false;
+		}
+	} else {
+		/*
+		 * Deleting, but the key to delete wasn't found - nothing to do:
+		 */
+		if (bkey_whiteout(&insert->k))
+			return false;
+
+		insert->k.needs_whiteout = false;
+	}
+
+	t = bset_tree_last(b);
+	k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
+	clobber_u64s = 0;
+overwrite:
+	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
+	if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k))
+		bch2_btree_node_iter_fix(iter, b, node_iter, t, k,
+					clobber_u64s, k->u64s);
+	return true;
+}
+
+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+			       unsigned i, u64 seq)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct btree_write *w = container_of(pin, struct btree_write, journal);
+	struct btree *b = container_of(w, struct btree, writes[i]);
+
+	six_lock_read(&b->lock);
+	bch2_btree_node_write_dirty(c, b, NULL,
+			(btree_current_write(b) == w &&
+			 w->journal.pin_list == journal_seq_pin(j, seq)));
+	six_unlock_read(&b->lock);
+}
+
+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 0, seq);
+}
+
+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+{
+	return __btree_node_flush(j, pin, 1, seq);
+}
+
+void bch2_btree_journal_key(struct btree_insert *trans,
+			   struct btree_iter *iter,
+			   struct bkey_i *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct journal *j = &c->journal;
+	struct btree *b = iter->nodes[0];
+	struct btree_write *w = btree_current_write(b);
+
+	EBUG_ON(iter->level || b->level);
+	EBUG_ON(trans->journal_res.ref !=
+		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
+
+	if (!journal_pin_active(&w->journal))
+		bch2_journal_pin_add(j, &trans->journal_res,
+				     &w->journal,
+				     btree_node_write_idx(b) == 0
+				     ? btree_node_flush0
+				     : btree_node_flush1);
+
+	if (trans->journal_res.ref) {
+		u64 seq = trans->journal_res.seq;
+		bool needs_whiteout = insert->k.needs_whiteout;
+
+		/* ick */
+		insert->k.needs_whiteout = false;
+		bch2_journal_add_keys(j, &trans->journal_res,
+				      b->btree_id, insert);
+		insert->k.needs_whiteout = needs_whiteout;
+
+		bch2_journal_set_has_inode(j, &trans->journal_res,
+					   insert->k.p.inode);
+
+		if (trans->journal_seq)
+			*trans->journal_seq = seq;
+		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
+	}
+
+	if (!btree_node_dirty(b))
+		set_btree_node_dirty(b);
+}
+
+static enum btree_insert_ret
+bch2_insert_fixup_key(struct btree_insert *trans,
+		     struct btree_insert_entry *insert)
+{
+	struct btree_iter *iter = insert->iter;
+
+	BUG_ON(iter->level);
+	BUG_ON(insert->k->k.u64s >
+	       bch_btree_keys_u64s_remaining(trans->c, iter->nodes[0]));
+
+	if (bch2_btree_bset_insert_key(iter, iter->nodes[0],
+				       &iter->node_iters[0],
+				       insert->k))
+		bch2_btree_journal_key(trans, iter, insert->k);
+
+	trans->did_work = true;
+	return BTREE_INSERT_OK;
+}
+
+static int inline foreground_maybe_merge(struct bch_fs *c,
+					 struct btree_iter *iter,
+					 enum btree_node_sibling sib)
+{
+	struct btree *b;
+
+	if (!btree_node_locked(iter, iter->level))
+		return 0;
+
+	b = iter->nodes[iter->level];
+	if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
+		return 0;
+
+	return bch2_foreground_maybe_merge(c, iter, sib);
+}
+
+/**
+ * btree_insert_key - insert a key one key into a leaf node
+ */
+static enum btree_insert_ret
+btree_insert_key(struct btree_insert *trans,
+		 struct btree_insert_entry *insert)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter *iter = insert->iter;
+	struct btree *b = iter->nodes[0];
+	enum btree_insert_ret ret;
+	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
+	int old_live_u64s = b->nr.live_u64s;
+	int live_u64s_added, u64s_added;
+
+	iter->flags &= ~BTREE_ITER_UPTODATE;
+
+	ret = !btree_node_is_extents(b)
+		? bch2_insert_fixup_key(trans, insert)
+		: bch2_insert_fixup_extent(trans, insert);
+
+	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
+	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
+
+	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
+	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
+		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);
+
+	if (u64s_added > live_u64s_added &&
+	    bch2_maybe_compact_whiteouts(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	trace_btree_insert_key(c, b, insert->k);
+	return ret;
+}
+
+static bool same_leaf_as_prev(struct btree_insert *trans,
+			      struct btree_insert_entry *i)
+{
+	/*
+	 * Because we sorted the transaction entries, if multiple iterators
+	 * point to the same leaf node they'll always be adjacent now:
+	 */
+	return i != trans->entries &&
+		i[0].iter->nodes[0] == i[-1].iter->nodes[0];
+}
+
+#define trans_for_each_entry(trans, i)					\
+	for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+
+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
+					    struct btree_iter *iter)
+{
+	bch2_btree_node_lock_write(b, iter);
+
+	if (btree_node_just_written(b) &&
+	    bch2_btree_post_write_cleanup(c, b))
+		bch2_btree_iter_reinit_node(iter, b);
+
+	/*
+	 * If the last bset has been written, or if it's gotten too big - start
+	 * a new bset to insert into:
+	 */
+	if (want_new_bset(c, b))
+		bch2_btree_init_next(c, b, iter);
+}
+
+static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i))
+			bch2_btree_node_lock_for_insert(c, i->iter->nodes[0], i->iter);
+}
+
+static void multi_unlock_write(struct btree_insert *trans)
+{
+	struct btree_insert_entry *i;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i))
+			bch2_btree_node_unlock_write(i->iter->nodes[0], i->iter);
+}
+
+static int btree_trans_entry_cmp(const void *_l, const void *_r)
+{
+	const struct btree_insert_entry *l = _l;
+	const struct btree_insert_entry *r = _r;
+
+	return btree_iter_cmp(l->iter, r->iter);
+}
+
+/* Normal update interface: */
+
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+int __bch2_btree_insert_at(struct btree_insert *trans)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_insert_entry *i;
+	struct btree_iter *split = NULL;
+	bool cycle_gc_lock = false;
+	unsigned u64s;
+	int ret;
+
+	trans_for_each_entry(trans, i) {
+		BUG_ON(i->iter->level);
+		BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+		BUG_ON(debug_check_bkeys(c) &&
+		       bch2_bkey_invalid(c, i->iter->btree_id,
+					 bkey_i_to_s_c(i->k)));
+	}
+
+	sort(trans->entries, trans->nr, sizeof(trans->entries[0]),
+	     btree_trans_entry_cmp, NULL);
+
+	if (unlikely(!percpu_ref_tryget(&c->writes)))
+		return -EROFS;
+retry_locks:
+	ret = -EINTR;
+	trans_for_each_entry(trans, i)
+		if (!bch2_btree_iter_set_locks_want(i->iter, 1))
+			goto err;
+retry:
+	trans->did_work = false;
+	u64s = 0;
+	trans_for_each_entry(trans, i)
+		if (!i->done)
+			u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+
+	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+
+	ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
+		? bch2_journal_res_get(&c->journal,
+				      &trans->journal_res,
+				      u64s, u64s)
+		: 0;
+	if (ret)
+		goto err;
+
+	multi_lock_write(c, trans);
+
+	u64s = 0;
+	trans_for_each_entry(trans, i) {
+		/* Multiple inserts might go to same leaf: */
+		if (!same_leaf_as_prev(trans, i))
+			u64s = 0;
+
+		/*
+		 * bch2_btree_node_insert_fits() must be called under write lock:
+		 * with only an intent lock, another thread can still call
+		 * bch2_btree_node_write(), converting an unwritten bset to a
+		 * written one
+		 */
+		if (!i->done) {
+			u64s += i->k->k.u64s + i->extra_res;
+			if (!bch2_btree_node_insert_fits(c,
+					i->iter->nodes[0], u64s)) {
+				split = i->iter;
+				goto unlock;
+			}
+		}
+	}
+
+	ret = 0;
+	split = NULL;
+	cycle_gc_lock = false;
+
+	trans_for_each_entry(trans, i) {
+		if (i->done)
+			continue;
+
+		switch (btree_insert_key(trans, i)) {
+		case BTREE_INSERT_OK:
+			i->done = true;
+			break;
+		case BTREE_INSERT_JOURNAL_RES_FULL:
+		case BTREE_INSERT_NEED_TRAVERSE:
+			ret = -EINTR;
+			break;
+		case BTREE_INSERT_NEED_RESCHED:
+			ret = -EAGAIN;
+			break;
+		case BTREE_INSERT_BTREE_NODE_FULL:
+			split = i->iter;
+			break;
+		case BTREE_INSERT_ENOSPC:
+			ret = -ENOSPC;
+			break;
+		case BTREE_INSERT_NEED_GC_LOCK:
+			cycle_gc_lock = true;
+			ret = -EINTR;
+			break;
+		default:
+			BUG();
+		}
+
+		if (!trans->did_work && (ret || split))
+			break;
+	}
+unlock:
+	multi_unlock_write(trans);
+	bch2_journal_res_put(&c->journal, &trans->journal_res);
+
+	if (split)
+		goto split;
+	if (ret)
+		goto err;
+
+	/*
+	 * hack: iterators are inconsistent when they hit end of leaf, until
+	 * traversed again
+	 */
+	trans_for_each_entry(trans, i)
+		if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF)
+			goto out;
+
+	trans_for_each_entry(trans, i)
+		if (!same_leaf_as_prev(trans, i)) {
+			foreground_maybe_merge(c, i->iter, btree_prev_sib);
+			foreground_maybe_merge(c, i->iter, btree_next_sib);
+		}
+out:
+	/* make sure we didn't lose an error: */
+	if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
+		trans_for_each_entry(trans, i)
+			BUG_ON(!i->done);
+
+	percpu_ref_put(&c->writes);
+	return ret;
+split:
+	/*
+	 * have to drop journal res before splitting, because splitting means
+	 * allocating new btree nodes, and holding a journal reservation
+	 * potentially blocks the allocator:
+	 */
+	ret = bch2_btree_split_leaf(c, split, trans->flags);
+	if (ret)
+		goto err;
+	/*
+	 * if the split didn't have to drop locks the insert will still be
+	 * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
+	 * and is overwriting won't have changed)
+	 */
+	goto retry_locks;
+err:
+	if (cycle_gc_lock) {
+		down_read(&c->gc_lock);
+		up_read(&c->gc_lock);
+	}
+
+	if (ret == -EINTR) {
+		trans_for_each_entry(trans, i) {
+			int ret2 = bch2_btree_iter_traverse(i->iter);
+			if (ret2) {
+				ret = ret2;
+				goto out;
+			}
+		}
+
+		/*
+		 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
+		 * dropped locks:
+		 */
+		if (!(trans->flags & BTREE_INSERT_ATOMIC))
+			goto retry;
+	}
+
+	goto out;
+}
+
+int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags)
+{
+	struct bkey_i k;
+
+	bkey_init(&k.k);
+	k.k.p = iter->pos;
+
+	return bch2_btree_insert_at(iter->c, NULL, NULL, NULL,
+				    BTREE_INSERT_NOFAIL|
+				    BTREE_INSERT_USE_RESERVE|flags,
+				    BTREE_INSERT_ENTRY(iter, &k));
+}
+
+int bch2_btree_insert_list_at(struct btree_iter *iter,
+			     struct keylist *keys,
+			     struct disk_reservation *disk_res,
+			     struct extent_insert_hook *hook,
+			     u64 *journal_seq, unsigned flags)
+{
+	BUG_ON(flags & BTREE_INSERT_ATOMIC);
+	BUG_ON(bch2_keylist_empty(keys));
+	bch2_verify_keylist_sorted(keys);
+
+	while (!bch2_keylist_empty(keys)) {
+		/* need to traverse between each insert */
+		int ret = bch2_btree_iter_traverse(iter);
+		if (ret)
+			return ret;
+
+		ret = bch2_btree_insert_at(iter->c, disk_res, hook,
+				journal_seq, flags,
+				BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys)));
+		if (ret)
+			return ret;
+
+		bch2_keylist_pop_front(keys);
+	}
+
+	return 0;
+}
+
+/**
+ * bch_btree_insert - insert keys into the extent btree
+ * @c:			pointer to struct bch_fs
+ * @id:			btree to insert into
+ * @insert_keys:	list of keys to insert
+ * @hook:		insert callback
+ */
+int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
+		     struct bkey_i *k,
+		     struct disk_reservation *disk_res,
+		     struct extent_insert_hook *hook,
+		     u64 *journal_seq, int flags)
+{
+	struct btree_iter iter;
+	int ret, ret2;
+
+	bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
+			     BTREE_ITER_INTENT);
+
+	ret = bch2_btree_iter_traverse(&iter);
+	if (unlikely(ret))
+		goto out;
+
+	ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
+				  BTREE_INSERT_ENTRY(&iter, k));
+out:	ret2 = bch2_btree_iter_unlock(&iter);
+
+	return ret ?: ret2;
+}
+
+/**
+ * bch_btree_update - like bch2_btree_insert(), but asserts that we're
+ * overwriting an existing key
+ */
+int bch2_btree_update(struct bch_fs *c, enum btree_id id,
+		     struct bkey_i *k, u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c u;
+	int ret;
+
+	EBUG_ON(id == BTREE_ID_EXTENTS);
+
+	bch2_btree_iter_init(&iter, c, id, k->k.p,
+			     BTREE_ITER_INTENT);
+
+	u = bch2_btree_iter_peek_with_holes(&iter);
+	ret = btree_iter_err(u);
+	if (ret)
+		return ret;
+
+	if (bkey_deleted(u.k)) {
+		bch2_btree_iter_unlock(&iter);
+		return -ENOENT;
+	}
+
+	ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, 0,
+				  BTREE_INSERT_ENTRY(&iter, k));
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
+
+/*
+ * bch_btree_delete_range - delete everything within a given range
+ *
+ * Range is a half open interval - [start, end)
+ */
+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
+			   struct bpos start,
+			   struct bpos end,
+			   struct bversion version,
+			   struct disk_reservation *disk_res,
+			   struct extent_insert_hook *hook,
+			   u64 *journal_seq)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_btree_iter_init(&iter, c, id, start,
+			     BTREE_ITER_INTENT);
+
+	while ((k = bch2_btree_iter_peek(&iter)).k &&
+	       !(ret = btree_iter_err(k))) {
+		unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits);
+		/* really shouldn't be using a bare, unpadded bkey_i */
+		struct bkey_i delete;
+
+		if (bkey_cmp(iter.pos, end) >= 0)
+			break;
+
+		bkey_init(&delete.k);
+
+		/*
+		 * For extents, iter.pos won't necessarily be the same as
+		 * bkey_start_pos(k.k) (for non extents they always will be the
+		 * same). It's important that we delete starting from iter.pos
+		 * because the range we want to delete could start in the middle
+		 * of k.
+		 *
+		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
+		 * bkey_start_pos(k.k)).
+		 */
+		delete.k.p = iter.pos;
+		delete.k.version = version;
+
+		if (iter.flags & BTREE_ITER_IS_EXTENTS) {
+			/*
+			 * The extents btree is special - KEY_TYPE_DISCARD is
+			 * used for deletions, not KEY_TYPE_DELETED. This is an
+			 * internal implementation detail that probably
+			 * shouldn't be exposed (internally, KEY_TYPE_DELETED is
+			 * used as a proxy for k->size == 0):
+			 */
+			delete.k.type = KEY_TYPE_DISCARD;
+
+			/* create the biggest key we can */
+			bch2_key_resize(&delete.k, max_sectors);
+			bch2_cut_back(end, &delete.k);
+		}
+
+		ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
+					  BTREE_INSERT_NOFAIL,
+					  BTREE_INSERT_ENTRY(&iter, &delete));
+		if (ret)
+			break;
+
+		bch2_btree_iter_cond_resched(&iter);
+	}
+
+	bch2_btree_iter_unlock(&iter);
+	return ret;
+}
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -80,21 +80,25 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
 {
 	struct bch_fs_usage stats =
 		__bch2_fs_usage_read(c);
+	unsigned i;

-	if ((s64) stats.sectors_dirty < 0)
-		panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty);
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		if ((s64) stats.s[i].data[S_META] < 0)
+			panic("replicas %u meta underflow: %lli\n",
+			      i + 1, stats.s[i].data[S_META]);

-	if ((s64) stats.sectors_cached < 0)
-		panic("sectors_cached underflow: %lli\n", stats.sectors_cached);
+		if ((s64) stats.s[i].data[S_DIRTY] < 0)
+			panic("replicas %u dirty underflow: %lli\n",
+			      i + 1, stats.s[i].data[S_DIRTY]);

-	if ((s64) stats.sectors_meta < 0)
-		panic("sectors_meta underflow: %lli\n", stats.sectors_meta);
+		if ((s64) stats.s[i].persistent_reserved < 0)
+			panic("replicas %u reserved underflow: %lli\n",
+			      i + 1, stats.s[i].persistent_reserved);
+	}

-	if ((s64) stats.sectors_persistent_reserved < 0)
-		panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved);
-
-	if ((s64) stats.sectors_online_reserved < 0)
-		panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved);
+	if ((s64) stats.online_reserved < 0)
+		panic("sectors_online_reserved underflow: %lli\n",
+		      stats.online_reserved);
 }

 #else
@ -223,11 +227,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 			struct disk_reservation *disk_res,
 			struct gc_pos gc_pos)
 {
-	s64 added =
-		stats->s[S_COMPRESSED][S_META] +
-		stats->s[S_COMPRESSED][S_DIRTY] +
-		stats->persistent_reserved +
-		stats->online_reserved;
+	struct fs_usage_sum sum = __fs_usage_sum(*stats);
+	s64 added = sum.data + sum.reserved;

 	/*
 	 * Not allowed to reduce sectors_available except by getting a
@ -255,19 +256,8 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	memset(stats, 0, sizeof(*stats));
 }

-static void bch2_fs_usage_update(struct bch_fs_usage *fs_usage,
-				struct bucket_mark old, struct bucket_mark new)
-{
-	fs_usage->s[S_COMPRESSED][S_CACHED] +=
-		(int) new.cached_sectors - (int) old.cached_sectors;
-	fs_usage->s[S_COMPRESSED][bucket_type(old)] -=
-		old.dirty_sectors;
-	fs_usage->s[S_COMPRESSED][bucket_type(new)] +=
-		new.dirty_sectors;
-}
-
 static void bch2_dev_usage_update(struct bch_dev *ca,
-				 struct bucket_mark old, struct bucket_mark new)
+				  struct bucket_mark old, struct bucket_mark new)
 {
 	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage *dev_usage;
@ -280,7 +270,7 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
 	preempt_disable();
 	dev_usage = this_cpu_ptr(ca->usage_percpu);

-	dev_usage->sectors[S_CACHED] +=
+	dev_usage->sectors_cached +=
 		(int) new.cached_sectors - (int) old.cached_sectors;

 	dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
@ -289,9 +279,9 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;

-	dev_usage->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old);
+	dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old);
+	dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old);
 	dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
-	dev_usage->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old);
 	preempt_enable();

 	if (!is_available_bucket(old) && is_available_bucket(new))
@ -309,7 +299,6 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
 bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 			    struct bucket_mark *old)
 {
-	struct bch_fs_usage stats = { 0 };
 	struct bucket_mark new;

 	*old = bucket_data_cmpxchg(ca, g, new, ({
@ -324,12 +313,8 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 		new.gen++;
 	}));

-	/* XXX: we're not actually updating fs usage's cached sectors... */
-	bch2_fs_usage_update(&stats, *old, new);
-
 	if (!old->owned_by_allocator && old->cached_sectors)
-		trace_invalidate(ca, g - ca->buckets,
-					old->cached_sectors);
+		trace_invalidate(ca, g - ca->buckets, old->cached_sectors);
 	return true;
 }

@ -367,12 +352,15 @@ void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
 void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
 			   bool owned_by_allocator)
 {
-	struct bucket_mark new;
+	struct bucket_mark old, new;

-	bucket_data_cmpxchg(ca, g, new, ({
+	old = bucket_data_cmpxchg(ca, g, new, ({
 		new.touched_this_mount	= 1;
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
+
+	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
+	       ca->fs->gc_pos.phase == GC_PHASE_DONE);
 }

 #define saturated_add(ca, dst, src, max)			\
@ -414,34 +402,14 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
 	       bucket_became_unavailable(ca->fs, old, new));
 }

-#if 0
 /* Reverting this until the copygc + compression issue is fixed: */

-static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
 {
-	return crc_compression_type(crc)
-		? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc)
-		: sectors;
+	return sectors * crc_compressed_size(NULL, crc) /
+		crc_uncompressed_size(NULL, crc);
 }

-static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
-{
-	return crc_compression_type(crc)
-		? min_t(unsigned, crc_compressed_size(crc), sectors)
-		: sectors;
-}
-#else
-static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
-{
-	return sectors;
-}
-
-static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors)
-{
-	return sectors;
-}
-#endif
-
 /*
 * Checking against gc's position has to be done here, inside the cmpxchg()
 * loop, to avoid racing with the start of gc clearing all the marks - GC does
@ -452,9 +420,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			     const union bch_extent_crc *crc,
 			     const struct bch_extent_ptr *ptr,
 			     s64 sectors, enum s_alloc type,
-			     bool may_make_unavailable,
 			     struct bch_fs_usage *stats,
-			     bool gc_will_visit, u64 journal_seq)
+			     u64 journal_seq, unsigned flags)
 {
 	struct bucket_mark old, new;
 	unsigned saturated;
@ -462,23 +429,24 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr);
 	unsigned data_type = type == S_META
 		? BUCKET_BTREE : BUCKET_DATA;
-	unsigned old_sectors, new_sectors;
-	int disk_sectors, compressed_sectors;
+	u64 v;

-	if (sectors > 0) {
-		old_sectors = 0;
-		new_sectors = sectors;
-	} else {
-		old_sectors = e.k->size;
-		new_sectors = e.k->size + sectors;
+	if (crc_compression_type(crc)) {
+		unsigned old_sectors, new_sectors;
+
+		if (sectors > 0) {
+			old_sectors = 0;
+			new_sectors = sectors;
+		} else {
+			old_sectors = e.k->size;
+			new_sectors = e.k->size + sectors;
+		}
+
+		sectors = -__disk_sectors(crc, old_sectors)
+			  +__disk_sectors(crc, new_sectors);
 	}

-	disk_sectors = -__disk_sectors(crc, old_sectors)
-		+ __disk_sectors(crc, new_sectors);
-	compressed_sectors = -__compressed_sectors(crc, old_sectors)
-		+ __compressed_sectors(crc, new_sectors);
-
-	if (gc_will_visit) {
+	if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
 		if (journal_seq)
 			bucket_cmpxchg(g, new, ({
 				new.touched_this_mount	= 1;
@ -486,10 +454,12 @@ static void bch2_mark_pointer(struct bch_fs *c,
 				new.journal_seq		= journal_seq;
 			}));

-		goto out;
+		return;
 	}

-	old = bucket_data_cmpxchg(ca, g, new, ({
+	v = READ_ONCE(g->_mark.counter);
+	do {
+		new.counter = old.counter = v;
 		saturated = 0;

 		/*
@ -498,21 +468,21 @@ static void bch2_mark_pointer(struct bch_fs *c,
 		 * checked the gen
 		 */
 		if (gen_after(new.gen, ptr->gen)) {
-			EBUG_ON(type != S_CACHED &&
+			EBUG_ON(!ptr->cached &&
 				test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
 			return;
 		}

-		if (type != S_CACHED &&
+		if (!ptr->cached &&
 		    new.dirty_sectors == GC_MAX_SECTORS_USED &&
-		    disk_sectors < 0)
-			saturated = -disk_sectors;
+		    sectors < 0)
+			saturated = -sectors;

-		if (type == S_CACHED)
-			saturated_add(ca, new.cached_sectors, disk_sectors,
+		if (ptr->cached)
+			saturated_add(ca, new.cached_sectors, sectors,
 				      GC_MAX_SECTORS_USED);
 		else
-			saturated_add(ca, new.dirty_sectors, disk_sectors,
+			saturated_add(ca, new.dirty_sectors, sectors,
 				      GC_MAX_SECTORS_USED);

 		if (!new.dirty_sectors &&
@ -528,7 +498,16 @@ static void bch2_mark_pointer(struct bch_fs *c,
 		}

 		new.touched_this_mount	= 1;
-	}));
+
+		if (flags & BCH_BUCKET_MARK_NOATOMIC) {
+			g->_mark = new;
+			break;
+		}
+	} while ((v = cmpxchg(&g->_mark.counter,
+			      old.counter,
+			      new.counter)) != old.counter);
+
+	bch2_dev_usage_update(ca, old, new);

 	if (old.data_type != data_type &&
 	    (old.data_type ||
@ -537,7 +516,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 		bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
 			g - ca->buckets, old.data_type, new.data_type);

-	BUG_ON(!may_make_unavailable &&
+	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));

 	if (saturated &&
@ -549,66 +528,61 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			wake_up_process(c->gc_thread);
 		}
 	}
-out:
-	stats->s[S_COMPRESSED][type]	+= compressed_sectors;
-	stats->s[S_UNCOMPRESSED][type]	+= sectors;
 }

 static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
 			    s64 sectors, bool metadata,
-			    bool may_make_unavailable,
 			    struct bch_fs_usage *stats,
-			    bool gc_will_visit, u64 journal_seq)
+			    u64 journal_seq, unsigned flags)
 {
 	const struct bch_extent_ptr *ptr;
 	const union bch_extent_crc *crc;
 	enum s_alloc type = metadata ? S_META : S_DIRTY;
+	unsigned replicas = 0;

 	BUG_ON(metadata && bkey_extent_is_cached(e.k));
 	BUG_ON(!sectors);

-	extent_for_each_ptr_crc(e, ptr, crc)
-		bch2_mark_pointer(c, e, crc, ptr, sectors,
-				 ptr->cached ? S_CACHED : type,
-				 may_make_unavailable,
-				 stats, gc_will_visit, journal_seq);
+	extent_for_each_ptr_crc(e, ptr, crc) {
+		bch2_mark_pointer(c, e, crc, ptr, sectors, type,
+				  stats, journal_seq, flags);
+		replicas += !ptr->cached;
+	}
+
+	BUG_ON(replicas >= BCH_REPLICAS_MAX);
+
+	if (replicas)
+		stats->s[replicas - 1].data[type] += sectors;
 }

-static void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-			   s64 sectors, bool metadata,
-			   bool may_make_unavailable,
-			   struct bch_fs_usage *stats,
-			   bool gc_will_visit, u64 journal_seq)
+void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
+		     s64 sectors, bool metadata,
+		     struct bch_fs_usage *stats,
+		     u64 journal_seq, unsigned flags)
 {
 	switch (k.k->type) {
 	case BCH_EXTENT:
 	case BCH_EXTENT_CACHED:
 		bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
-				may_make_unavailable, stats,
-				gc_will_visit, journal_seq);
+				stats, journal_seq, flags);
 		break;
 	case BCH_RESERVATION: {
 		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);

-		stats->persistent_reserved += r.v->nr_replicas * sectors;
+		if (r.v->nr_replicas)
+			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
 		break;
 	}
 	}
 }

-void __bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		       s64 sectors, bool metadata,
-		       struct bch_fs_usage *stats)
-{
-	__bch2_mark_key(c, k, sectors, metadata, true, stats, false, 0);
-}
-
 void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		     s64 sectors, bool metadata)
+		     s64 sectors, bool metadata, unsigned flags)
 {
 	struct bch_fs_usage stats = { 0 };

-	__bch2_gc_mark_key(c, k, sectors, metadata, &stats);
+	__bch2_mark_key(c, k, sectors, metadata, &stats, 0,
+			flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);

 	preempt_disable();
 	bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
@ -619,6 +593,8 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		  s64 sectors, bool metadata, struct gc_pos gc_pos,
 		  struct bch_fs_usage *stats, u64 journal_seq)
 {
+	unsigned flags = gc_will_visit(c, gc_pos)
+		? BCH_BUCKET_MARK_GC_WILL_VISIT : 0;
 	/*
 	 * synchronization w.r.t. GC:
 	 *
@ -647,9 +623,7 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	 *    (e.g. the btree node lock, or the relevant allocator lock).
 	 */
 	lg_local_lock(&c->usage_lock);
-	__bch2_mark_key(c, k, sectors, metadata, false, stats,
-		       gc_will_visit(c, gc_pos), journal_seq);
-
+	__bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags);
 	bch2_fs_stats_verify(c);
 	lg_local_unlock(&c->usage_lock);
 }
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -124,9 +124,9 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
 {
 	return max_t(s64, 0,
 		     ca->mi.nbuckets - ca->mi.first_bucket -
-		     stats.buckets_dirty -
-		     stats.buckets_alloc -
-		     stats.buckets_meta);
+		     stats.buckets[S_META] -
+		     stats.buckets[S_DIRTY] -
+		     stats.buckets_alloc);
 }

 /*
@ -157,16 +157,31 @@ struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
 			struct disk_reservation *, struct gc_pos);

+struct fs_usage_sum {
+	u64	data;
+	u64	reserved;
+};
+
+static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
+{
+	struct fs_usage_sum sum = { 0 };
+	unsigned i;
+
+	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+		sum.data += (stats.s[i].data[S_META] +
+			     stats.s[i].data[S_DIRTY]) * (i + 1);
+		sum.reserved += stats.s[i].persistent_reserved * (i + 1);
+	}
+
+	sum.reserved += stats.online_reserved;
+	return sum;
+}
+
 static inline u64 __bch2_fs_sectors_used(struct bch_fs *c)
 {
-	struct bch_fs_usage stats = __bch2_fs_usage_read(c);
-	u64 reserved = stats.persistent_reserved +
-		stats.online_reserved;
+	struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c));

-	return stats.s[S_COMPRESSED][S_META] +
-		stats.s[S_COMPRESSED][S_DIRTY] +
-		reserved +
-		(reserved >> 7);
+	return sum.data + sum.reserved + (sum.reserved >> 7);
 }

 static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
@ -199,9 +214,15 @@ void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
 void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
 			       enum bucket_data_type, bool);

-void __bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
-		       struct bch_fs_usage *);
-void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool);
+#define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
+#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 1)
+#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 2)
+
+void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
+		     struct bch_fs_usage *, u64, unsigned);
+
+void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c,
+		      s64, bool, unsigned);
 void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
 		  struct gc_pos, struct bch_fs_usage *, u64);

--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -7,7 +7,6 @@
 enum bucket_data_type {
 	BUCKET_DATA	= 0,
 	BUCKET_BTREE,
-	BUCKET_PRIOS,
 	BUCKET_JOURNAL,
 	BUCKET_SB,
 };
@ -49,32 +48,33 @@ struct bucket {
 	};
 };

-enum s_compressed {
-	S_COMPRESSED,
-	S_UNCOMPRESSED,
-	S_COMPRESSED_NR,
-};
-
+/* kill, switch to bucket_data_type */
 enum s_alloc {
 	S_META,
 	S_DIRTY,
-	S_CACHED,
 	S_ALLOC_NR,
 };

 struct bch_dev_usage {
-	u64			buckets_dirty;
+	u64			buckets[S_ALLOC_NR];
 	u64			buckets_cached;
-	u64			buckets_meta;
 	u64			buckets_alloc;

+	/* _compressed_ sectors: */
 	u64			sectors[S_ALLOC_NR];
+	u64			sectors_cached;
 };

 struct bch_fs_usage {
 	/* all fields are in units of 512 byte sectors: */
-	u64			s[S_COMPRESSED_NR][S_ALLOC_NR];
-	u64			persistent_reserved;
+
+	/* _uncompressed_ sectors: */
+
+	struct {
+		u64		data[S_ALLOC_NR];
+		u64		persistent_reserved;
+	}			s[BCH_REPLICAS_MAX];
+
 	u64			online_reserved;
 	u64			available_cache;
 };
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@ -73,12 +73,12 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg)
 		return -EINVAL;

 	user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL);
-	if (!devs)
+	if (!user_devs)
 		return -ENOMEM;

 	devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL);

-	if (copy_from_user(user_devs, user_arg->devs,
+	if (copy_from_user(user_devs, arg.devs,
 			   sizeof(u64) * arg.nr_devs))
 		goto err;

--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@ -71,7 +71,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)

 	memcpy(n_ondisk, n_sorted, btree_bytes(c));

-	bch2_btree_node_read_done(c, v, pick.ca, &pick.ptr);
+	bch2_btree_node_read_done(c, v);
 	n_sorted = c->verify_data->data;

 	percpu_ref_put(&pick.ca->io_ref);
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@ -26,7 +26,7 @@ void bch2_fatal_error(struct bch_fs *c)
 		bch_err(c, "emergency read only");
 }

-void bch2_nonfatal_io_error_work(struct work_struct *work)
+void bch2_io_error_work(struct work_struct *work)
 {
 	struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work);
 	struct bch_fs *c = ca->fs;
@ -45,9 +45,9 @@ void bch2_nonfatal_io_error_work(struct work_struct *work)
 	mutex_unlock(&c->state_lock);
 }

-void bch2_nonfatal_io_error(struct bch_dev *ca)
+void bch2_io_error(struct bch_dev *ca)
 {
-	queue_work(system_long_wq, &ca->io_error_work);
+	//queue_work(system_long_wq, &ca->io_error_work);
 }

 #ifdef __KERNEL__
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@ -179,63 +179,32 @@ do {									\
 	_ret;								\
 })

-#define bch2_dev_fatal_error(ca, ...)					\
-do {									\
-	bch_err(ca, __VA_ARGS__);					\
-	bch2_fatal_error(c);						\
-} while (0)
-
-#define bch2_dev_fatal_io_error(ca, fmt, ...)				\
-do {									\
-	printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,			\
-		"fatal IO error on %s for " fmt),			\
-		(ca)->name, ##__VA_ARGS__);				\
-	bch2_fatal_error((ca)->fs);					\
-} while (0)
-
-#define bch2_dev_fatal_io_err_on(cond, ca, ...)				\
-({									\
-	int _ret = !!(cond);						\
-									\
-	if (_ret)							\
-		bch2_dev_fatal_io_error(ca, __VA_ARGS__);		\
-	_ret;								\
-})
-
 /*
- * Nonfatal IO errors: either recoverable metadata IO (because we have
- * replicas), or data IO - we need to log it and print out a message, but we
- * don't (necessarily) want to shut down the fs:
+ * IO errors: either recoverable metadata IO (because we have replicas), or data
+ * IO - we need to log it and print out a message, but we don't (necessarily)
+ * want to shut down the fs:
 */

-void bch2_nonfatal_io_error_work(struct work_struct *);
+void bch2_io_error_work(struct work_struct *);

 /* Does the error handling without logging a message */
-void bch2_nonfatal_io_error(struct bch_dev *);
-
-#if 0
-#define bch2_fs_nonfatal_io_error(c, ...)				\
-do {									\
-	bch_err(c, __VA_ARGS__);					\
-	bch2_nonfatal_io_error(c);					\
-} while (0)
-#endif
+void bch2_io_error(struct bch_dev *);

 /* Logs message and handles the error: */
-#define bch2_dev_nonfatal_io_error(ca, fmt, ...)				\
+#define bch2_dev_io_error(ca, fmt, ...)					\
 do {									\
 	printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs,			\
 		"IO error on %s for " fmt),				\
 		(ca)->name, ##__VA_ARGS__);				\
-	bch2_nonfatal_io_error(ca);					\
+	bch2_io_error(ca);						\
 } while (0)

-#define bch2_dev_nonfatal_io_err_on(cond, ca, ...)			\
+#define bch2_dev_io_err_on(cond, ca, ...)				\
 ({									\
 	bool _ret = (cond);						\
 									\
 	if (_ret)							\
-		bch2_dev_nonfatal_io_error(ca, __VA_ARGS__);		\
+		bch2_dev_io_error(ca, __VA_ARGS__);			\
 	_ret;								\
 })

--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -9,6 +9,8 @@
 #include "bkey_methods.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
 #include "checksum.h"
 #include "debug.h"
 #include "dirent.h"
@ -497,6 +499,54 @@ out:
 	return out - buf;
 }

+void bch2_get_read_device(struct bch_fs *c,
+			  const struct bkey *k,
+			  const struct bch_extent_ptr *ptr,
+			  const union bch_extent_crc *crc,
+			  struct bch_devs_mask *avoid,
+			  struct extent_pick_ptr *pick)
+{
+	struct bch_dev *ca = c->devs[ptr->dev];
+
+	if (ptr->cached && ptr_stale(ca, ptr))
+		return;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+		return;
+
+	if (avoid && test_bit(ca->dev_idx, avoid->d))
+		return;
+
+	if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
+		return;
+
+	if (!percpu_ref_tryget(&ca->io_ref))
+		return;
+
+	if (pick->ca)
+		percpu_ref_put(&pick->ca->io_ref);
+
+	*pick = (struct extent_pick_ptr) {
+		.ptr	= *ptr,
+		.ca	= ca,
+	};
+
+	if (k->size)
+		pick->crc = crc_to_128(k, crc);
+}
+
+static void extent_pick_read_device(struct bch_fs *c,
+				    struct bkey_s_c_extent e,
+				    struct bch_devs_mask *avoid,
+				    struct extent_pick_ptr *pick)
+{
+	const union bch_extent_crc *crc;
+	const struct bch_extent_ptr *ptr;
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		bch2_get_read_device(c, e.k, ptr, crc, avoid, pick);
+}
+
 /* Btree ptrs */

 static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
@ -615,36 +665,10 @@ static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
 struct extent_pick_ptr
 bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
 {
-	struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-	const union bch_extent_crc *crc;
-	const struct bch_extent_ptr *ptr;
 	struct extent_pick_ptr pick = { .ca = NULL };

-	extent_for_each_ptr_crc(e, ptr, crc) {
-		struct bch_dev *ca = c->devs[ptr->dev];
-		struct btree *root = btree_node_root(c, b);
-
-		if (bch2_fs_inconsistent_on(crc, c,
-				"btree node pointer with crc at btree %u level %u/%u bucket %zu",
-				b->btree_id, b->level, root ? root->level : -1,
-				PTR_BUCKET_NR(ca, ptr)))
-			break;
-
-		if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-			continue;
-
-		if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
-			continue;
-
-		if (!percpu_ref_tryget(&ca->io_ref))
-			continue;
-
-		if (pick.ca)
-			percpu_ref_put(&pick.ca->io_ref);
-
-		pick.ca		= ca;
-		pick.ptr	= *ptr;
-	}
+	extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+				NULL, &pick);

 	return pick;
 }
@ -2029,13 +2053,11 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 * as the pointers are sorted by tier, hence preferring pointers to tier 0
 * rather than pointers to tier 1.
 */
-void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
-				   struct bch_dev *avoid,
-				   struct extent_pick_ptr *ret)
+void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+			  struct bch_devs_mask *avoid,
+			  struct extent_pick_ptr *ret)
 {
 	struct bkey_s_c_extent e;
-	const union bch_extent_crc *crc;
-	const struct bch_extent_ptr *ptr;

 	switch (k.k->type) {
 	case KEY_TYPE_DELETED:
@ -2053,32 +2075,7 @@ void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
 		e = bkey_s_c_to_extent(k);
 		ret->ca = NULL;

-		extent_for_each_ptr_crc(e, ptr, crc) {
-			struct bch_dev *ca = c->devs[ptr->dev];
-
-			if (ptr->cached && ptr_stale(ca, ptr))
-				continue;
-
-			if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-				continue;
-
-			if (ret->ca &&
-			    (ca == avoid ||
-			     ret->ca->mi.tier < ca->mi.tier))
-				continue;
-
-			if (!percpu_ref_tryget(&ca->io_ref))
-				continue;
-
-			if (ret->ca)
-				percpu_ref_put(&ret->ca->io_ref);
-
-			*ret = (struct extent_pick_ptr) {
-				.crc = crc_to_128(e.k, crc),
-				.ptr = *ptr,
-				.ca = ca,
-			};
-		}
+		extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);

 		if (!ret->ca && !bkey_extent_is_cached(e.k))
 			ret->ca = ERR_PTR(-EIO);
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -3,11 +3,16 @@

 #include "bcachefs.h"
 #include "bkey.h"
+#include "io_types.h"

+struct bch_fs;
+struct journal_res;
 struct btree_node_iter;
 struct btree_insert;
 struct btree_insert_entry;
 struct extent_insert_hook;
+struct bch_devs_mask;
+union bch_extent_crc;

 struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
 						  struct btree *,
@ -20,27 +25,18 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 extern const struct bkey_ops bch2_bkey_btree_ops;
 extern const struct bkey_ops bch2_bkey_extent_ops;

-struct bch_fs;
-struct journal_res;
-
-struct extent_pick_ptr {
-	struct bch_extent_crc128	crc;
-	struct bch_extent_ptr		ptr;
-	struct bch_dev			*ca;
-};
-
+void bch2_get_read_device(struct bch_fs *,
+			  const struct bkey *,
+			  const struct bch_extent_ptr *,
+			  const union bch_extent_crc *,
+			  struct bch_devs_mask *,
+			  struct extent_pick_ptr *);
 struct extent_pick_ptr
 bch2_btree_pick_ptr(struct bch_fs *, const struct btree *);

-void bch2_extent_pick_ptr_avoiding(struct bch_fs *, struct bkey_s_c,
-				  struct bch_dev *, struct extent_pick_ptr *);
-
-static inline void
-bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
-		    struct extent_pick_ptr *ret)
-{
-	bch2_extent_pick_ptr_avoiding(c, k, NULL, ret);
-}
+void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
+			  struct bch_devs_mask *,
+			  struct extent_pick_ptr *);

 enum btree_insert_ret
 bch2_insert_fixup_extent(struct btree_insert *,
@ -558,6 +554,12 @@ void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);

 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+struct bch_extent_ptr *
+bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent,
+		     struct bch_extent_ptr);
+struct bch_extent_ptr *
+bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent,
+			      struct bkey_s_c_extent);

 bool bch2_cut_front(struct bpos, struct bkey_i *);
 bool bch2_cut_back(struct bpos, struct bkey *);
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -21,6 +21,8 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
+
+#include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>

 struct bio_set *bch2_writepage_bioset;
@ -700,8 +702,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 {
 	struct bio *bio = &rbio->bio;
 	int flags = BCH_READ_RETRY_IF_STALE|
-		BCH_READ_PROMOTE|
-		BCH_READ_MAY_REUSE_BIO;
+		BCH_READ_MAY_PROMOTE;

 	while (1) {
 		struct extent_pick_ptr pick;
@ -727,7 +728,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 		bch2_btree_iter_unlock(iter);
 		k = bkey_i_to_s_c(&tmp.k);

-		bch2_extent_pick_ptr(c, k, &pick);
+		bch2_extent_pick_ptr(c, k, NULL, &pick);
 		if (IS_ERR(pick.ca)) {
 			bcache_io_error(c, bio, "no device to read from");
 			bio_endio(bio);
@ -753,15 +754,14 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 		    bkey_extent_is_compressed(k))
 			bch2_mark_pages_unalloc(bio);

-		if (is_last)
-			flags |= BCH_READ_IS_LAST;
-
 		if (pick.ca) {
-			PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] =
-				c->prio_clock[READ].hand;
+			if (!is_last) {
+				bio_inc_remaining(&rbio->bio);
+				flags |= BCH_READ_MUST_CLONE;
+				trace_read_split(&rbio->bio);
+			}

 			bch2_read_extent(c, rbio, k, &pick, flags);
-			flags &= ~BCH_READ_MAY_REUSE_BIO;
 		} else {
 			zero_fill_bio(bio);

@ -803,9 +803,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping,
 				 BCH_ENCODED_EXTENT_MAX >> PAGE_SECTOR_SHIFT);

 		struct bch_read_bio *rbio =
-			container_of(bio_alloc_bioset(GFP_NOFS, n,
-						      &c->bio_read),
-				     struct bch_read_bio, bio);
+			to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read));

 		rbio->bio.bi_end_io = bch2_readpages_end_io;
 		bio_add_page_contig(&rbio->bio, page);
@ -854,9 +852,7 @@ int bch2_readpage(struct file *file, struct page *page)
 	struct bch_fs *c = inode->i_sb->s_fs_info;
 	struct bch_read_bio *rbio;

-	rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1,
-					    &c->bio_read),
-			   struct bch_read_bio, bio);
+	rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
 	rbio->bio.bi_end_io = bch2_readpages_end_io;

 	__bchfs_readpage(c, rbio, inode->i_ino, page);
@ -1240,9 +1236,7 @@ static int bch2_read_single_page(struct page *page,
 	int ret;
 	DECLARE_COMPLETION_ONSTACK(done);

-	rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1,
-					     &c->bio_read),
-			    struct bch_read_bio, bio);
+	rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read));
 	rbio->bio.bi_private = &done;
 	rbio->bio.bi_end_io = bch2_read_single_page_end_io;

@ -1464,9 +1458,7 @@ start:
 		if (iter->count)
 			closure_get(&dio->cl);

-		bch2_read(c, container_of(bio,
-				struct bch_read_bio, bio),
-			 inode->i_ino);
+		bch2_read(c, to_rbio(bio), inode->i_ino);
 	}

 	if (sync) {
@ -2088,13 +2080,14 @@ static long bch2_fpunch(struct inode *inode, loff_t offset, loff_t len)
 		if (unlikely(ret))
 			goto out;

-		ret = bch2_discard(c,
-				  POS(ino, discard_start),
-				  POS(ino, discard_end),
-				  ZERO_VERSION,
-				  &disk_res,
-				  &i_sectors_hook.hook,
-				  &ei->journal_seq);
+		ret = bch2_btree_delete_range(c,
+				BTREE_ID_EXTENTS,
+				POS(ino, discard_start),
+				POS(ino, discard_end),
+				ZERO_VERSION,
+				&disk_res,
+				&i_sectors_hook.hook,
+				&ei->journal_seq);

 		i_sectors_dirty_put(ei, &i_sectors_hook);
 		bch2_disk_reservation_put(c, &disk_res);
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@ -328,8 +328,11 @@ again:
 int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size,
 			struct extent_insert_hook *hook, u64 *journal_seq)
 {
-	return bch2_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0),
-			   ZERO_VERSION, NULL, hook, journal_seq);
+	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS,
+				       POS(inode_nr, new_size),
+				       POS(inode_nr + 1, 0),
+				       ZERO_VERSION, NULL, hook,
+				       journal_seq);
 }

 int bch2_inode_rm(struct bch_fs *c, u64 inode_nr)
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@ -13,18 +13,20 @@
 void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);

+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
+			       const struct bkey_i *);
+
 enum bch_write_flags {
 	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
-	BCH_WRITE_DISCARD		= (1 << 1),
-	BCH_WRITE_CACHED		= (1 << 2),
-	BCH_WRITE_FLUSH			= (1 << 3),
-	BCH_WRITE_DISCARD_ON_ERROR	= (1 << 4),
-	BCH_WRITE_DATA_COMPRESSED	= (1 << 5),
+	BCH_WRITE_CACHED		= (1 << 1),
+	BCH_WRITE_FLUSH			= (1 << 2),
+	BCH_WRITE_DATA_COMPRESSED	= (1 << 3),

 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 6),
-	BCH_WRITE_DONE			= (1 << 7),
-	BCH_WRITE_LOOPED		= (1 << 8),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 4),
+	BCH_WRITE_DONE			= (1 << 5),
+	BCH_WRITE_LOOPED		= (1 << 6),
+	__BCH_WRITE_KEYLIST_LOCKED	= 8,
 };

 static inline u64 *op_journal_seq(struct bch_write_op *op)
@ -53,43 +55,54 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
 	return wbio;
 }

-struct cache_promote_op;
+void bch2_wake_delayed_writes(unsigned long data);

+struct bch_devs_mask;
+struct cache_promote_op;
 struct extent_pick_ptr;

-void bch2_read_extent_iter(struct bch_fs *, struct bch_read_bio *,
-			   struct bvec_iter, struct bkey_s_c k,
-			   struct extent_pick_ptr *, unsigned);
+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		       struct bkey_s_c k, struct extent_pick_ptr *, unsigned);
+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
+		 u64, struct bch_devs_mask *, unsigned);
+
+enum bch_read_flags {
+	BCH_READ_RETRY_IF_STALE		= 1 << 0,
+	BCH_READ_MAY_PROMOTE		= 1 << 1,
+	BCH_READ_USER_MAPPED		= 1 << 2,
+
+	/* internal: */
+	BCH_READ_MUST_BOUNCE		= 1 << 3,
+	BCH_READ_MUST_CLONE		= 1 << 4,
+	BCH_READ_IN_RETRY		= 1 << 5,
+};

 static inline void bch2_read_extent(struct bch_fs *c,
-				    struct bch_read_bio *orig,
+				    struct bch_read_bio *rbio,
 				    struct bkey_s_c k,
 				    struct extent_pick_ptr *pick,
 				    unsigned flags)
 {
-	bch2_read_extent_iter(c, orig, orig->bio.bi_iter,
-			     k, pick, flags);
+	rbio->_state = 0;
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags);
 }

-enum bch_read_flags {
-	BCH_READ_FORCE_BOUNCE		= 1 << 0,
-	BCH_READ_RETRY_IF_STALE		= 1 << 1,
-	BCH_READ_PROMOTE		= 1 << 2,
-	BCH_READ_IS_LAST		= 1 << 3,
-	BCH_READ_MAY_REUSE_BIO		= 1 << 4,
-	BCH_READ_USER_MAPPED		= 1 << 5,
-};
+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+			     u64 inode)
+{
+	rbio->_state = 0;
+	__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
+		    BCH_READ_RETRY_IF_STALE|
+		    BCH_READ_MAY_PROMOTE|
+		    BCH_READ_USER_MAPPED);
+}

-void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
+static inline struct bch_read_bio *rbio_init(struct bio *bio)
+{
+	struct bch_read_bio *rbio = to_rbio(bio);

-void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
-			       const struct bkey_i *);
-
-int bch2_discard(struct bch_fs *, struct bpos, struct bpos,
-		 struct bversion, struct disk_reservation *,
-		 struct extent_insert_hook *, u64 *);
-
-void bch2_read_retry_work(struct work_struct *);
-void bch2_wake_delayed_writes(unsigned long data);
+	rbio->_state = 0;
+	return rbio;
+}

 #endif /* _BCACHE_IO_H */
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@ -4,11 +4,20 @@
 #include "btree_types.h"
 #include "buckets_types.h"
 #include "keylist_types.h"
+#include "super_types.h"

 #include <linux/llist.h>
 #include <linux/workqueue.h>

+struct extent_pick_ptr {
+	struct bch_extent_crc128	crc;
+	struct bch_extent_ptr		ptr;
+	struct bch_dev			*ca;
+};
+
 struct bch_read_bio {
+	struct bch_fs		*c;
+
 	/*
 	 * Reads will often have to be split, and if the extent being read from
 	 * was checksummed or compressed we'll also have to allocate bounce
@ -19,33 +28,37 @@ struct bch_read_bio {
 	 */
 	union {
 	struct bch_read_bio	*parent;
-	bio_end_io_t		*orig_bi_end_io;
+	bio_end_io_t		*end_io;
 	};

 	/*
-	 * Saved copy of parent->bi_iter, from submission time - allows us to
+	 * Saved copy of bio->bi_iter, from submission time - allows us to
 	 * resubmit on IO error, and also to copy data back to the original bio
 	 * when we're bouncing:
 	 */
-	struct bvec_iter	parent_iter;
+	struct bvec_iter	bvec_iter;

 	unsigned		submit_time_us;
-	u16			flags;
+	u8			flags;
+	union {
+	struct {
 	u8			bounce:1,
-				split:1;
+				split:1,
+				process_context:1,
+				retry:2;
+	};
+	u8			_state;
+	};

-	struct bch_fs		*c;
-	struct bch_dev		*ca;
-	struct bch_extent_ptr	ptr;
-	struct bch_extent_crc128 crc;
+	struct extent_pick_ptr	pick;
 	struct bversion		version;

-	struct cache_promote_op *promote;
+	struct promote_op	*promote;

 	/*
 	 * If we have to retry the read (IO error, checksum failure, read stale
 	 * data (raced with allocator), we retry the portion of the parent bio
-	 * that failed (i.e. this bio's portion, parent_iter).
+	 * that failed (i.e. this bio's portion, bvec_iter).
 	 *
 	 * But we need to stash the inode somewhere:
 	 */
@ -56,12 +69,6 @@ struct bch_read_bio {
 	struct bio		bio;
 };

-static inline struct bch_read_bio *
-bch2_rbio_parent(struct bch_read_bio *rbio)
-{
-	return rbio->split ? rbio->parent : rbio;
-}
-
 struct bch_write_bio {
 	struct bch_fs		*c;
 	struct bch_dev		*ca;
@ -132,6 +139,8 @@ struct bch_write_op {

 	int			(*index_update_fn)(struct bch_write_op *);

+	struct bch_devs_mask	failed;
+
 	struct keylist		insert_keys;
 	u64			inline_keys[BKEY_EXTENT_U64s_MAX * 2];

--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -10,6 +10,7 @@
 #include "buckets.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "btree_io.h"
 #include "checksum.h"
 #include "debug.h"
@ -150,7 +151,7 @@ static void journal_seq_blacklist_flush(struct journal *j,
 	}

 	for (i = 0;; i++) {
-		struct btree_interior_update *as;
+		struct btree_update *as;
 		struct pending_btree_node_free *d;

 		mutex_lock(&j->blacklist_lock);
@ -673,9 +674,9 @@ reread:			sectors_read = min_t(unsigned,

 			ret = submit_bio_wait(bio);

-			if (bch2_dev_fatal_io_err_on(ret, ca,
-						  "journal read from sector %llu",
-						  offset) ||
+			if (bch2_dev_io_err_on(ret, ca,
+					       "journal read from sector %llu",
+					       offset) ||
 			    bch2_meta_read_fault("journal"))
 				return -EIO;

@ -1086,7 +1087,6 @@ static bool journal_entry_is_open(struct journal *j)

 void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_prev_buf(j);

 	atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count);
@ -1096,10 +1096,10 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
 		__bch2_time_stats_update(j->delay_time,
 					j->need_write_time);
 #if 0
-	closure_call(&j->io, journal_write, NULL, &c->cl);
+	closure_call(&j->io, journal_write, NULL, NULL);
 #else
 	/* Shut sparse up: */
-	closure_init(&j->io, &c->cl);
+	closure_init(&j->io, NULL);
 	set_closure_fn(&j->io, journal_write, NULL);
 	journal_write(&j->io);
 #endif
@ -1734,13 +1734,11 @@ void bch2_journal_pin_drop(struct journal *j,
 			  struct journal_entry_pin *pin)
 {
 	unsigned long flags;
-	bool wakeup;
-
-	if (!journal_pin_active(pin))
-		return;
+	bool wakeup = false;

 	spin_lock_irqsave(&j->pin_lock, flags);
-	wakeup = __journal_pin_drop(j, pin);
+	if (journal_pin_active(pin))
+		wakeup = __journal_pin_drop(j, pin);
 	spin_unlock_irqrestore(&j->pin_lock, flags);

 	/*
@ -2099,60 +2097,6 @@ static void journal_write_compact(struct jset *jset)
 	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
 }

-static void journal_write_endio(struct bio *bio)
-{
-	struct bch_dev *ca = bio->bi_private;
-	struct journal *j = &ca->fs->journal;
-
-	if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "journal write") ||
-	    bch2_meta_write_fault("journal"))
-		bch2_journal_halt(j);
-
-	closure_put(&j->io);
-	percpu_ref_put(&ca->io_ref);
-}
-
-static void journal_write_done(struct closure *cl)
-{
-	struct journal *j = container_of(cl, struct journal, io);
-	struct journal_buf *w = journal_prev_buf(j);
-
-	__bch2_time_stats_update(j->write_time, j->write_start_time);
-
-	j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
-
-	/*
-	 * Updating last_seq_ondisk may let journal_reclaim_work() discard more
-	 * buckets:
-	 *
-	 * Must come before signaling write completion, for
-	 * bch2_fs_journal_stop():
-	 */
-	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-
-	BUG_ON(!j->reservations.prev_buf_unwritten);
-	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
-		     &j->reservations.counter);
-
-	/*
-	 * XXX: this is racy, we could technically end up doing the wake up
-	 * after the journal_buf struct has been reused for the next write
-	 * (because we're clearing JOURNAL_IO_IN_FLIGHT) and wake up things that
-	 * are waiting on the _next_ write, not this one.
-	 *
-	 * The wake up can't come before, because journal_flush_seq_async() is
-	 * looking at JOURNAL_IO_IN_FLIGHT when it has to wait on a journal
-	 * write that was already in flight.
-	 *
-	 * The right fix is to use a lock here, but using j.lock here means it
-	 * has to be a spin_lock_irqsave() lock which then requires propagating
-	 * the irq()ness to other locks and it's all kinds of nastiness.
-	 */
-
-	closure_wake_up(&w->wait);
-	wake_up(&j->wait);
-}
-
 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 {
 	/* we aren't holding j->lock: */
@ -2172,6 +2116,89 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	buf->size	= new_size;
 }

+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct journal_buf *w = journal_prev_buf(j);
+
+	__bch2_time_stats_update(j->write_time, j->write_start_time);
+
+	spin_lock(&j->lock);
+	j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
+
+	/*
+	 * Updating last_seq_ondisk may let journal_reclaim_work() discard more
+	 * buckets:
+	 *
+	 * Must come before signaling write completion, for
+	 * bch2_fs_journal_stop():
+	 */
+	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+
+	/* also must come before signalling write completion: */
+	closure_debug_destroy(cl);
+
+	BUG_ON(!j->reservations.prev_buf_unwritten);
+	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
+		     &j->reservations.counter);
+
+	closure_wake_up(&w->wait);
+	wake_up(&j->wait);
+
+	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+	spin_unlock(&j->lock);
+}
+
+static void journal_write_error(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+
+	while (j->replicas_failed) {
+		unsigned idx = __fls(j->replicas_failed);
+
+		bch2_extent_drop_ptr_idx(e, idx);
+		j->replicas_failed ^= 1 << idx;
+	}
+
+	if (!bch2_extent_nr_ptrs(e.c)) {
+		bch_err(c, "unable to write journal to sufficient devices");
+		goto err;
+	}
+
+	if (bch2_check_mark_super(c, e.c, BCH_DATA_JOURNAL))
+		goto err;
+
+out:
+	journal_write_done(cl);
+	return;
+err:
+	bch2_fatal_error(c);
+	bch2_journal_halt(j);
+	goto out;
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+	struct journal *j = &ca->fs->journal;
+
+	if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") ||
+	    bch2_meta_write_fault("journal")) {
+		/* Was this a flush or an actual journal write? */
+		if (ca->journal.ptr_idx != U8_MAX) {
+			set_bit(ca->journal.ptr_idx, &j->replicas_failed);
+			set_closure_fn(&j->io, journal_write_error,
+				       system_highpri_wq);
+		}
+	}
+
+	closure_put(&j->io);
+	percpu_ref_put(&ca->io_ref);
+}
+
 static void journal_write(struct closure *cl)
 {
 	struct journal *j = container_of(cl, struct journal, io);
@ -2181,7 +2208,7 @@ static void journal_write(struct closure *cl)
 	struct jset *jset;
 	struct bio *bio;
 	struct bch_extent_ptr *ptr;
-	unsigned i, sectors, bytes;
+	unsigned i, sectors, bytes, ptr_idx = 0;

 	journal_buf_realloc(j, w);
 	jset = w->data;
@ -2231,7 +2258,7 @@ static void journal_write(struct closure *cl)
 		bch2_journal_halt(j);
 		bch_err(c, "Unable to allocate journal write");
 		bch2_fatal_error(c);
-		closure_return_with_destructor(cl, journal_write_done);
+		continue_at(cl, journal_write_done, system_highpri_wq);
 	}

 	if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key),
@ -2255,6 +2282,7 @@ static void journal_write(struct closure *cl)

 		atomic64_add(sectors, &ca->meta_sectors_written);

+		ca->journal.ptr_idx	= ptr_idx++;
 		bio = ca->journal.bio;
 		bio_reset(bio);
 		bio->bi_iter.bi_sector	= ptr->offset;
@ -2277,6 +2305,7 @@ static void journal_write(struct closure *cl)
 		    !bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) {
 			percpu_ref_get(&ca->io_ref);

+			ca->journal.ptr_idx = U8_MAX;
 			bio = ca->journal.bio;
 			bio_reset(bio);
 			bio->bi_bdev		= ca->disk_sb.bdev;
@ -2290,10 +2319,10 @@ no_io:
 	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
 		ptr->offset += sectors;

-	closure_return_with_destructor(cl, journal_write_done);
+	continue_at(cl, journal_write_done, system_highpri_wq);
 err:
 	bch2_inconsistent_error(c);
-	closure_return_with_destructor(cl, journal_write_done);
+	continue_at(cl, journal_write_done, system_highpri_wq);
 }

 static void journal_write_work(struct work_struct *work)
@ -2524,18 +2553,61 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
 	spin_unlock(&j->lock);
 }

+static int journal_seq_flushed(struct journal *j, u64 seq)
+{
+	struct journal_buf *buf;
+	int ret = 1;
+
+	spin_lock(&j->lock);
+	BUG_ON(seq > atomic64_read(&j->seq));
+
+	if (seq == atomic64_read(&j->seq)) {
+		bool set_need_write = false;
+
+		ret = 0;
+
+		buf = journal_cur_buf(j);
+
+		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
+
+		switch (journal_buf_switch(j, set_need_write)) {
+		case JOURNAL_ENTRY_ERROR:
+			ret = -EIO;
+			break;
+		case JOURNAL_ENTRY_CLOSED:
+			/*
+			 * Journal entry hasn't been opened yet, but caller
+			 * claims it has something (seq == j->seq):
+			 */
+			BUG();
+		case JOURNAL_ENTRY_INUSE:
+			break;
+		case JOURNAL_UNLOCKED:
+			return 0;
+		}
+	} else if (seq + 1 == atomic64_read(&j->seq) &&
+		   j->reservations.prev_buf_unwritten) {
+		ret = bch2_journal_error(j);
+	}
+
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
 int bch2_journal_flush_seq(struct journal *j, u64 seq)
 {
-	struct closure cl;
 	u64 start_time = local_clock();
+	int ret, ret2;

-	closure_init_stack(&cl);
-	bch2_journal_flush_seq_async(j, seq, &cl);
-	closure_sync(&cl);
+	ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));

 	bch2_time_stats_update(j->flush_seq_time, start_time);

-	return bch2_journal_error(j);
+	return ret ?: ret2 < 0 ? ret2 : 0;
 }

 void bch2_journal_meta_async(struct journal *j, struct closure *parent)
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -139,6 +139,7 @@ struct journal {

 	struct closure		io;
 	struct delayed_work	write_work;
+	unsigned long		replicas_failed;

 	/* Sequence number of most recent journal entry (last entry in @pin) */
 	atomic64_t		seq;
@ -227,6 +228,7 @@ struct journal_device {

 	/* Bio for journal reads/writes to this device */
 	struct bio		*bio;
+	u8			ptr_idx;

 	/* for bch_journal_read_device */
 	struct closure		read;
--- a/libbcachefs/keylist.c
+++ b/libbcachefs/keylist.c
@ -53,3 +53,14 @@ void bch2_keylist_pop_front(struct keylist *l)
 			  bkey_next(l->keys),
 			  bch_keylist_u64s(l));
 }
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *l)
+{
+	struct bkey_i *k;
+
+	for_each_keylist_key(l, k)
+		BUG_ON(bkey_next(k) != l->top &&
+		       bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+}
+#endif
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@ -59,4 +59,10 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
 #define keylist_single(k)					\
 	((struct keylist) { .keys = k, .top = bkey_next(k) })

+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_verify_keylist_sorted(struct keylist *);
+#else
+static inline void bch2_verify_keylist_sorted(struct keylist *l) {}
+#endif
+
 #endif /* _BCACHE_KEYLIST_H */
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@ -72,7 +72,7 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 	bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);

 	bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
-	ctxt.avoid = ca;
+	__set_bit(ca->dev_idx, ctxt.avoid.d);

 	/*
 	 * In theory, only one pass should be necessary as we've
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -30,7 +30,7 @@ static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
 }

 static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m,
-						       struct bkey_s_extent e)
+							struct bkey_s_extent e)
 {
 	const struct bch_extent_ptr *ptr;
 	struct bch_extent_ptr *ret;
@ -138,11 +138,11 @@ out:
 }

 void bch2_migrate_write_init(struct bch_fs *c,
-			    struct migrate_write *m,
-			    struct write_point *wp,
-			    struct bkey_s_c k,
-			    const struct bch_extent_ptr *move_ptr,
-			    unsigned flags)
+			     struct migrate_write *m,
+			     struct write_point *wp,
+			     struct bkey_s_c k,
+			     const struct bch_extent_ptr *move_ptr,
+			     unsigned flags)
 {
 	bkey_reassemble(&m->key, k);

@ -178,23 +178,18 @@ static void migrate_bio_init(struct moving_io *io, struct bio *bio,
 	bch2_bio_map(bio, NULL);
 }

-static void moving_io_destructor(struct closure *cl)
+static void moving_io_free(struct moving_io *io)
 {
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct moving_context *ctxt = io->ctxt;
 	struct bio_vec *bv;
 	int i;

-	//if (io->replace.failures)
-	//	trace_copy_collision(q, &io->key.k);
-
 	atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
 	wake_up(&ctxt->wait);

 	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
 		if (bv->bv_page)
 			__free_page(bv->bv_page);
-
 	kfree(io);
 }

@ -204,27 +199,26 @@ static void moving_error(struct moving_context *ctxt, unsigned flag)
 	//atomic_or(flag, &ctxt->error_flags);
 }

-static void moving_io_after_write(struct closure *cl)
+static void moving_write_done(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct moving_context *ctxt = io->ctxt;

 	if (io->write.op.error)
-		moving_error(ctxt, MOVING_FLAG_WRITE);
+		moving_error(io->ctxt, MOVING_FLAG_WRITE);

-	moving_io_destructor(cl);
+	//if (io->replace.failures)
+	//	trace_copy_collision(q, &io->key.k);
+
+	moving_io_free(io);
 }

-static void write_moving(struct moving_io *io)
+static void write_moving(struct closure *cl)
 {
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
 	struct bch_write_op *op = &io->write.op;

-	if (op->error) {
-		closure_return_with_destructor(&io->cl, moving_io_destructor);
-	} else {
-		closure_call(&op->cl, bch2_write, NULL, &io->cl);
-		closure_return_with_destructor(&io->cl, moving_io_after_write);
-	}
+	closure_call(&op->cl, bch2_write, NULL, &io->cl);
+	closure_return_with_destructor(&io->cl, moving_write_done);
 }

 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@ -243,10 +237,8 @@ static void read_moving_endio(struct bio *bio)

 	trace_move_read_done(&io->write.key.k);

-	if (bio->bi_error) {
-		io->write.op.error = bio->bi_error;
+	if (bio->bi_error)
 		moving_error(io->ctxt, MOVING_FLAG_READ);
-	}

 	io->read_completed = true;
 	if (next_pending_write(ctxt))
@ -255,43 +247,21 @@ static void read_moving_endio(struct bio *bio)
 	closure_put(&ctxt->cl);
 }

-static void __bch2_data_move(struct closure *cl)
-{
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct bch_fs *c = io->write.op.c;
-	struct extent_pick_ptr pick;
-
-	bch2_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key),
-				     io->ctxt->avoid, &pick);
-	if (IS_ERR_OR_NULL(pick.ca))
-		closure_return_with_destructor(cl, moving_io_destructor);
-
-	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
-	io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k);
-	io->rbio.bio.bi_end_io	= read_moving_endio;
-
-	/*
-	 * dropped by read_moving_endio() - guards against use after free of
-	 * ctxt when doing wakeup
-	 */
-	closure_get(&io->ctxt->cl);
-
-	bch2_read_extent(c, &io->rbio,
-			bkey_i_to_s_c(&io->write.key),
-			&pick, BCH_READ_IS_LAST);
-}
-
 int bch2_data_move(struct bch_fs *c,
-		  struct moving_context *ctxt,
-		  struct write_point *wp,
-		  struct bkey_s_c k,
-		  const struct bch_extent_ptr *move_ptr)
+		   struct moving_context *ctxt,
+		   struct write_point *wp,
+		   struct bkey_s_c k,
+		   const struct bch_extent_ptr *move_ptr)
 {
+	struct extent_pick_ptr pick;
 	struct moving_io *io;

+	bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick);
+	if (IS_ERR_OR_NULL(pick.ca))
+		return pick.ca ? PTR_ERR(pick.ca) : 0;
+
 	io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
-		     DIV_ROUND_UP(k.k->size, PAGE_SECTORS),
-		     GFP_KERNEL);
+		     DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL);
 	if (!io)
 		return -ENOMEM;

@ -299,6 +269,10 @@ int bch2_data_move(struct bch_fs *c,

 	migrate_bio_init(io, &io->rbio.bio, k.k->size);

+	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
+	io->rbio.bio.bi_end_io		= read_moving_endio;
+
 	if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
 		kfree(io);
 		return -ENOMEM;
@ -318,7 +292,12 @@ int bch2_data_move(struct bch_fs *c,
 	atomic_add(k.k->size, &ctxt->sectors_in_flight);
 	list_add_tail(&io->list, &ctxt->reads);

-	closure_call(&io->cl, __bch2_data_move, NULL, &ctxt->cl);
+	/*
+	 * dropped by read_moving_endio() - guards against use after free of
+	 * ctxt when doing wakeup
+	 */
+	closure_get(&io->ctxt->cl);
+	bch2_read_extent(c, &io->rbio, k, &pick, 0);
 	return 0;
 }

@ -328,8 +307,14 @@ static void do_pending_writes(struct moving_context *ctxt)

 	while ((io = next_pending_write(ctxt))) {
 		list_del(&io->list);
+
+		if (io->rbio.bio.bi_error) {
+			moving_io_free(io);
+			continue;
+		}
+
 		trace_move_write(&io->write.key.k);
-		write_moving(io);
+		closure_call(&io->cl, write_moving, NULL, &ctxt->cl);
 	}
 }

--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@ -46,7 +46,7 @@ struct moving_context {
 	struct bch_ratelimit	*rate;

 	/* Try to avoid reading the following device */
-	struct bch_dev		*avoid;
+	struct bch_devs_mask	avoid;

 	struct list_head	reads;

--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@ -181,7 +181,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 		if (val) {
 			id = bch2_opt_lookup(name);
 			if (id < 0)
-				return -EINVAL;
+				continue;

 			ret = parse_one_opt(id, val, &v);
 			if (ret < 0)
@ -196,8 +196,9 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 				v = 0;
 			}

-			if (bch2_opt_table[id].type != BCH_OPT_BOOL)
-				return -EINVAL;
+			if (id < 0 ||
+			    bch2_opt_table[id].type != BCH_OPT_BOOL)
+				continue;
 		}

 		bch2_opt_set(opts, id, v);
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -700,23 +700,18 @@ static void write_super_endio(struct bio *bio)

 	/* XXX: return errors directly */

-	bch2_dev_fatal_io_err_on(bio->bi_error, ca, "superblock write");
+	if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write"))
+		ca->sb_write_error = 1;

 	closure_put(&ca->fs->sb_write);
 	percpu_ref_put(&ca->io_ref);
 }

-static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 {
 	struct bch_sb *sb = ca->disk_sb.sb;
 	struct bio *bio = ca->disk_sb.bio;

-	if (idx >= sb->layout.nr_superblocks)
-		return false;
-
-	if (!percpu_ref_tryget(&ca->io_ref))
-		return false;
-
 	sb->offset = sb->layout.sb_offset[idx];

 	SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum);
@ -734,21 +729,23 @@ static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
 	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
 	bch2_bio_map(bio, sb);

+	percpu_ref_get(&ca->io_ref);
 	closure_bio_submit(bio, &c->sb_write);
-	return true;
 }

 void bch2_write_super(struct bch_fs *c)
 {
 	struct closure *cl = &c->sb_write;
 	struct bch_dev *ca;
-	unsigned i, super_idx = 0;
+	unsigned i, sb = 0, nr_wrote;
 	const char *err;
-	bool wrote;
+	struct bch_devs_mask sb_written;
+	bool wrote, can_mount_without_written, can_mount_with_written;

 	lockdep_assert_held(&c->sb_lock);

 	closure_init_stack(cl);
+	memset(&sb_written, 0, sizeof(sb_written));

 	le64_add_cpu(&c->disk_sb->seq, 1);

@ -767,15 +764,53 @@ void bch2_write_super(struct bch_fs *c)
 	    test_bit(BCH_FS_ERROR, &c->flags))
 		goto out;

+	for_each_online_member(ca, c, i) {
+		__set_bit(ca->dev_idx, sb_written.d);
+		ca->sb_write_error = 0;
+	}
+
 	do {
 		wrote = false;
 		for_each_online_member(ca, c, i)
-			if (write_one_super(c, ca, super_idx))
+			if (sb < ca->disk_sb.sb->layout.nr_superblocks) {
+				write_one_super(c, ca, sb);
 				wrote = true;
-
+			}
 		closure_sync(cl);
-		super_idx++;
+		sb++;
 	} while (wrote);
+
+	for_each_online_member(ca, c, i)
+		if (ca->sb_write_error)
+			__clear_bit(ca->dev_idx, sb_written.d);
+
+	nr_wrote = bitmap_weight(sb_written.d, BCH_SB_MEMBERS_MAX);
+
+	can_mount_with_written =
+		bch2_have_enough_devs(c,
+			__bch2_replicas_status(c, sb_written),
+			BCH_FORCE_IF_DEGRADED);
+
+	for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+		sb_written.d[i] = ~sb_written.d[i];
+
+	can_mount_without_written =
+		bch2_have_enough_devs(c,
+			__bch2_replicas_status(c, sb_written),
+			BCH_FORCE_IF_DEGRADED);
+
+	/*
+	 * If we would be able to mount _without_ the devices we successfully
+	 * wrote superblocks to, we weren't able to write to enough devices:
+	 *
+	 * Exception: if we can mount without the successes because we haven't
+	 * written anything (new filesystem), we continue if we'd be able to
+	 * mount with the devices we did successfully write to:
+	 */
+	bch2_fs_fatal_err_on(!nr_wrote ||
+			     (can_mount_without_written &&
+			      !can_mount_with_written), c,
+		"Unable to write superblock to sufficient devices");
 out:
 	/* Make new options visible after they're persistent: */
 	bch2_sb_update(c);
@ -1087,7 +1122,7 @@ int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e,
 }

 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
-					      struct bch_dev *dev_to_offline)
+					struct bch_devs_mask online_devs)
 {
 	struct bch_replicas_cpu_entry *e;
 	struct bch_replicas_cpu *r;
@ -1114,8 +1149,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 			if (!replicas_test_dev(e, dev))
 				continue;

-			if (bch2_dev_is_online(c->devs[dev]) &&
-			    c->devs[dev] != dev_to_offline)
+			if (test_bit(dev, online_devs.d))
 				nr_online++;
 			else
 				nr_offline++;
@ -1137,7 +1171,32 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c,

 struct replicas_status bch2_replicas_status(struct bch_fs *c)
 {
-	return __bch2_replicas_status(c, NULL);
+	return __bch2_replicas_status(c, bch2_online_devs(c));
+}
+
+bool bch2_have_enough_devs(struct bch_fs *c,
+			   struct replicas_status s,
+			   unsigned flags)
+{
+	if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
+	     s.replicas[BCH_DATA_BTREE].nr_offline) &&
+	    !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
+		return false;
+
+	if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
+	     !s.replicas[BCH_DATA_BTREE].nr_online) &&
+	    !(flags & BCH_FORCE_IF_METADATA_LOST))
+		return false;
+
+	if (s.replicas[BCH_DATA_USER].nr_offline &&
+	    !(flags & BCH_FORCE_IF_DATA_DEGRADED))
+		return false;
+
+	if (!s.replicas[BCH_DATA_USER].nr_online &&
+	    !(flags & BCH_FORCE_IF_DATA_LOST))
+		return false;
+
+	return true;
 }

 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@ -4,6 +4,7 @@
 #include "extents.h"
 #include "eytzinger.h"
 #include "super_types.h"
+#include "super.h"

 #include <asm/byteorder.h>

@ -134,8 +135,9 @@ struct replicas_status {
 };

 struct replicas_status __bch2_replicas_status(struct bch_fs *,
-					      struct bch_dev *);
+					      struct bch_devs_mask);
 struct replicas_status bch2_replicas_status(struct bch_fs *);
+bool bch2_have_enough_devs(struct bch_fs *, struct replicas_status, unsigned);

 unsigned bch2_replicas_online(struct bch_fs *, bool);
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -11,6 +11,7 @@
 #include "btree_cache.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "btree_io.h"
 #include "chardev.h"
 #include "checksum.h"
@ -416,7 +417,6 @@ static void bch2_fs_exit(struct bch_fs *c)
 	del_timer_sync(&c->foreground_write_wakeup);
 	cancel_delayed_work_sync(&c->pd_controllers_update);
 	cancel_work_sync(&c->read_only_work);
-	cancel_work_sync(&c->read_retry_work);

 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (c->devs[i])
@ -519,10 +519,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	mutex_init(&c->bio_bounce_pages_lock);
 	mutex_init(&c->zlib_workspace_lock);

-	bio_list_init(&c->read_retry_list);
-	spin_lock_init(&c->read_retry_lock);
-	INIT_WORK(&c->read_retry_work, bch2_read_retry_work);
-
 	bio_list_init(&c->btree_write_error_list);
 	spin_lock_init(&c->btree_write_error_lock);
 	INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work);
@ -584,7 +580,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1,
 				      sizeof(struct btree_reserve)) ||
 	    mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
-				      sizeof(struct btree_interior_update)) ||
+				      sizeof(struct btree_update)) ||
 	    mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
 	    bioset_init(&c->btree_read_bio, 1,
 			offsetof(struct btree_read_bio, bio)) ||
@ -1120,7 +1116,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	spin_lock_init(&ca->freelist_lock);
 	bch2_dev_moving_gc_init(ca);

-	INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work);
+	INIT_WORK(&ca->io_error_work, bch2_io_error_work);

 	if (bch2_fs_init_fault("dev_alloc"))
 		goto err;
@ -1262,31 +1258,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb)

 /* Device management: */

-static bool have_enough_devs(struct bch_fs *c,
-			     struct replicas_status s,
-			     unsigned flags)
-{
-	if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
-	     s.replicas[BCH_DATA_BTREE].nr_offline) &&
-	    !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
-		return false;
-
-	if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
-	     !s.replicas[BCH_DATA_BTREE].nr_online) &&
-	    !(flags & BCH_FORCE_IF_METADATA_LOST))
-		return false;
-
-	if (s.replicas[BCH_DATA_USER].nr_offline &&
-	    !(flags & BCH_FORCE_IF_DATA_DEGRADED))
-		return false;
-
-	if (!s.replicas[BCH_DATA_USER].nr_online &&
-	    !(flags & BCH_FORCE_IF_DATA_LOST))
-		return false;
-
-	return true;
-}
-
 /*
 * Note: this function is also used by the error paths - when a particular
 * device sees an error, we call it to determine whether we can just set the
@ -1299,6 +1270,7 @@ static bool have_enough_devs(struct bch_fs *c,
 bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 			    enum bch_member_state new_state, int flags)
 {
+	struct bch_devs_mask new_online_devs;
 	struct replicas_status s;
 	struct bch_dev *ca2;
 	int i, nr_rw = 0, required;
@ -1331,19 +1303,12 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 			return true;

 		/* do we have enough devices to read from?  */
-		s = __bch2_replicas_status(c, ca);
+		new_online_devs = bch2_online_devs(c);
+		__clear_bit(ca->dev_idx, new_online_devs.d);

-		pr_info("replicas: j %u %u b %u %u d %u %u",
-			s.replicas[BCH_DATA_JOURNAL].nr_online,
-			s.replicas[BCH_DATA_JOURNAL].nr_offline,
+		s = __bch2_replicas_status(c, new_online_devs);

-			s.replicas[BCH_DATA_BTREE].nr_online,
-			s.replicas[BCH_DATA_BTREE].nr_offline,
-
-			s.replicas[BCH_DATA_USER].nr_online,
-			s.replicas[BCH_DATA_USER].nr_offline);
-
-		return have_enough_devs(c, s, flags);
+		return bch2_have_enough_devs(c, s, flags);
 	default:
 		BUG();
 	}
@ -1374,7 +1339,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)

 	s = bch2_replicas_status(c);

-	return have_enough_devs(c, s, flags);
+	return bch2_have_enough_devs(c, s, flags);
 }

 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@ -94,6 +94,18 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
 	__for_each_online_member(ca, c, iter,				\
 		(1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO))

+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
+{
+	struct bch_devs_mask devs;
+	struct bch_dev *ca;
+	unsigned i;
+
+	memset(&devs, 0, sizeof(devs));
+	for_each_online_member(ca, c, i)
+		__set_bit(ca->dev_idx, devs.d);
+	return devs;
+}
+
 struct bch_fs *bch2_bdev_to_fs(struct block_device *);
 struct bch_fs *bch2_uuid_to_fs(uuid_le);
 int bch2_congested(struct bch_fs *, int);
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@ -9,4 +9,8 @@ struct bcache_superblock {
 	fmode_t			mode;
 };

+struct bch_devs_mask {
+	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
+};
+
 #endif /* _BCACHE_SUPER_TYPES_H */
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -232,24 +232,36 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)

 	return scnprintf(buf, PAGE_SIZE,
 			 "capacity:\t\t%llu\n"
-			 "compressed:\n"
+			 "1 replicas:\n"
 			 "\tmeta:\t\t%llu\n"
 			 "\tdirty:\t\t%llu\n"
-			 "\tcached:\t\t%llu\n"
-			 "uncompressed:\n"
+			 "\treserved:\t%llu\n"
+			 "2 replicas:\n"
 			 "\tmeta:\t\t%llu\n"
 			 "\tdirty:\t\t%llu\n"
-			 "\tcached:\t\t%llu\n"
-			 "persistent reserved sectors:\t%llu\n"
-			 "online reserved sectors:\t%llu\n",
+			 "\treserved:\t%llu\n"
+			 "3 replicas:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\treserved:\t%llu\n"
+			 "4 replicas:\n"
+			 "\tmeta:\t\t%llu\n"
+			 "\tdirty:\t\t%llu\n"
+			 "\treserved:\t%llu\n"
+			 "online reserved:\t%llu\n",
 			 c->capacity,
-			 stats.s[S_COMPRESSED][S_META],
-			 stats.s[S_COMPRESSED][S_DIRTY],
-			 stats.s[S_COMPRESSED][S_CACHED],
-			 stats.s[S_UNCOMPRESSED][S_META],
-			 stats.s[S_UNCOMPRESSED][S_DIRTY],
-			 stats.s[S_UNCOMPRESSED][S_CACHED],
-			 stats.persistent_reserved,
+			 stats.s[0].data[S_META],
+			 stats.s[0].data[S_DIRTY],
+			 stats.s[0].persistent_reserved,
+			 stats.s[1].data[S_META],
+			 stats.s[1].data[S_DIRTY],
+			 stats.s[1].persistent_reserved,
+			 stats.s[2].data[S_META],
+			 stats.s[2].data[S_DIRTY],
+			 stats.s[2].persistent_reserved,
+			 stats.s[3].data[S_META],
+			 stats.s[3].data[S_DIRTY],
+			 stats.s[3].persistent_reserved,
 			 stats.online_reserved);
 }

@ -708,8 +720,8 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
 		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
 		stats.buckets_alloc,			ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets_meta,			ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets_dirty,			ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets[S_META],			ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets[S_DIRTY],			ca->mi.nbuckets - ca->mi.first_bucket,
 		__dev_buckets_available(ca, stats),	ca->mi.nbuckets - ca->mi.first_bucket,
 		c->freelist_wait.list.first		? "waiting" : "empty",
 		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
@ -749,11 +761,11 @@ SHOW(bch2_dev)

 	sysfs_hprint(dirty_data,	stats.sectors[S_DIRTY] << 9);
 	sysfs_print(dirty_bytes,	stats.sectors[S_DIRTY] << 9);
-	sysfs_print(dirty_buckets,	stats.buckets_dirty);
-	sysfs_hprint(cached_data,	stats.sectors[S_CACHED] << 9);
-	sysfs_print(cached_bytes,	stats.sectors[S_CACHED] << 9);
+	sysfs_print(dirty_buckets,	stats.buckets[S_DIRTY]);
+	sysfs_hprint(cached_data,	stats.sectors_cached << 9);
+	sysfs_print(cached_bytes,	stats.sectors_cached << 9);
 	sysfs_print(cached_buckets,	stats.buckets_cached);
-	sysfs_print(meta_buckets,	stats.buckets_meta);
+	sysfs_print(meta_buckets,	stats.buckets[S_META]);
 	sysfs_print(alloc_buckets,	stats.buckets_alloc);
 	sysfs_print(available_buckets,	dev_buckets_available(ca));
 	sysfs_print(free_buckets,	dev_buckets_free(ca));