Update bcachefs sources to 783085c3cc44 kbuild: Allow gcov to be enabled on the command line

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
2025-02-22 00:00:03 +03:00 · 2023-11-20 19:33:52 -05:00 · 2023-11-20 19:33:52 -05:00 · 06611a71a3
commit 06611a71a3
parent e6b578917f
71 changed files with 1309 additions and 632 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-938f680845d1be28979e23aee972dba010c464ba
+783085c3cc440183ba5e987b1aa7791cc1ca42ba
--- a/cmd_data.c
+++ b/cmd_data.c
@ -5,6 +5,7 @@

 #include "libbcachefs/bcachefs_ioctl.h"
 #include "libbcachefs/btree_cache.h"
+#include "libbcachefs/move.h"

 #include "cmds.h"
 #include "libbcachefs.h"
@ -55,7 +56,7 @@ int cmd_data_rereplicate(int argc, char *argv[])
 		die("too many arguments");

 	return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) {
-		.op		= BCH_DATA_OP_REREPLICATE,
+		.op		= BCH_DATA_OP_rereplicate,
 		.start_btree	= 0,
 		.start_pos	= POS_MIN,
 		.end_btree	= BTREE_ID_NR,
@ -70,7 +71,7 @@ static void data_job_usage(void)
 	     "\n"
 	     "Kick off a data job and report progress\n"
 	     "\n"
-	     "job: one of scrub, rereplicate, migrate, or rewrite_old_nodes\n"
+	     "job: one of scrub, rereplicate, migrate, rewrite_old_nodes, or drop_extra_replicas\n"
 	     "\n"
 	     "Options:\n"
 	     "  -b btree                    btree to operate on\n"
@ -81,14 +82,6 @@ static void data_job_usage(void)
 	exit(EXIT_SUCCESS);
 }

-const char * const data_jobs[] = {
-	"scrub",
-	"rereplicate",
-	"migrate",
-	"rewrite_old_nodes",
-	NULL
-};
-
 int cmd_data_job(int argc, char *argv[])
 {
 	struct bch_ioctl_data op = {
@ -121,10 +114,7 @@ int cmd_data_job(int argc, char *argv[])
 	if (!job)
 		die("please specify which type of job");

-	op.op = read_string_list_or_die(job, data_jobs, "bad job type");
-
-	if (op.op == BCH_DATA_OP_SCRUB)
-		die("scrub not implemented yet");
+	op.op = read_string_list_or_die(job, bch2_data_ops_strs, "bad job type");

 	char *fs_path = arg_pop();
 	if (!fs_path)
--- a/cmd_device.c
+++ b/cmd_device.c
@ -332,7 +332,7 @@ int cmd_device_evacuate(int argc, char *argv[])
 	}

 	return bchu_data(fs, (struct bch_ioctl_data) {
-		.op		= BCH_DATA_OP_MIGRATE,
+		.op		= BCH_DATA_OP_migrate,
 		.start_btree	= 0,
 		.start_pos	= POS_MIN,
 		.end_btree	= BTREE_ID_NR,
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@ -161,6 +161,13 @@ static inline i_type a_type##_read(const a_type##_t *v)			\
 	return __ATOMIC_READ(&v->counter);				\
 }									\
 									\
+static inline i_type a_type##_read_acquire(const a_type##_t *v)		\
+{									\
+	i_type ret = __ATOMIC_READ(&v->counter);			\
+	smp_mb__after_atomic();						\
+	return ret;							\
+}									\
+									\
 static inline void a_type##_set(a_type##_t *v, i_type i)		\
 {									\
 	return __ATOMIC_SET(&v->counter, i);				\
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@ -104,7 +104,7 @@

 struct closure;
 struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
+typedef void (closure_fn) (struct work_struct *);
 extern struct dentry *bcache_debug;

 struct closure_waitlist {
@ -254,7 +254,7 @@ static inline void closure_queue(struct closure *cl)
 		INIT_WORK(&cl->work, cl->work.func);
 		BUG_ON(!queue_work(wq, &cl->work));
 	} else
-		cl->fn(cl);
+		cl->fn(&cl->work);
 }

 /**
@ -309,6 +309,11 @@ static inline void closure_wake_up(struct closure_waitlist *list)
 	__closure_wake_up(list);
 }

+#define CLOSURE_CALLBACK(name)	void name(struct work_struct *ws)
+#define closure_type(name, type, member)				\
+	struct closure *cl = container_of(ws, struct closure, work);	\
+	type *name = container_of(cl, type, member)
+
 /**
 * continue_at - jump to another function with barrier
 *
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@ -22,10 +22,18 @@ struct shrinker {
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
 	struct list_head list;
+	void	*private_data;
 };

-int register_shrinker(struct shrinker *, const char *, ...);
-void unregister_shrinker(struct shrinker *);
+static inline void shrinker_free(struct shrinker *s)
+{
+	free(s);
+}
+
+struct shrinker *shrinker_alloc(unsigned int, const char *, ...);
+
+int shrinker_register(struct shrinker *);
+void shrinker_unregister(struct shrinker *);

 void run_shrinkers(gfp_t gfp_mask, bool);

--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@ -847,6 +847,19 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
 			return ret;
 	}

+	/*
+	 * need to know if we're getting called from the invalidate path or
+	 * not:
+	 */
+
+	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+	    old_a->cached_sectors) {
+		ret = bch2_update_cached_sectors_list(trans, new->k.p.inode,
+						      -((s64) old_a->cached_sectors));
+		if (ret)
+			return ret;
+	}
+
 	return 0;
 }

@ -1212,7 +1225,7 @@ fsck_err:
 	return ret;
 }

-static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
 					      struct btree_iter *iter)
 {
 	struct bch_fs *c = trans->c;
@ -1271,24 +1284,6 @@ delete:
 	goto out;
 }

-static int bch2_check_discard_freespace_key(struct btree_trans *trans,
-					    struct btree_iter *iter,
-					    struct bpos end)
-{
-	if (!btree_id_is_extents(iter->btree_id)) {
-		return __bch2_check_discard_freespace_key(trans, iter);
-	} else {
-		int ret = 0;
-
-		while (!bkey_eq(iter->pos, end) &&
-		       !(ret = btree_trans_too_many_iters(trans) ?:
-			       __bch2_check_discard_freespace_key(trans, iter)))
-			bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
-
-		return ret;
-	}
-}
-
 /*
 * We've already checked that generation numbers in the bucket_gens btree are
 * valid for buckets that exist; this just checks for keys for nonexistent
@ -1445,12 +1440,40 @@ bkey_err:
 	ret = for_each_btree_key2(trans, iter,
 			BTREE_ID_need_discard, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
-		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
-	      for_each_btree_key2(trans, iter,
-			BTREE_ID_freespace, POS_MIN,
-			BTREE_ITER_PREFETCH, k,
-		bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
-	      for_each_btree_key_commit(trans, iter,
+		bch2_check_discard_freespace_key(trans, &iter));
+	if (ret)
+		goto err;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+	while (1) {
+		bch2_trans_begin(trans);
+		k = bch2_btree_iter_peek(&iter);
+		if (!k.k)
+			break;
+
+		ret = bkey_err(k) ?:
+			bch2_check_discard_freespace_key(trans, &iter);
+		if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+			ret = 0;
+			continue;
+		}
+		if (ret) {
+			struct printbuf buf = PRINTBUF;
+			bch2_bkey_val_to_text(&buf, c, k);
+
+			bch_err(c, "while checking %s", buf.buf);
+			printbuf_exit(&buf);
+			break;
+		}
+
+		bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+	}
+	bch2_trans_iter_exit(trans, &iter);
+	if (ret)
+		goto err;
+
+	ret = for_each_btree_key_commit(trans, iter,
 			BTREE_ID_bucket_gens, POS_MIN,
 			BTREE_ITER_PREFETCH, k,
 			NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
@ -1802,7 +1825,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
 	unsigned i;
 	int ret = 0;

-	ret = bch2_btree_write_buffer_flush(trans);
+	ret = bch2_btree_write_buffer_tryflush(trans);
 	if (ret)
 		goto err;

--- a/libbcachefs/alloc_foreground.c
+++ b/libbcachefs/alloc_foreground.c
@ -1297,6 +1297,30 @@ out:
 	return wp;
 }

+static noinline void
+deallocate_extra_replicas(struct bch_fs *c,
+			  struct open_buckets *ptrs,
+			  struct open_buckets *ptrs_no_use,
+			  unsigned extra_replicas)
+{
+	struct open_buckets ptrs2 = { 0 };
+	struct open_bucket *ob;
+	unsigned i;
+
+	open_bucket_for_each(c, ptrs, ob, i) {
+		unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+		if (d && d <= extra_replicas) {
+			extra_replicas -= d;
+			ob_push(c, ptrs_no_use, ob);
+		} else {
+			ob_push(c, &ptrs2, ob);
+		}
+	}
+
+	*ptrs = ptrs2;
+}
+
 /*
 * Get us an open_bucket we can allocate from, return with it locked:
 */
@ -1382,6 +1406,9 @@ alloc_done:
 	if (ret)
 		goto err;

+	if (nr_effective > nr_replicas)
+		deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+
 	/* Free buckets we didn't use: */
 	open_bucket_for_each(c, &wp->ptrs, ob, i)
 		open_bucket_free_unused(c, ob);
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -406,6 +406,7 @@ BCH_DEBUG_PARAMS_DEBUG()
 	x(blocked_journal_max_in_flight)	\
 	x(blocked_allocate)			\
 	x(blocked_allocate_open_bucket)		\
+	x(blocked_write_buffer_full)		\
 	x(nocow_lock_contended)

 enum bch_time_stats {
@ -640,6 +641,8 @@ struct journal_keys {
 	size_t			gap;
 	size_t			nr;
 	size_t			size;
+	atomic_t		ref;
+	bool			initial_ref_held;
 };

 struct btree_trans_buf {
@ -664,7 +667,8 @@ struct btree_trans_buf {
 	x(invalidate)							\
 	x(delete_dead_snapshots)					\
 	x(snapshot_delete_pagecache)					\
-	x(sysfs)
+	x(sysfs)							\
+	x(btree_write_buffer)

 enum bch_write_ref {
 #define x(n) BCH_WRITE_REF_##n,
@ -1064,6 +1068,16 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
 #endif
 }

+static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+	return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+		atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+	return percpu_ref_tryget(&c->writes);
+#endif
+}
+
 static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
 {
 #ifdef BCH_WRITE_REF_DEBUG
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -303,6 +303,13 @@ struct bkey_i {
 	struct bch_val	v;
 };

+#define POS_KEY(_pos)							\
+((struct bkey) {							\
+	.u64s		= BKEY_U64s,					\
+	.format		= KEY_FORMAT_CURRENT,				\
+	.p		= _pos,						\
+})
+
 #define KEY(_inode, _offset, _size)					\
 ((struct bkey) {							\
 	.u64s		= BKEY_U64s,					\
@ -1436,7 +1443,7 @@ struct bch_sb_field_replicas_v0 {
 	struct bch_replicas_entry_v0 entries[];
 } __packed __aligned(8);

-struct bch_replicas_entry {
+struct bch_replicas_entry_v1 {
 	__u8			data_type;
 	__u8			nr_devs;
 	__u8			nr_required;
@ -1448,7 +1455,7 @@ struct bch_replicas_entry {

 struct bch_sb_field_replicas {
 	struct bch_sb_field	field;
-	struct bch_replicas_entry entries[];
+	struct bch_replicas_entry_v1 entries[];
 } __packed __aligned(8);

 /* BCH_SB_FIELD_quota: */
@ -2124,7 +2131,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(clock,		7)		\
 	x(dev_usage,		8)		\
 	x(log,			9)		\
-	x(overwrite,		10)
+	x(overwrite,		10)		\
+	x(write_buffer_keys,	11)

 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@ -2174,7 +2182,7 @@ struct jset_entry_usage {
 struct jset_entry_data_usage {
 	struct jset_entry	entry;
 	__le64			v;
-	struct bch_replicas_entry r;
+	struct bch_replicas_entry_v1 r;
 } __packed;

 struct jset_entry_clock {
--- a/libbcachefs/bcachefs_ioctl.h
+++ b/libbcachefs/bcachefs_ioctl.h
@ -173,12 +173,18 @@ struct bch_ioctl_disk_set_state {
 	__u64			dev;
 };

+#define BCH_DATA_OPS()			\
+	x(scrub,		0)	\
+	x(rereplicate,		1)	\
+	x(migrate,		2)	\
+	x(rewrite_old_nodes,	3)	\
+	x(drop_extra_replicas,	4)
+
 enum bch_data_ops {
-	BCH_DATA_OP_SCRUB		= 0,
-	BCH_DATA_OP_REREPLICATE		= 1,
-	BCH_DATA_OP_MIGRATE		= 2,
-	BCH_DATA_OP_REWRITE_OLD_NODES	= 3,
-	BCH_DATA_OP_NR			= 4,
+#define x(t, n) BCH_DATA_OP_##t = n,
+	BCH_DATA_OPS()
+#undef x
+	BCH_DATA_OP_NR
 };

 /*
@ -237,7 +243,7 @@ struct bch_ioctl_data_event {

 struct bch_replicas_usage {
 	__u64			sectors;
-	struct bch_replicas_entry r;
+	struct bch_replicas_entry_v1 r;
 } __packed;

 static inline struct bch_replicas_usage *
--- a/libbcachefs/btree_cache.c
+++ b/libbcachefs/btree_cache.c
@ -318,8 +318,7 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
 					   struct shrink_control *sc)
 {
-	struct bch_fs *c = container_of(shrink, struct bch_fs,
-					btree_cache.shrink);
+	struct bch_fs *c = shrink->private_data;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b, *t;
 	unsigned long nr = sc->nr_to_scan;
@ -420,8 +419,7 @@ out_nounlock:
 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
 					    struct shrink_control *sc)
 {
-	struct bch_fs *c = container_of(shrink, struct bch_fs,
-					btree_cache.shrink);
+	struct bch_fs *c = shrink->private_data;
 	struct btree_cache *bc = &c->btree_cache;

 	if (bch2_btree_shrinker_disabled)
@ -432,8 +430,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,

 static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
 {
-	struct bch_fs *c = container_of(shrink, struct bch_fs,
-					btree_cache.shrink);
+	struct bch_fs *c = shrink->private_data;
 	char *cbuf;
 	size_t buflen = seq_buf_get_buf(s, &cbuf);
 	struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
@ -448,7 +445,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	struct btree *b;
 	unsigned i, flags;

-	unregister_shrinker(&bc->shrink);
+	shrinker_free(bc->shrink);

 	/* vfree() can allocate memory: */
 	flags = memalloc_nofs_save();
@ -502,6 +499,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 int bch2_fs_btree_cache_init(struct bch_fs *c)
 {
 	struct btree_cache *bc = &c->btree_cache;
+	struct shrinker *shrink;
 	unsigned i;
 	int ret = 0;

@ -521,13 +519,16 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)

 	mutex_init(&c->verify_lock);

-	bc->shrink.count_objects	= bch2_btree_cache_count;
-	bc->shrink.scan_objects		= bch2_btree_cache_scan;
-	bc->shrink.to_text		= bch2_btree_cache_shrinker_to_text;
-	bc->shrink.seeks		= 4;
-	ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name);
-	if (ret)
+	shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
+	if (!shrink)
 		goto err;
+	bc->shrink = shrink;
+	shrink->count_objects	= bch2_btree_cache_count;
+	shrink->scan_objects	= bch2_btree_cache_scan;
+	shrink->to_text		= bch2_btree_cache_shrinker_to_text;
+	shrink->seeks		= 4;
+	shrink->private_data	= c;
+	shrinker_register(shrink);

 	return 0;
 err:
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -1287,7 +1287,7 @@ static int bch2_gc_done(struct bch_fs *c,
 		}

 		for (i = 0; i < c->replicas.nr; i++) {
-			struct bch_replicas_entry *e =
+			struct bch_replicas_entry_v1 *e =
 				cpu_replicas_entry(&c->replicas, i);

 			if (metadata_only &&
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
 	return offset;
 }

-static void btree_node_read_all_replicas_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
 {
-	struct btree_node_read_all *ra =
-		container_of(cl, struct btree_node_read_all, cl);
+	closure_type(ra, struct btree_node_read_all, cl);
 	struct bch_fs *c = ra->c;
 	struct btree *b = ra->b;
 	struct printbuf buf = PRINTBUF;
@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool

 	if (sync) {
 		closure_sync(&ra->cl);
-		btree_node_read_all_replicas_done(&ra->cl);
+		btree_node_read_all_replicas_done(&ra->cl.work);
 	} else {
 		continue_at(&ra->cl, btree_node_read_all_replicas_done,
 			    c->io_complete_wq);
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -1854,19 +1854,11 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
 					      struct btree_iter *iter,
 					      struct bpos end_pos)
 {
-	struct bkey_i *k;
-
-	if (bpos_lt(iter->path->pos, iter->journal_pos))
-		iter->journal_idx = 0;
-
-	k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+	return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
 					   iter->path->level,
 					   iter->path->pos,
 					   end_pos,
 					   &iter->journal_idx);
-
-	iter->journal_pos = k ? k->k.p : end_pos;
-	return k;
 }

 static noinline
@ -2874,7 +2866,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
 	trans->fn_idx		= fn_idx;
 	trans->locking_wait.task = current;
 	trans->journal_replay_not_finished =
-		!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+		unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+		atomic_inc_not_zero(&c->journal_keys.ref);
 	closure_init_stack(&trans->ref);

 	s = btree_trans_stats(trans);
@ -2991,6 +2984,9 @@ void bch2_trans_put(struct btree_trans *trans)
 			kfree(trans->fs_usage_deltas);
 	}

+	if (unlikely(trans->journal_replay_not_finished))
+		bch2_journal_keys_put(c);
+
 	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
 		mempool_free(trans->mem, &c->btree_trans_mem_pool);
 	else
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@ -445,14 +445,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
 					  unsigned flags,
 					  unsigned long ip)
 {
-	memset(iter, 0, sizeof(*iter));
 	iter->trans		= trans;
+	iter->update_path	= NULL;
+	iter->key_cache_path	= NULL;
 	iter->btree_id		= btree_id;
+	iter->min_depth		= 0;
 	iter->flags		= flags;
 	iter->snapshot		= pos.snapshot;
 	iter->pos		= pos;
-	iter->k.p	= pos;
-
+	iter->k			= POS_KEY(pos);
+	iter->journal_idx	= 0;
 #ifdef CONFIG_BCACHEFS_DEBUG
 	iter->ip_allocated = ip;
 #endif
--- a/libbcachefs/btree_journal_iter.c
+++ b/libbcachefs/btree_journal_iter.c
@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
 	return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
 }

+/* Returns first non-overwritten key >= search key: */
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
 					   unsigned level, struct bpos pos,
 					   struct bpos end_pos, size_t *idx)
@ -80,16 +81,32 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree
 	struct journal_keys *keys = &c->journal_keys;
 	unsigned iters = 0;
 	struct journal_key *k;
+
+	BUG_ON(*idx > keys->nr);
 search:
 	if (!*idx)
 		*idx = __bch2_journal_key_search(keys, btree_id, level, pos);

+	while (*idx &&
+	       __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+		--(*idx);
+		iters++;
+		if (iters == 10) {
+			*idx = 0;
+			goto search;
+		}
+	}
+
 	while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
 		if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
 			return NULL;

-		if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
-		    !k->overwritten)
+		if (k->overwritten) {
+			(*idx)++;
+			continue;
+		}
+
+		if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
 			return k->k;

 		(*idx)++;
@ -189,7 +206,9 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		/* Since @keys was full, there was no gap: */
 		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
 		kvfree(keys->d);
-		*keys = new_keys;
+		keys->d		= new_keys.d;
+		keys->nr	= new_keys.nr;
+		keys->size	= new_keys.size;

 		/* And now the gap is at the end: */
 		keys->gap	= keys->nr;
@ -415,10 +434,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 		cmp_int(l->journal_offset, r->journal_offset);
 }

-void bch2_journal_keys_free(struct journal_keys *keys)
+void bch2_journal_keys_put(struct bch_fs *c)
 {
+	struct journal_keys *keys = &c->journal_keys;
 	struct journal_key *i;

+	BUG_ON(atomic_read(&keys->ref) <= 0);
+
+	if (!atomic_dec_and_test(&keys->ref))
+		return;
+
 	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
 	keys->gap = keys->nr;

@ -429,6 +454,8 @@ void bch2_journal_keys_free(struct journal_keys *keys)
 	kvfree(keys->d);
 	keys->d = NULL;
 	keys->nr = keys->gap = keys->size = 0;
+
+	bch2_journal_entries_free(c);
 }

 static void __journal_keys_sort(struct journal_keys *keys)
--- a/libbcachefs/btree_journal_iter.h
+++ b/libbcachefs/btree_journal_iter.h
@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
 						struct bch_fs *,
 						struct btree *);

-void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_keys_put(struct bch_fs *);
+
+static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
+{
+	if (c->journal_keys.initial_ref_held)
+		bch2_journal_keys_put(c);
+	c->journal_keys.initial_ref_held = false;
+}
+
 void bch2_journal_entries_free(struct bch_fs *);

 int bch2_journal_keys_sort(struct bch_fs *);
--- a/libbcachefs/btree_key_cache.c
+++ b/libbcachefs/btree_key_cache.c
@ -646,11 +646,19 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 	if (journal_seq && ck->journal.seq != journal_seq)
 		goto out;

+	trans->journal_res.seq = ck->journal.seq;
+
 	/*
-	 * Since journal reclaim depends on us making progress here, and the
-	 * allocator/copygc depend on journal reclaim making progress, we need
-	 * to be using alloc reserves:
+	 * If we're at the end of the journal, we really want to free up space
+	 * in the journal right away - we don't want to pin that old journal
+	 * sequence number with a new btree node write, we want to re-journal
+	 * the update
 	 */
+	if (ck->journal.seq == journal_last_seq(j))
+		commit_flags |= BCH_WATERMARK_reclaim;
+	else
+		commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
+
 	ret   = bch2_btree_iter_traverse(&b_iter) ?:
 		bch2_trans_update(trans, &b_iter, ck->k,
 				  BTREE_UPDATE_KEY_CACHE_RECLAIM|
@ -659,9 +667,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 		bch2_trans_commit(trans, NULL, NULL,
 				  BCH_TRANS_COMMIT_no_check_rw|
 				  BCH_TRANS_COMMIT_no_enospc|
-				  (ck->journal.seq == journal_last_seq(j)
-				   ? BCH_WATERMARK_reclaim
-				   : 0)|
 				  commit_flags);

 	bch2_fs_fatal_err_on(ret &&
@ -830,8 +835,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
 					   struct shrink_control *sc)
 {
-	struct bch_fs *c = container_of(shrink, struct bch_fs,
-					btree_key_cache.shrink);
+	struct bch_fs *c = shrink->private_data;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	struct bucket_table *tbl;
 	struct bkey_cached *ck, *t;
@ -932,8 +936,7 @@ out:
 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
 					    struct shrink_control *sc)
 {
-	struct bch_fs *c = container_of(shrink, struct bch_fs,
-					btree_key_cache.shrink);
+	struct bch_fs *c = shrink->private_data;
 	struct btree_key_cache *bc = &c->btree_key_cache;
 	long nr = atomic_long_read(&bc->nr_keys) -
 		atomic_long_read(&bc->nr_dirty);
@ -953,7 +956,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
 	int cpu;
 #endif

-	unregister_shrinker(&bc->shrink);
+	shrinker_free(bc->shrink);

 	mutex_lock(&bc->lock);

@ -1028,8 +1031,8 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)

 static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
 {
-	struct btree_key_cache *bc =
-		container_of(shrink, struct btree_key_cache, shrink);
+	struct bch_fs *c = shrink->private_data;
+	struct btree_key_cache *bc = &c->btree_key_cache;
 	char *cbuf;
 	size_t buflen = seq_buf_get_buf(s, &cbuf);
 	struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
@ -1041,6 +1044,7 @@ static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shri
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 {
 	struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+	struct shrinker *shrink;

 #ifdef __KERNEL__
 	bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
@ -1053,12 +1057,16 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)

 	bc->table_init_done = true;

-	bc->shrink.seeks		= 0;
-	bc->shrink.count_objects	= bch2_btree_key_cache_count;
-	bc->shrink.scan_objects		= bch2_btree_key_cache_scan;
-	bc->shrink.to_text		= bch2_btree_key_cache_shrinker_to_text;
-	if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name))
+	shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
+	if (!shrink)
 		return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+	bc->shrink = shrink;
+	shrink->seeks		= 0;
+	shrink->count_objects	= bch2_btree_key_cache_count;
+	shrink->scan_objects	= bch2_btree_key_cache_scan;
+	shrink->to_text		= bch2_btree_key_cache_shrinker_to_text;
+	shrink->private_data	= c;
+	shrinker_register(shrink);
 	return 0;
 }

--- a/libbcachefs/btree_key_cache_types.h
+++ b/libbcachefs/btree_key_cache_types.h
@ -17,7 +17,7 @@ struct btree_key_cache {
 	struct list_head	freed_nonpcpu;
 	size_t			nr_freed_nonpcpu;

-	struct shrinker		shrink;
+	struct shrinker		*shrink;
 	unsigned		shrink_iter;
 	struct btree_key_cache_freelist __percpu *pcpu_freed;

--- a/libbcachefs/btree_trans_commit.c
+++ b/libbcachefs/btree_trans_commit.c
@ -660,10 +660,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 		i->k->k.needs_whiteout = false;
 	}

-	if (trans->nr_wb_updates &&
-	    trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
-		return -BCH_ERR_btree_insert_need_flush_buffer;
-
 	/*
 	 * Don't get journal reservation until after we know insert will
 	 * succeed:
@ -698,14 +694,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 	    bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
 		return -BCH_ERR_btree_insert_need_mark_replicas;

-	if (trans->nr_wb_updates) {
-		EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
-
-		ret = bch2_btree_insert_keys_write_buffer(trans);
-		if (ret)
-			goto revert_fs_usage;
-	}
-
 	h = trans->hooks;
 	while (h) {
 		ret = h->fn(trans, h);
@ -767,7 +755,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,

 		trans_for_each_wb_update(trans, wb) {
 			entry = bch2_journal_add_entry(j, &trans->journal_res,
-					       BCH_JSET_ENTRY_btree_keys,
+					       BCH_JSET_ENTRY_write_buffer_keys,
 					       wb->btree, 0,
 					       wb->k.k.u64s);
 			bkey_copy((struct bkey_i *) entry->start, &wb->k);
@ -951,30 +939,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,

 		ret = bch2_trans_relock(trans);
 		break;
-	case -BCH_ERR_btree_insert_need_flush_buffer: {
-		struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-		ret = 0;
-
-		if (wb->state.nr > wb->size * 3 / 4) {
-			bch2_trans_unlock(trans);
-			mutex_lock(&wb->flush_lock);
-
-			if (wb->state.nr > wb->size * 3 / 4) {
-				bch2_trans_begin(trans);
-				ret = __bch2_btree_write_buffer_flush(trans,
-						flags|BCH_TRANS_COMMIT_no_check_rw, true);
-				if (!ret) {
-					trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-					ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-				}
-			} else {
-				mutex_unlock(&wb->flush_lock);
-				ret = bch2_trans_relock(trans);
-			}
-		}
-		break;
-	}
 	default:
 		BUG_ON(ret >= 0);
 		break;
@ -1073,20 +1037,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
 			goto out_reset;
 	}

-	if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
-	    mutex_trylock(&c->btree_write_buffer.flush_lock)) {
-		bch2_trans_begin(trans);
-		bch2_trans_unlock(trans);
-
-		ret = __bch2_btree_write_buffer_flush(trans,
-					flags|BCH_TRANS_COMMIT_no_check_rw, true);
-		if (!ret) {
-			trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-			ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-		}
-		goto out;
-	}
-
 	EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));

 	trans->journal_u64s		= trans->extra_journal_entries.nr;
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -173,7 +173,7 @@ struct btree_cache {
 	unsigned		not_freed_will_make_reachable;
 	unsigned		not_freed_access_bit;
 	atomic_t		dirty;
-	struct shrinker		shrink;
+	struct shrinker		*shrink;

 	/*
 	 * If we need to allocate memory for a new btree node and that
@ -297,8 +297,7 @@ struct btree_iter {
 	struct btree_path	*key_cache_path;

 	enum btree_id		btree_id:8;
-	unsigned		min_depth:3;
-	unsigned		advanced:1;
+	u8			min_depth;

 	/* btree_iter_copy starts here: */
 	u16			flags;
@ -315,7 +314,6 @@ struct btree_iter {

 	/* BTREE_ITER_WITH_JOURNAL: */
 	size_t			journal_idx;
-	struct bpos		journal_pos;
 #ifdef TRACK_PATH_ALLOCATED
 	unsigned long		ip_allocated;
 #endif
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -774,9 +774,9 @@ static void btree_interior_update_work(struct work_struct *work)
 	}
 }

-static void btree_update_set_nodes_written(struct closure *cl)
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
 {
-	struct btree_update *as = container_of(cl, struct btree_update, cl);
+	closure_type(as, struct btree_update, cl);
 	struct bch_fs *c = as->c;

 	mutex_lock(&c->btree_interior_update_lock);
--- a/libbcachefs/btree_write_buffer.c
+++ b/libbcachefs/btree_write_buffer.c
@ -7,43 +7,132 @@
 #include "btree_write_buffer.h"
 #include "error.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "journal_reclaim.h"

-#include <linux/sort.h>
+#include <linux/prefetch.h>

 static int bch2_btree_write_buffer_journal_flush(struct journal *,
 				struct journal_entry_pin *, u64);

-static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
-{
-	const struct btree_write_buffered_key *l = _l;
-	const struct btree_write_buffered_key *r = _r;
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);

-	return  cmp_int(l->btree, r->btree) ?:
-		bpos_cmp(l->k.k.p, r->k.k.p) ?:
-		cmp_int(l->journal_seq, r->journal_seq) ?:
-		cmp_int(l->journal_offset, r->journal_offset);
+static inline bool __wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
+{
+	return (cmp_int(l->hi, r->hi) ?:
+		cmp_int(l->mi, r->mi) ?:
+		cmp_int(l->lo, r->lo)) >= 0;
 }

-static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+static inline bool wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
 {
-	const struct btree_write_buffered_key *l = _l;
-	const struct btree_write_buffered_key *r = _r;
+#ifdef CONFIG_X86_64
+	int cmp;

-	return  cmp_int(l->journal_seq, r->journal_seq);
+	asm(".intel_syntax noprefix;"
+	    "mov rax, [%[l]];"
+	    "sub rax, [%[r]];"
+	    "mov rax, [%[l] + 8];"
+	    "sbb rax, [%[r] + 8];"
+	    "mov rax, [%[l] + 16];"
+	    "sbb rax, [%[r] + 16];"
+	    ".att_syntax prefix;"
+	    : "=@ccae" (cmp)
+	    : [l] "r" (l), [r] "r" (r)
+	    : "rax", "cc");
+
+	EBUG_ON(cmp != __wb_key_cmp(l, r));
+	return cmp;
+#else
+	return __wb_key_cmp(l, r);
+#endif
 }

-static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
+/* Compare excluding idx, the low 24 bits: */
+static inline bool wb_key_eq(const void *_l, const void *_r)
+{
+	const struct wb_key_ref *l = _l;
+	const struct wb_key_ref *r = _r;
+
+	return !((l->hi ^ r->hi)|
+		 (l->mi ^ r->mi)|
+		 ((l->lo >> 24) ^ (r->lo >> 24)));
+}
+
+static noinline void wb_sort(struct wb_key_ref *base, size_t num)
+{
+	size_t n = num, a = num / 2;
+
+	if (!a)		/* num < 2 || size == 0 */
+		return;
+
+	for (;;) {
+		size_t b, c, d;
+
+		if (a)			/* Building heap: sift down --a */
+			--a;
+		else if (--n)		/* Sorting: Extract root to --n */
+			swap(base[0], base[n]);
+		else			/* Sort complete */
+			break;
+
+		/*
+		 * Sift element at "a" down into heap.  This is the
+		 * "bottom-up" variant, which significantly reduces
+		 * calls to cmp_func(): we find the sift-down path all
+		 * the way to the leaves (one compare per level), then
+		 * backtrack to find where to insert the target element.
+		 *
+		 * Because elements tend to sift down close to the leaves,
+		 * this uses fewer compares than doing two per level
+		 * on the way down.  (A bit more than half as many on
+		 * average, 3/4 worst-case.)
+		 */
+		for (b = a; c = 2*b + 1, (d = c + 1) < n;)
+			b = wb_key_cmp(base + c, base + d) ? c : d;
+		if (d == n)		/* Special case last leaf with no sibling */
+			b = c;
+
+		/* Now backtrack from "b" to the correct location for "a" */
+		while (b != a && wb_key_cmp(base + a, base + b))
+			b = (b - 1) / 2;
+		c = b;			/* Where "a" belongs */
+		while (b != a) {	/* Shift it into place */
+			b = (b - 1) / 2;
+			swap(base[b], base[c]);
+		}
+	}
+}
+
+static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
 					  struct btree_iter *iter,
+					  struct btree_write_buffered_key *wb)
+{
+	bch2_btree_node_unlock_write(trans, iter->path, iter->path->l[0].b);
+
+	trans->journal_res.seq = wb->journal_seq;
+
+	return bch2_trans_update(trans, iter, &wb->k,
+				 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+		bch2_trans_commit(trans, NULL, NULL,
+				  BCH_TRANS_COMMIT_no_enospc|
+				  BCH_TRANS_COMMIT_no_check_rw|
+				  BCH_TRANS_COMMIT_no_journal_res|
+				  BCH_TRANS_COMMIT_journal_reclaim);
+}
+
+static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
 			       struct btree_write_buffered_key *wb,
-					     unsigned commit_flags,
-					     bool *write_locked,
-					     size_t *fast)
+			       bool *write_locked, size_t *fast)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_path *path;
 	int ret;

+	EBUG_ON(!wb->journal_seq);
+	EBUG_ON(!c->btree_write_buffer.flushing.pin.seq);
+	EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+
 	ret = bch2_btree_iter_traverse(iter);
 	if (ret)
 		return ret;
@ -66,46 +155,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
 		*write_locked = true;
 	}

-	if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
-		bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+	if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) {
 		*write_locked = false;
-		goto trans_commit;
+		return wb_flush_one_slowpath(trans, iter, wb);
 	}

 	bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
 	(*fast)++;
 	return 0;
-trans_commit:
-	trans->journal_res.seq = wb->journal_seq;
-
-	return  bch2_trans_update(trans, iter, &wb->k,
-				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-		bch2_trans_commit(trans, NULL, NULL,
-				  commit_flags|
-				  BCH_TRANS_COMMIT_no_check_rw|
-				  BCH_TRANS_COMMIT_no_enospc|
-				  BCH_TRANS_COMMIT_no_journal_res|
-				  BCH_TRANS_COMMIT_journal_reclaim);
-}
-
-static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
-{
-	union btree_write_buffer_state old, new;
-	u64 v = READ_ONCE(wb->state.v);
-
-	do {
-		old.v = new.v = v;
-
-		new.nr = 0;
-		new.idx++;
-	} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
-
-	while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
-		cpu_relax();
-
-	smp_mb();
-
-	return old;
 }

 /*
@ -137,35 +194,79 @@ btree_write_buffered_insert(struct btree_trans *trans,
 	return ret;
 }

-int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
-				    bool locked)
+static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
+{
+	struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
+	struct journal *j = &c->journal;
+
+	if (!wb->inc.keys.nr)
+		return;
+
+	bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
+			     bch2_btree_write_buffer_journal_flush);
+
+	darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
+	darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+	if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
+		swap(wb->flushing.keys, wb->inc.keys);
+		goto out;
+	}
+
+	size_t nr = min(darray_room(wb->flushing.keys),
+			wb->sorted.size - wb->flushing.keys.nr);
+	nr = min(nr, wb->inc.keys.nr);
+
+	memcpy(&darray_top(wb->flushing.keys),
+	       wb->inc.keys.data,
+	       sizeof(wb->inc.keys.data[0]) * nr);
+
+	memmove(wb->inc.keys.data,
+		wb->inc.keys.data + nr,
+	       sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
+
+	wb->flushing.keys.nr	+= nr;
+	wb->inc.keys.nr		-= nr;
+out:
+	if (!wb->inc.keys.nr)
+		bch2_journal_pin_drop(j, &wb->inc.pin);
+	else
+		bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
+					bch2_btree_write_buffer_journal_flush);
+
+	if (j->watermark) {
+		spin_lock(&j->lock);
+		bch2_journal_set_watermark(j);
+		spin_unlock(&j->lock);
+	}
+
+	BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
+}
+
+static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
 	struct journal *j = &c->journal;
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	struct journal_entry_pin pin;
-	struct btree_write_buffered_key *i, *keys;
+	struct wb_key_ref *i;
 	struct btree_iter iter = { NULL };
-	size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+	size_t skipped = 0, fast = 0, slowpath = 0;
 	bool write_locked = false;
-	union btree_write_buffer_state s;
 	int ret = 0;

-	memset(&pin, 0, sizeof(pin));
+	bch2_trans_unlock(trans);
+	bch2_trans_begin(trans);

-	if (!locked && !mutex_trylock(&wb->flush_lock))
-		return 0;
+	mutex_lock(&wb->inc.lock);
+	move_keys_from_inc_to_flushing(wb);
+	mutex_unlock(&wb->inc.lock);

-	bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
-			      bch2_btree_write_buffer_journal_flush);
-	bch2_journal_pin_drop(j, &wb->journal_pin);
-
-	s = btree_write_buffer_switch(wb);
-	keys = wb->keys[s.idx];
-	nr = s.nr;
-
-	if (race_fault())
-		goto slowpath;
+	for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
+		wb->sorted.data[i].idx = i;
+		wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
+		memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
+	}
+	wb->sorted.nr = wb->flushing.keys.nr;

 	/*
 	 * We first sort so that we can detect and skip redundant updates, and
@ -181,110 +282,178 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
 	 * If that happens, simply skip the key so we can optimistically insert
 	 * as many keys as possible in the fast path.
 	 */
-	sort(keys, nr, sizeof(keys[0]),
-	     btree_write_buffered_key_cmp, NULL);
+	wb_sort(wb->sorted.data, wb->sorted.nr);
+
+	darray_for_each(wb->sorted, i) {
+		struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+
+		for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
+			prefetch(&wb->flushing.keys.data[n->idx]);
+
+		BUG_ON(!k->journal_seq);
+
+		if (i + 1 < &darray_top(wb->sorted) &&
+		    wb_key_eq(i, i + 1)) {
+			struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];

-	for (i = keys; i < keys + nr; i++) {
-		if (i + 1 < keys + nr &&
-		    i[0].btree == i[1].btree &&
-		    bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
 			skipped++;
-			i->journal_seq = 0;
+			n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);;
+			k->journal_seq = 0;
 			continue;
 		}

 		if (write_locked &&
-		    (iter.path->btree_id != i->btree ||
-		     bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+		    (iter.path->btree_id != k->btree ||
+		     bpos_gt(k->k.k.p, iter.path->l[0].b->key.k.p))) {
 			bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
 			write_locked = false;
 		}

-		if (!iter.path || iter.path->btree_id != i->btree) {
+		if (!iter.path || iter.path->btree_id != k->btree) {
 			bch2_trans_iter_exit(trans, &iter);
-			bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+			bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
 					     BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
 		}

-		bch2_btree_iter_set_pos(&iter, i->k.k.p);
+		bch2_btree_iter_set_pos(&iter, k->k.k.p);
 		iter.path->preserve = false;

 		do {
-			ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
-						commit_flags, &write_locked, &fast);
+			if (race_fault()) {
+				ret = -BCH_ERR_journal_reclaim_would_deadlock;
+				break;
+			}
+
+			ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
 			if (!write_locked)
 				bch2_trans_begin(trans);
 		} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));

-		if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+		if (!ret) {
+			k->journal_seq = 0;
+		} else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
 			slowpath++;
-			continue;
-		}
-		if (ret)
+			ret = 0;
+		} else
 			break;
-
-		i->journal_seq = 0;
 	}

 	if (write_locked)
 		bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
 	bch2_trans_iter_exit(trans, &iter);

-	trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
-
-	if (slowpath)
-		goto slowpath;
-
-	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
-out:
-	bch2_journal_pin_drop(j, &pin);
-	mutex_unlock(&wb->flush_lock);
-	return ret;
-slowpath:
-	trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+	if (ret)
+		goto err;

+	if (slowpath) {
 		/*
-	 * Now sort the rest by journal seq and bump the journal pin as we go.
-	 * The slowpath zapped the seq of keys that were successfully flushed so
+		 * Flush in the order they were present in the journal, so that
+		 * we can release journal pins:
+		 * The fastpath zapped the seq of keys that were successfully flushed so
 		 * we can skip those here.
 		 */
-	sort(keys, nr, sizeof(keys[0]),
-	     btree_write_buffered_journal_cmp,
-	     NULL);
+		trace_write_buffer_flush_slowpath(trans, slowpath, wb->flushing.keys.nr);

-	commit_flags &= ~BCH_WATERMARK_MASK;
-	commit_flags |= BCH_WATERMARK_reclaim;
-
-	for (i = keys; i < keys + nr; i++) {
+		struct btree_write_buffered_key *i;
+		darray_for_each(wb->flushing.keys, i) {
 			if (!i->journal_seq)
 				continue;

-		bch2_journal_pin_update(j, i->journal_seq, &pin,
+			bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
 						bch2_btree_write_buffer_journal_flush);

+			bch2_trans_begin(trans);
+
 			ret = commit_do(trans, NULL, NULL,
-				commit_flags|
+					BCH_WATERMARK_reclaim|
+					BCH_TRANS_COMMIT_no_check_rw|
 					BCH_TRANS_COMMIT_no_enospc|
 					BCH_TRANS_COMMIT_no_journal_res|
 					BCH_TRANS_COMMIT_journal_reclaim,
 					btree_write_buffered_insert(trans, i));
-		if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
+			if (ret)
+				goto err;
+		}
+	}
+err:
+	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
+	bch2_journal_pin_drop(j, &wb->flushing.pin);
+	wb->flushing.keys.nr = 0;
+	return ret;
+}
+
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+{
+	struct journal *j = &c->journal;
+	struct journal_buf *buf;
+	int ret = 0;
+
+	mutex_lock(&j->buf_lock);
+	while ((buf = bch2_next_write_buffer_flush_journal_buf(j, seq)))
+		if (bch2_journal_keys_to_write_buffer(c, buf)) {
+			ret = -ENOMEM;
 			break;
 		}
+	mutex_unlock(&j->buf_lock);

-	goto out;
+	return ret;
 }

 int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
 {
+	struct bch_fs *c = trans->c;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	int ret = 0, fetch_from_journal_err;
+
+	trace_write_buffer_flush_sync(trans, _RET_IP_);
+retry:
 	bch2_trans_unlock(trans);
-	mutex_lock(&trans->c->btree_write_buffer.flush_lock);
-	return __bch2_btree_write_buffer_flush(trans, 0, true);
+
+	bch2_journal_block_reservations(&c->journal);
+	fetch_from_journal_err = fetch_wb_keys_from_journal(c, U64_MAX);
+	bch2_journal_unblock(&c->journal);
+
+	/*
+	 * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
+	 * is not guaranteed to empty wb->inc:
+	 */
+	mutex_lock(&wb->flushing.lock);
+	while (!ret &&
+	       (wb->flushing.keys.nr || wb->inc.keys.nr))
+		ret = bch2_btree_write_buffer_flush_locked(trans);
+	mutex_unlock(&wb->flushing.lock);
+
+	if (!ret && fetch_from_journal_err)
+		goto retry;
+
+	return ret;
 }

-int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
 {
-	return __bch2_btree_write_buffer_flush(trans, 0, false);
+	struct bch_fs *c = trans->c;
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	int ret = 0;
+
+	if (mutex_trylock(&wb->flushing.lock)) {
+		ret = bch2_btree_write_buffer_flush_locked(trans);
+		mutex_unlock(&wb->flushing.lock);
+	}
+
+	return ret;
+}
+
+int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
+		return -BCH_ERR_erofs_no_writes;
+
+	int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
+	bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+	return ret;
 }

 static int bch2_btree_write_buffer_journal_flush(struct journal *j,
@ -292,84 +461,195 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	int ret, fetch_from_journal_err;

-	mutex_lock(&wb->flush_lock);
+	do {
+		fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);

-	return bch2_trans_run(c,
-			__bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
+		mutex_lock(&wb->flushing.lock);
+		ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+		mutex_unlock(&wb->flushing.lock);
+	} while (!ret &&
+		 (fetch_from_journal_err ||
+		  (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq) ||
+		  (wb->inc.pin.seq && wb->inc.pin.seq <= seq)));
+
+	return ret;
 }

-static inline u64 btree_write_buffer_ref(int idx)
+static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
 {
-	return ((union btree_write_buffer_state) {
-		.ref0 = idx == 0,
-		.ref1 = idx == 1,
-	}).v;
-}
-
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
-{
-	struct bch_fs *c = trans->c;
+	struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
 	struct btree_write_buffer *wb = &c->btree_write_buffer;
-	struct btree_write_buffered_key *i;
-	union btree_write_buffer_state old, new;
-	int ret = 0;
-	u64 v;
+	int ret;

-	trans_for_each_wb_update(trans, i) {
-		EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+	mutex_lock(&wb->flushing.lock);
+	do {
+		ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+	} while (!ret && bch2_btree_write_buffer_should_flush(c));
+	mutex_unlock(&wb->flushing.lock);

-		i->journal_seq		= trans->journal_res.seq;
-		i->journal_offset	= trans->journal_res.offset;
+	bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+}
+
+int __bch2_journal_key_to_wb(struct bch_fs *c,
+			     struct journal_keys_to_wb *dst,
+			     enum btree_id btree, struct bkey_i *k)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+	int ret;
+retry:
+	ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
+	if (!ret && dst->wb == &wb->flushing)
+		ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+	if (unlikely(ret)) {
+		if (dst->wb == &c->btree_write_buffer.flushing) {
+			mutex_unlock(&dst->wb->lock);
+			dst->wb = &c->btree_write_buffer.inc;
+			bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
+					     bch2_btree_write_buffer_journal_flush);
+			goto retry;
 		}

-	preempt_disable();
-	v = READ_ONCE(wb->state.v);
-	do {
-		old.v = new.v = v;
+		return ret;
+	}

-		new.v += btree_write_buffer_ref(new.idx);
-		new.nr += trans->nr_wb_updates;
-		if (new.nr > wb->size) {
-			ret = -BCH_ERR_btree_insert_need_flush_buffer;
+	dst->room = darray_room(dst->wb->keys);
+	if (dst->wb == &wb->flushing)
+		dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+	BUG_ON(!dst->room);
+	BUG_ON(!dst->seq);
+
+	struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+	wb_k->journal_seq	= dst->seq;
+	wb_k->btree		= btree;
+	bkey_copy(&wb_k->k, k);
+	dst->wb->keys.nr++;
+	dst->room--;
+	return 0;
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	if (mutex_trylock(&wb->flushing.lock)) {
+		mutex_lock(&wb->inc.lock);
+		move_keys_from_inc_to_flushing(wb);
+
+		/*
+		 * Attempt to skip wb->inc, and add keys directly to
+		 * wb->flushing, saving us a copy later:
+		 */
+
+		if (!wb->inc.keys.nr) {
+			dst->wb = &wb->flushing;
+		} else {
+			mutex_unlock(&wb->flushing.lock);
+			dst->wb = &wb->inc;
+		}
+	} else {
+		mutex_lock(&wb->inc.lock);
+		dst->wb = &wb->inc;
+	}
+
+	dst->room = darray_room(dst->wb->keys);
+	if (dst->wb == &wb->flushing)
+		dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+	dst->seq = seq;
+
+	bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
+			     bch2_btree_write_buffer_journal_flush);
+}
+
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	if (!dst->wb->keys.nr)
+		bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
+
+	if (bch2_btree_write_buffer_should_flush(c) &&
+	    __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+	    !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+
+	if (dst->wb == &wb->flushing)
+		mutex_unlock(&wb->flushing.lock);
+	mutex_unlock(&wb->inc.lock);
+}
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+	struct journal_keys_to_wb dst;
+	struct jset_entry *entry;
+	struct bkey_i *k;
+	int ret = 0;
+
+	bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+	for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+		jset_entry_for_each_key(entry, k) {
+			ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+			if (ret)
 				goto out;
 		}
-	} while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);

-	memcpy(wb->keys[new.idx] + old.nr,
-	       trans->wb_updates,
-	       sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+		entry->type = BCH_JSET_ENTRY_btree_keys;
+	}

-	bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
-			     bch2_btree_write_buffer_journal_flush);
-
-	atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+	buf->need_flush_to_write_buffer = false;
 out:
-	preempt_enable();
+	bch2_journal_keys_to_write_buffer_end(c, &dst);
 	return ret;
 }

+static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
+{
+	if (wb->keys.size >= new_size)
+		return 0;
+
+	if (!mutex_trylock(&wb->lock))
+		return -EINTR;
+
+	int ret = darray_resize(&wb->keys, new_size);
+	mutex_unlock(&wb->lock);
+	return ret;
+}
+
+int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	return wb_keys_resize(&wb->flushing, new_size) ?:
+		wb_keys_resize(&wb->inc, new_size);
+}
+
 void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
 {
 	struct btree_write_buffer *wb = &c->btree_write_buffer;

-	BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+	BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
+	       !bch2_journal_error(&c->journal));

-	kvfree(wb->keys[1]);
-	kvfree(wb->keys[0]);
+	darray_exit(&wb->sorted);
+	darray_exit(&wb->flushing.keys);
+	darray_exit(&wb->inc.keys);
 }

 int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
 {
 	struct btree_write_buffer *wb = &c->btree_write_buffer;

-	mutex_init(&wb->flush_lock);
-	wb->size = c->opts.btree_write_buffer_size;
+	mutex_init(&wb->inc.lock);
+	mutex_init(&wb->flushing.lock);
+	INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);

-	wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
-	wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
-	if (!wb->keys[0] || !wb->keys[1])
-		return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+	/* Will be resized by journal as needed: */
+	unsigned initial_size = 1 << 16;

-	return 0;
+	return  darray_make_room(&wb->inc.keys, initial_size) ?:
+		darray_make_room(&wb->flushing.keys, initial_size) ?:
+		darray_make_room(&wb->sorted, initial_size);
 }
--- a/libbcachefs/btree_write_buffer.h
+++ b/libbcachefs/btree_write_buffer.h
@ -2,12 +2,59 @@
 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
 #define _BCACHEFS_BTREE_WRITE_BUFFER_H

-int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+#include "bkey.h"
+
+static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
+}
+
+static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
+{
+	struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+	return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
+}
+
+struct btree_trans;
 int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-int bch2_btree_write_buffer_flush(struct btree_trans *);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
+int bch2_btree_write_buffer_tryflush(struct btree_trans *);

-int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+struct journal_keys_to_wb {
+	struct btree_write_buffer_keys	*wb;
+	size_t				room;
+	u64				seq;
+};

+int __bch2_journal_key_to_wb(struct bch_fs *,
+			     struct journal_keys_to_wb *,
+			     enum btree_id, struct bkey_i *);
+
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+			     struct journal_keys_to_wb *dst,
+			     enum btree_id btree, struct bkey_i *k)
+{
+	EBUG_ON(!dst->seq);
+
+	if (unlikely(!dst->room))
+		return __bch2_journal_key_to_wb(c, dst, btree, k);
+
+	struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+	wb_k->journal_seq	= dst->seq;
+	wb_k->btree		= btree;
+	bkey_copy(&wb_k->k, k);
+	dst->wb->keys.nr++;
+	dst->room--;
+	return 0;
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+
+int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
 void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
 int bch2_fs_btree_write_buffer_init(struct bch_fs *);

--- a/libbcachefs/btree_write_buffer_types.h
+++ b/libbcachefs/btree_write_buffer_types.h
@ -2,43 +2,56 @@
 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H

+#include "darray.h"
 #include "journal_types.h"

 #define BTREE_WRITE_BUFERED_VAL_U64s_MAX	4
 #define BTREE_WRITE_BUFERED_U64s_MAX	(BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)

+struct wb_key_ref {
+union {
+	struct {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		unsigned			idx:24;
+		u8				pos[sizeof(struct bpos)];
+		enum btree_id			btree:8;
+#else
+		enum btree_id			btree:8;
+		u8				pos[sizeof(struct bpos)];
+		unsigned			idx:24;
+#endif
+	} __packed;
+	struct {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+		u64 lo;
+		u64 mi;
+		u64 hi;
+#else
+		u64 hi;
+		u64 mi;
+		u64 lo;
+#endif
+	};
+};
+};
+
 struct btree_write_buffered_key {
-	u64			journal_seq;
-	unsigned		journal_offset;
-	enum btree_id		btree;
+	enum btree_id			btree:8;
+	u64				journal_seq:56;
 	__BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
 };

-union btree_write_buffer_state {
-	struct {
-		atomic64_t	counter;
-	};
-
-	struct {
-		u64		v;
-	};
-
-	struct {
-		u64			nr:23;
-		u64			idx:1;
-		u64			ref0:20;
-		u64			ref1:20;
-	};
+struct btree_write_buffer_keys {
+	DARRAY(struct btree_write_buffered_key) keys;
+	struct journal_entry_pin	pin;
+	struct mutex			lock;
 };

 struct btree_write_buffer {
-	struct mutex			flush_lock;
-	struct journal_entry_pin	journal_pin;
-
-	union btree_write_buffer_state	state;
-	size_t				size;
-
-	struct btree_write_buffered_key	*keys[2];
+	DARRAY(struct wb_key_ref)	sorted;
+	struct btree_write_buffer_keys	inc;
+	struct btree_write_buffer_keys	flushing;
+	struct work_struct		flush_work;
 };

 #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -61,7 +61,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 		usage->reserved += usage->persistent_reserved[i];

 	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry *e =
+		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(&c->replicas, i);

 		fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
@ -214,7 +214,7 @@ void bch2_fs_usage_to_text(struct printbuf *out,
 	}

 	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry *e =
+		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(&c->replicas, i);

 		prt_printf(out, "\t");
@ -345,7 +345,7 @@ static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,

 static inline int __update_replicas(struct bch_fs *c,
 				    struct bch_fs_usage *fs_usage,
-				    struct bch_replicas_entry *r,
+				    struct bch_replicas_entry_v1 *r,
 				    s64 sectors)
 {
 	int idx = bch2_replicas_entry_idx(c, r);
@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c,
 }

 static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
-			struct bch_replicas_entry *r, s64 sectors,
+			struct bch_replicas_entry_v1 *r, s64 sectors,
 			unsigned journal_seq, bool gc)
 {
 	struct bch_fs_usage *fs_usage;
@ -453,8 +453,8 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
 				__replicas_deltas_realloc(trans, more, _gfp));
 }

-static inline int update_replicas_list(struct btree_trans *trans,
-					struct bch_replicas_entry *r,
+int bch2_update_replicas_list(struct btree_trans *trans,
+			 struct bch_replicas_entry_v1 *r,
 			 s64 sectors)
 {
 	struct replicas_delta_list *d;
@ -481,14 +481,13 @@ static inline int update_replicas_list(struct btree_trans *trans,
 	return 0;
 }

-static inline int update_cached_sectors_list(struct btree_trans *trans,
-					      unsigned dev, s64 sectors)
+int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
 {
 	struct bch_replicas_padded r;

 	bch2_replicas_entry_cached(&r.e, dev);

-	return update_replicas_list(trans, &r.e, sectors);
+	return bch2_update_replicas_list(trans, &r.e, sectors);
 }

 int bch2_mark_alloc(struct btree_trans *trans,
@ -580,23 +579,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
 	}
 	percpu_up_read(&c->mark_lock);

-	/*
-	 * need to know if we're getting called from the invalidate path or
-	 * not:
-	 */
-
-	if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
-	    old_a->cached_sectors) {
-		ret = update_cached_sectors(c, new, ca->dev_idx,
-					    -((s64) old_a->cached_sectors),
-					    journal_seq, gc);
-		if (ret) {
-			bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
-					    __func__);
-			return ret;
-		}
-	}
-
 	if (new_a->data_type == BCH_DATA_free &&
 	    (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
 		closure_wake_up(&c->freelist_wait);
@ -1470,7 +1452,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,

 	bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
 	r.e.data_type = data_type;
-	ret = update_replicas_list(trans, &r.e, sectors);
+	ret = bch2_update_replicas_list(trans, &r.e, sectors);
 err:
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
@ -1513,7 +1495,7 @@ static int __trans_mark_extent(struct btree_trans *trans,

 		if (p.ptr.cached) {
 			if (!stale) {
-				ret = update_cached_sectors_list(trans, p.ptr.dev,
+				ret = bch2_update_cached_sectors_list(trans, p.ptr.dev,
 								      disk_sectors);
 				if (ret)
 					return ret;
@ -1532,7 +1514,7 @@ static int __trans_mark_extent(struct btree_trans *trans,
 	}

 	if (r.e.nr_devs)
-		ret = update_replicas_list(trans, &r.e, dirty_sectors);
+		ret = bch2_update_replicas_list(trans, &r.e, dirty_sectors);

 	return ret;
 }
@ -1669,7 +1651,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
 		s64 sectors = le16_to_cpu(new_s->sectors);

 		bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
-		ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+		ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
 		if (ret)
 			return ret;
 	}
@ -1678,7 +1660,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
 		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));

 		bch2_bkey_to_replicas(&r.e, old);
-		ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+		ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
 		if (ret)
 			return ret;
 	}
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -315,6 +315,9 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
 			    : c->usage[journal_seq & JOURNAL_BUF_MASK]);
 }

+int bch2_update_replicas_list(struct btree_trans *,
+			 struct bch_replicas_entry_v1 *, s64);
+int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
 int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);

 void bch2_fs_usage_initialize(struct bch_fs *);
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@ -444,7 +444,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
 	dst_end = (void *) arg->replicas + replica_entries_bytes;

 	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry *src_e =
+		struct bch_replicas_entry_v1 *src_e =
 			cpu_replicas_entry(&c->replicas, i);

 		/* check that we have enough space for one replicas entry */
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@ -95,7 +95,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
 				unsigned long io_until,
 				unsigned long cpu_timeout)
 {
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct io_clock_wait wait;

 	wait.io_timer.expire	= io_until;
@ -111,7 +110,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,

 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (kthread && kthread_should_stop())
+		if (kthread_should_stop())
 			break;

 		if (wait.expired)
--- a/libbcachefs/darray.c
+++ b/libbcachefs/darray.c
@ -9,10 +9,12 @@ int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, g
 	if (new_size > d->size) {
 		new_size = roundup_pow_of_two(new_size);

-		void *data = krealloc_array(d->data, new_size, element_size, gfp);
+		void *data = kvmalloc_array(new_size, element_size, gfp);
 		if (!data)
 			return -ENOMEM;

+		memcpy(data, d->data, d->size * element_size);
+		kvfree(d->data);
 		d->data	= data;
 		d->size = new_size;
 	}
--- a/libbcachefs/darray.h
+++ b/libbcachefs/darray.h
@ -92,7 +92,7 @@ do {									\

 #define darray_exit(_d)							\
 do {									\
-	kfree((_d)->data);						\
+	kvfree((_d)->data);						\
 	darray_init(_d);						\
 } while (0)

--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@ -1005,7 +1005,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
 	unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
 	int ret = 0;

-	ret = bch2_btree_write_buffer_flush(trans);
+	ret = bch2_btree_write_buffer_flush_sync(trans);
 	if (ret)
 		goto err;

--- a/libbcachefs/ec_types.h
+++ b/libbcachefs/ec_types.h
@ -5,7 +5,7 @@
 #include "bcachefs_format.h"

 struct bch_replicas_padded {
-	struct bch_replicas_entry	e;
+	struct bch_replicas_entry_v1	e;
 	u8				pad[BCH_BKEY_PTRS_MAX];
 };

--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@ -150,7 +150,6 @@
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_mark_replicas)	\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_res)		\
 	x(BCH_ERR_btree_insert_fail,	btree_insert_need_journal_reclaim)	\
-	x(BCH_ERR_btree_insert_fail,	btree_insert_need_flush_buffer)		\
 	x(0,				backpointer_to_overwritten_btree_node)	\
 	x(0,				lock_fail_root_changed)			\
 	x(0,				journal_reclaim_would_deadlock)		\
--- a/libbcachefs/fs-io-direct.c
+++ b/libbcachefs/fs-io-direct.c
@ -35,9 +35,9 @@ static void bio_check_or_release(struct bio *bio, bool check_dirty)
 	}
 }

-static void bch2_dio_read_complete(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_read_complete)
 {
-	struct dio_read *dio = container_of(cl, struct dio_read, cl);
+	closure_type(dio, struct dio_read, cl);

 	dio->req->ki_complete(dio->req, dio->ret);
 	bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
@ -325,9 +325,9 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
 	return 0;
 }

-static void bch2_dio_write_flush_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
 {
-	struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+	closure_type(dio, struct dio_write, op.cl);
 	struct bch_fs *c = dio->op.c;

 	closure_debug_destroy(cl);
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -861,7 +861,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 	    abs(pos_src - pos_dst) < len)
 		return -EINVAL;

-	bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+	lock_two_nondirectories(&src->v, &dst->v);
+	bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);

 	inode_dio_wait(&src->v);
 	inode_dio_wait(&dst->v);
@ -914,7 +915,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
 		ret = bch2_flush_inode(c, dst);
 err:
 	bch2_quota_reservation_put(c, dst, &quota_res);
-	bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+	bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
+	unlock_two_nondirectories(&src->v, &dst->v);

 	return bch2_err_class(ret);
 }
--- a/libbcachefs/fs-ioctl.c
+++ b/libbcachefs/fs-ioctl.c
@ -453,35 +453,33 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
 static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
 				struct bch_ioctl_subvolume arg)
 {
-	struct filename *name;
 	struct path path;
 	struct inode *dir;
-	struct dentry *victim;
 	int ret = 0;

 	if (arg.flags)
 		return -EINVAL;

-	name = getname((const char __user *)(unsigned long)arg.dst_ptr);
-	victim = filename_path_locked(arg.dirfd, name, &path);
-	putname(name);
-	if (IS_ERR(victim))
-		return PTR_ERR(victim);
+	ret = user_path_at(arg.dirfd,
+			(const char __user *)(unsigned long)arg.dst_ptr,
+			LOOKUP_FOLLOW, &path);
+	if (ret)
+		return ret;

-	if (victim->d_sb->s_fs_info != c) {
+	if (path.dentry->d_sb->s_fs_info != c) {
 		ret = -EXDEV;
 		goto err;
 	}

-	dir = d_inode(path.dentry);
-	ret = __bch2_unlink(dir, victim, true);
-	if (!ret) {
-		fsnotify_rmdir(dir, victim);
-		d_delete(victim);
-	}
-	inode_unlock(dir);
+	dir = path.dentry->d_parent->d_inode;
+
+	ret = __bch2_unlink(dir, path.dentry, true);
+	if (ret)
+		goto err;
+
+	fsnotify_rmdir(dir, path.dentry);
+	d_delete(path.dentry);
 err:
-	dput(victim);
 	path_put(&path);
 	return ret;
 }
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@ -1667,8 +1667,7 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
 		if (!first)
 			seq_putc(seq, ':');
 		first = false;
-		seq_puts(seq, "/dev/");
-		seq_puts(seq, ca->name);
+		seq_puts(seq, ca->disk_sb.sb_name);
 	}

 	return 0;
@ -1901,7 +1900,7 @@ got_sb:
 		sb->s_flags	|= SB_POSIXACL;
 #endif

-	sb->s_shrink.seeks = 0;
+	sb->s_shrink->seeks = 0;

 	vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
 	ret = PTR_ERR_OR_ZERO(vinode);
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r)
 }

 enum bch_inode_lock_op {
-	INODE_LOCK		= (1U << 0),
-	INODE_PAGECACHE_BLOCK	= (1U << 1),
-	INODE_UPDATE_LOCK	= (1U << 2),
+	INODE_PAGECACHE_BLOCK	= (1U << 0),
+	INODE_UPDATE_LOCK	= (1U << 1),
 };

 #define bch2_lock_inodes(_locks, ...)					\
@ -91,8 +90,6 @@ do {									\
 									\
 	for (i = 1; i < ARRAY_SIZE(a); i++)				\
 		if (a[i] != a[i - 1]) {					\
-			if ((_locks) & INODE_LOCK)			\
-				down_write_nested(&a[i]->v.i_rwsem, i);	\
 			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
 				bch2_pagecache_block_get(a[i]);\
 			if ((_locks) & INODE_UPDATE_LOCK)			\
@ -109,8 +106,6 @@ do {									\
 									\
 	for (i = 1; i < ARRAY_SIZE(a); i++)				\
 		if (a[i] != a[i - 1]) {					\
-			if ((_locks) & INODE_LOCK)			\
-				up_write(&a[i]->v.i_rwsem);		\
 			if ((_locks) & INODE_PAGECACHE_BLOCK)		\
 				bch2_pagecache_block_put(a[i]);\
 			if ((_locks) & INODE_UPDATE_LOCK)			\
--- a/libbcachefs/fsck.c
+++ b/libbcachefs/fsck.c
@ -826,6 +826,18 @@ fsck_err:
 	goto out;
 }

+static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	bch2_trans_iter_exit(trans, &iter);
+	return k.k->type == KEY_TYPE_set;
+}
+
 static int check_inode(struct btree_trans *trans,
 		       struct btree_iter *iter,
 		       struct bkey_s_c k,
@ -890,6 +902,17 @@ static int check_inode(struct btree_trans *trans,
 		return 0;
 	}

+	if (u.bi_flags & BCH_INODE_unlinked &&
+	    c->sb.version >= bcachefs_metadata_version_deleted_inodes) {
+		ret = check_inode_deleted_list(trans, k.k->p);
+		if (ret)
+			return ret;
+
+		fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+			    "inode %llu:%u unlinked, but not on deleted list",
+			    u.bi_inum, k.k->p.snapshot);
+	}
+
 	if (u.bi_flags & BCH_INODE_unlinked &&
 	    (!c->sb.clean ||
 	     fsck_err(c, inode_unlinked_but_clean,
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@ -1157,10 +1157,6 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 again:
 	need_another_pass = false;

-	ret = bch2_btree_write_buffer_flush_sync(trans);
-	if (ret)
-		goto err;
-
 	/*
 	 * Weird transaction restart handling here because on successful delete,
 	 * bch2_inode_rm_snapshot() will return a nested transaction restart,
@ -1191,8 +1187,12 @@ again:
 	}
 	bch2_trans_iter_exit(trans, &iter);

-	if (!ret && need_another_pass)
+	if (!ret && need_another_pass) {
+		ret = bch2_btree_write_buffer_flush_sync(trans);
+		if (ret)
+			goto err;
 		goto again;
+	}
 err:
 	bch2_trans_put(trans);

--- a/libbcachefs/io_write.c
+++ b/libbcachefs/io_write.c
@ -580,9 +580,9 @@ static inline void wp_update_state(struct write_point *wp, bool running)
 	__wp_update_state(wp, state);
 }

-static void bch2_write_index(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_write_index)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	closure_type(op, struct bch_write_op, cl);
 	struct write_point *wp = op->wp;
 	struct workqueue_struct *wq = index_update_wq(op);
 	unsigned long flags;
@ -1208,9 +1208,9 @@ static void __bch2_nocow_write_done(struct bch_write_op *op)
 		bch2_nocow_write_convert_unwritten(op);
 }

-static void bch2_nocow_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_nocow_write_done)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	closure_type(op, struct bch_write_op, cl);

 	__bch2_nocow_write_done(op);
 	bch2_write_done(cl);
@ -1363,7 +1363,7 @@ err:
 		op->insert_keys.top = op->insert_keys.keys;
 	} else if (op->flags & BCH_WRITE_SYNC) {
 		closure_sync(&op->cl);
-		bch2_nocow_write_done(&op->cl);
+		bch2_nocow_write_done(&op->cl.work);
 	} else {
 		/*
 		 * XXX
@ -1566,9 +1566,9 @@ err:
 * If op->discard is true, instead of inserting the data it invalidates the
 * region of the cache represented by op->bio and op->inode.
 */
-void bch2_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_write)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	closure_type(op, struct bch_write_op, cl);
 	struct bio *bio = &op->wbio.bio;
 	struct bch_fs *c = op->c;
 	unsigned data_len;
--- a/libbcachefs/io_write.h
+++ b/libbcachefs/io_write.h
@ -90,8 +90,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 	op->devs_need_flush	= NULL;
 }

-void bch2_write(struct closure *);
-
+CLOSURE_CALLBACK(bch2_write);
 void bch2_write_point_do_index_updates(struct work_struct *);

 static inline struct bch_write_bio *wbio_init(struct bio *bio)
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -10,6 +10,7 @@
 #include "bkey_methods.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "error.h"
 #include "journal.h"
@ -147,6 +148,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
 		bch2_journal_reclaim_fast(j);
 	if (write)
 		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	wake_up(&j->wait);
 }

 /*
@ -184,6 +186,8 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
 	/* Close out old buffer: */
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);

+	trace_journal_entry_close(c, vstruct_bytes(buf->data));
+
 	sectors = vstruct_blocks_plus(buf->data, c->block_bits,
 				      buf->u64s_reserved) << c->block_bits;
 	BUG_ON(sectors > buf->sectors);
@ -328,6 +332,7 @@ static int journal_entry_open(struct journal *j)
 	buf->must_flush	= false;
 	buf->separate_flush = false;
 	buf->flush_time	= 0;
+	buf->need_flush_to_write_buffer = true;

 	memset(buf->data, 0, sizeof(*buf->data));
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
@ -764,6 +769,75 @@ void bch2_journal_block(struct journal *j)
 	journal_quiesce(j);
 }

+/*
+ * XXX: ideally this would not be closing the current journal entry, but
+ * otherwise we do not have a way to avoid racing with res_get() - j->blocked
+ * will race.
+ */
+static bool journal_reservations_stopped(struct journal *j)
+{
+	union journal_res_state s;
+
+	journal_entry_close(j);
+
+	s.v = atomic64_read_acquire(&j->reservations.counter);
+
+	return  s.buf0_count == 0 &&
+		s.buf1_count == 0 &&
+		s.buf2_count == 0 &&
+		s.buf3_count == 0;
+}
+
+void bch2_journal_block_reservations(struct journal *j)
+{
+	spin_lock(&j->lock);
+	j->blocked++;
+	spin_unlock(&j->lock);
+
+	wait_event(j->wait, journal_reservations_stopped(j));
+}
+
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+	spin_lock(&j->lock);
+	max_seq = min(max_seq, journal_cur_seq(j));
+
+	for (u64 seq = journal_last_unwritten_seq(j);
+	     seq <= max_seq;
+	     seq++) {
+		unsigned idx = seq & JOURNAL_BUF_MASK;
+		struct journal_buf *buf = j->buf + idx;
+		union journal_res_state s;
+
+		if (!buf->need_flush_to_write_buffer)
+			continue;
+
+		if (seq == journal_cur_seq(j))
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
+		s.v = atomic64_read_acquire(&j->reservations.counter);
+
+		if (journal_state_count(s, idx)) {
+			spin_unlock(&j->lock);
+			return ERR_PTR(-EAGAIN);
+		}
+
+		spin_unlock(&j->lock);
+		return buf;
+	}
+
+	spin_unlock(&j->lock);
+	return NULL;
+}
+
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+	struct journal_buf *ret;
+
+	wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
+	return ret;
+}
+
 /* allocate journal on a device: */

 static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@ -1215,6 +1289,7 @@ int bch2_fs_journal_init(struct journal *j)
 	static struct lock_class_key res_key;
 	unsigned i;

+	mutex_init(&j->buf_lock);
 	spin_lock_init(&j->lock);
 	spin_lock_init(&j->err_lock);
 	init_waitqueue_head(&j->wait);
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -259,7 +259,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 {
 	union journal_res_state s;

-	s.v = atomic64_sub_return(((union journal_res_state) {
+	s.v = atomic64_sub_return_release(((union journal_res_state) {
 				    .buf0_count = idx == 0,
 				    .buf1_count = idx == 1,
 				    .buf2_count = idx == 2,
@ -427,6 +427,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)

 void bch2_journal_unblock(struct journal *);
 void bch2_journal_block(struct journal *);
+void bch2_journal_block_reservations(struct journal *);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);

 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -4,6 +4,7 @@
 #include "alloc_foreground.h"
 #include "btree_io.h"
 #include "btree_update_interior.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "disk_groups.h"
@ -713,6 +714,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
 	journal_entry_btree_keys_to_text(out, c, entry);
 }

+static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	return journal_entry_btree_keys_validate(c, jset, entry,
+				version, big_endian, READ);
+}
+
+static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	journal_entry_btree_keys_to_text(out, c, entry);
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, unsigned, int,
@ -1025,10 +1042,9 @@ next_block:
 	return 0;
 }

-static void bch2_journal_read_device(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_journal_read_device)
 {
-	struct journal_device *ja =
-		container_of(cl, struct journal_device, read);
+	closure_type(ja, struct journal_device, read);
 	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
 	struct bch_fs *c = ca->fs;
 	struct journal_list *jlist =
@ -1494,6 +1510,8 @@ done:

 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 {
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
 	/* we aren't holding j->lock: */
 	unsigned new_size = READ_ONCE(j->buf_size_want);
 	void *new_buf;
@ -1501,6 +1519,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	if (buf->buf_size >= new_size)
 		return;

+	size_t btree_write_buffer_size = new_size / 64;
+
+	if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
+		return;
+
 	new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
 	if (!new_buf)
 		return;
@ -1520,9 +1543,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 	return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
 }

-static void journal_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_done)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_replicas_padded replicas;
@ -1590,6 +1613,7 @@ static void journal_write_done(struct closure *cl)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);

+	bch2_journal_reclaim_fast(j);
 	bch2_journal_space_available(j);

 	track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
@ -1641,9 +1665,9 @@ static void journal_write_endio(struct bio *bio)
 	percpu_ref_put(&ca->io_ref);
 }

-static void do_journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(do_journal_write)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	struct journal_buf *w = journal_last_unwritten_buf(j);
@ -1693,9 +1717,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct jset_entry *start, *end, *i, *next, *prev = NULL;
 	struct jset *jset = w->data;
+	struct journal_keys_to_wb wb = { NULL };
 	unsigned sectors, bytes, u64s;
-	bool validate_before_checksum = false;
 	unsigned long btree_roots_have = 0;
+	bool validate_before_checksum = false;
+	u64 seq = le64_to_cpu(jset->seq);
 	int ret;

 	/*
@ -1723,9 +1749,28 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 		 * to c->btree_roots we have to get any missing btree roots and
 		 * add them to this journal entry:
 		 */
-		if (i->type == BCH_JSET_ENTRY_btree_root) {
+		switch (i->type) {
+		case BCH_JSET_ENTRY_btree_root:
 			bch2_journal_entry_to_btree_root(c, i);
 			__set_bit(i->btree_id, &btree_roots_have);
+			break;
+		case BCH_JSET_ENTRY_write_buffer_keys:
+			EBUG_ON(!w->need_flush_to_write_buffer);
+
+			if (!wb.wb)
+				bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
+
+			struct bkey_i *k;
+			jset_entry_for_each_key(i, k) {
+				ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
+				if (ret) {
+					bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+					bch2_journal_keys_to_write_buffer_end(c, &wb);
+					return ret;
+				}
+			}
+			i->type = BCH_JSET_ENTRY_btree_keys;
+			break;
 		}

 		/* Can we merge with previous entry? */
@ -1748,6 +1793,10 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 			memmove_u64s_down(prev, i, jset_u64s(u64s));
 	}

+	if (wb.wb)
+		bch2_journal_keys_to_write_buffer_end(c, &wb);
+	w->need_flush_to_write_buffer = false;
+
 	prev = prev ? vstruct_next(prev) : jset->start;
 	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);

@ -1755,8 +1804,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)

 	end	= bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);

-	bch2_journal_super_entries_add_common(c, &end,
-				le64_to_cpu(jset->seq));
+	bch2_journal_super_entries_add_common(c, &end, seq);
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);

@ -1779,7 +1827,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));

 	if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
-		j->last_empty_seq = le64_to_cpu(jset->seq);
+		j->last_empty_seq = seq;

 	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
 		validate_before_checksum = true;
@ -1853,9 +1901,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 	return 0;
 }

-void bch2_journal_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_journal_write)
 {
-	struct journal *j = container_of(cl, struct journal, io);
+	closure_type(j, struct journal, io);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct bch_dev *ca;
 	struct journal_buf *w = journal_last_unwritten_buf(j);
@ -1875,9 +1923,11 @@ void bch2_journal_write(struct closure *cl)
 	if (ret)
 		goto err;

+	mutex_lock(&j->buf_lock);
 	journal_buf_realloc(j, w);

 	ret = bch2_journal_write_prep(j, w);
+	mutex_unlock(&j->buf_lock);
 	if (ret)
 		goto err;

--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@ -60,6 +60,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,

 int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);

-void bch2_journal_write(struct closure *);
+CLOSURE_CALLBACK(bch2_journal_write);

 #endif /* _BCACHEFS_JOURNAL_IO_H */
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "errcode.h"
 #include "error.h"
@ -50,20 +51,23 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
 	return available;
 }

-static inline void journal_set_watermark(struct journal *j)
+void bch2_journal_set_watermark(struct journal *j)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	bool low_on_space = j->space[journal_space_clean].total * 4 <=
 		j->space[journal_space_total].total;
 	bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
-	unsigned watermark = low_on_space || low_on_pin
+	bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
+	unsigned watermark = low_on_space || low_on_pin || low_on_wb
 		? BCH_WATERMARK_reclaim
 		: BCH_WATERMARK_stripe;

 	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
 			       &j->low_on_space_start, low_on_space) ||
 	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
-			       &j->low_on_pin_start, low_on_pin))
+			       &j->low_on_pin_start, low_on_pin) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
+			       &j->write_buffer_full_start, low_on_wb))
 		trace_and_count(c, journal_full, c);

 	swap(watermark, j->watermark);
@ -230,7 +234,7 @@ void bch2_journal_space_available(struct journal *j)
 	else
 		clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);

-	journal_set_watermark(j);
+	bch2_journal_set_watermark(j);
 out:
 	j->cur_entry_sectors	= !ret ? j->space[journal_space_discarded].next_entry : 0;
 	j->cur_entry_error	= ret;
@ -303,6 +307,7 @@ void bch2_journal_reclaim_fast(struct journal *j)
 	 * all btree nodes got written out
 	 */
 	while (!fifo_empty(&j->pin) &&
+	       j->pin.front <= j->seq_ondisk &&
 	       !atomic_read(&fifo_peek_front(&j->pin).count)) {
 		j->pin.front++;
 		popped = true;
@ -635,7 +640,6 @@ static u64 journal_seq_to_flush(struct journal *j)
 static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	u64 seq_to_flush;
 	size_t min_nr, min_key_cache, nr_flushed;
 	unsigned flags;
@ -651,7 +655,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 	flags = memalloc_noreclaim_save();

 	do {
-		if (kthread && kthread_should_stop())
+		if (kthread_should_stop())
 			break;

 		if (bch2_journal_error(j)) {
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j)
 unsigned bch2_journal_dev_buckets_available(struct journal *,
 					    struct journal_device *,
 					    enum journal_space_from);
+void bch2_journal_set_watermark(struct journal *);
 void bch2_journal_space_available(struct journal *);

 static inline bool journal_pin_active(struct journal_entry_pin *pin)
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -36,6 +36,7 @@ struct journal_buf {
 	bool			noflush;	/* write has already been kicked off, and was noflush */
 	bool			must_flush;	/* something wants a flush */
 	bool			separate_flush;
+	bool			need_flush_to_write_buffer;
 };

 /*
@ -181,6 +182,12 @@ struct journal {
 	 */
 	darray_u64		early_journal_entries;

+	/*
+	 * Protects journal_buf->data, when accessing without a jorunal
+	 * reservation: for synchronization between the btree write buffer code
+	 * and the journal write path:
+	 */
+	struct mutex		buf_lock;
 	/*
 	 * Two journal entries -- one is currently open for new entries, the
 	 * other is possibly being written out.
@ -271,6 +278,7 @@ struct journal {
 	u64			low_on_space_start;
 	u64			low_on_pin_start;
 	u64			max_in_flight_start;
+	u64			write_buffer_full_start;

 	struct bch2_time_stats	*flush_write_time;
 	struct bch2_time_stats	*noflush_write_time;
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -27,6 +27,13 @@
 #include <linux/ioprio.h>
 #include <linux/kthread.h>

+const char * const bch2_data_ops_strs[] = {
+#define x(t, n, ...) [n] = #t,
+	BCH_DATA_OPS()
+#undef x
+	NULL
+};
+
 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
 {
 	if (trace_move_extent_enabled()) {
@ -163,12 +170,17 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 		atomic_read(&ctxt->write_sectors) != sectors_pending);
 }

+static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
+{
+	move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+	closure_sync(&ctxt->cl);
+}
+
 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
 	struct bch_fs *c = ctxt->trans->c;

-	move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
-	closure_sync(&ctxt->cl);
+	bch2_moving_ctxt_flush_all(ctxt);

 	EBUG_ON(atomic_read(&ctxt->write_sectors));
 	EBUG_ON(atomic_read(&ctxt->write_ios));
@ -216,7 +228,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
 	trace_move_data(c, stats);
 }

-void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
 {
 	memset(stats, 0, sizeof(*stats));
 	stats->data_type = BCH_DATA_user;
@ -484,8 +496,8 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
 	struct bch_fs *c = ctxt->trans->c;
 	u64 delay;

-	if (ctxt->wait_on_copygc && !c->copygc_running) {
-		bch2_trans_unlock_long(ctxt->trans);
+	if (ctxt->wait_on_copygc && c->copygc_running) {
+		bch2_moving_ctxt_flush_all(ctxt);
 		wait_event_killable(c->copygc_running_wq,
 				    !c->copygc_running ||
 				    kthread_should_stop());
@ -503,7 +515,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
 			set_current_state(TASK_INTERRUPTIBLE);
 		}

-		if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
+		if (kthread_should_stop()) {
 			__set_current_state(TASK_RUNNING);
 			return 1;
 		}
@ -512,7 +524,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
 			schedule_timeout(delay);

 		if (unlikely(freezing(current))) {
-			move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+			bch2_moving_ctxt_flush_all(ctxt);
 			try_to_freeze();
 		}
 	} while (delay);
@ -721,11 +733,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
 	bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
 	fragmentation = a->fragmentation_lru;

-	ret = bch2_btree_write_buffer_flush(trans);
-	if (ret) {
+	ret = bch2_btree_write_buffer_tryflush(trans);
 	bch_err_msg(c, ret, "flushing btree write buffer");
+	if (ret)
 		goto err;
-	}

 	while (!(ret = bch2_move_ratelimit(ctxt))) {
 		bch2_trans_begin(trans);
@ -856,18 +867,17 @@ typedef bool (*move_btree_pred)(struct bch_fs *, void *,
 				struct data_update_opts *);

 static int bch2_move_btree(struct bch_fs *c,
-			   enum btree_id start_btree_id, struct bpos start_pos,
-			   enum btree_id end_btree_id,   struct bpos end_pos,
+			   struct bbpos start,
+			   struct bbpos end,
 			   move_btree_pred pred, void *arg,
 			   struct bch_move_stats *stats)
 {
-	bool kthread = (current->flags & PF_KTHREAD) != 0;
 	struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
 	struct moving_context ctxt;
 	struct btree_trans *trans;
 	struct btree_iter iter;
 	struct btree *b;
-	enum btree_id id;
+	enum btree_id btree;
 	struct data_update_opts data_opts;
 	int ret = 0;

@ -878,26 +888,26 @@ static int bch2_move_btree(struct bch_fs *c,

 	stats->data_type = BCH_DATA_btree;

-	for (id = start_btree_id;
-	     id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
-	     id++) {
-		stats->pos = BBPOS(id, POS_MIN);
+	for (btree = start.btree;
+	     btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+	     btree ++) {
+		stats->pos = BBPOS(btree, POS_MIN);

-		if (!bch2_btree_id_root(c, id)->b)
+		if (!bch2_btree_id_root(c, btree)->b)
 			continue;

-		bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+		bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
 					  BTREE_ITER_PREFETCH);
 retry:
 		ret = 0;
 		while (bch2_trans_begin(trans),
 		       (b = bch2_btree_iter_peek_node(&iter)) &&
 		       !(ret = PTR_ERR_OR_ZERO(b))) {
-			if (kthread && kthread_should_stop())
+			if (kthread_should_stop())
 				break;

-			if ((cmp_int(id, end_btree_id) ?:
-			     bpos_cmp(b->key.k.p, end_pos)) > 0)
+			if ((cmp_int(btree, end.btree) ?:
+			     bpos_cmp(b->key.k.p, end.pos)) > 0)
 				break;

 			stats->pos = BBPOS(iter.btree_id, iter.pos);
@ -918,7 +928,7 @@ next:

 		bch2_trans_iter_exit(trans, &iter);

-		if (kthread && kthread_should_stop())
+		if (kthread_should_stop())
 			break;
 	}

@ -1034,8 +1044,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
 	int ret;

 	ret = bch2_move_btree(c,
-			      0,		POS_MIN,
-			      BTREE_ID_NR,	SPOS_MAX,
+			      BBPOS_MIN,
+			      BBPOS_MAX,
 			      rewrite_old_nodes_pred, c, stats);
 	if (!ret) {
 		mutex_lock(&c->sb_lock);
@ -1050,71 +1060,101 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
 	return ret;
 }

+static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
+			     struct bkey_s_c k,
+			     struct bch_io_opts *io_opts,
+			     struct data_update_opts *data_opts)
+{
+	unsigned durability = bch2_bkey_durability(c, k);
+	unsigned replicas = bkey_is_btree_ptr(k.k)
+		? c->opts.metadata_replicas
+		: io_opts->data_replicas;
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	unsigned i = 0;
+
+	bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+		unsigned d = bch2_extent_ptr_durability(c, &p);
+
+		if (d && durability - d >= replicas) {
+			data_opts->kill_ptrs |= BIT(i);
+			durability -= d;
+		}
+
+		i++;
+	}
+
+	return data_opts->kill_ptrs != 0;
+}
+
+static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
+				   struct btree *b,
+				   struct bch_io_opts *io_opts,
+				   struct data_update_opts *data_opts)
+{
+	return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
 int bch2_data_job(struct bch_fs *c,
 		  struct bch_move_stats *stats,
 		  struct bch_ioctl_data op)
 {
+	struct bbpos start	= BBPOS(op.start_btree, op.start_pos);
+	struct bbpos end	= BBPOS(op.end_btree, op.end_pos);
 	int ret = 0;

+	if (op.op >= BCH_DATA_OP_NR)
+		return -EINVAL;
+
+	bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
+
 	switch (op.op) {
-	case BCH_DATA_OP_REREPLICATE:
-		bch2_move_stats_init(stats, "rereplicate");
+	case BCH_DATA_OP_rereplicate:
 		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
-
-		ret = bch2_move_btree(c,
-				      op.start_btree,	op.start_pos,
-				      op.end_btree,	op.end_pos,
+		ret = bch2_move_btree(c, start, end,
 				      rereplicate_btree_pred, c, stats) ?: ret;
-		ret = bch2_replicas_gc2(c) ?: ret;
-
-		ret = bch2_move_data(c,
-				     (struct bbpos) { op.start_btree,	op.start_pos },
-				     (struct bbpos) { op.end_btree,	op.end_pos },
+		ret = bch2_move_data(c, start, end,
 				     NULL,
 				     stats,
 				     writepoint_hashed((unsigned long) current),
 				     true,
 				     rereplicate_pred, c) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
-
-		bch2_move_stats_exit(stats, c);
 		break;
-	case BCH_DATA_OP_MIGRATE:
+	case BCH_DATA_OP_migrate:
 		if (op.migrate.dev >= c->sb.nr_devices)
 			return -EINVAL;

-		bch2_move_stats_init(stats, "migrate");
 		stats->data_type = BCH_DATA_journal;
 		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-
-		ret = bch2_move_btree(c,
-				      op.start_btree,	op.start_pos,
-				      op.end_btree,	op.end_pos,
+		ret = bch2_move_btree(c, start, end,
 				      migrate_btree_pred, &op, stats) ?: ret;
-		ret = bch2_replicas_gc2(c) ?: ret;
-
-		ret = bch2_move_data(c,
-				     (struct bbpos) { op.start_btree,	op.start_pos },
-				     (struct bbpos) { op.end_btree,	op.end_pos },
+		ret = bch2_move_data(c, start, end,
 				     NULL,
 				     stats,
 				     writepoint_hashed((unsigned long) current),
 				     true,
 				     migrate_pred, &op) ?: ret;
 		ret = bch2_replicas_gc2(c) ?: ret;
-
-		bch2_move_stats_exit(stats, c);
 		break;
-	case BCH_DATA_OP_REWRITE_OLD_NODES:
-		bch2_move_stats_init(stats, "rewrite_old_nodes");
+	case BCH_DATA_OP_rewrite_old_nodes:
 		ret = bch2_scan_old_btree_nodes(c, stats);
-		bch2_move_stats_exit(stats, c);
+		break;
+	case BCH_DATA_OP_drop_extra_replicas:
+		ret = bch2_move_btree(c, start, end,
+				drop_extra_replicas_btree_pred, c, stats) ?: ret;
+		ret = bch2_move_data(c, start, end, NULL, stats,
+				writepoint_hashed((unsigned long) current),
+				true,
+				drop_extra_replicas_pred, c) ?: ret;
+		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	default:
 		ret = -EINVAL;
 	}

+	bch2_move_stats_exit(stats, c);
 	return ret;
 }

--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@ -56,6 +56,8 @@ do {									\
 typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
 			     struct bch_io_opts *, struct data_update_opts *);

+extern const char * const bch2_data_ops_strs[];
+
 void bch2_moving_ctxt_exit(struct moving_context *);
 void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
 			   struct bch_ratelimit *, struct bch_move_stats *,
@ -130,7 +132,7 @@ int bch2_data_job(struct bch_fs *,

 void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
 void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, char *);
+void bch2_move_stats_init(struct bch_move_stats *, const char *);

 void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);

--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@ -153,8 +153,11 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,

 	move_buckets_wait(ctxt, buckets_in_flight, false);

-	ret = bch2_btree_write_buffer_flush(trans);
-	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+	ret = bch2_btree_write_buffer_tryflush(trans);
+	if (bch2_err_matches(ret, EROFS))
+		return ret;
+
+	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
 				 __func__, bch2_err_str(ret)))
 		return ret;

--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@ -233,11 +233,6 @@ enum fsck_err_opts {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		true,				\
 	  NULL,		"Stash pointer to in memory btree node in btree ptr")\
-	x(btree_write_buffer_size, u32,					\
-	  OPT_FS|OPT_MOUNT,						\
-	  OPT_UINT(16, (1U << 20) - 1),					\
-	  BCH2_NO_SB_OPT,		1U << 13,			\
-	  NULL,		"Number of btree write buffer entries")		\
 	x(gc_reserve_percent,		u8,				\
 	  OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,			\
 	  OPT_UINT(5, 21),						\
--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@ -159,6 +159,8 @@ static int bch2_journal_replay(struct bch_fs *c)
 			goto err;
 	}

+	BUG_ON(!atomic_read(&keys->ref));
+
 	/*
 	 * First, attempt to replay keys in sorted order. This is more
 	 * efficient - better locality of btree access -  but some might fail if
@ -218,14 +220,15 @@ static int bch2_journal_replay(struct bch_fs *c)
 	bch2_trans_put(trans);
 	trans = NULL;

+	if (!c->opts.keep_journal)
+		bch2_journal_keys_put_initial(c);
+
 	replay_now_at(j, j->replay_journal_seq_end);
 	j->replay_journal_seq = 0;

 	bch2_journal_set_replay_done(j);
-	bch2_journal_flush_all_pins(j);
-	ret = bch2_journal_error(j);

-	if (keys->nr && !ret)
+	if (keys->nr)
 		bch2_journal_log_msg(c, "journal replay finished");
 err:
 	if (trans)
@ -935,8 +938,12 @@ use_clean:

 		bch2_move_stats_init(&stats, "recovery");

-		bch_info(c, "scanning for old btree nodes");
-		ret =   bch2_fs_read_write(c) ?:
+		struct printbuf buf = PRINTBUF;
+		bch2_version_to_text(&buf, c->sb.version_min);
+		bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
+		printbuf_exit(&buf);
+
+		ret =   bch2_fs_read_write_early(c) ?:
 			bch2_scan_old_btree_nodes(c, &stats);
 		if (ret)
 			goto err;
@ -953,10 +960,8 @@ out:
 	bch2_flush_fsck_errs(c);

 	if (!c->opts.keep_journal &&
-	    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) {
-		bch2_journal_keys_free(&c->journal_keys);
-		bch2_journal_entries_free(c);
-	}
+	    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+		bch2_journal_keys_put_initial(c);
 	kfree(clean);

 	if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@ -11,7 +11,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,

 /* Replicas tracking - in memory: */

-static void verify_replicas_entry(struct bch_replicas_entry *e)
+static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
 	unsigned i;
@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
 #endif
 }

-void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 {
 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
 }
@ -53,7 +53,7 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
 }

 void bch2_replicas_entry_to_text(struct printbuf *out,
-				 struct bch_replicas_entry *e)
+				 struct bch_replicas_entry_v1 *e)
 {
 	unsigned i;

@ -71,7 +71,7 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
 void bch2_cpu_replicas_to_text(struct printbuf *out,
 			       struct bch_replicas_cpu *r)
 {
-	struct bch_replicas_entry *e;
+	struct bch_replicas_entry_v1 *e;
 	bool first = true;

 	for_each_cpu_replicas_entry(r, e) {
@ -84,7 +84,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 }

 static void extent_to_replicas(struct bkey_s_c k,
-			       struct bch_replicas_entry *r)
+			       struct bch_replicas_entry_v1 *r)
 {
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
@ -104,7 +104,7 @@ static void extent_to_replicas(struct bkey_s_c k,
 }

 static void stripe_to_replicas(struct bkey_s_c k,
-			       struct bch_replicas_entry *r)
+			       struct bch_replicas_entry_v1 *r)
 {
 	struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
 	const struct bch_extent_ptr *ptr;
@ -117,7 +117,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
 		r->devs[r->nr_devs++] = ptr->dev;
 }

-void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
 			   struct bkey_s_c k)
 {
 	e->nr_devs = 0;
@ -142,7 +142,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
 	bch2_replicas_entry_sort(e);
 }

-void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
 			      enum bch_data_type data_type,
 			      struct bch_devs_list devs)
 {
@ -164,7 +164,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,

 static struct bch_replicas_cpu
 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
-		       struct bch_replicas_entry *new_entry)
+		       struct bch_replicas_entry_v1 *new_entry)
 {
 	unsigned i;
 	struct bch_replicas_cpu new = {
@ -194,7 +194,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 }

 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
-				       struct bch_replicas_entry *search)
+				       struct bch_replicas_entry_v1 *search)
 {
 	int idx, entry_size = replicas_entry_bytes(search);

@ -212,7 +212,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 }

 int bch2_replicas_entry_idx(struct bch_fs *c,
-			    struct bch_replicas_entry *search)
+			    struct bch_replicas_entry_v1 *search)
 {
 	bch2_replicas_entry_sort(search);

@ -220,13 +220,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c,
 }

 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
-				 struct bch_replicas_entry *search)
+				 struct bch_replicas_entry_v1 *search)
 {
 	return __replicas_entry_idx(r, search) >= 0;
 }

 bool bch2_replicas_marked(struct bch_fs *c,
-			  struct bch_replicas_entry *search)
+			  struct bch_replicas_entry_v1 *search)
 {
 	bool marked;

@ -343,7 +343,7 @@ err:
 static unsigned reserve_journal_replicas(struct bch_fs *c,
 				     struct bch_replicas_cpu *r)
 {
-	struct bch_replicas_entry *e;
+	struct bch_replicas_entry_v1 *e;
 	unsigned journal_res_u64s = 0;

 	/* nr_inodes: */
@ -368,7 +368,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c,

 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
-				struct bch_replicas_entry *new_entry)
+				struct bch_replicas_entry_v1 *new_entry)
 {
 	struct bch_replicas_cpu new_r, new_gc;
 	int ret = 0;
@ -433,7 +433,7 @@ err:
 	goto out;
 }

-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
 {
 	return likely(bch2_replicas_marked(c, r))
 		? 0 : bch2_mark_replicas_slowpath(c, r);
@ -484,7 +484,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)

 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 {
-	struct bch_replicas_entry *e;
+	struct bch_replicas_entry_v1 *e;
 	unsigned i = 0;

 	lockdep_assert_held(&c->replicas_gc_lock);
@ -559,7 +559,7 @@ retry:
 	}

 	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry *e =
+		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(&c->replicas, i);

 		if (e->data_type == BCH_DATA_journal ||
@ -590,7 +590,7 @@ retry:
 }

 int bch2_replicas_set_usage(struct bch_fs *c,
-			    struct bch_replicas_entry *r,
+			    struct bch_replicas_entry_v1 *r,
 			    u64 sectors)
 {
 	int ret, idx = bch2_replicas_entry_idx(c, r);
@ -623,7 +623,7 @@ static int
 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 				   struct bch_replicas_cpu *cpu_r)
 {
-	struct bch_replicas_entry *e, *dst;
+	struct bch_replicas_entry_v1 *e, *dst;
 	unsigned nr = 0, entry_size = 0, idx = 0;

 	for_each_replicas_entry(sb_r, e) {
@ -661,7 +661,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 		nr++;
 	}

-	entry_size += sizeof(struct bch_replicas_entry) -
+	entry_size += sizeof(struct bch_replicas_entry_v1) -
 		sizeof(struct bch_replicas_entry_v0);

 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
@ -672,7 +672,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 	cpu_r->entry_size	= entry_size;

 	for_each_replicas_entry(sb_r, e) {
-		struct bch_replicas_entry *dst =
+		struct bch_replicas_entry_v1 *dst =
 			cpu_replicas_entry(cpu_r, idx++);

 		dst->data_type	= e->data_type;
@ -716,7 +716,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
 {
 	struct bch_sb_field_replicas_v0 *sb_r;
 	struct bch_replicas_entry_v0 *dst;
-	struct bch_replicas_entry *src;
+	struct bch_replicas_entry_v1 *src;
 	size_t bytes;

 	bytes = sizeof(struct bch_sb_field_replicas);
@ -754,7 +754,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 					    struct bch_replicas_cpu *r)
 {
 	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_entry *dst, *src;
+	struct bch_replicas_entry_v1 *dst, *src;
 	bool need_v1 = false;
 	size_t bytes;

@ -805,7 +805,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 		      memcmp, NULL);

 	for (i = 0; i < cpu_r->nr; i++) {
-		struct bch_replicas_entry *e =
+		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(cpu_r, i);

 		if (e->data_type >= BCH_DATA_NR) {
@ -835,7 +835,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 			}

 		if (i + 1 < cpu_r->nr) {
-			struct bch_replicas_entry *n =
+			struct bch_replicas_entry_v1 *n =
 				cpu_replicas_entry(cpu_r, i + 1);

 			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
@ -872,7 +872,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
 				     struct bch_sb_field *f)
 {
 	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
-	struct bch_replicas_entry *e;
+	struct bch_replicas_entry_v1 *e;
 	bool first = true;

 	for_each_replicas_entry(r, e) {
@ -934,7 +934,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 			   unsigned flags, bool print)
 {
-	struct bch_replicas_entry *e;
+	struct bch_replicas_entry_v1 *e;
 	bool ret = true;

 	percpu_down_read(&c->mark_lock);
@ -994,7 +994,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);

 	if (replicas) {
-		struct bch_replicas_entry *r;
+		struct bch_replicas_entry_v1 *r;

 		for_each_replicas_entry(replicas, r)
 			for (i = 0; i < r->nr_devs; i++)
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@ -6,26 +6,26 @@
 #include "eytzinger.h"
 #include "replicas_types.h"

-void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
 void bch2_replicas_entry_to_text(struct printbuf *,
-				 struct bch_replicas_entry *);
+				 struct bch_replicas_entry_v1 *);
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);

-static inline struct bch_replicas_entry *
+static inline struct bch_replicas_entry_v1 *
 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
 {
 	return (void *) r->entries + r->entry_size * i;
 }

 int bch2_replicas_entry_idx(struct bch_fs *,
-			    struct bch_replicas_entry *);
+			    struct bch_replicas_entry_v1 *);

-void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
 			      enum bch_data_type,
 			      struct bch_devs_list);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
 int bch2_mark_replicas(struct bch_fs *,
-		       struct bch_replicas_entry *);
+		       struct bch_replicas_entry_v1 *);

 static inline struct replicas_delta *
 replicas_delta_next(struct replicas_delta *d)
@ -35,9 +35,9 @@ replicas_delta_next(struct replicas_delta *d)

 int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);

-void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);

-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
 					      unsigned dev)
 {
 	e->data_type	= BCH_DATA_cached;
@ -57,7 +57,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 int bch2_replicas_gc2(struct bch_fs *);

 int bch2_replicas_set_usage(struct bch_fs *,
-			    struct bch_replicas_entry *,
+			    struct bch_replicas_entry_v1 *,
 			    u64);

 #define for_each_cpu_replicas_entry(_r, _i)				\
--- a/libbcachefs/replicas_types.h
+++ b/libbcachefs/replicas_types.h
@ -5,12 +5,12 @@
 struct bch_replicas_cpu {
 	unsigned		nr;
 	unsigned		entry_size;
-	struct bch_replicas_entry *entries;
+	struct bch_replicas_entry_v1 *entries;
 };

 struct replicas_delta {
 	s64			delta;
-	struct bch_replicas_entry r;
+	struct bch_replicas_entry_v1 r;
 } __packed;

 struct replicas_delta_list {
--- a/libbcachefs/sb-clean.c
+++ b/libbcachefs/sb-clean.c
@ -235,7 +235,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 	}

 	for (i = 0; i < c->replicas.nr; i++) {
-		struct bch_replicas_entry *e =
+		struct bch_replicas_entry_v1 *e =
 			cpu_replicas_entry(&c->replicas, i);
 		struct jset_entry_data_usage *u =
 			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
--- a/libbcachefs/sb-errors.h
+++ b/libbcachefs/sb-errors.h
@ -248,7 +248,8 @@
 	x(root_inode_not_dir,					240)	\
 	x(dir_loop,						241)	\
 	x(hash_table_key_duplicate,				242)	\
-	x(hash_table_key_wrong_offset,				243)
+	x(hash_table_key_wrong_offset,				243)	\
+	x(unlinked_inode_not_on_deleted_list,			244)

 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
--- a/libbcachefs/six.c
+++ b/libbcachefs/six.c
@ -324,7 +324,7 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
 }
 EXPORT_SYMBOL_GPL(six_relock_ip);

-#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN

 static inline bool six_owner_running(struct six_lock *lock)
 {
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -166,6 +166,7 @@ void bch2_free_super(struct bch_sb_handle *sb)
 	if (!IS_ERR_OR_NULL(sb->bdev))
 		blkdev_put(sb->bdev, sb->holder);
 	kfree(sb->holder);
+	kfree(sb->sb_name);

 	kfree(sb->sb);
 	memset(sb, 0, sizeof(*sb));
@ -657,12 +658,13 @@ reread:
 	return 0;
 }

-int bch2_read_super(const char *path, struct bch_opts *opts,
-		    struct bch_sb_handle *sb)
+int __bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
 {
 	u64 offset = opt_get(*opts, sb);
 	struct bch_sb_layout layout;
 	struct printbuf err = PRINTBUF;
+	struct printbuf err2 = PRINTBUF;
 	__le64 *i;
 	int ret;
 #ifndef __KERNEL__
@ -675,6 +677,10 @@ retry:
 	if (!sb->holder)
 		return -ENOMEM;

+	sb->sb_name = kstrdup(path, GFP_KERNEL);
+	if (!sb->sb_name)
+		return -ENOMEM;
+
 #ifndef __KERNEL__
 	if (opt_get(*opts, direct_io) == false)
 		sb->mode |= BLK_OPEN_BUFFERED;
@ -721,8 +727,14 @@ retry:
 	if (opt_defined(*opts, sb))
 		goto err;

-	printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+	prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
 	       path, err.buf);
+	if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
+		printk(KERN_INFO "%s", err2.buf);
+	else
+		printk(KERN_ERR "%s", err2.buf);
+
+	printbuf_exit(&err2);
 	printbuf_reset(&err);

 	/*
@ -798,6 +810,20 @@ err_no_print:
 	goto out;
 }

+int bch2_read_super(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
+{
+	return __bch2_read_super(path, opts, sb, false);
+}
+
+/* provide a silenced version for mount.bcachefs */
+
+int bch2_read_super_silent(const char *path, struct bch_opts *opts,
+		    struct bch_sb_handle *sb)
+{
+	return __bch2_read_super(path, opts, sb, true);
+}
+
 /* write superblock: */

 static void write_super_endio(struct bio *bio)
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@ -74,6 +74,7 @@ void bch2_free_super(struct bch_sb_handle *);
 int bch2_sb_realloc(struct bch_sb_handle *, unsigned);

 int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
 int bch2_write_super(struct bch_fs *);
 void __bch2_check_set_feature(struct bch_fs *, unsigned);

--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -314,7 +314,8 @@ void bch2_fs_read_only(struct bch_fs *c)
 		BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
 		BUG_ON(atomic_read(&c->btree_cache.dirty));
 		BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
-		BUG_ON(c->btree_write_buffer.state.nr);
+		BUG_ON(c->btree_write_buffer.inc.keys.nr);
+		BUG_ON(c->btree_write_buffer.flushing.keys.nr);

 		bch_verbose(c, "marking filesystem clean");
 		bch2_fs_mark_clean(c);
@ -504,8 +505,8 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
-	bch2_journal_keys_free(&c->journal_keys);
-	bch2_journal_entries_free(c);
+	bch2_journal_keys_put_initial(c);
+	BUG_ON(atomic_read(&c->journal_keys.ref));
 	bch2_fs_btree_write_buffer_exit(c);
 	percpu_free_rwsem(&c->mark_lock);
 	free_percpu(c->online_reserved);
@ -704,6 +705,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)

 	init_rwsem(&c->gc_lock);
 	mutex_init(&c->gc_gens_lock);
+	atomic_set(&c->journal_keys.ref, 1);
+	c->journal_keys.initial_ref_held = true;

 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_init(&c->times[i]);
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@ -5,6 +5,7 @@
 struct bch_sb_handle {
 	struct bch_sb		*sb;
 	struct block_device	*bdev;
+	char			*sb_name;
 	struct bio		*bio;
 	void			*holder;
 	size_t			buffer_size;
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -496,7 +496,7 @@ STORE(bch2_fs)

 		sc.gfp_mask = GFP_KERNEL;
 		sc.nr_to_scan = strtoul_or_return(buf);
-		c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+		c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
 	}

 	if (attr == &sysfs_btree_wakeup)
--- a/libbcachefs/trace.h
+++ b/libbcachefs/trace.h
@ -188,6 +188,25 @@ DEFINE_EVENT(bch_fs, journal_entry_full,
 	TP_ARGS(c)
 );

+TRACE_EVENT(journal_entry_close,
+	TP_PROTO(struct bch_fs *c, unsigned bytes),
+	TP_ARGS(c, bytes),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(u32,		bytes			)
+	),
+
+	TP_fast_assign(
+		__entry->dev			= c->dev;
+		__entry->bytes			= bytes;
+	),
+
+	TP_printk("%d,%d entry bytes %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->bytes)
+);
+
 DEFINE_EVENT(bio, journal_write,
 	TP_PROTO(struct bio *bio),
 	TP_ARGS(bio)
@ -1313,21 +1332,38 @@ TRACE_EVENT(write_buffer_flush,
 		  __entry->nr, __entry->size, __entry->skipped, __entry->fast)
 );

-TRACE_EVENT(write_buffer_flush_slowpath,
-	TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
-	TP_ARGS(trans, nr, size),
+TRACE_EVENT(write_buffer_flush_sync,
+	TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
+	TP_ARGS(trans, caller_ip),

 	TP_STRUCT__entry(
-		__field(size_t,		nr		)
-		__field(size_t,		size		)
+		__array(char,			trans_fn, 32	)
+		__field(unsigned long,		caller_ip	)
 	),

 	TP_fast_assign(
-		__entry->nr	= nr;
-		__entry->size	= size;
+		strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+		__entry->caller_ip		= caller_ip;
 	),

-	TP_printk("%zu/%zu", __entry->nr, __entry->size)
+	TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
+TRACE_EVENT(write_buffer_flush_slowpath,
+	TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
+	TP_ARGS(trans, slowpath, total),
+
+	TP_STRUCT__entry(
+		__field(size_t,		slowpath	)
+		__field(size_t,		total		)
+	),
+
+	TP_fast_assign(
+		__entry->slowpath	= slowpath;
+		__entry->total		= total;
+	),
+
+	TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
 );

 #endif /* _TRACE_BCACHEFS_H */
--- a/linux/closure.c
+++ b/linux/closure.c
@ -36,7 +36,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 			closure_debug_destroy(cl);

 			if (destructor)
-				destructor(cl);
+				destructor(&cl->work);

 			if (parent)
 				closure_put(parent);
@ -108,8 +108,9 @@ struct closure_syncer {
 	int			done;
 };

-static void closure_sync_fn(struct closure *cl)
+static CLOSURE_CALLBACK(closure_sync_fn)
 {
+	struct closure *cl = container_of(ws, struct closure, work);
 	struct closure_syncer *s = cl->s;
 	struct task_struct *p;

--- a/linux/shrinker.c
+++ b/linux/shrinker.c
@ -12,7 +12,12 @@
 static LIST_HEAD(shrinker_list);
 static DEFINE_MUTEX(shrinker_lock);

-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
+{
+	return calloc(sizeof(struct shrinker), 1);
+}
+
+int shrinker_register(struct shrinker *shrinker)
 {
 	mutex_lock(&shrinker_lock);
 	list_add_tail(&shrinker->list, &shrinker_list);