From 018de5aa899937a9dc3bc8cb9819cb218a59abf3 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Fri, 4 May 2018 14:04:31 -0400
Subject: [PATCH] Update bcachefs sources to ed4aea2ad4 bcachefs: fix gcc
 warning

---
 .bcachefs_revision                  |    2 +-
 libbcachefs/alloc.c                 |    1 +
 libbcachefs/bcachefs.h              |   35 +-
 libbcachefs/bcachefs_format.h       |   53 +-
 libbcachefs/btree_io.c              |  120 +-
 libbcachefs/btree_io.h              |    2 +-
 libbcachefs/btree_iter.c            |    4 +-
 libbcachefs/btree_update_interior.c |    1 +
 libbcachefs/btree_update_leaf.c     |   13 +-
 libbcachefs/buckets.h               |    3 +-
 libbcachefs/debug.c                 |   12 +-
 libbcachefs/error.c                 |   10 +-
 libbcachefs/error.h                 |    6 +-
 libbcachefs/extents.c               |   89 +-
 libbcachefs/extents.h               |   12 +-
 libbcachefs/extents_types.h         |    1 -
 libbcachefs/fs-io.c                 |  271 ++-
 libbcachefs/fs.c                    |    1 +
 libbcachefs/fs.h                    |    2 +
 libbcachefs/io.c                    |  924 ++++++---
 libbcachefs/io.h                    |   33 +-
 libbcachefs/io_types.h              |   21 +-
 libbcachefs/journal.c               | 2974 +++++----------------------
 libbcachefs/journal.h               |  128 +-
 libbcachefs/journal_io.c            | 1423 +++++++++++++
 libbcachefs/journal_io.h            |   45 +
 libbcachefs/journal_reclaim.c       |  411 ++++
 libbcachefs/journal_reclaim.h       |   36 +
 libbcachefs/journal_seq_blacklist.c |  358 ++++
 libbcachefs/journal_seq_blacklist.h |   13 +
 libbcachefs/journal_types.h         |    8 +-
 libbcachefs/move.c                  |   31 +-
 libbcachefs/super.c                 |   49 +-
 libbcachefs/super.h                 |   21 +-
 libbcachefs/sysfs.c                 |  100 +-
 libbcachefs/util.c                  |  201 +-
 libbcachefs/util.h                  |  101 +-
 37 files changed, 4216 insertions(+), 3299 deletions(-)
 create mode 100644 libbcachefs/journal_io.c
 create mode 100644 libbcachefs/journal_io.h
 create mode 100644 libbcachefs/journal_reclaim.c
 create mode 100644 libbcachefs/journal_reclaim.h
 create mode 100644 libbcachefs/journal_seq_blacklist.c
 create mode 100644 libbcachefs/journal_seq_blacklist.h

diff --git a/.bcachefs_revision b/.bcachefs_revision
index a7c36b9e..37d51b2f 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-edf5f38218f699e53913a549465f35d36c4418f7
+ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index 16bdc48c..256adb51 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -69,6 +69,7 @@
 #include "extents.h"
 #include "io.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "super-io.h"
 
 #include <linux/blkdev.h>
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index bc10324f..206c30f4 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -271,17 +271,19 @@ do {									\
 #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
 #endif
 
-/* name, frequency_units, duration_units */
-#define BCH_TIME_STATS()						\
-	BCH_TIME_STAT(btree_node_mem_alloc,	sec, us)		\
-	BCH_TIME_STAT(btree_gc,			sec, ms)		\
-	BCH_TIME_STAT(btree_split,		sec, us)		\
-	BCH_TIME_STAT(btree_sort,		ms, us)			\
-	BCH_TIME_STAT(btree_read,		ms, us)			\
-	BCH_TIME_STAT(journal_write,		us, us)			\
-	BCH_TIME_STAT(journal_delay,		ms, us)			\
-	BCH_TIME_STAT(journal_blocked,		sec, ms)		\
-	BCH_TIME_STAT(journal_flush_seq,	us, us)
+#define BCH_TIME_STATS()				\
+	BCH_TIME_STAT(btree_node_mem_alloc)		\
+	BCH_TIME_STAT(btree_gc)				\
+	BCH_TIME_STAT(btree_split)			\
+	BCH_TIME_STAT(btree_sort)			\
+	BCH_TIME_STAT(btree_read)			\
+	BCH_TIME_STAT(data_write)			\
+	BCH_TIME_STAT(data_read)			\
+	BCH_TIME_STAT(data_promote)			\
+	BCH_TIME_STAT(journal_write)			\
+	BCH_TIME_STAT(journal_delay)			\
+	BCH_TIME_STAT(journal_blocked)			\
+	BCH_TIME_STAT(journal_flush_seq)
 
 #include "alloc_types.h"
 #include "buckets_types.h"
@@ -416,7 +418,12 @@ struct bch_dev {
 	struct work_struct	io_error_work;
 
 	/* The rest of this all shows up in sysfs */
-	atomic_t		latency[2];
+	atomic64_t		cur_latency[2];
+	struct time_stats	io_latency[2];
+
+#define CONGESTED_MAX		1024
+	atomic_t		congested;
+	u64			congested_last;
 
 	struct io_count __percpu *io_done;
 };
@@ -644,6 +651,7 @@ struct bch_fs {
 	struct bio_set		bio_write;
 	struct mutex		bio_bounce_pages_lock;
 	mempool_t		bio_bounce_pages;
+	struct rhashtable	promote_table;
 
 	mempool_t		compression_bounce[2];
 	mempool_t		compress_workspace[BCH_COMPRESSION_NR];
@@ -708,12 +716,13 @@ struct bch_fs {
 	unsigned		copy_gc_enabled:1;
 	unsigned		rebalance_enabled:1;
 	unsigned		rebalance_percent;
+	bool			promote_whole_extents;
 
 #define BCH_DEBUG_PARAM(name, description) bool name;
 	BCH_DEBUG_PARAMS_ALL()
 #undef BCH_DEBUG_PARAM
 
-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+#define BCH_TIME_STAT(name)				\
 	struct time_stats	name##_time;
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index eed6fb85..48d14a30 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -1088,13 +1088,14 @@ LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,
 
 LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
 LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
-					struct bch_sb, flags[1], 28, 32);
 
 LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
 LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
 LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);
 
+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+					struct bch_sb, flags[2],  0,  4);
+
 /* Features: */
 enum bch_sb_features {
 	BCH_FEATURE_LZ4			= 0,
@@ -1193,29 +1194,41 @@ struct jset_entry {
 	};
 };
 
+#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES()			\
+	x(btree_keys,		0)		\
+	x(btree_root,		1)		\
+	x(prio_ptrs,		2)		\
+	x(blacklist,		3)		\
+	x(blacklist_v2,		4)
+
+enum {
+#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+	BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
 struct jset_entry_blacklist {
 	struct jset_entry	entry;
 	__le64			seq;
 };
 
-#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
-
-enum {
-	JOURNAL_ENTRY_BTREE_KEYS		= 0,
-	JOURNAL_ENTRY_BTREE_ROOT		= 1,
-	JOURNAL_ENTRY_PRIO_PTRS			= 2, /* Obsolete */
-
-	/*
-	 * Journal sequence numbers can be blacklisted: bsets record the max
-	 * sequence number of all the journal entries they contain updates for,
-	 * so that on recovery we can ignore those bsets that contain index
-	 * updates newer that what made it into the journal.
-	 *
-	 * This means that we can't reuse that journal_seq - we have to skip it,
-	 * and then record that we skipped it so that the next time we crash and
-	 * recover we don't think there was a missing journal entry.
-	 */
-	JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED	= 3,
+struct jset_entry_blacklist_v2 {
+	struct jset_entry	entry;
+	__le64			start;
+	__le64			end;
 };
 
 /*
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index 0525c3b8..1aa94229 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -13,7 +13,8 @@
 #include "error.h"
 #include "extents.h"
 #include "io.h"
-#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "super-io.h"
 
 #include <trace/events/bcachefs.h>
@@ -947,6 +948,7 @@ enum btree_validate_ret {
 
 #define btree_err(type, c, b, i, msg, ...)				\
 ({									\
+	__label__ out;							\
 	char _buf[300], *out = _buf, *end = out + sizeof(_buf);		\
 									\
 	out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
@@ -956,7 +958,11 @@ enum btree_validate_ret {
 	    write == READ &&						\
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
 		mustfix_fsck_err(c, "%s", _buf);			\
-	} else {							\
+		goto out;						\
+	}								\
+									\
+	switch (write) {						\
+	case READ:							\
 		bch_err(c, "%s", _buf);					\
 									\
 		switch (type) {						\
@@ -976,7 +982,17 @@ enum btree_validate_ret {
 			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
 			goto fsck_err;					\
 		}							\
+		break;							\
+	case WRITE:							\
+		bch_err(c, "corrupt metadata before write: %s", _buf);	\
+									\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
 	}								\
+out:									\
 	true;								\
 })
 
@@ -1323,37 +1339,48 @@ static void btree_node_read_work(struct work_struct *work)
 	struct btree_read_bio *rb =
 		container_of(work, struct btree_read_bio, work);
 	struct bch_fs *c	= rb->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
 	struct btree *b		= rb->bio.bi_private;
 	struct bio *bio		= &rb->bio;
 	struct bch_devs_mask avoid;
+	bool can_retry;
 
 	memset(&avoid, 0, sizeof(avoid));
 
 	goto start;
-	do {
+	while (1) {
 		bch_info(c, "retrying read");
+		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
 		bio_reset(bio);
-		bio_set_dev(bio, rb->pick.ca->disk_sb.bdev);
 		bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
 		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
 		bio->bi_iter.bi_size	= btree_bytes(c);
-		submit_bio_wait(bio);
-start:
-		bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
-		percpu_ref_put(&rb->pick.ca->io_ref);
 
-		__set_bit(rb->pick.ca->dev_idx, avoid.d);
-		rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
+		if (rb->have_ioref) {
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			submit_bio_wait(bio);
+		} else {
+			bio->bi_status = BLK_STS_REMOVED;
+		}
+start:
+		bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+		if (rb->have_ioref)
+			percpu_ref_put(&ca->io_ref);
+		rb->have_ioref = false;
+
+		__set_bit(rb->pick.ptr.dev, avoid.d);
+		can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
 
 		if (!bio->bi_status &&
-		    !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
-			goto out;
-	} while (!IS_ERR_OR_NULL(rb->pick.ca));
+		    !bch2_btree_node_read_done(c, b, can_retry))
+			break;
 
-	set_btree_node_read_error(b);
-out:
-	if (!IS_ERR_OR_NULL(rb->pick.ca))
-		percpu_ref_put(&rb->pick.ca->io_ref);
+		if (!can_retry) {
+			set_btree_node_read_error(b);
+			break;
+		}
+	}
 
 	bch2_time_stats_update(&c->btree_read_time, rb->start_time);
 	bio_put(&rb->bio);
@@ -1365,10 +1392,13 @@ static void btree_node_read_endio(struct bio *bio)
 {
 	struct btree_read_bio *rb =
 		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;
 
-	bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}
 
-	INIT_WORK(&rb->work, btree_node_read_work);
 	queue_work(system_unbound_wq, &rb->work);
 }
 
@@ -1377,41 +1407,58 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 {
 	struct extent_pick_ptr pick;
 	struct btree_read_bio *rb;
+	struct bch_dev *ca;
 	struct bio *bio;
+	int ret;
 
 	trace_btree_read(c, b);
 
-	pick = bch2_btree_pick_ptr(c, b, NULL);
-	if (bch2_fs_fatal_err_on(!pick.ca, c,
+	ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+	if (bch2_fs_fatal_err_on(ret <= 0, c,
 			"btree node read error: no device to read from")) {
 		set_btree_node_read_error(b);
 		return;
 	}
 
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
 	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
 	rb = container_of(bio, struct btree_read_bio, bio);
 	rb->c			= c;
 	rb->start_time		= local_clock();
+	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
 	rb->pick		= pick;
-	bio_set_dev(bio, pick.ca->disk_sb.bdev);
+	INIT_WORK(&rb->work, btree_node_read_work);
 	bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
 	bio->bi_iter.bi_size	= btree_bytes(c);
+	bio->bi_end_io		= btree_node_read_endio;
+	bio->bi_private		= b;
 	bch2_bio_map(bio, b->data);
 
-	this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
-		     bio_sectors(bio));
-
 	set_btree_node_read_in_flight(b);
 
-	if (sync) {
-		submit_bio_wait(bio);
-		bio->bi_private	= b;
-		btree_node_read_work(&rb->work);
+	if (rb->have_ioref) {
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+			     bio_sectors(bio));
+		bio_set_dev(bio, ca->disk_sb.bdev);
+
+		if (sync) {
+			submit_bio_wait(bio);
+
+			bio->bi_private	= b;
+			btree_node_read_work(&rb->work);
+		} else {
+			submit_bio(bio);
+		}
 	} else {
-		bio->bi_end_io	= btree_node_read_endio;
-		bio->bi_private	= b;
-		submit_bio(bio);
+		bio->bi_status = BLK_STS_REMOVED;
+
+		if (sync)
+			btree_node_read_work(&rb->work);
+		else
+			queue_work(system_unbound_wq, &rb->work);
+
 	}
 }
 
@@ -1593,20 +1640,21 @@ static void btree_node_write_endio(struct bio *bio)
 	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
 	struct bch_write_bio *orig	= parent ?: wbio;
 	struct bch_fs *c		= wbio->c;
-	struct bch_dev *ca		= wbio->ca;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
 	unsigned long flags;
 
-	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+	if (wbio->have_ioref)
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
 
 	if (bio->bi_status == BLK_STS_REMOVED ||
 	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
 	    bch2_meta_write_fault("btree")) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
-		bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
 	}
 
-	if (wbio->have_io_ref)
+	if (wbio->have_ioref)
 		percpu_ref_put(&ca->io_ref);
 
 	if (parent) {
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index 01df817d..947685f9 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -12,8 +12,8 @@ struct btree_iter;
 
 struct btree_read_bio {
 	struct bch_fs		*c;
-	unsigned		submit_time_us;
 	u64			start_time;
+	unsigned		have_ioref:1;
 	struct extent_pick_ptr	pick;
 	struct work_struct	work;
 	struct bio		bio;
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c
index 465aadba..69cad3bb 100644
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -748,7 +748,9 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *k;
 	BKEY_PADDED(k) tmp;
-	unsigned nr = iter->level > 1 ? 1 : 8;
+	unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
+		? (iter->level > 1 ? 0 :  2)
+		: (iter->level > 1 ? 1 : 16);
 	bool was_locked = btree_node_locked(iter, iter->level);
 
 	while (nr) {
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 63696920..adba3092 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -12,6 +12,7 @@
 #include "buckets.h"
 #include "extents.h"
 #include "journal.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
 #include "replicas.h"
 #include "super-io.h"
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 53b39de5..92fb5f61 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -8,6 +8,7 @@
 #include "debug.h"
 #include "extents.h"
 #include "journal.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
 
 #include <linux/sort.h>
@@ -137,7 +138,7 @@ void bch2_btree_journal_key(struct btree_insert *trans,
 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
-	if (likely(trans->journal_res.ref)) {
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		u64 seq = trans->journal_res.seq;
 		bool needs_whiteout = insert->k.needs_whiteout;
 
@@ -155,12 +156,16 @@ void bch2_btree_journal_key(struct btree_insert *trans,
 		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
 	}
 
-	if (unlikely(!journal_pin_active(&w->journal)))
-		bch2_journal_pin_add(j, &trans->journal_res,
-				     &w->journal,
+	if (unlikely(!journal_pin_active(&w->journal))) {
+		u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+			? trans->journal_res.seq
+			: j->replay_journal_seq;
+
+		bch2_journal_pin_add(j, seq, &w->journal,
 				     btree_node_write_idx(b) == 0
 				     ? btree_node_flush0
 				     : btree_node_flush1);
+	}
 
 	if (unlikely(!btree_node_dirty(b)))
 		set_btree_node_dirty(b);
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 399a853c..01f0b314 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -142,7 +142,8 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
 	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
 
 	if (WARN_ONCE(stats.buckets_unavailable > total,
-		      "buckets_unavailable overflow\n"))
+		      "buckets_unavailable overflow (%llu > %llu)\n",
+		      stats.buckets_unavailable, total))
 		return 0;
 
 	return total - stats.buckets_unavailable;
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c
index 7190990d..71f649bc 100644
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -36,6 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
 	struct bset *sorted, *inmemory;
 	struct extent_pick_ptr pick;
+	struct bch_dev *ca;
 	struct bio *bio;
 
 	if (c->opts.nochanges)
@@ -54,12 +55,15 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	v->btree_id	= b->btree_id;
 	bch2_btree_keys_init(v, &c->expensive_debug_checks);
 
-	pick = bch2_btree_pick_ptr(c, b, NULL);
-	if (IS_ERR_OR_NULL(pick.ca))
+	if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+		return;
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	if (!bch2_dev_get_ioref(ca, READ))
 		return;
 
 	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
-	bio_set_dev(bio, pick.ca->disk_sb.bdev);
+	bio_set_dev(bio, ca->disk_sb.bdev);
 	bio->bi_opf		= REQ_OP_READ|REQ_META;
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
 	bio->bi_iter.bi_size	= btree_bytes(c);
@@ -68,7 +72,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	submit_bio_wait(bio);
 
 	bio_put(bio);
-	percpu_ref_put(&pick.ca->io_ref);
+	percpu_ref_put(&ca->io_ref);
 
 	memcpy(n_ondisk, n_sorted, btree_bytes(c));
 
diff --git a/libbcachefs/error.c b/libbcachefs/error.c
index ca2a06e2..2a357fc3 100644
--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@@ -3,20 +3,22 @@
 #include "io.h"
 #include "super.h"
 
-void bch2_inconsistent_error(struct bch_fs *c)
+bool bch2_inconsistent_error(struct bch_fs *c)
 {
 	set_bit(BCH_FS_ERROR, &c->flags);
 
 	switch (c->opts.errors) {
 	case BCH_ON_ERROR_CONTINUE:
-		break;
+		return false;
 	case BCH_ON_ERROR_RO:
 		if (bch2_fs_emergency_read_only(c))
 			bch_err(c, "emergency read only");
-		break;
+		return true;
 	case BCH_ON_ERROR_PANIC:
 		panic(bch2_fmt(c, "panic after error"));
-		break;
+		return true;
+	default:
+		BUG();
 	}
 }
 
diff --git a/libbcachefs/error.h b/libbcachefs/error.h
index ac3e96d2..ababaee0 100644
--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@@ -45,13 +45,13 @@ do {									\
  * BCH_ON_ERROR_CONTINUE mode
  */
 
-void bch2_inconsistent_error(struct bch_fs *);
+bool bch2_inconsistent_error(struct bch_fs *);
 
 #define bch2_fs_inconsistent(c, ...)					\
-do {									\
+({									\
 	bch_err(c, __VA_ARGS__);					\
 	bch2_inconsistent_error(c);					\
-} while (0)
+})
 
 #define bch2_fs_inconsistent_on(cond, c, ...)				\
 ({									\
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index c5d1e7cb..9efaa1ff 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -588,58 +588,51 @@ out:
 	return out - buf;
 }
 
-static inline bool dev_latency_better(struct bch_dev *dev1,
-				      struct bch_dev *dev2)
+static inline bool dev_latency_better(struct bch_fs *c,
+			      const struct bch_extent_ptr *ptr1,
+			      const struct bch_extent_ptr *ptr2)
 {
-	unsigned l1 = atomic_read(&dev1->latency[READ]);
-	unsigned l2 = atomic_read(&dev2->latency[READ]);
+	struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
+	struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
+	u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+	u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
 
 	/* Pick at random, biased in favor of the faster device: */
 
 	return bch2_rand_range(l1 + l2) > l1;
 }
 
-static void extent_pick_read_device(struct bch_fs *c,
-				    struct bkey_s_c_extent e,
-				    struct bch_devs_mask *avoid,
-				    struct extent_pick_ptr *pick)
+static int extent_pick_read_device(struct bch_fs *c,
+				   struct bkey_s_c_extent e,
+				   struct bch_devs_mask *avoid,
+				   struct extent_pick_ptr *pick)
 {
 	const struct bch_extent_ptr *ptr;
 	struct bch_extent_crc_unpacked crc;
+	struct bch_dev *ca;
+	int ret = 0;
 
 	extent_for_each_ptr_crc(e, ptr, crc) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		ca = bch_dev_bkey_exists(c, ptr->dev);
 
 		if (ptr->cached && ptr_stale(ca, ptr))
 			continue;
 
-		if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+		if (avoid && test_bit(ptr->dev, avoid->d))
 			continue;
 
-		if (avoid) {
-			if (test_bit(ca->dev_idx, avoid->d))
-				continue;
-
-			if (pick->ca &&
-			    test_bit(pick->ca->dev_idx, avoid->d))
-				goto use;
-		}
-
-		if (pick->ca && !dev_latency_better(ca, pick->ca))
+		if (ret && !dev_latency_better(c, ptr, &pick->ptr))
 			continue;
-use:
-		if (!percpu_ref_tryget(&ca->io_ref))
-			continue;
-
-		if (pick->ca)
-			percpu_ref_put(&pick->ca->io_ref);
 
 		*pick = (struct extent_pick_ptr) {
 			.ptr	= *ptr,
 			.crc	= crc,
-			.ca	= ca,
 		};
+
+		ret = 1;
 	}
+
+	return ret;
 }
 
 /* Btree ptrs */
@@ -759,16 +752,12 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
 #undef p
 }
 
-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
-		    struct bch_devs_mask *avoid)
+int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+			struct bch_devs_mask *avoid,
+			struct extent_pick_ptr *pick)
 {
-	struct extent_pick_ptr pick = { .ca = NULL };
-
-	extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
-				avoid, &pick);
-
-	return pick;
+	return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+				       avoid, pick);
 }
 
 /* Extents */
@@ -2057,37 +2046,33 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
  * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
  * other devices, it will still pick a pointer from avoid.
  */
-void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
-			  struct bch_devs_mask *avoid,
-			  struct extent_pick_ptr *ret)
+int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+			 struct bch_devs_mask *avoid,
+			 struct extent_pick_ptr *pick)
 {
-	struct bkey_s_c_extent e;
+	int ret;
 
 	switch (k.k->type) {
 	case KEY_TYPE_DELETED:
 	case KEY_TYPE_DISCARD:
 	case KEY_TYPE_COOKIE:
-		ret->ca = NULL;
-		return;
+		return 0;
 
 	case KEY_TYPE_ERROR:
-		ret->ca = ERR_PTR(-EIO);
-		return;
+		return -EIO;
 
 	case BCH_EXTENT:
 	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
-		ret->ca = NULL;
+		ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
+					      avoid, pick);
 
-		extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
+		if (!ret && !bkey_extent_is_cached(k.k))
+			ret = -EIO;
 
-		if (!ret->ca && !bkey_extent_is_cached(e.k))
-			ret->ca = ERR_PTR(-EIO);
-		return;
+		return ret;
 
 	case BCH_RESERVATION:
-		ret->ca = NULL;
-		return;
+		return 0;
 
 	default:
 		BUG();
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 8dc15484..338e9e01 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -53,13 +53,13 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 						     struct btree *,
 						     struct btree_node_iter_large *);
 
-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
-		    struct bch_devs_mask *avoid);
+int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
+			struct bch_devs_mask *avoid,
+			struct extent_pick_ptr *);
 
-void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
-			  struct bch_devs_mask *,
-			  struct extent_pick_ptr *);
+int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
+			 struct bch_devs_mask *,
+			 struct extent_pick_ptr *);
 
 enum btree_insert_ret
 bch2_insert_fixup_extent(struct btree_insert *,
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
index 15805cd2..76139f93 100644
--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@@ -21,7 +21,6 @@ struct bch_extent_crc_unpacked {
 struct extent_pick_ptr {
 	struct bch_extent_ptr		ptr;
 	struct bch_extent_crc_unpacked	crc;
-	struct bch_dev			*ca;
 };
 
 #endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index d1473f2a..a2455b42 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -20,6 +20,7 @@
 #include <linux/migrate.h>
 #include <linux/mmu_context.h>
 #include <linux/pagevec.h>
+#include <linux/sched/signal.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
@@ -124,13 +125,13 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
 	if (!res->sectors)
 		return;
 
-	mutex_lock(&inode->ei_update_lock);
+	mutex_lock(&inode->ei_quota_lock);
 	BUG_ON(res->sectors > inode->ei_quota_reserved);
 
 	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
 			-((s64) res->sectors), BCH_QUOTA_PREALLOC);
 	inode->ei_quota_reserved -= res->sectors;
-	mutex_unlock(&inode->ei_update_lock);
+	mutex_unlock(&inode->ei_quota_lock);
 
 	res->sectors = 0;
 }
@@ -143,14 +144,14 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
 {
 	int ret;
 
-	mutex_lock(&inode->ei_update_lock);
+	mutex_lock(&inode->ei_quota_lock);
 	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
 			      check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
 	if (likely(!ret)) {
 		inode->ei_quota_reserved += sectors;
 		res->sectors += sectors;
 	}
-	mutex_unlock(&inode->ei_update_lock);
+	mutex_unlock(&inode->ei_quota_lock);
 
 	return ret;
 }
@@ -195,9 +196,10 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
 	return __bch2_write_inode(c, inode, inode_set_size, &new_size);
 }
 
-static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			     struct quota_res *quota_res, int sectors)
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+			   struct quota_res *quota_res, int sectors)
 {
+	mutex_lock(&inode->ei_quota_lock);
 #ifdef CONFIG_BCACHEFS_QUOTA
 	if (quota_res && sectors > 0) {
 		BUG_ON(sectors > quota_res->sectors);
@@ -210,14 +212,7 @@ static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 	}
 #endif
 	inode->v.i_blocks += sectors;
-}
-
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			   struct quota_res *quota_res, int sectors)
-{
-	mutex_lock(&inode->ei_update_lock);
-	__i_sectors_acct(c, inode, quota_res, sectors);
-	mutex_unlock(&inode->ei_update_lock);
+	mutex_unlock(&inode->ei_quota_lock);
 }
 
 /* i_sectors accounting: */
@@ -265,7 +260,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
 	if (h->new_i_size != U64_MAX)
 		i_size_write(&h->inode->v, h->new_i_size);
 
-	__i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
+	i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
 
 	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
 	mutex_unlock(&h->inode->ei_update_lock);
@@ -773,6 +768,7 @@ void bch2_invalidatepage(struct page *page, unsigned int offset,
 
 int bch2_releasepage(struct page *page, gfp_t gfp_mask)
 {
+	/* XXX: this can't take locks that are held while we allocate memory */
 	EBUG_ON(!PageLocked(page));
 	EBUG_ON(PageWriteback(page));
 
@@ -881,10 +877,12 @@ static int readpage_add_page(struct readpages_iter *iter, struct page *page)
 	int ret;
 
 	prefetchw(&page->flags);
-	page_state_init_for_read(page);
 
 	ret = add_to_page_cache_lru(page, iter->mapping,
 				    page->index, GFP_NOFS);
+	if (!ret)
+		page_state_init_for_read(page);
+
 	put_page(page);
 	return ret;
 }
@@ -992,12 +990,13 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;
 
+	rbio->c = c;
+	rbio->start_time = local_clock();
+
 	while (1) {
-		struct extent_pick_ptr pick;
 		BKEY_PADDED(k) tmp;
 		struct bkey_s_c k;
 		unsigned bytes;
-		bool is_last;
 
 		bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));
 
@@ -1016,45 +1015,37 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 		bch2_btree_iter_unlock(iter);
 		k = bkey_i_to_s_c(&tmp.k);
 
-		bch2_extent_pick_ptr(c, k, NULL, &pick);
-		if (IS_ERR(pick.ca)) {
-			bcache_io_error(c, bio, "no device to read from");
-			bio_endio(bio);
-			return;
-		}
+		if (readpages_iter) {
+			bool want_full_extent = false;
+
+			if (bkey_extent_is_data(k.k)) {
+				struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+				const struct bch_extent_ptr *ptr;
+				struct bch_extent_crc_unpacked crc;
+
+				extent_for_each_ptr_crc(e, ptr, crc)
+					want_full_extent |= !!crc.csum_type |
+							     !!crc.compression_type;
+			}
 
-		if (readpages_iter)
 			readpage_bio_extend(readpages_iter,
 					    bio, k.k->p.offset,
-					    pick.ca &&
-					    (pick.crc.csum_type ||
-					     pick.crc.compression_type));
+					    want_full_extent);
+		}
 
 		bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
 			 bio->bi_iter.bi_sector) << 9;
-		is_last = bytes == bio->bi_iter.bi_size;
 		swap(bio->bi_iter.bi_size, bytes);
 
+		if (bytes == bio->bi_iter.bi_size)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
 		if (bkey_extent_is_allocation(k.k))
 			bch2_add_page_sectors(bio, k);
 
-		if (pick.ca) {
-			if (!is_last) {
-				bio_inc_remaining(&rbio->bio);
-				flags |= BCH_READ_MUST_CLONE;
-				trace_read_split(&rbio->bio);
-			}
+		bch2_read_extent(c, rbio, k, flags);
 
-			bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
-					 &pick, flags);
-		} else {
-			zero_fill_bio(bio);
-
-			if (is_last)
-				bio_endio(bio);
-		}
-
-		if (is_last)
+		if (flags & BCH_READ_LAST_FRAGMENT)
 			return;
 
 		swap(bio->bi_iter.bi_size, bytes);
@@ -1487,6 +1478,194 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }
 
+#define WRITE_BATCH_PAGES	32
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+				 struct address_space *mapping,
+				 struct iov_iter *iter,
+				 loff_t pos, unsigned len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct page *pages[WRITE_BATCH_PAGES];
+	unsigned long index = pos >> PAGE_SHIFT;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	unsigned i, copied = 0, nr_pages_copied = 0;
+	int ret = 0;
+
+	BUG_ON(!len);
+	BUG_ON(nr_pages > ARRAY_SIZE(pages));
+
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
+		if (!pages[i]) {
+			nr_pages = i;
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (offset && !PageUptodate(pages[0])) {
+		ret = bch2_read_single_page(pages[0], mapping);
+		if (ret)
+			goto out;
+	}
+
+	if ((pos + len) & (PAGE_SIZE - 1) &&
+	    !PageUptodate(pages[nr_pages - 1])) {
+		if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
+			zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
+		} else {
+			ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
+			if (ret)
+				goto out;
+		}
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		ret = bch2_get_page_reservation(c, inode, pages[i], true);
+
+		if (ret && !PageUptodate(pages[i])) {
+			ret = bch2_read_single_page(pages[i], mapping);
+			if (ret)
+				goto out;
+
+			ret = bch2_get_page_reservation(c, inode, pages[i], true);
+		}
+
+		if (ret)
+			goto out;
+	}
+
+	if (mapping_writably_mapped(mapping))
+		for (i = 0; i < nr_pages; i++)
+			flush_dcache_page(pages[i]);
+
+	while (copied < len) {
+		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
+		unsigned pg_bytes = min_t(unsigned, len - copied,
+					  PAGE_SIZE - pg_offset);
+		unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
+						iter, pg_offset, pg_bytes);
+
+		if (!pg_copied)
+			break;
+
+		flush_dcache_page(page);
+		iov_iter_advance(iter, pg_copied);
+		copied += pg_copied;
+	}
+
+	if (!copied)
+		goto out;
+
+	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+	inode->ei_last_dirtied = (unsigned long) current;
+
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+
+	if (copied < len &&
+	    ((offset + copied) & (PAGE_SIZE - 1))) {
+		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+
+		if (!PageUptodate(page)) {
+			zero_user(page, 0, PAGE_SIZE);
+			copied -= (offset + copied) & (PAGE_SIZE - 1);
+		}
+	}
+out:
+	for (i = 0; i < nr_pages_copied; i++) {
+		if (!PageUptodate(pages[i]))
+			SetPageUptodate(pages[i]);
+		if (!PageDirty(pages[i]))
+			set_page_dirty(pages[i]);
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+
+	for (i = nr_pages_copied; i < nr_pages; i++) {
+		if (!PageDirty(pages[i]))
+			bch2_put_page_reservation(c, inode, pages[i]);
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+
+	return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
+
+	pagecache_add_get(&mapping->add_lock);
+
+	do {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
+			      PAGE_SIZE * WRITE_BATCH_PAGES - offset);
+again:
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+			bytes = min_t(unsigned long, iov_iter_count(iter),
+				      PAGE_SIZE - offset);
+
+			if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+				ret = -EFAULT;
+				break;
+			}
+		}
+
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		if (unlikely(ret < 0))
+			break;
+
+		cond_resched();
+
+		if (unlikely(ret == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+				      iov_iter_single_seg_count(iter));
+			goto again;
+		}
+		pos += ret;
+		written += ret;
+
+		balance_dirty_pages_ratelimited(mapping);
+	} while (iov_iter_count(iter));
+
+	pagecache_add_put(&mapping->add_lock);
+
+	return written ? written : ret;
+}
+
 /* O_DIRECT reads */
 
 static void bch2_dio_read_complete(struct closure *cl)
@@ -1822,7 +2001,7 @@ static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	ret = iocb->ki_flags & IOCB_DIRECT
 		? bch2_direct_write(iocb, from)
-		: generic_perform_write(file, from, iocb->ki_pos);
+		: bch2_buffered_write(iocb, from);
 
 	if (likely(ret > 0))
 		iocb->ki_pos += ret;
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index c7e842ee..fb30f0d9 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -1028,6 +1028,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)
 
 	inode_init_once(&inode->v);
 	mutex_init(&inode->ei_update_lock);
+	mutex_init(&inode->ei_quota_lock);
 	inode->ei_journal_seq = 0;
 
 	return &inode->v;
diff --git a/libbcachefs/fs.h b/libbcachefs/fs.h
index fddfb2d2..fbbc7a3a 100644
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@@ -15,6 +15,8 @@ struct bch_inode_info {
 	u64			ei_journal_seq;
 	u64			ei_quota_reserved;
 	unsigned long		ei_last_dirtied;
+
+	struct mutex		ei_quota_lock;
 	struct bch_qid		ei_qid;
 
 	struct bch_hash_info	ei_str_hash;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 27e45081..bb656522 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -14,6 +14,7 @@
 #include "compress.h"
 #include "clock.h"
 #include "debug.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "extents.h"
 #include "io.h"
@@ -30,14 +31,71 @@
 
 #include <trace/events/bcachefs.h>
 
-/* Allocate, free from mempool: */
-
-void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
 {
+	const struct bch_devs_mask *devs;
+	unsigned d, nr = 0, total = 0;
+	u64 now = local_clock(), last;
+	s64 congested;
+	struct bch_dev *ca;
+
+	if (!target)
+		return false;
+
+	rcu_read_lock();
+	devs = bch2_target_to_mask(c, target);
+	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
+		ca = rcu_dereference(c->devs[d]);
+		if (!ca)
+			continue;
+
+		congested = atomic_read(&ca->congested);
+		last = READ_ONCE(ca->congested_last);
+		if (time_after64(now, last))
+			congested -= (now - last) >> 12;
+
+		total += max(congested, 0LL);
+		nr++;
+	}
+	rcu_read_unlock();
+
+	return bch2_rand_range(nr * CONGESTED_MAX) < total;
+}
+
+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency,
+				       u64 now, int rw)
+{
+	u64 latency_capable =
+		ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m;
+	/* ideally we'd be taking into account the device's variance here: */
+	u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3);
+	s64 latency_over = io_latency - latency_threshold;
+
+	if (latency_threshold && latency_over > 0) {
+		/*
+		 * bump up congested by approximately latency_over * 4 /
+		 * latency_threshold - we don't need much accuracy here so don't
+		 * bother with the divide:
+		 */
+		if (atomic_read(&ca->congested) < CONGESTED_MAX)
+			atomic_add(latency_over >>
+				   max_t(int, ilog2(latency_threshold) - 2, 0),
+				   &ca->congested);
+
+		ca->congested_last = now;
+	} else if (atomic_read(&ca->congested) > 0) {
+		atomic_dec(&ca->congested);
+	}
+}
+
+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
+{
+	atomic64_t *latency = &ca->cur_latency[rw];
 	u64 now = local_clock();
-	unsigned io_latency = (now >> 10) - submit_time_us;
-	atomic_t *latency = &ca->latency[rw];
-	unsigned old, new, v = atomic_read(latency);
+	u64 io_latency = time_after64(now, submit_time)
+		? now - submit_time
+		: 0;
+	u64 old, new, v = atomic64_read(latency);
 
 	do {
 		old = v;
@@ -51,10 +109,16 @@ void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
 		    now & ~(~0 << 5))
 			break;
 
-		new = ewma_add((u64) old, io_latency, 6);
-	} while ((v = atomic_cmpxchg(latency, old, new)) != old);
+		new = ewma_add(old, io_latency, 5);
+	} while ((v = atomic64_cmpxchg(latency, old, new)) != old);
+
+	bch2_congested_acct(ca, io_latency, now, rw);
+
+	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 }
 
+/* Allocate, free from mempool: */
+
 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
 {
 	struct bio_vec *bv;
@@ -169,22 +233,21 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 		}
 
 		n->c			= c;
-		n->ca			= ca;
-		n->submit_time_us	= local_clock_us();
+		n->dev			= ptr->dev;
+		n->have_ioref		= bch2_dev_get_ioref(ca, WRITE);
+		n->submit_time		= local_clock();
 		n->bio.bi_iter.bi_sector = ptr->offset;
 
 		if (!journal_flushes_device(ca))
 			n->bio.bi_opf |= REQ_FUA;
 
-		if (likely(percpu_ref_tryget(&ca->io_ref))) {
+		if (likely(n->have_ioref)) {
 			this_cpu_add(ca->io_done->sectors[WRITE][type],
 				     bio_sectors(&n->bio));
 
-			n->have_io_ref		= true;
 			bio_set_dev(&n->bio, ca->disk_sb.bdev);
 			submit_bio(&n->bio);
 		} else {
-			n->have_io_ref		= false;
 			n->bio.bi_status	= BLK_STS_REMOVED;
 			bio_endio(&n->bio);
 		}
@@ -196,15 +259,18 @@ static void __bch2_write(struct closure *);
 static void bch2_write_done(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
 
 	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
-		op->error = bch2_journal_error(&op->c->journal);
+		op->error = bch2_journal_error(&c->journal);
 
 	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
-		bch2_disk_reservation_put(op->c, &op->res);
-	percpu_ref_put(&op->c->writes);
+		bch2_disk_reservation_put(c, &op->res);
+	percpu_ref_put(&c->writes);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
+	bch2_time_stats_update(&c->data_write_time, op->start_time);
+
 	closure_return(cl);
 }
 
@@ -318,15 +384,15 @@ static void bch2_write_endio(struct bio *bio)
 	struct bch_write_bio *wbio	= to_wbio(bio);
 	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
 	struct bch_fs *c		= wbio->c;
-	struct bch_dev *ca		= wbio->ca;
-
-	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
-		set_bit(ca->dev_idx, op->failed.d);
+		set_bit(wbio->dev, op->failed.d);
 
-	if (wbio->have_io_ref)
+	if (wbio->have_ioref) {
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);
 		percpu_ref_put(&ca->io_ref);
+	}
 
 	if (wbio->bounce)
 		bch2_bio_free_pages_pool(c, bio);
@@ -821,6 +887,8 @@ void bch2_write(struct closure *cl)
 	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
 	BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
 
+	op->start_time = local_clock();
+
 	memset(&op->failed, 0, sizeof(op->failed));
 
 	bch2_keylist_init(&op->insert_keys, op->inline_keys);
@@ -844,92 +912,27 @@ void bch2_write(struct closure *cl)
 
 struct promote_op {
 	struct closure		cl;
+	u64			start_time;
+
+	struct rhash_head	hash;
+	struct bpos		pos;
+
 	struct migrate_write	write;
 	struct bio_vec		bi_inline_vecs[0]; /* must be last */
 };
 
-static void promote_done(struct closure *cl)
+static const struct rhashtable_params bch_promote_params = {
+	.head_offset	= offsetof(struct promote_op, hash),
+	.key_offset	= offsetof(struct promote_op, pos),
+	.key_len	= sizeof(struct bpos),
+};
+
+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k,
+				  struct bpos pos,
+				  struct bch_io_opts opts,
+				  unsigned flags)
 {
-	struct promote_op *op =
-		container_of(cl, struct promote_op, cl);
-	struct bch_fs *c = op->write.op.c;
-
-	percpu_ref_put(&c->writes);
-	bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
-	kfree(op);
-}
-
-static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
-{
-	struct bch_fs *c = rbio->c;
-	struct closure *cl = &op->cl;
-	struct bio *bio = &op->write.op.wbio.bio;
-
-	BUG_ON(!rbio->split || !rbio->bounce);
-
-	if (!percpu_ref_tryget(&c->writes))
-		return;
-
-	trace_promote(&rbio->bio);
-
-	/* we now own pages: */
-	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
-	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-	rbio->promote = NULL;
-
-	bch2_migrate_read_done(&op->write, rbio);
-
-	closure_init(cl, NULL);
-	closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
-	closure_return_with_destructor(cl, promote_done);
-}
-
-/*
- * XXX: multiple promotes can race with each other, wastefully. Keep a list of
- * outstanding promotes?
- */
-static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
-					struct bkey_s_c k)
-{
-	struct bch_fs *c = rbio->c;
-	struct promote_op *op;
-	struct bio *bio;
-	/* data might have to be decompressed in the write path: */
-	unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
-				      PAGE_SECTORS);
-	int ret;
-
-	BUG_ON(!rbio->bounce);
-	BUG_ON(pages < rbio->bio.bi_vcnt);
-
-	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
-		     GFP_NOIO);
-	if (!op)
-		return NULL;
-
-	bio = &op->write.op.wbio.bio;
-	bio_init(bio, bio->bi_inline_vecs, pages);
-
-	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
-	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-
-	ret = bch2_migrate_write_init(c, &op->write,
-			writepoint_hashed((unsigned long) current),
-			rbio->opts,
-			DATA_PROMOTE,
-			(struct data_opts) {
-				.target = rbio->opts.promote_target
-			},
-			k);
-	BUG_ON(ret);
-
-	return op;
-}
-
-static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
-			   unsigned flags, u16 target)
-{
-	if (!target)
+	if (!opts.promote_target)
 		return false;
 
 	if (!(flags & BCH_READ_MAY_PROMOTE))
@@ -938,15 +941,182 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
 	if (percpu_ref_is_dying(&c->writes))
 		return false;
 
-	return bch2_extent_has_target(c, e, target) == NULL;
+	if (!bkey_extent_is_data(k.k))
+		return false;
+
+	if (bch2_extent_has_target(c, bkey_s_c_to_extent(k), opts.promote_target))
+		return false;
+
+	if (bch2_target_congested(c, opts.promote_target))
+		return false;
+
+	if (rhashtable_lookup_fast(&c->promote_table, &pos,
+				   bch_promote_params))
+		return false;
+
+	return true;
+}
+
+static void promote_free(struct bch_fs *c, struct promote_op *op)
+{
+	int ret;
+
+	ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+				     bch_promote_params);
+	BUG_ON(ret);
+	percpu_ref_put(&c->writes);
+	kfree(op);
+}
+
+static void promote_done(struct closure *cl)
+{
+	struct promote_op *op =
+		container_of(cl, struct promote_op, cl);
+	struct bch_fs *c = op->write.op.c;
+
+	bch2_time_stats_update(&c->data_promote_time, op->start_time);
+
+	bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+	promote_free(c, op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	struct closure *cl = &op->cl;
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	trace_promote(&rbio->bio);
+
+	/* we now own pages: */
+	BUG_ON(!rbio->bounce);
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
+
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+
+	bch2_migrate_read_done(&op->write, rbio);
+
+	closure_init(cl, NULL);
+	closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+	closure_return_with_destructor(cl, promote_done);
+}
+
+noinline
+static struct promote_op *__promote_alloc(struct bch_fs *c,
+					  struct bpos pos,
+					  struct extent_pick_ptr *pick,
+					  struct bch_io_opts opts,
+					  unsigned rbio_sectors,
+					  struct bch_read_bio **rbio)
+{
+	struct promote_op *op = NULL;
+	struct bio *bio;
+	unsigned rbio_pages = DIV_ROUND_UP(rbio_sectors, PAGE_SECTORS);
+	/* data might have to be decompressed in the write path: */
+	unsigned wbio_pages = DIV_ROUND_UP(pick->crc.uncompressed_size,
+					   PAGE_SECTORS);
+	int ret;
+
+	if (!percpu_ref_tryget(&c->writes))
+		return NULL;
+
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * wbio_pages,
+		     GFP_NOIO);
+	if (!op)
+		goto err;
+
+	op->start_time = local_clock();
+	op->pos = pos;
+
+	/*
+	 * promotes require bouncing, but if the extent isn't
+	 * checksummed/compressed it might be too big for the mempool:
+	 */
+	if (rbio_sectors > c->sb.encoded_extent_max) {
+		*rbio = kzalloc(sizeof(struct bch_read_bio) +
+				sizeof(struct bio_vec) * rbio_pages,
+				GFP_NOIO);
+		if (!*rbio)
+			goto err;
+
+		rbio_init(&(*rbio)->bio, opts);
+		bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs,
+			 rbio_pages);
+
+		(*rbio)->bio.bi_iter.bi_size = rbio_sectors << 9;
+		bch2_bio_map(&(*rbio)->bio, NULL);
+
+		if (bio_alloc_pages(&(*rbio)->bio, GFP_NOIO))
+			goto err;
+
+		(*rbio)->bounce		= true;
+		(*rbio)->split		= true;
+		(*rbio)->kmalloc	= true;
+	}
+
+	if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
+					  bch_promote_params))
+		goto err;
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, bio->bi_inline_vecs, wbio_pages);
+
+	ret = bch2_migrate_write_init(c, &op->write,
+			writepoint_hashed((unsigned long) current),
+			opts,
+			DATA_PROMOTE,
+			(struct data_opts) {
+				.target = opts.promote_target
+			},
+			bkey_s_c_null);
+	BUG_ON(ret);
+
+	return op;
+err:
+	if (*rbio)
+		bio_free_pages(&(*rbio)->bio);
+	kfree(*rbio);
+	*rbio = NULL;
+	kfree(op);
+	percpu_ref_put(&c->writes);
+	return NULL;
+}
+
+static inline struct promote_op *promote_alloc(struct bch_fs *c,
+					       struct bvec_iter iter,
+					       struct bkey_s_c k,
+					       struct extent_pick_ptr *pick,
+					       struct bch_io_opts opts,
+					       unsigned flags,
+					       struct bch_read_bio **rbio,
+					       bool *bounce,
+					       bool *read_full)
+{
+	bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
+	unsigned sectors = promote_full
+		? pick->crc.compressed_size
+		: bvec_iter_sectors(iter);
+	struct bpos pos = promote_full
+		? bkey_start_pos(k.k)
+		: POS(k.k->p.inode, iter.bi_sector);
+	struct promote_op *promote;
+
+	if (!should_promote(c, k, pos, opts, flags))
+		return NULL;
+
+	promote = __promote_alloc(c, pos, pick, opts, sectors, rbio);
+	if (!promote)
+		return NULL;
+
+	*bounce		= true;
+	*read_full	= promote_full;
+	return promote;
 }
 
 /* Read */
 
-static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *,
-				     struct bvec_iter, u64,
-				     struct bch_devs_mask *, unsigned);
-
 #define READ_RETRY_AVOID	1
 #define READ_RETRY		2
 #define READ_ERR		3
@@ -979,38 +1149,144 @@ static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
 
 static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 {
-	struct bch_read_bio *parent = rbio->parent;
-
-	BUG_ON(!rbio->split);
+	BUG_ON(rbio->bounce && !rbio->split);
 
 	if (rbio->promote)
-		kfree(rbio->promote);
+		promote_free(rbio->c, rbio->promote);
+	rbio->promote = NULL;
+
 	if (rbio->bounce)
 		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
-	bio_put(&rbio->bio);
 
-	return parent;
+	if (rbio->split) {
+		struct bch_read_bio *parent = rbio->parent;
+
+		if (rbio->kmalloc)
+			kfree(rbio);
+		else
+			bio_put(&rbio->bio);
+
+		rbio = parent;
+	}
+
+	return rbio;
 }
 
 static void bch2_rbio_done(struct bch_read_bio *rbio)
 {
-	if (rbio->promote)
-		kfree(rbio->promote);
-	rbio->promote = NULL;
-
-	if (rbio->split)
-		rbio = bch2_rbio_free(rbio);
+	bch2_time_stats_update(&rbio->c->data_read_time, rbio->start_time);
 	bio_endio(&rbio->bio);
 }
 
+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter, u64 inode,
+				     struct bch_devs_mask *avoid, unsigned flags)
+{
+	struct btree_iter iter;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     rbio->pos, BTREE_ITER_SLOTS);
+retry:
+	rbio->bio.bi_status = 0;
+
+	k = bch2_btree_iter_peek_slot(&iter);
+	if (btree_iter_err(k)) {
+		bch2_btree_iter_unlock(&iter);
+		goto err;
+	}
+
+	bkey_reassemble(&tmp.k, k);
+	k = bkey_i_to_s_c(&tmp.k);
+	bch2_btree_iter_unlock(&iter);
+
+	if (!bkey_extent_is_data(k.k) ||
+	    !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+				     rbio->pick.ptr,
+				     rbio->pos.offset -
+				     rbio->pick.crc.offset)) {
+		/* extent we wanted to read no longer exists: */
+		rbio->hole = true;
+		goto out;
+	}
+
+	ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+	if (ret == READ_RETRY)
+		goto retry;
+	if (ret)
+		goto err;
+	goto out;
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+	bch2_rbio_done(rbio);
+}
+
+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+			    struct bvec_iter bvec_iter, u64 inode,
+			    struct bch_devs_mask *avoid, unsigned flags)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	flags &= ~BCH_READ_LAST_FRAGMENT;
+	flags |= BCH_READ_MUST_CLONE;
+retry:
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+			   POS(inode, bvec_iter.bi_sector),
+			   BTREE_ITER_SLOTS, k) {
+		BKEY_PADDED(k) tmp;
+		unsigned bytes;
+
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch2_btree_iter_unlock(&iter);
+
+		bytes = min_t(unsigned, bvec_iter.bi_size,
+			      (k.k->p.offset - bvec_iter.bi_sector) << 9);
+		swap(bvec_iter.bi_size, bytes);
+
+		ret = __bch2_read_extent(c, rbio, bvec_iter, k, avoid, flags);
+		switch (ret) {
+		case READ_RETRY:
+			goto retry;
+		case READ_ERR:
+			goto err;
+		};
+
+		if (bytes == bvec_iter.bi_size)
+			goto out;
+
+		swap(bvec_iter.bi_size, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, bytes);
+	}
+
+	/*
+	 * If we get here, it better have been because there was an error
+	 * reading a btree node
+	 */
+	ret = bch2_btree_iter_unlock(&iter);
+	BUG_ON(!ret);
+	__bcache_io_error(c, "btree IO error %i", ret);
+err:
+	rbio->bio.bi_status = BLK_STS_IOERR;
+out:
+	bch2_rbio_done(rbio);
+}
+
 static void bch2_rbio_retry(struct work_struct *work)
 {
 	struct bch_read_bio *rbio =
 		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c		= rbio->c;
-	struct bvec_iter iter		= rbio->bvec_iter;
-	unsigned flags			= rbio->flags;
-	u64 inode			= rbio->pos.inode;
+	struct bch_fs *c	= rbio->c;
+	struct bvec_iter iter	= rbio->bvec_iter;
+	unsigned flags		= rbio->flags;
+	u64 inode		= rbio->pos.inode;
 	struct bch_devs_mask avoid;
 
 	trace_read_retry(&rbio->bio);
@@ -1018,26 +1294,19 @@ static void bch2_rbio_retry(struct work_struct *work)
 	memset(&avoid, 0, sizeof(avoid));
 
 	if (rbio->retry == READ_RETRY_AVOID)
-		__set_bit(rbio->pick.ca->dev_idx, avoid.d);
+		__set_bit(rbio->pick.ptr.dev, avoid.d);
 
-	if (rbio->promote)
-		kfree(rbio->promote);
-	rbio->promote = NULL;
+	rbio->bio.bi_status = 0;
 
-	if (rbio->split)
-		rbio = bch2_rbio_free(rbio);
-	else
-		rbio->bio.bi_status = 0;
+	rbio = bch2_rbio_free(rbio);
 
-	if (!(flags & BCH_READ_NODECODE))
-		flags |= BCH_READ_MUST_CLONE;
 	flags |= BCH_READ_IN_RETRY;
 	flags &= ~BCH_READ_MAY_PROMOTE;
 
 	if (flags & BCH_READ_NODECODE)
-		bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags);
+		bch2_read_retry_nodecode(c, rbio, iter, inode, &avoid, flags);
 	else
-		__bch2_read(c, rbio, iter, inode, &avoid, flags);
+		bch2_read_retry(c, rbio, iter, inode, &avoid, flags);
 }
 
 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
@@ -1049,7 +1318,9 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry,
 		return;
 
 	if (retry == READ_ERR) {
-		bch2_rbio_parent(rbio)->bio.bi_status = error;
+		rbio = bch2_rbio_free(rbio);
+
+		rbio->bio.bi_status = error;
 		bch2_rbio_done(rbio);
 	} else {
 		bch2_rbio_punt(rbio, bch2_rbio_retry,
@@ -1121,12 +1392,13 @@ out:
 	bch2_btree_iter_unlock(&iter);
 }
 
-static bool should_narrow_crcs(struct bkey_s_c_extent e,
+static bool should_narrow_crcs(struct bkey_s_c k,
 			       struct extent_pick_ptr *pick,
 			       unsigned flags)
 {
 	return !(flags & BCH_READ_IN_RETRY) &&
-		bch2_can_narrow_extent_crcs(e, pick->crc);
+		bkey_extent_is_data(k.k) &&
+		bch2_can_narrow_extent_crcs(bkey_s_c_to_extent(k), pick->crc);
 }
 
 /* Inner part that may run in process context */
@@ -1134,8 +1406,10 @@ static void __bch2_read_endio(struct work_struct *work)
 {
 	struct bch_read_bio *rbio =
 		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c = rbio->c;
-	struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio;
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
+	struct bio *src		= &rbio->bio;
+	struct bio *dst		= &bch2_rbio_parent(rbio)->bio;
 	struct bvec_iter dst_iter = rbio->bvec_iter;
 	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
 	struct nonce nonce = extent_nonce(rbio->version, crc);
@@ -1191,10 +1465,13 @@ static void __bch2_read_endio(struct work_struct *work)
 		 */
 		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
 		promote_start(rbio->promote, rbio);
+		rbio->promote = NULL;
 	}
 nodecode:
-	if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) {
+		rbio = bch2_rbio_free(rbio);
 		bch2_rbio_done(rbio);
+	}
 	return;
 csum_err:
 	/*
@@ -1208,7 +1485,7 @@ csum_err:
 		return;
 	}
 
-	bch2_dev_io_error(rbio->pick.ca,
+	bch2_dev_io_error(ca,
 		"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
 		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
 		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
@@ -1227,25 +1504,27 @@ static void bch2_read_endio(struct bio *bio)
 {
 	struct bch_read_bio *rbio =
 		container_of(bio, struct bch_read_bio, bio);
-	struct bch_fs *c = rbio->c;
+	struct bch_fs *c	= rbio->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rbio->pick.ptr.dev);
 	struct workqueue_struct *wq = NULL;
 	enum rbio_context context = RBIO_CONTEXT_NULL;
 
-	bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ);
-
-	percpu_ref_put(&rbio->pick.ca->io_ref);
+	if (rbio->have_ioref) {
+		bch2_latency_acct(ca, rbio->submit_time, READ);
+		percpu_ref_put(&ca->io_ref);
+	}
 
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;
 
-	if (bch2_dev_io_err_on(bio->bi_status, rbio->pick.ca, "data read")) {
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
 		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
 		return;
 	}
 
 	if (rbio->pick.ptr.cached &&
 	    (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-	     ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
+	     ptr_stale(ca, &rbio->pick.ptr))) {
 		atomic_long_inc(&c->read_realloc_races);
 
 		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
@@ -1266,76 +1545,97 @@ static void bch2_read_endio(struct bio *bio)
 }
 
 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
-		       struct bvec_iter iter, struct bkey_s_c_extent e,
-		       struct extent_pick_ptr *pick, unsigned flags)
+		       struct bvec_iter iter, struct bkey_s_c k,
+		       struct bch_devs_mask *avoid, unsigned flags)
 {
-	struct bch_read_bio *rbio;
-	bool split = false, bounce = false, read_full = false;
-	bool promote = false, narrow_crcs = false;
-	struct bpos pos = bkey_start_pos(e.k);
-	int ret = 0;
+	struct extent_pick_ptr pick;
+	struct bch_read_bio *rbio = NULL;
+	struct bch_dev *ca;
+	struct promote_op *promote = NULL;
+	bool bounce = false, read_full = false, narrow_crcs = false;
+	struct bpos pos = bkey_start_pos(k.k);
+	int pick_ret;
 
-	lg_local_lock(&c->usage_lock);
-	bucket_io_clock_reset(c, pick->ca,
-			PTR_BUCKET_NR(pick->ca, &pick->ptr), READ);
-	lg_local_unlock(&c->usage_lock);
+	pick_ret = bch2_extent_pick_ptr(c, k, avoid, &pick);
 
-	narrow_crcs = should_narrow_crcs(e, pick, flags);
+	/* hole or reservation - just zero fill: */
+	if (!pick_ret)
+		goto hole;
+
+	if (pick_ret < 0)
+		goto no_device;
+
+	if (pick_ret > 0)
+		ca = bch_dev_bkey_exists(c, pick.ptr.dev);
 
 	if (flags & BCH_READ_NODECODE) {
-		BUG_ON(iter.bi_size < pick->crc.compressed_size << 9);
-		iter.bi_size = pick->crc.compressed_size << 9;
+		/*
+		 * can happen if we retry, and the extent we were going to read
+		 * has been merged in the meantime:
+		 */
+		if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS)
+			goto hole;
+
+		iter.bi_sector	= pos.offset;
+		iter.bi_size	= pick.crc.compressed_size << 9;
 		goto noclone;
 	}
 
+	if (!(flags & BCH_READ_LAST_FRAGMENT) ||
+	    bio_flagged(&orig->bio, BIO_CHAIN))
+		flags |= BCH_READ_MUST_CLONE;
+
+	narrow_crcs = should_narrow_crcs(k, &pick, flags);
+
 	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
 		flags |= BCH_READ_MUST_BOUNCE;
 
-	EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector ||
-		e.k->p.offset < bvec_iter_end_sector(iter));
+	EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
+		k.k->p.offset < bvec_iter_end_sector(iter));
 
-	if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
-	    (pick->crc.csum_type != BCH_CSUM_NONE &&
-	     (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
-	      (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
+	if (pick.crc.compression_type != BCH_COMPRESSION_NONE ||
+	    (pick.crc.csum_type != BCH_CSUM_NONE &&
+	     (bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+	      (bch2_csum_type_is_encryption(pick.crc.csum_type) &&
 	       (flags & BCH_READ_USER_MAPPED)) ||
 	      (flags & BCH_READ_MUST_BOUNCE)))) {
 		read_full = true;
 		bounce = true;
 	}
 
-	promote = should_promote(c, e, flags, orig->opts.promote_target);
-	/* could also set read_full */
-	if (promote)
-		bounce = true;
+	promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+				&rbio, &bounce, &read_full);
 
 	if (!read_full) {
-		EBUG_ON(pick->crc.compression_type);
-		EBUG_ON(pick->crc.csum_type &&
-			(bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
-			 bvec_iter_sectors(iter) != pick->crc.live_size ||
-			 pick->crc.offset ||
+		EBUG_ON(pick.crc.compression_type);
+		EBUG_ON(pick.crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick.crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick.crc.live_size ||
+			 pick.crc.offset ||
 			 iter.bi_sector != pos.offset));
 
-		pick->ptr.offset += pick->crc.offset +
+		pick.ptr.offset += pick.crc.offset +
 			(iter.bi_sector - pos.offset);
-		pick->crc.compressed_size	= bvec_iter_sectors(iter);
-		pick->crc.uncompressed_size	= bvec_iter_sectors(iter);
-		pick->crc.offset		= 0;
-		pick->crc.live_size		= bvec_iter_sectors(iter);
+		pick.crc.compressed_size	= bvec_iter_sectors(iter);
+		pick.crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick.crc.offset			= 0;
+		pick.crc.live_size		= bvec_iter_sectors(iter);
 		pos.offset			= iter.bi_sector;
 	}
 
-	if (bounce) {
-		unsigned sectors = pick->crc.compressed_size;
+	if (rbio) {
+		/* promote already allocated bounce rbio */
+	} else if (bounce) {
+		unsigned sectors = pick.crc.compressed_size;
 
 		rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
-					DIV_ROUND_UP(sectors, PAGE_SECTORS),
-					&c->bio_read_split),
+						  DIV_ROUND_UP(sectors, PAGE_SECTORS),
+						  &c->bio_read_split),
 				 orig->opts);
 
 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
-		split = true;
+		rbio->bounce	= true;
+		rbio->split	= true;
 	} else if (flags & BCH_READ_MUST_CLONE) {
 		/*
 		 * Have to clone if there were any splits, due to error
@@ -1349,156 +1649,138 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 						&c->bio_read_split),
 				 orig->opts);
 		rbio->bio.bi_iter = iter;
-		split = true;
+		rbio->split	= true;
 	} else {
 noclone:
 		rbio = orig;
 		rbio->bio.bi_iter = iter;
-		split = false;
 		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
 	}
 
-	BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size);
+	BUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size);
 
 	rbio->c			= c;
-	if (split)
+	rbio->submit_time	= local_clock();
+	if (rbio->split)
 		rbio->parent	= orig;
 	else
 		rbio->end_io	= orig->bio.bi_end_io;
 	rbio->bvec_iter		= iter;
-	rbio->submit_time_us	= local_clock_us();
 	rbio->flags		= flags;
-	rbio->bounce		= bounce;
-	rbio->split		= split;
+	rbio->have_ioref	= pick_ret > 0 && bch2_dev_get_ioref(ca, READ);
 	rbio->narrow_crcs	= narrow_crcs;
+	rbio->hole		= 0;
 	rbio->retry		= 0;
 	rbio->context		= 0;
-	rbio->devs_have		= bch2_extent_devs(e);
-	rbio->pick		= *pick;
+	rbio->devs_have		= bch2_bkey_devs(k);
+	rbio->pick		= pick;
 	rbio->pos		= pos;
-	rbio->version		= e.k->version;
-	rbio->promote		= promote ? promote_alloc(rbio, e.s_c) : NULL;
+	rbio->version		= k.k->version;
+	rbio->promote		= promote;
 	INIT_WORK(&rbio->work, NULL);
 
-	bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
 	rbio->bio.bi_opf	= orig->bio.bi_opf;
-	rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
+	rbio->bio.bi_iter.bi_sector = pick.ptr.offset;
 	rbio->bio.bi_end_io	= bch2_read_endio;
 
-	if (bounce)
+	if (rbio->bounce)
 		trace_read_bounce(&rbio->bio);
 
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
-	this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
+
+	if (!rbio->have_ioref)
+		goto no_device_postclone;
+
+	lg_local_lock(&c->usage_lock);
+	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
+	lg_local_unlock(&c->usage_lock);
+
+	this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
 		     bio_sectors(&rbio->bio));
 
+	bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+
 	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		if (!(flags & BCH_READ_LAST_FRAGMENT)) {
+			bio_inc_remaining(&orig->bio);
+			trace_read_split(&orig->bio);
+		}
+
 		submit_bio(&rbio->bio);
+		return 0;
 	} else {
+		int ret;
+
 		submit_bio_wait(&rbio->bio);
 
 		rbio->context = RBIO_CONTEXT_UNBOUND;
 		bch2_read_endio(&rbio->bio);
 
 		ret = rbio->retry;
-		if (rbio->split)
-			rbio = bch2_rbio_free(rbio);
-		if (!ret)
-			bch2_rbio_done(rbio);
+		rbio = bch2_rbio_free(rbio);
+
+		if (ret == READ_RETRY_AVOID) {
+			__set_bit(pick.ptr.dev, avoid->d);
+			ret = READ_RETRY;
+		}
+
+		return ret;
 	}
 
-	return ret;
-}
+no_device_postclone:
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+	bch2_rbio_free(rbio);
+no_device:
+	__bcache_io_error(c, "no device to read from");
 
-static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio,
-				     struct bvec_iter bvec_iter, u64 inode,
-				     struct bch_devs_mask *avoid, unsigned flags)
-{
-	struct extent_pick_ptr pick;
-	struct btree_iter iter;
-	BKEY_PADDED(k) tmp;
-	struct bkey_s_c k;
-	int ret;
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		orig->bio.bi_status = BLK_STS_IOERR;
 
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
-			     POS(inode, bvec_iter.bi_sector),
-			     BTREE_ITER_SLOTS);
-retry:
-	k = bch2_btree_iter_peek_slot(&iter);
-	if (btree_iter_err(k)) {
-		bch2_btree_iter_unlock(&iter);
-		goto err;
+		if (flags & BCH_READ_LAST_FRAGMENT)
+			bch2_rbio_done(orig);
+		return 0;
+	} else {
+		return READ_ERR;
 	}
 
-	bkey_reassemble(&tmp.k, k);
-	k = bkey_i_to_s_c(&tmp.k);
-	bch2_btree_iter_unlock(&iter);
-
-	if (!bkey_extent_is_data(k.k) ||
-	    !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
-				     rbio->pick.ptr,
-				     rbio->pos.offset -
-				     rbio->pick.crc.offset) ||
-	    bkey_start_offset(k.k) != bvec_iter.bi_sector)
-		goto err;
-
-	bch2_extent_pick_ptr(c, k, avoid, &pick);
-	if (IS_ERR(pick.ca)) {
-		bcache_io_error(c, &rbio->bio, "no device to read from");
-		bio_endio(&rbio->bio);
-		return;
-	}
-
-	if (!pick.ca)
-		goto err;
-
-	if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) {
-		percpu_ref_put(&pick.ca->io_ref);
-		goto err;
-
-	}
-
-	ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k),
-				 &pick, flags);
-	switch (ret) {
-	case READ_RETRY_AVOID:
-		__set_bit(pick.ca->dev_idx, avoid->d);
-	case READ_RETRY:
-		goto retry;
-	case READ_ERR:
-		bio_endio(&rbio->bio);
-		return;
-	};
-
-	return;
-err:
+hole:
 	/*
-	 * extent we wanted to read no longer exists, or
-	 * was merged or partially overwritten (and thus
-	 * possibly bigger than the memory that was
-	 * originally allocated)
+	 * won't normally happen in the BCH_READ_NODECODE
+	 * (bch2_move_extent()) path, but if we retry and the extent we wanted
+	 * to read no longer exists we have to signal that:
 	 */
-	rbio->bio.bi_status = BLK_STS_AGAIN;
-	bio_endio(&rbio->bio);
-	return;
+	if (flags & BCH_READ_NODECODE)
+		orig->hole = true;
+
+	zero_fill_bio_iter(&orig->bio, iter);
+
+	if (flags & BCH_READ_LAST_FRAGMENT)
+		bch2_rbio_done(orig);
+	return 0;
 }
 
-void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-		 struct bvec_iter bvec_iter, u64 inode,
-		 struct bch_devs_mask *avoid, unsigned flags)
+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	unsigned flags = BCH_READ_RETRY_IF_STALE|
+		BCH_READ_MAY_PROMOTE|
+		BCH_READ_USER_MAPPED;
 	int ret;
 
-	EBUG_ON(flags & BCH_READ_NODECODE);
-retry:
+	BUG_ON(rbio->_state);
+	BUG_ON(flags & BCH_READ_NODECODE);
+	BUG_ON(flags & BCH_READ_IN_RETRY);
+
+	rbio->c = c;
+	rbio->start_time = local_clock();
+
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
-			   POS(inode, bvec_iter.bi_sector),
+			   POS(inode, rbio->bio.bi_iter.bi_sector),
 			   BTREE_ITER_SLOTS, k) {
 		BKEY_PADDED(k) tmp;
-		struct extent_pick_ptr pick;
-		struct bvec_iter fragment;
+		unsigned bytes;
 
 		/*
 		 * Unlock the iterator while the btree node's lock is still in
@@ -1508,49 +1790,20 @@ retry:
 		k = bkey_i_to_s_c(&tmp.k);
 		bch2_btree_iter_unlock(&iter);
 
-		bch2_extent_pick_ptr(c, k, avoid, &pick);
-		if (IS_ERR(pick.ca)) {
-			bcache_io_error(c, &rbio->bio, "no device to read from");
-			bio_endio(&rbio->bio);
-			return;
-		}
+		bytes = min_t(unsigned, rbio->bio.bi_iter.bi_size,
+			      (k.k->p.offset - rbio->bio.bi_iter.bi_sector) << 9);
+		swap(rbio->bio.bi_iter.bi_size, bytes);
 
-		fragment = bvec_iter;
-		fragment.bi_size = (min_t(u64, k.k->p.offset,
-					  bvec_iter_end_sector(bvec_iter)) -
-				    bvec_iter.bi_sector) << 9;
+		if (rbio->bio.bi_iter.bi_size == bytes)
+			flags |= BCH_READ_LAST_FRAGMENT;
 
-		if (pick.ca) {
-			if (fragment.bi_size != bvec_iter.bi_size) {
-				bio_inc_remaining(&rbio->bio);
-				flags |= BCH_READ_MUST_CLONE;
-				trace_read_split(&rbio->bio);
-			}
+		bch2_read_extent(c, rbio, k, flags);
 
-			ret = __bch2_read_extent(c, rbio, fragment,
-						 bkey_s_c_to_extent(k),
-						 &pick, flags);
-			switch (ret) {
-			case READ_RETRY_AVOID:
-				__set_bit(pick.ca->dev_idx, avoid->d);
-			case READ_RETRY:
-				goto retry;
-			case READ_ERR:
-				rbio->bio.bi_status = BLK_STS_IOERR;
-				bio_endio(&rbio->bio);
-				return;
-			};
-		} else {
-			zero_fill_bio_iter(&rbio->bio, fragment);
-
-			if (fragment.bi_size == bvec_iter.bi_size)
-				bio_endio(&rbio->bio);
-		}
-
-		if (fragment.bi_size == bvec_iter.bi_size)
+		if (flags & BCH_READ_LAST_FRAGMENT)
 			return;
 
-		bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
+		swap(rbio->bio.bi_iter.bi_size, bytes);
+		bio_advance(&rbio->bio, bytes);
 	}
 
 	/*
@@ -1560,5 +1813,34 @@ retry:
 	ret = bch2_btree_iter_unlock(&iter);
 	BUG_ON(!ret);
 	bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
-	bio_endio(&rbio->bio);
+	bch2_rbio_done(rbio);
+}
+
+void bch2_fs_io_exit(struct bch_fs *c)
+{
+	if (c->promote_table.tbl)
+		rhashtable_destroy(&c->promote_table);
+	mempool_exit(&c->bio_bounce_pages);
+	bioset_exit(&c->bio_write);
+	bioset_exit(&c->bio_read_split);
+	bioset_exit(&c->bio_read);
+}
+
+int bch2_fs_io_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+			BIOSET_NEED_BVECS) ||
+	    mempool_init_page_pool(&c->bio_bounce_pages,
+				   max_t(unsigned,
+					 c->opts.btree_node_size,
+					 c->sb.encoded_extent_max) /
+				   PAGE_SECTORS, 0) ||
+	    rhashtable_init(&c->promote_table, &bch_promote_params))
+		return -ENOMEM;
+
+	return 0;
 }
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index a0c795ab..8b1411c6 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -16,7 +16,7 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
 void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
 
-void bch2_latency_acct(struct bch_dev *, unsigned, int);
+void bch2_latency_acct(struct bch_dev *, u64, int);
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *);
@@ -99,40 +99,28 @@ struct cache_promote_op;
 struct extent_pick_ptr;
 
 int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		       struct bkey_s_c_extent e, struct extent_pick_ptr *,
-		       unsigned);
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		 u64, struct bch_devs_mask *, unsigned);
+		       struct bkey_s_c, struct bch_devs_mask *, unsigned);
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);
 
 enum bch_read_flags {
 	BCH_READ_RETRY_IF_STALE		= 1 << 0,
 	BCH_READ_MAY_PROMOTE		= 1 << 1,
 	BCH_READ_USER_MAPPED		= 1 << 2,
 	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,
 
 	/* internal: */
-	BCH_READ_MUST_BOUNCE		= 1 << 4,
-	BCH_READ_MUST_CLONE		= 1 << 5,
-	BCH_READ_IN_RETRY		= 1 << 6,
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
 };
 
 static inline void bch2_read_extent(struct bch_fs *c,
 				    struct bch_read_bio *rbio,
-				    struct bkey_s_c_extent e,
-				    struct extent_pick_ptr *pick,
+				    struct bkey_s_c k,
 				    unsigned flags)
 {
-	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
-}
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-			     u64 inode)
-{
-	BUG_ON(rbio->_state);
-	__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
-		    BCH_READ_RETRY_IF_STALE|
-		    BCH_READ_MAY_PROMOTE|
-		    BCH_READ_USER_MAPPED);
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
 }
 
 static inline struct bch_read_bio *rbio_init(struct bio *bio,
@@ -146,4 +134,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
 	return rbio;
 }
 
+void bch2_fs_io_exit(struct bch_fs *);
+int bch2_fs_io_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_IO_H */
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index a022ab33..28281ea6 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -14,6 +14,8 @@
 
 struct bch_read_bio {
 	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;
 
 	/*
 	 * Reads will often have to be split, and if the extent being read from
@@ -35,17 +37,19 @@ struct bch_read_bio {
 	 */
 	struct bvec_iter	bvec_iter;
 
-	unsigned		submit_time_us;
-	u8			flags;
+	u16			flags;
 	union {
 	struct {
-	u8			bounce:1,
+	u16			bounce:1,
 				split:1,
+				kmalloc:1,
+				have_ioref:1,
 				narrow_crcs:1,
+				hole:1,
 				retry:2,
 				context:2;
 	};
-	u8			_state;
+	u16			_state;
 	};
 
 	struct bch_devs_list	devs_have;
@@ -66,20 +70,20 @@ struct bch_read_bio {
 
 struct bch_write_bio {
 	struct bch_fs		*c;
-	struct bch_dev		*ca;
 	struct bch_write_bio	*parent;
 
+	u64			submit_time;
+
 	struct bch_devs_list	failed;
 	u8			order;
+	u8			dev;
 
 	unsigned		split:1,
 				bounce:1,
 				put_bio:1,
-				have_io_ref:1,
+				have_ioref:1,
 				used_mempool:1;
 
-	unsigned		submit_time_us;
-
 	struct bio		bio;
 };
 
@@ -87,6 +91,7 @@ struct bch_write_op {
 	struct closure		cl;
 	struct bch_fs		*c;
 	struct workqueue_struct	*io_wq;
+	u64			start_time;
 
 	unsigned		written; /* sectors */
 	u16			flags;
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index b525a85c..ea67af3d 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -7,1160 +7,16 @@
 #include "bcachefs.h"
 #include "alloc.h"
 #include "bkey_methods.h"
-#include "buckets.h"
 #include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
-#include "btree_io.h"
-#include "checksum.h"
-#include "debug.h"
-#include "error.h"
-#include "extents.h"
-#include "io.h"
-#include "keylist.h"
+#include "buckets.h"
 #include "journal.h"
-#include "replicas.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "super-io.h"
-#include "vstructs.h"
 
 #include <trace/events/bcachefs.h>
 
-static void journal_write(struct closure *);
-static void journal_reclaim_fast(struct journal *);
-static void journal_pin_add_entry(struct journal *,
-				  struct journal_entry_pin_list *,
-				  struct journal_entry_pin *,
-				  journal_pin_flush_fn);
-
-static inline void journal_wake(struct journal *j)
-{
-	wake_up(&j->wait);
-	closure_wake_up(&j->async_wait);
-}
-
-static inline struct journal_buf *journal_cur_buf(struct journal *j)
-{
-	return j->buf + j->reservations.idx;
-}
-
-static inline struct journal_buf *journal_prev_buf(struct journal *j)
-{
-	return j->buf + !j->reservations.idx;
-}
-
-/* Sequence number of oldest dirty journal entry */
-
-static inline u64 journal_last_seq(struct journal *j)
-{
-	return j->pin.front;
-}
-
-static inline u64 journal_cur_seq(struct journal *j)
-{
-	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
-
-	return j->pin.back - 1;
-}
-
-static inline u64 journal_pin_seq(struct journal *j,
-				  struct journal_entry_pin_list *pin_list)
-{
-	return fifo_entry_idx_abs(&j->pin, pin_list);
-}
-
-u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
-{
-	u64 ret = 0;
-
-	spin_lock(&j->lock);
-	if (journal_pin_active(pin))
-		ret = journal_pin_seq(j, pin->pin_list);
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
-static inline void bch2_journal_add_entry_noreservation(struct journal_buf *buf,
-				 unsigned type, enum btree_id id,
-				 unsigned level,
-				 const void *data, size_t u64s)
-{
-	struct jset *jset = buf->data;
-
-	bch2_journal_add_entry_at(buf, le32_to_cpu(jset->u64s),
-				  type, id, level, data, u64s);
-	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
-}
-
-static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
-						 enum btree_id id)
-{
-	struct jset_entry *entry;
-
-	for_each_jset_entry_type(entry, j, type)
-		if (entry->btree_id == id)
-			return entry;
-
-	return NULL;
-}
-
-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
-					   enum btree_id id, unsigned *level)
-{
-	struct bkey_i *k;
-	struct jset_entry *entry =
-		bch2_journal_find_entry(j, JOURNAL_ENTRY_BTREE_ROOT, id);
-
-	if (!entry)
-		return NULL;
-
-	if (!entry->u64s)
-		return ERR_PTR(-EINVAL);
-
-	k = entry->start;
-	*level = entry->level;
-	*level = entry->level;
-	return k;
-}
-
-static void bch2_journal_add_btree_root(struct journal_buf *buf,
-				       enum btree_id id, struct bkey_i *k,
-				       unsigned level)
-{
-	bch2_journal_add_entry_noreservation(buf,
-			      JOURNAL_ENTRY_BTREE_ROOT, id, level,
-			      k, k->k.u64s);
-}
-
-static void journal_seq_blacklist_flush(struct journal *j,
-				struct journal_entry_pin *pin, u64 seq)
-{
-	struct bch_fs *c =
-		container_of(j, struct bch_fs, journal);
-	struct journal_seq_blacklist *bl =
-		container_of(pin, struct journal_seq_blacklist, pin);
-	struct blacklisted_node n;
-	struct closure cl;
-	unsigned i;
-	int ret;
-
-	closure_init_stack(&cl);
-
-	for (i = 0;; i++) {
-		struct btree_iter iter;
-		struct btree *b;
-
-		mutex_lock(&j->blacklist_lock);
-		if (i >= bl->nr_entries) {
-			mutex_unlock(&j->blacklist_lock);
-			break;
-		}
-		n = bl->entries[i];
-		mutex_unlock(&j->blacklist_lock);
-
-		__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
-
-		b = bch2_btree_iter_peek_node(&iter);
-
-		/* The node might have already been rewritten: */
-
-		if (b->data->keys.seq == n.seq) {
-			ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
-			if (ret) {
-				bch2_btree_iter_unlock(&iter);
-				bch2_fs_fatal_error(c,
-					"error %i rewriting btree node with blacklisted journal seq",
-					ret);
-				bch2_journal_halt(j);
-				return;
-			}
-		}
-
-		bch2_btree_iter_unlock(&iter);
-	}
-
-	for (i = 0;; i++) {
-		struct btree_update *as;
-		struct pending_btree_node_free *d;
-
-		mutex_lock(&j->blacklist_lock);
-		if (i >= bl->nr_entries) {
-			mutex_unlock(&j->blacklist_lock);
-			break;
-		}
-		n = bl->entries[i];
-		mutex_unlock(&j->blacklist_lock);
-redo_wait:
-		mutex_lock(&c->btree_interior_update_lock);
-
-		/*
-		 * Is the node on the list of pending interior node updates -
-		 * being freed? If so, wait for that to finish:
-		 */
-		for_each_pending_btree_node_free(c, as, d)
-			if (n.seq	== d->seq &&
-			    n.btree_id	== d->btree_id &&
-			    !d->level &&
-			    !bkey_cmp(n.pos, d->key.k.p)) {
-				closure_wait(&as->wait, &cl);
-				mutex_unlock(&c->btree_interior_update_lock);
-				closure_sync(&cl);
-				goto redo_wait;
-			}
-
-		mutex_unlock(&c->btree_interior_update_lock);
-	}
-
-	mutex_lock(&j->blacklist_lock);
-
-	bch2_journal_pin_drop(j, &bl->pin);
-	list_del(&bl->list);
-	kfree(bl->entries);
-	kfree(bl);
-
-	mutex_unlock(&j->blacklist_lock);
-}
-
-static struct journal_seq_blacklist *
-journal_seq_blacklist_find(struct journal *j, u64 seq)
-{
-	struct journal_seq_blacklist *bl;
-
-	lockdep_assert_held(&j->blacklist_lock);
-
-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		if (seq == bl->seq)
-			return bl;
-
-	return NULL;
-}
-
-static struct journal_seq_blacklist *
-bch2_journal_seq_blacklisted_new(struct journal *j, u64 seq)
-{
-	struct journal_seq_blacklist *bl;
-
-	lockdep_assert_held(&j->blacklist_lock);
-
-	/*
-	 * When we start the journal, bch2_journal_start() will skip over @seq:
-	 */
-
-	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
-	if (!bl)
-		return NULL;
-
-	bl->seq = seq;
-	list_add_tail(&bl->list, &j->seq_blacklist);
-	return bl;
-}
-
-/*
- * Returns true if @seq is newer than the most recent journal entry that got
- * written, and data corresponding to @seq should be ignored - also marks @seq
- * as blacklisted so that on future restarts the corresponding data will still
- * be ignored:
- */
-int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
-{
-	struct journal *j = &c->journal;
-	struct journal_seq_blacklist *bl = NULL;
-	struct blacklisted_node *n;
-	u64 journal_seq, i;
-	int ret = 0;
-
-	if (!seq)
-		return 0;
-
-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
-	spin_unlock(&j->lock);
-
-	/* Interier updates aren't journalled: */
-	BUG_ON(b->level);
-	BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
-
-	/*
-	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
-	 * increasing it temporarily to work around bug in old kernels
-	 */
-	bch2_fs_inconsistent_on(seq > journal_seq + 4, c,
-			 "bset journal seq too far in the future: %llu > %llu",
-			 seq, journal_seq);
-
-	if (seq <= journal_seq &&
-	    list_empty_careful(&j->seq_blacklist))
-		return 0;
-
-	mutex_lock(&j->blacklist_lock);
-
-	if (seq <= journal_seq) {
-		bl = journal_seq_blacklist_find(j, seq);
-		if (!bl)
-			goto out;
-	} else {
-		bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
-			    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
-
-		for (i = journal_seq + 1; i <= seq; i++) {
-			bl = journal_seq_blacklist_find(j, i) ?:
-				bch2_journal_seq_blacklisted_new(j, i);
-			if (!bl) {
-				ret = -ENOMEM;
-				goto out;
-			}
-		}
-	}
-
-	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
-		if (b->data->keys.seq	== n->seq &&
-		    b->btree_id		== n->btree_id &&
-		    !bkey_cmp(b->key.k.p, n->pos))
-			goto found_entry;
-
-	if (!bl->nr_entries ||
-	    is_power_of_2(bl->nr_entries)) {
-		n = krealloc(bl->entries,
-			     max(bl->nr_entries * 2, 8UL) * sizeof(*n),
-			     GFP_KERNEL);
-		if (!n) {
-			ret = -ENOMEM;
-			goto out;
-		}
-		bl->entries = n;
-	}
-
-	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
-		.seq		= b->data->keys.seq,
-		.btree_id	= b->btree_id,
-		.pos		= b->key.k.p,
-	};
-found_entry:
-	ret = 1;
-out:
-	mutex_unlock(&j->blacklist_lock);
-	return ret;
-}
-
-/*
- * Journal replay/recovery:
- *
- * This code is all driven from bch2_fs_start(); we first read the journal
- * entries, do some other stuff, then we mark all the keys in the journal
- * entries (same as garbage collection would), then we replay them - reinserting
- * them into the cache in precisely the same order as they appear in the
- * journal.
- *
- * We only journal keys that go in leaf nodes, which simplifies things quite a
- * bit.
- */
-
-struct journal_list {
-	struct closure		cl;
-	struct mutex		lock;
-	struct list_head	*head;
-	int			ret;
-};
-
-#define JOURNAL_ENTRY_ADD_OK		0
-#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
-
-/*
- * Given a journal entry we just read, add it to the list of journal entries to
- * be replayed:
- */
-static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
-			     struct journal_list *jlist, struct jset *j)
-{
-	struct journal_replay *i, *pos;
-	struct list_head *where;
-	size_t bytes = vstruct_bytes(j);
-	__le64 last_seq;
-	int ret;
-
-	last_seq = !list_empty(jlist->head)
-		? list_last_entry(jlist->head, struct journal_replay,
-				  list)->j.last_seq
-		: 0;
-
-	/* Is this entry older than the range we need? */
-	if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
-		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
-		goto out;
-	}
-
-	/* Drop entries we don't need anymore */
-	list_for_each_entry_safe(i, pos, jlist->head, list) {
-		if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
-			break;
-		list_del(&i->list);
-		kvpfree(i, offsetof(struct journal_replay, j) +
-			vstruct_bytes(&i->j));
-	}
-
-	list_for_each_entry_reverse(i, jlist->head, list) {
-		/* Duplicate? */
-		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
-			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
-				    memcmp(j, &i->j, bytes), c,
-				    "found duplicate but non identical journal entries (seq %llu)",
-				    le64_to_cpu(j->seq));
-			goto found;
-		}
-
-		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
-			where = &i->list;
-			goto add;
-		}
-	}
-
-	where = jlist->head;
-add:
-	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
-	if (!i) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	list_add(&i->list, where);
-	i->devs.nr = 0;
-	memcpy(&i->j, j, bytes);
-found:
-	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
-		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
-	else
-		fsck_err_on(1, c, "duplicate journal entries on same device");
-	ret = JOURNAL_ENTRY_ADD_OK;
-out:
-fsck_err:
-	return ret;
-}
-
-static struct nonce journal_nonce(const struct jset *jset)
-{
-	return (struct nonce) {{
-		[0] = 0,
-		[1] = ((__le32 *) &jset->seq)[0],
-		[2] = ((__le32 *) &jset->seq)[1],
-		[3] = BCH_NONCE_JOURNAL,
-	}};
-}
-
-/* this fills in a range with empty jset_entries: */
-static void journal_entry_null_range(void *start, void *end)
-{
-	struct jset_entry *entry;
-
-	for (entry = start; entry != end; entry = vstruct_next(entry))
-		memset(entry, 0, sizeof(*entry));
-}
-
-static int journal_validate_key(struct bch_fs *c, struct jset *jset,
-				struct jset_entry *entry,
-				struct bkey_i *k, enum bkey_type key_type,
-				const char *type)
-{
-	void *next = vstruct_next(entry);
-	const char *invalid;
-	char buf[160];
-	int ret = 0;
-
-	if (mustfix_fsck_err_on(!k->k.u64s, c,
-			"invalid %s in journal: k->u64s 0", type)) {
-		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
-	}
-
-	if (mustfix_fsck_err_on((void *) bkey_next(k) >
-				(void *) vstruct_next(entry), c,
-			"invalid %s in journal: extends past end of journal entry",
-			type)) {
-		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
-		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
-	}
-
-	if (mustfix_fsck_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
-			"invalid %s in journal: bad format %u",
-			type, k->k.format)) {
-		le16_add_cpu(&entry->u64s, -k->k.u64s);
-		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
-	}
-
-	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
-		bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
-
-	invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
-	if (invalid) {
-		bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
-				     bkey_i_to_s_c(k));
-		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
-				 type, invalid, buf);
-
-		le16_add_cpu(&entry->u64s, -k->k.u64s);
-		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
-		journal_entry_null_range(vstruct_next(entry), next);
-		return 0;
-	}
-fsck_err:
-	return ret;
-}
-
-#define JOURNAL_ENTRY_REREAD	5
-#define JOURNAL_ENTRY_NONE	6
-#define JOURNAL_ENTRY_BAD	7
-
-#define journal_entry_err(c, msg, ...)					\
-({									\
-	if (write == READ) {						\
-		mustfix_fsck_err(c, msg, ##__VA_ARGS__);		\
-	} else {							\
-		bch_err(c, "detected corrupt metadata before write:\n"	\
-	                msg, ##__VA_ARGS__);				\
-		ret = BCH_FSCK_ERRORS_NOT_FIXED;			\
-		goto fsck_err;						\
-	}								\
-	true;								\
-})
-
-#define journal_entry_err_on(cond, c, msg, ...)				\
-	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
-
-static int journal_entry_validate_entries(struct bch_fs *c, struct jset *jset,
-					  int write)
-{
-	struct jset_entry *entry;
-	int ret = 0;
-
-	vstruct_for_each(jset, entry) {
-		void *next = vstruct_next(entry);
-		struct bkey_i *k;
-
-		if (journal_entry_err_on(vstruct_next(entry) >
-					 vstruct_last(jset), c,
-				"journal entry extends past end of jset")) {
-			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
-			break;
-		}
-
-		switch (entry->type) {
-		case JOURNAL_ENTRY_BTREE_KEYS:
-			vstruct_for_each(entry, k) {
-				ret = journal_validate_key(c, jset, entry, k,
-						bkey_type(entry->level,
-							  entry->btree_id),
-						"key");
-				if (ret)
-					goto fsck_err;
-			}
-			break;
-
-		case JOURNAL_ENTRY_BTREE_ROOT:
-			k = entry->start;
-
-			if (journal_entry_err_on(!entry->u64s ||
-					le16_to_cpu(entry->u64s) != k->k.u64s, c,
-					"invalid btree root journal entry: wrong number of keys")) {
-				/*
-				 * we don't want to null out this jset_entry,
-				 * just the contents, so that later we can tell
-				 * we were _supposed_ to have a btree root
-				 */
-				entry->u64s = 0;
-				journal_entry_null_range(vstruct_next(entry), next);
-				continue;
-			}
-
-			ret = journal_validate_key(c, jset, entry, k,
-						   BKEY_TYPE_BTREE, "btree root");
-			if (ret)
-				goto fsck_err;
-			break;
-
-		case JOURNAL_ENTRY_PRIO_PTRS:
-			break;
-
-		case JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED:
-			if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
-				"invalid journal seq blacklist entry: bad size")) {
-				journal_entry_null_range(entry,
-						vstruct_next(entry));
-			}
-
-			break;
-		default:
-			journal_entry_err(c, "invalid journal entry type %u",
-					  entry->type);
-			journal_entry_null_range(entry, vstruct_next(entry));
-			break;
-		}
-	}
-
-fsck_err:
-	return ret;
-}
-
-static int journal_entry_validate(struct bch_fs *c,
-				  struct jset *jset, u64 sector,
-				  unsigned bucket_sectors_left,
-				  unsigned sectors_read,
-				  int write)
-{
-	size_t bytes = vstruct_bytes(jset);
-	struct bch_csum csum;
-	int ret = 0;
-
-	if (le64_to_cpu(jset->magic) != jset_magic(c))
-		return JOURNAL_ENTRY_NONE;
-
-	if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
-		bch_err(c, "unknown journal entry version %u",
-			le32_to_cpu(jset->version));
-		return BCH_FSCK_UNKNOWN_VERSION;
-	}
-
-	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
-			"journal entry too big (%zu bytes), sector %lluu",
-			bytes, sector)) {
-		/* XXX: note we might have missing journal entries */
-		return JOURNAL_ENTRY_BAD;
-	}
-
-	if (bytes > sectors_read << 9)
-		return JOURNAL_ENTRY_REREAD;
-
-	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
-			"journal entry with unknown csum type %llu sector %lluu",
-			JSET_CSUM_TYPE(jset), sector))
-		return JOURNAL_ENTRY_BAD;
-
-	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
-	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
-			"journal checksum bad, sector %llu", sector)) {
-		/* XXX: retry IO, when we start retrying checksum errors */
-		/* XXX: note we might have missing journal entries */
-		return JOURNAL_ENTRY_BAD;
-	}
-
-	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-		    jset->encrypted_start,
-		    vstruct_end(jset) - (void *) jset->encrypted_start);
-
-	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
-			"invalid journal entry: last_seq > seq"))
-		jset->last_seq = jset->seq;
-
-	return 0;
-fsck_err:
-	return ret;
-}
-
-struct journal_read_buf {
-	void		*data;
-	size_t		size;
-};
-
-static int journal_read_buf_realloc(struct journal_read_buf *b,
-				    size_t new_size)
-{
-	void *n;
-
-	/* the bios are sized for this many pages, max: */
-	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
-		return -ENOMEM;
-
-	new_size = roundup_pow_of_two(new_size);
-	n = kvpmalloc(new_size, GFP_KERNEL);
-	if (!n)
-		return -ENOMEM;
-
-	kvpfree(b->data, b->size);
-	b->data = n;
-	b->size = new_size;
-	return 0;
-}
-
-static int journal_read_bucket(struct bch_dev *ca,
-			       struct journal_read_buf *buf,
-			       struct journal_list *jlist,
-			       unsigned bucket, u64 *seq, bool *entries_found)
-{
-	struct bch_fs *c = ca->fs;
-	struct journal_device *ja = &ca->journal;
-	struct bio *bio = ja->bio;
-	struct jset *j = NULL;
-	unsigned sectors, sectors_read = 0;
-	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
-	    end = offset + ca->mi.bucket_size;
-	bool saw_bad = false;
-	int ret = 0;
-
-	pr_debug("reading %u", bucket);
-
-	while (offset < end) {
-		if (!sectors_read) {
-reread:			sectors_read = min_t(unsigned,
-				end - offset, buf->size >> 9);
-
-			bio_reset(bio);
-			bio_set_dev(bio, ca->disk_sb.bdev);
-			bio->bi_iter.bi_sector	= offset;
-			bio->bi_iter.bi_size	= sectors_read << 9;
-			bio_set_op_attrs(bio, REQ_OP_READ, 0);
-			bch2_bio_map(bio, buf->data);
-
-			ret = submit_bio_wait(bio);
-
-			if (bch2_dev_io_err_on(ret, ca,
-					       "journal read from sector %llu",
-					       offset) ||
-			    bch2_meta_read_fault("journal"))
-				return -EIO;
-
-			j = buf->data;
-		}
-
-		ret = journal_entry_validate(c, j, offset,
-					end - offset, sectors_read,
-					READ);
-		switch (ret) {
-		case BCH_FSCK_OK:
-			break;
-		case JOURNAL_ENTRY_REREAD:
-			if (vstruct_bytes(j) > buf->size) {
-				ret = journal_read_buf_realloc(buf,
-							vstruct_bytes(j));
-				if (ret)
-					return ret;
-			}
-			goto reread;
-		case JOURNAL_ENTRY_NONE:
-			if (!saw_bad)
-				return 0;
-			sectors = c->opts.block_size;
-			goto next_block;
-		case JOURNAL_ENTRY_BAD:
-			saw_bad = true;
-			sectors = c->opts.block_size;
-			goto next_block;
-		default:
-			return ret;
-		}
-
-		/*
-		 * This happens sometimes if we don't have discards on -
-		 * when we've partially overwritten a bucket with new
-		 * journal entries. We don't need the rest of the
-		 * bucket:
-		 */
-		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
-			return 0;
-
-		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
-
-		mutex_lock(&jlist->lock);
-		ret = journal_entry_add(c, ca, jlist, j);
-		mutex_unlock(&jlist->lock);
-
-		switch (ret) {
-		case JOURNAL_ENTRY_ADD_OK:
-			*entries_found = true;
-			break;
-		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
-			break;
-		default:
-			return ret;
-		}
-
-		if (le64_to_cpu(j->seq) > *seq)
-			*seq = le64_to_cpu(j->seq);
-
-		sectors = vstruct_sectors(j, c->block_bits);
-next_block:
-		pr_debug("next");
-		offset		+= sectors;
-		sectors_read	-= sectors;
-		j = ((void *) j) + (sectors << 9);
-	}
-
-	return 0;
-}
-
-static void bch2_journal_read_device(struct closure *cl)
-{
-#define read_bucket(b)							\
-	({								\
-		bool entries_found = false;				\
-		ret = journal_read_bucket(ca, &buf, jlist, b, &seq,	\
-					  &entries_found);		\
-		if (ret)						\
-			goto err;					\
-		__set_bit(b, bitmap);					\
-		entries_found;						\
-	 })
-
-	struct journal_device *ja =
-		container_of(cl, struct journal_device, read);
-	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
-	struct journal_list *jlist =
-		container_of(cl->parent, struct journal_list, cl);
-	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
-	struct journal_read_buf buf = { NULL, 0 };
-
-	DECLARE_BITMAP(bitmap, ja->nr);
-	unsigned i, l, r;
-	u64 seq = 0;
-	int ret;
-
-	if (!ja->nr)
-		goto out;
-
-	bitmap_zero(bitmap, ja->nr);
-	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
-	if (ret)
-		goto err;
-
-	pr_debug("%u journal buckets", ja->nr);
-
-	/*
-	 * If the device supports discard but not secure discard, we can't do
-	 * the fancy fibonacci hash/binary search because the live journal
-	 * entries might not form a contiguous range:
-	 */
-	for (i = 0; i < ja->nr; i++)
-		read_bucket(i);
-	goto search_done;
-
-	if (!blk_queue_nonrot(q))
-		goto linear_scan;
-
-	/*
-	 * Read journal buckets ordered by golden ratio hash to quickly
-	 * find a sequence of buckets with valid journal entries
-	 */
-	for (i = 0; i < ja->nr; i++) {
-		l = (i * 2654435769U) % ja->nr;
-
-		if (test_bit(l, bitmap))
-			break;
-
-		if (read_bucket(l))
-			goto bsearch;
-	}
-
-	/*
-	 * If that fails, check all the buckets we haven't checked
-	 * already
-	 */
-	pr_debug("falling back to linear search");
-linear_scan:
-	for (l = find_first_zero_bit(bitmap, ja->nr);
-	     l < ja->nr;
-	     l = find_next_zero_bit(bitmap, ja->nr, l + 1))
-		if (read_bucket(l))
-			goto bsearch;
-
-	/* no journal entries on this device? */
-	if (l == ja->nr)
-		goto out;
-bsearch:
-	/* Binary search */
-	r = find_next_bit(bitmap, ja->nr, l + 1);
-	pr_debug("starting binary search, l %u r %u", l, r);
-
-	while (l + 1 < r) {
-		unsigned m = (l + r) >> 1;
-		u64 cur_seq = seq;
-
-		read_bucket(m);
-
-		if (cur_seq != seq)
-			l = m;
-		else
-			r = m;
-	}
-
-search_done:
-	/*
-	 * Find the journal bucket with the highest sequence number:
-	 *
-	 * If there's duplicate journal entries in multiple buckets (which
-	 * definitely isn't supposed to happen, but...) - make sure to start
-	 * cur_idx at the last of those buckets, so we don't deadlock trying to
-	 * allocate
-	 */
-	seq = 0;
-
-	for (i = 0; i < ja->nr; i++)
-		if (ja->bucket_seq[i] >= seq &&
-		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
-			/*
-			 * When journal_next_bucket() goes to allocate for
-			 * the first time, it'll use the bucket after
-			 * ja->cur_idx
-			 */
-			ja->cur_idx = i;
-			seq = ja->bucket_seq[i];
-		}
-
-	/*
-	 * Set last_idx to indicate the entire journal is full and needs to be
-	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
-	 * pinned when it first runs:
-	 */
-	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
-	/*
-	 * Read buckets in reverse order until we stop finding more journal
-	 * entries:
-	 */
-	for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
-	     i != ja->cur_idx;
-	     i = (i + ja->nr - 1) % ja->nr)
-		if (!test_bit(i, bitmap) &&
-		    !read_bucket(i))
-			break;
-out:
-	kvpfree(buf.data, buf.size);
-	percpu_ref_put(&ca->io_ref);
-	closure_return(cl);
-err:
-	mutex_lock(&jlist->lock);
-	jlist->ret = ret;
-	mutex_unlock(&jlist->lock);
-	goto out;
-#undef read_bucket
-}
-
-void bch2_journal_entries_free(struct list_head *list)
-{
-
-	while (!list_empty(list)) {
-		struct journal_replay *i =
-			list_first_entry(list, struct journal_replay, list);
-		list_del(&i->list);
-		kvpfree(i, offsetof(struct journal_replay, j) +
-			vstruct_bytes(&i->j));
-	}
-}
-
-static int journal_seq_blacklist_read(struct journal *j,
-				      struct journal_replay *i,
-				      struct journal_entry_pin_list *p)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct jset_entry *entry;
-	struct journal_seq_blacklist *bl;
-	u64 seq;
-
-	for_each_jset_entry_type(entry, &i->j,
-			JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED) {
-		struct jset_entry_blacklist *bl_entry =
-			container_of(entry, struct jset_entry_blacklist, entry);
-		seq = le64_to_cpu(bl_entry->seq);
-
-		bch_verbose(c, "blacklisting existing journal seq %llu", seq);
-
-		bl = bch2_journal_seq_blacklisted_new(j, seq);
-		if (!bl)
-			return -ENOMEM;
-
-		journal_pin_add_entry(j, p, &bl->pin,
-				  journal_seq_blacklist_flush);
-		bl->written = true;
-	}
-
-	return 0;
-}
-
-static inline bool journal_has_keys(struct list_head *list)
-{
-	struct journal_replay *i;
-	struct jset_entry *entry;
-	struct bkey_i *k, *_n;
-
-	list_for_each_entry(i, list, list)
-		for_each_jset_key(k, _n, entry, &i->j)
-			return true;
-
-	return false;
-}
-
-int bch2_journal_read(struct bch_fs *c, struct list_head *list)
-{
-	struct journal *j = &c->journal;
-	struct journal_list jlist;
-	struct journal_replay *i;
-	struct journal_entry_pin_list *p;
-	struct bch_dev *ca;
-	u64 cur_seq, end_seq, seq;
-	unsigned iter, keys = 0, entries = 0;
-	size_t nr;
-	bool degraded = false;
-	int ret = 0;
-
-	closure_init_stack(&jlist.cl);
-	mutex_init(&jlist.lock);
-	jlist.head = list;
-	jlist.ret = 0;
-
-	for_each_member_device(ca, c, iter) {
-		if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
-			continue;
-
-		if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
-		     ca->mi.state == BCH_MEMBER_STATE_RO) &&
-		    percpu_ref_tryget(&ca->io_ref))
-			closure_call(&ca->journal.read,
-				     bch2_journal_read_device,
-				     system_unbound_wq,
-				     &jlist.cl);
-		else
-			degraded = true;
-	}
-
-	closure_sync(&jlist.cl);
-
-	if (jlist.ret)
-		return jlist.ret;
-
-	if (list_empty(list)){
-		bch_err(c, "no journal entries found");
-		return BCH_FSCK_REPAIR_IMPOSSIBLE;
-	}
-
-	fsck_err_on(c->sb.clean && journal_has_keys(list), c,
-		    "filesystem marked clean but journal has keys to replay");
-
-	list_for_each_entry(i, list, list) {
-		ret = journal_entry_validate_entries(c, &i->j, READ);
-		if (ret)
-			goto fsck_err;
-
-		/*
-		 * If we're mounting in degraded mode - if we didn't read all
-		 * the devices - this is wrong:
-		 */
-
-		if (!degraded &&
-		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		     fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
-						       i->devs), c,
-				 "superblock not marked as containing replicas (type %u)",
-				 BCH_DATA_JOURNAL))) {
-			ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
-			if (ret)
-				return ret;
-		}
-	}
-
-	i = list_last_entry(list, struct journal_replay, list);
-
-	nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
-
-	if (nr > j->pin.size) {
-		free_fifo(&j->pin);
-		init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
-		if (!j->pin.data) {
-			bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
-			return -ENOMEM;
-		}
-	}
-
-	atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
-	j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
-
-	j->pin.front	= le64_to_cpu(i->j.last_seq);
-	j->pin.back	= le64_to_cpu(i->j.seq) + 1;
-
-	fifo_for_each_entry_ptr(p, &j->pin, seq) {
-		INIT_LIST_HEAD(&p->list);
-		INIT_LIST_HEAD(&p->flushed);
-		atomic_set(&p->count, 0);
-		p->devs.nr = 0;
-	}
-
-	mutex_lock(&j->blacklist_lock);
-
-	list_for_each_entry(i, list, list) {
-		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
-		atomic_set(&p->count, 1);
-		p->devs = i->devs;
-
-		if (journal_seq_blacklist_read(j, i, p)) {
-			mutex_unlock(&j->blacklist_lock);
-			return -ENOMEM;
-		}
-	}
-
-	mutex_unlock(&j->blacklist_lock);
-
-	cur_seq = journal_last_seq(j);
-	end_seq = le64_to_cpu(list_last_entry(list,
-				struct journal_replay, list)->j.seq);
-
-	list_for_each_entry(i, list, list) {
-		struct jset_entry *entry;
-		struct bkey_i *k, *_n;
-		bool blacklisted;
-
-		mutex_lock(&j->blacklist_lock);
-		while (cur_seq < le64_to_cpu(i->j.seq) &&
-		       journal_seq_blacklist_find(j, cur_seq))
-			cur_seq++;
-
-		blacklisted = journal_seq_blacklist_find(j,
-							 le64_to_cpu(i->j.seq));
-		mutex_unlock(&j->blacklist_lock);
-
-		fsck_err_on(blacklisted, c,
-			    "found blacklisted journal entry %llu",
-			    le64_to_cpu(i->j.seq));
-
-		fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
-			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
-			cur_seq, le64_to_cpu(i->j.seq) - 1,
-			journal_last_seq(j), end_seq);
-
-		cur_seq = le64_to_cpu(i->j.seq) + 1;
-
-		for_each_jset_key(k, _n, entry, &i->j)
-			keys++;
-		entries++;
-	}
-
-	bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
-		 keys, entries, journal_cur_seq(j));
-fsck_err:
-	return ret;
-}
-
-int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
-{
-	struct bkey_i *k, *n;
-	struct jset_entry *j;
-	struct journal_replay *r;
-	int ret;
-
-	list_for_each_entry(r, list, list)
-		for_each_jset_key(k, n, j, &r->j) {
-			enum bkey_type type = bkey_type(j->level, j->btree_id);
-			struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
-
-			if (btree_type_has_ptrs(type)) {
-				ret = bch2_btree_mark_key_initial(c, type, k_s_c);
-				if (ret)
-					return ret;
-			}
-		}
-
-	return 0;
-}
-
 static bool journal_entry_is_open(struct journal *j)
 {
 	return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
@@ -1174,15 +30,15 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
 
 	if (!need_write_just_set &&
 	    test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		__bch2_time_stats_update(j->delay_time,
-					j->need_write_time);
+		bch2_time_stats_update(j->delay_time,
+				       j->need_write_time);
 #if 0
-	closure_call(&j->io, journal_write, NULL, NULL);
+	closure_call(&j->io, bch2_journal_write, NULL, NULL);
 #else
 	/* Shut sparse up: */
 	closure_init(&j->io, NULL);
-	set_closure_fn(&j->io, journal_write, NULL);
-	journal_write(&j->io);
+	set_closure_fn(&j->io, bch2_journal_write, NULL);
+	bch2_journal_write(&j->io);
 #endif
 }
 
@@ -1269,8 +125,8 @@ static enum {
 		c->opts.block_size;
 	BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
-	journal_reclaim_fast(j);
-	/* XXX: why set this here, and not in journal_write()? */
+	bch2_journal_reclaim_fast(j);
+	/* XXX: why set this here, and not in bch2_journal_write()? */
 	buf->data->last_seq	= cpu_to_le64(journal_last_seq(j));
 
 	journal_pin_new_entry(j, 1);
@@ -1285,6 +141,8 @@ static enum {
 		bch2_bucket_seq_cleanup(c);
 	}
 
+	c->bucket_journal_seq++;
+
 	/* ugh - might be called from __journal_res_get() under wait_event() */
 	__set_current_state(TASK_RUNNING);
 	bch2_journal_buf_put(j, old.idx, need_write_just_set);
@@ -1311,96 +169,6 @@ void bch2_journal_halt(struct journal *j)
 	closure_wake_up(&journal_prev_buf(j)->wait);
 }
 
-static unsigned journal_dev_buckets_available(struct journal *j,
-					      struct bch_dev *ca)
-{
-	struct journal_device *ja = &ca->journal;
-	unsigned next = (ja->cur_idx + 1) % ja->nr;
-	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
-	/*
-	 * Hack to avoid a deadlock during journal replay:
-	 * journal replay might require setting a new btree
-	 * root, which requires writing another journal entry -
-	 * thus, if the journal is full (and this happens when
-	 * replaying the first journal bucket's entries) we're
-	 * screwed.
-	 *
-	 * So don't let the journal fill up unless we're in
-	 * replay:
-	 */
-	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
-		available = max((int) available - 2, 0);
-
-	/*
-	 * Don't use the last bucket unless writing the new last_seq
-	 * will make another bucket available:
-	 */
-	if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
-		available = max((int) available - 1, 0);
-
-	return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-static int journal_entry_sectors(struct journal *j)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
-	unsigned sectors_available = UINT_MAX;
-	unsigned i, nr_online = 0, nr_devs = 0;
-
-	lockdep_assert_held(&j->lock);
-
-	rcu_read_lock();
-	for_each_member_device_rcu(ca, c, i,
-				   &c->rw_devs[BCH_DATA_JOURNAL]) {
-		struct journal_device *ja = &ca->journal;
-		unsigned buckets_required = 0;
-
-		if (!ja->nr)
-			continue;
-
-		sectors_available = min_t(unsigned, sectors_available,
-					  ca->mi.bucket_size);
-
-		/*
-		 * Note that we don't allocate the space for a journal entry
-		 * until we write it out - thus, if we haven't started the write
-		 * for the previous entry we have to make sure we have space for
-		 * it too:
-		 */
-		if (bch2_extent_has_device(e.c, ca->dev_idx)) {
-			if (j->prev_buf_sectors > ja->sectors_free)
-				buckets_required++;
-
-			if (j->prev_buf_sectors + sectors_available >
-			    ja->sectors_free)
-				buckets_required++;
-		} else {
-			if (j->prev_buf_sectors + sectors_available >
-			    ca->mi.bucket_size)
-				buckets_required++;
-
-			buckets_required++;
-		}
-
-		if (journal_dev_buckets_available(j, ca) >= buckets_required)
-			nr_devs++;
-		nr_online++;
-	}
-	rcu_read_unlock();
-
-	if (nr_online < c->opts.metadata_replicas_required)
-		return -EROFS;
-
-	if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
-		return 0;
-
-	return sectors_available;
-}
-
 /*
  * should _only_ called from journal_res_get() - when we actually want a
  * journal reservation - journal entry is open means journal is dirty:
@@ -1425,7 +193,7 @@ static int journal_entry_open(struct journal *j)
 	if (!fifo_free(&j->pin))
 		return 0;
 
-	sectors = journal_entry_sectors(j);
+	sectors = bch2_journal_entry_sectors(j);
 	if (sectors <= 0)
 		return sectors;
 
@@ -1468,8 +236,8 @@ static int journal_entry_open(struct journal *j)
 				       old.v, new.v)) != old.v);
 
 	if (j->res_get_blocked_start)
-		__bch2_time_stats_update(j->blocked_time,
-					j->res_get_blocked_start);
+		bch2_time_stats_update(j->blocked_time,
+				       j->res_get_blocked_start);
 	j->res_get_blocked_start = 0;
 
 	mod_delayed_work(system_freezable_wq,
@@ -1479,110 +247,444 @@ static int journal_entry_open(struct journal *j)
 	return 1;
 }
 
-void bch2_journal_start(struct bch_fs *c)
+/*
+ * returns true if there's nothing to flush and no journal write still in flight
+ */
+static bool journal_flush_write(struct journal *j)
 {
-	struct journal *j = &c->journal;
-	struct journal_seq_blacklist *bl;
-	u64 new_seq = 0;
+	bool ret;
 
-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		new_seq = max(new_seq, bl->seq);
+	spin_lock(&j->lock);
+	ret = !j->reservations.prev_buf_unwritten;
+
+	if (!journal_entry_is_open(j)) {
+		spin_unlock(&j->lock);
+		return ret;
+	}
+
+	set_bit(JOURNAL_NEED_WRITE, &j->flags);
+	if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED)
+		ret = false;
+	else
+		spin_unlock(&j->lock);
+	return ret;
+}
+
+static void journal_write_work(struct work_struct *work)
+{
+	struct journal *j = container_of(work, struct journal, write_work.work);
+
+	journal_flush_write(j);
+}
+
+/*
+ * Given an inode number, if that inode number has data in the journal that
+ * hasn't yet been flushed, return the journal sequence number that needs to be
+ * flushed:
+ */
+u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
+{
+	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
+	u64 seq = 0;
+
+	if (!test_bit(h, j->buf[0].has_inode) &&
+	    !test_bit(h, j->buf[1].has_inode))
+		return 0;
+
+	spin_lock(&j->lock);
+	if (test_bit(h, journal_cur_buf(j)->has_inode))
+		seq = journal_cur_seq(j);
+	else if (test_bit(h, journal_prev_buf(j)->has_inode))
+		seq = journal_cur_seq(j) - 1;
+	spin_unlock(&j->lock);
+
+	return seq;
+}
+
+static int __journal_res_get(struct journal *j, struct journal_res *res,
+			      unsigned u64s_min, unsigned u64s_max)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *buf;
+	int ret;
+retry:
+	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+	if (ret)
+		return ret;
+
+	spin_lock(&j->lock);
+	/*
+	 * Recheck after taking the lock, so we don't race with another thread
+	 * that just did journal_entry_open() and call journal_entry_close()
+	 * unnecessarily
+	 */
+	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
+	if (ret) {
+		spin_unlock(&j->lock);
+		return 1;
+	}
+
+	/*
+	 * If we couldn't get a reservation because the current buf filled up,
+	 * and we had room for a bigger entry on disk, signal that we want to
+	 * realloc the journal bufs:
+	 */
+	buf = journal_cur_buf(j);
+	if (journal_entry_is_open(j) &&
+	    buf->size >> 9 < buf->disk_sectors &&
+	    buf->size < JOURNAL_ENTRY_SIZE_MAX)
+		j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+
+	/*
+	 * Close the current journal entry if necessary, then try to start a new
+	 * one:
+	 */
+	switch (journal_buf_switch(j, false)) {
+	case JOURNAL_ENTRY_ERROR:
+		spin_unlock(&j->lock);
+		return -EROFS;
+	case JOURNAL_ENTRY_INUSE:
+		/* haven't finished writing out the previous one: */
+		spin_unlock(&j->lock);
+		trace_journal_entry_full(c);
+		goto blocked;
+	case JOURNAL_ENTRY_CLOSED:
+		break;
+	case JOURNAL_UNLOCKED:
+		goto retry;
+	}
+
+	/* We now have a new, closed journal buf - see if we can open it: */
+	ret = journal_entry_open(j);
+	spin_unlock(&j->lock);
+
+	if (ret < 0)
+		return ret;
+	if (ret)
+		goto retry;
+
+	/* Journal's full, we have to wait */
+
+	/*
+	 * Direct reclaim - can't rely on reclaim from work item
+	 * due to freezing..
+	 */
+	bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+	trace_journal_full(c);
+blocked:
+	if (!j->res_get_blocked_start)
+		j->res_get_blocked_start = local_clock() ?: 1;
+	return 0;
+}
+
+/*
+ * Essentially the entry function to the journaling code. When bcachefs is doing
+ * a btree insert, it calls this function to get the current journal write.
+ * Journal write is the structure used set up journal writes. The calling
+ * function will then add its keys to the structure, queuing them for the next
+ * write.
+ *
+ * To ensure forward progress, the current task must not be holding any
+ * btree node write locks.
+ */
+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
+				 unsigned u64s_min, unsigned u64s_max)
+{
+	int ret;
+
+	wait_event(j->wait,
+		   (ret = __journal_res_get(j, res, u64s_min,
+					    u64s_max)));
+	return ret < 0 ? ret : 0;
+}
+
+u64 bch2_journal_last_unwritten_seq(struct journal *j)
+{
+	u64 seq;
+
+	spin_lock(&j->lock);
+	seq = journal_cur_seq(j);
+	if (j->reservations.prev_buf_unwritten)
+		seq--;
+	spin_unlock(&j->lock);
+
+	return seq;
+}
+
+/**
+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't
+ * open yet, or wait if we cannot
+ *
+ * used by the btree interior update machinery, when it needs to write a new
+ * btree root - every journal entry contains the roots of all the btrees, so it
+ * doesn't need to bother with getting a journal reservation
+ */
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+	int ret;
+
+	spin_lock(&j->lock);
+	BUG_ON(seq > journal_cur_seq(j));
+
+	if (seq < journal_cur_seq(j) ||
+	    journal_entry_is_open(j)) {
+		spin_unlock(&j->lock);
+		return 1;
+	}
+
+	ret = journal_entry_open(j);
+	if (!ret)
+		closure_wait(&j->async_wait, parent);
+	spin_unlock(&j->lock);
+
+	if (!ret)
+		bch2_journal_reclaim_work(&j->reclaim_work.work);
+
+	return ret;
+}
+
+/**
+ * bch2_journal_wait_on_seq - wait for a journal entry to be written
+ *
+ * does _not_ cause @seq to be written immediately - if there is no other
+ * activity to cause the relevant journal entry to be filled up or flushed it
+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is
+ * configurable).
+ */
+void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
+{
+	spin_lock(&j->lock);
+
+	BUG_ON(seq > journal_cur_seq(j));
+
+	if (bch2_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return;
+	}
+
+	if (seq == journal_cur_seq(j)) {
+		if (!closure_wait(&journal_cur_buf(j)->wait, parent))
+			BUG();
+	} else if (seq + 1 == journal_cur_seq(j) &&
+		   j->reservations.prev_buf_unwritten) {
+		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
+			BUG();
+
+		smp_mb();
+
+		/* check if raced with write completion (or failure) */
+		if (!j->reservations.prev_buf_unwritten ||
+		    bch2_journal_error(j))
+			closure_wake_up(&journal_prev_buf(j)->wait);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+/**
+ * bch2_journal_flush_seq_async - wait for a journal entry to be written
+ *
+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if
+ * necessary
+ */
+void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
+{
+	struct journal_buf *buf;
 
 	spin_lock(&j->lock);
 
-	set_bit(JOURNAL_STARTED, &j->flags);
+	BUG_ON(seq > journal_cur_seq(j));
 
-	while (journal_cur_seq(j) < new_seq)
-		journal_pin_new_entry(j, 0);
+	if (bch2_journal_error(j)) {
+		spin_unlock(&j->lock);
+		return;
+	}
 
-	/*
-	 * journal_buf_switch() only inits the next journal entry when it
-	 * closes an open journal entry - the very first journal entry gets
-	 * initialized here:
-	 */
-	journal_pin_new_entry(j, 1);
-	bch2_journal_buf_init(j);
+	if (seq == journal_cur_seq(j)) {
+		bool set_need_write = false;
+
+		buf = journal_cur_buf(j);
+
+		if (parent && !closure_wait(&buf->wait, parent))
+			BUG();
+
+		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
+
+		switch (journal_buf_switch(j, set_need_write)) {
+		case JOURNAL_ENTRY_ERROR:
+			if (parent)
+				closure_wake_up(&buf->wait);
+			break;
+		case JOURNAL_ENTRY_CLOSED:
+			/*
+			 * Journal entry hasn't been opened yet, but caller
+			 * claims it has something
+			 */
+			BUG();
+		case JOURNAL_ENTRY_INUSE:
+			break;
+		case JOURNAL_UNLOCKED:
+			return;
+		}
+	} else if (parent &&
+		   seq + 1 == journal_cur_seq(j) &&
+		   j->reservations.prev_buf_unwritten) {
+		buf = journal_prev_buf(j);
+
+		if (!closure_wait(&buf->wait, parent))
+			BUG();
+
+		smp_mb();
+
+		/* check if raced with write completion (or failure) */
+		if (!j->reservations.prev_buf_unwritten ||
+		    bch2_journal_error(j))
+			closure_wake_up(&buf->wait);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+static int journal_seq_flushed(struct journal *j, u64 seq)
+{
+	struct journal_buf *buf;
+	int ret = 1;
+
+	spin_lock(&j->lock);
+	BUG_ON(seq > journal_cur_seq(j));
+
+	if (seq == journal_cur_seq(j)) {
+		bool set_need_write = false;
+
+		ret = 0;
+
+		buf = journal_cur_buf(j);
+
+		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+			j->need_write_time = local_clock();
+			set_need_write = true;
+		}
+
+		switch (journal_buf_switch(j, set_need_write)) {
+		case JOURNAL_ENTRY_ERROR:
+			ret = -EIO;
+			break;
+		case JOURNAL_ENTRY_CLOSED:
+			/*
+			 * Journal entry hasn't been opened yet, but caller
+			 * claims it has something
+			 */
+			BUG();
+		case JOURNAL_ENTRY_INUSE:
+			break;
+		case JOURNAL_UNLOCKED:
+			return 0;
+		}
+	} else if (seq + 1 == journal_cur_seq(j) &&
+		   j->reservations.prev_buf_unwritten) {
+		ret = bch2_journal_error(j);
+	}
 
 	spin_unlock(&j->lock);
 
-	/*
-	 * Adding entries to the next journal entry before allocating space on
-	 * disk for the next journal entry - this is ok, because these entries
-	 * only have to go down with the next journal entry we write:
-	 */
-	list_for_each_entry(bl, &j->seq_blacklist, list)
-		if (!bl->written) {
-			bch2_journal_add_entry_noreservation(journal_cur_buf(j),
-					JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED,
-					0, 0, &bl->seq, 1);
-
-			journal_pin_add_entry(j,
-					      &fifo_peek_back(&j->pin),
-					      &bl->pin,
-					      journal_seq_blacklist_flush);
-			bl->written = true;
-		}
-
-	queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-}
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
-	struct journal *j = &c->journal;
-	struct bkey_i *k, *_n;
-	struct jset_entry *entry;
-	struct journal_replay *i, *n;
-	int ret = 0;
-
-	list_for_each_entry_safe(i, n, list, list) {
-		j->replay_pin_list =
-			journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
-		for_each_jset_key(k, _n, entry, &i->j) {
-
-			if (entry->btree_id == BTREE_ID_ALLOC) {
-				/*
-				 * allocation code handles replay for
-				 * BTREE_ID_ALLOC keys:
-				 */
-				ret = bch2_alloc_replay_key(c, k->k.p);
-			} else {
-				/*
-				 * We might cause compressed extents to be
-				 * split, so we need to pass in a
-				 * disk_reservation:
-				 */
-				struct disk_reservation disk_res =
-					bch2_disk_reservation_init(c, 0);
-
-				ret = bch2_btree_insert(c, entry->btree_id, k,
-							&disk_res, NULL, NULL,
-							BTREE_INSERT_NOFAIL|
-							BTREE_INSERT_JOURNAL_REPLAY);
-			}
-
-			if (ret) {
-				bch_err(c, "journal replay: error %d while replaying key",
-					ret);
-				goto err;
-			}
-
-			cond_resched();
-		}
-
-		if (atomic_dec_and_test(&j->replay_pin_list->count))
-			journal_wake(j);
-	}
-
-	j->replay_pin_list = NULL;
-
-	bch2_journal_set_replay_done(j);
-	ret = bch2_journal_flush_all_pins(j);
-err:
-	bch2_journal_entries_free(list);
 	return ret;
 }
 
+int bch2_journal_flush_seq(struct journal *j, u64 seq)
+{
+	u64 start_time = local_clock();
+	int ret, ret2;
+
+	ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
+
+	bch2_time_stats_update(j->flush_seq_time, start_time);
+
+	return ret ?: ret2 < 0 ? ret2 : 0;
+}
+
+/**
+ * bch2_journal_meta_async - force a journal entry to be written
+ */
+void bch2_journal_meta_async(struct journal *j, struct closure *parent)
+{
+	struct journal_res res;
+	unsigned u64s = jset_u64s(0);
+
+	memset(&res, 0, sizeof(res));
+
+	bch2_journal_res_get(j, &res, u64s, u64s);
+	bch2_journal_res_put(j, &res);
+
+	bch2_journal_flush_seq_async(j, res.seq, parent);
+}
+
+int bch2_journal_meta(struct journal *j)
+{
+	struct journal_res res;
+	unsigned u64s = jset_u64s(0);
+	int ret;
+
+	memset(&res, 0, sizeof(res));
+
+	ret = bch2_journal_res_get(j, &res, u64s, u64s);
+	if (ret)
+		return ret;
+
+	bch2_journal_res_put(j, &res);
+
+	return bch2_journal_flush_seq(j, res.seq);
+}
+
+/*
+ * bch2_journal_flush_async - if there is an open journal entry, or a journal
+ * still being written, write it and wait for the write to complete
+ */
+void bch2_journal_flush_async(struct journal *j, struct closure *parent)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return;
+	}
+	spin_unlock(&j->lock);
+
+	bch2_journal_flush_seq_async(j, seq, parent);
+}
+
+int bch2_journal_flush(struct journal *j)
+{
+	u64 seq, journal_seq;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+
+	if (journal_entry_is_open(j)) {
+		seq = journal_seq;
+	} else if (journal_seq) {
+		seq = journal_seq - 1;
+	} else {
+		spin_unlock(&j->lock);
+		return 0;
+	}
+	spin_unlock(&j->lock);
+
+	return bch2_journal_flush_seq(j, seq);
+}
+
+/* allocate journal on a device: */
+
 static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
 					 bool new_fs, struct closure *cl)
 {
@@ -1745,1161 +847,6 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
 	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 }
 
-/* Journalling */
-
-/**
- * journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-static void journal_reclaim_fast(struct journal *j)
-{
-	struct journal_entry_pin_list temp;
-	bool popped = false;
-
-	lockdep_assert_held(&j->lock);
-
-	/*
-	 * Unpin journal entries whose reference counts reached zero, meaning
-	 * all btree nodes got written out
-	 */
-	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
-		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
-		BUG_ON(!fifo_pop(&j->pin, temp));
-		popped = true;
-	}
-
-	if (popped)
-		journal_wake(j);
-}
-
-/*
- * Journal entry pinning - machinery for holding a reference on a given journal
- * entry, marking it as dirty:
- */
-
-static inline void __journal_pin_add(struct journal *j,
-				     struct journal_entry_pin_list *pin_list,
-				     struct journal_entry_pin *pin,
-				     journal_pin_flush_fn flush_fn)
-{
-	BUG_ON(journal_pin_active(pin));
-	BUG_ON(!atomic_read(&pin_list->count));
-
-	atomic_inc(&pin_list->count);
-	pin->pin_list	= pin_list;
-	pin->flush	= flush_fn;
-
-	if (flush_fn)
-		list_add(&pin->list, &pin_list->list);
-	else
-		INIT_LIST_HEAD(&pin->list);
-
-	/*
-	 * If the journal is currently full,  we might want to call flush_fn
-	 * immediately:
-	 */
-	journal_wake(j);
-}
-
-static void journal_pin_add_entry(struct journal *j,
-				  struct journal_entry_pin_list *pin_list,
-				  struct journal_entry_pin *pin,
-				  journal_pin_flush_fn flush_fn)
-{
-	spin_lock(&j->lock);
-	__journal_pin_add(j, pin_list, pin, flush_fn);
-	spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_add(struct journal *j,
-			  struct journal_res *res,
-			  struct journal_entry_pin *pin,
-			  journal_pin_flush_fn flush_fn)
-{
-	struct journal_entry_pin_list *pin_list = res->ref
-		? journal_seq_pin(j, res->seq)
-		: j->replay_pin_list;
-
-	spin_lock(&j->lock);
-	__journal_pin_add(j, pin_list, pin, flush_fn);
-	spin_unlock(&j->lock);
-}
-
-static inline void __journal_pin_drop(struct journal *j,
-				      struct journal_entry_pin *pin)
-{
-	struct journal_entry_pin_list *pin_list = pin->pin_list;
-
-	if (!journal_pin_active(pin))
-		return;
-
-	pin->pin_list = NULL;
-	list_del_init(&pin->list);
-
-	/*
-	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
-	 * writing a new last_seq will now make another bucket available:
-	 */
-	if (atomic_dec_and_test(&pin_list->count) &&
-	    pin_list == &fifo_peek_front(&j->pin))
-		journal_reclaim_fast(j);
-}
-
-void bch2_journal_pin_drop(struct journal *j,
-			  struct journal_entry_pin *pin)
-{
-	spin_lock(&j->lock);
-	__journal_pin_drop(j, pin);
-	spin_unlock(&j->lock);
-}
-
-void bch2_journal_pin_add_if_older(struct journal *j,
-				  struct journal_entry_pin *src_pin,
-				  struct journal_entry_pin *pin,
-				  journal_pin_flush_fn flush_fn)
-{
-	spin_lock(&j->lock);
-
-	if (journal_pin_active(src_pin) &&
-	    (!journal_pin_active(pin) ||
-	     journal_pin_seq(j, src_pin->pin_list) <
-	     journal_pin_seq(j, pin->pin_list))) {
-		__journal_pin_drop(j, pin);
-		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
-	}
-
-	spin_unlock(&j->lock);
-}
-
-static struct journal_entry_pin *
-__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
-	struct journal_entry_pin_list *pin_list;
-	struct journal_entry_pin *ret;
-	u64 iter;
-
-	/* no need to iterate over empty fifo entries: */
-	journal_reclaim_fast(j);
-
-	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
-		if (iter > seq_to_flush)
-			break;
-
-		ret = list_first_entry_or_null(&pin_list->list,
-				struct journal_entry_pin, list);
-		if (ret) {
-			/* must be list_del_init(), see bch2_journal_pin_drop() */
-			list_move(&ret->list, &pin_list->flushed);
-			*seq = iter;
-			return ret;
-		}
-	}
-
-	return NULL;
-}
-
-static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
-{
-	struct journal_entry_pin *ret;
-
-	spin_lock(&j->lock);
-	ret = __journal_get_next_pin(j, seq_to_flush, seq);
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
-static int journal_flush_done(struct journal *j, u64 seq_to_flush,
-			      struct journal_entry_pin **pin,
-			      u64 *pin_seq)
-{
-	int ret;
-
-	*pin = NULL;
-
-	ret = bch2_journal_error(j);
-	if (ret)
-		return ret;
-
-	spin_lock(&j->lock);
-	/*
-	 * If journal replay hasn't completed, the unreplayed journal entries
-	 * hold refs on their corresponding sequence numbers
-	 */
-	ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
-		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
-		journal_last_seq(j) > seq_to_flush ||
-		(fifo_used(&j->pin) == 1 &&
-		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
-int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_entry_pin *pin;
-	u64 pin_seq;
-	bool flush;
-
-	if (!test_bit(JOURNAL_STARTED, &j->flags))
-		return 0;
-again:
-	wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
-	if (pin) {
-		/* flushing a journal pin might cause a new one to be added: */
-		pin->flush(j, pin, pin_seq);
-		goto again;
-	}
-
-	spin_lock(&j->lock);
-	flush = journal_last_seq(j) != j->last_seq_ondisk ||
-		(seq_to_flush == U64_MAX && c->btree_roots_dirty);
-	spin_unlock(&j->lock);
-
-	return flush ? bch2_journal_meta(j) : 0;
-}
-
-int bch2_journal_flush_all_pins(struct journal *j)
-{
-	return bch2_journal_flush_pins(j, U64_MAX);
-}
-
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
-{
-	bool ret;
-
-	spin_lock(&j->lock);
-	ret = ja->nr &&
-		(ja->last_idx != ja->cur_idx &&
-		 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
-/**
- * journal_reclaim_work - free up journal buckets
- *
- * Background journal reclaim writes out btree nodes. It should be run
- * early enough so that we never completely run out of journal buckets.
- *
- * High watermarks for triggering background reclaim:
- * - FIFO has fewer than 512 entries left
- * - fewer than 25% journal buckets free
- *
- * Background reclaim runs until low watermarks are reached:
- * - FIFO has more than 1024 entries left
- * - more than 50% journal buckets free
- *
- * As long as a reclaim can complete in the time it takes to fill up
- * 512 journal entries or 25% of all journal buckets, then
- * journal_next_bucket() should not stall.
- */
-static void journal_reclaim_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(to_delayed_work(work),
-				struct bch_fs, journal.reclaim_work);
-	struct journal *j = &c->journal;
-	struct bch_dev *ca;
-	struct journal_entry_pin *pin;
-	u64 seq, seq_to_flush = 0;
-	unsigned iter, bucket_to_flush;
-	unsigned long next_flush;
-	bool reclaim_lock_held = false, need_flush;
-
-	/*
-	 * Advance last_idx to point to the oldest journal entry containing
-	 * btree node updates that have not yet been written out
-	 */
-	for_each_rw_member(ca, c, iter) {
-		struct journal_device *ja = &ca->journal;
-
-		if (!ja->nr)
-			continue;
-
-		while (should_discard_bucket(j, ja)) {
-			if (!reclaim_lock_held) {
-				/*
-				 * ugh:
-				 * might be called from __journal_res_get()
-				 * under wait_event() - have to go back to
-				 * TASK_RUNNING before doing something that
-				 * would block, but only if we're doing work:
-				 */
-				__set_current_state(TASK_RUNNING);
-
-				mutex_lock(&j->reclaim_lock);
-				reclaim_lock_held = true;
-				/* recheck under reclaim_lock: */
-				continue;
-			}
-
-			if (ca->mi.discard &&
-			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-				blkdev_issue_discard(ca->disk_sb.bdev,
-					bucket_to_sector(ca,
-						ja->buckets[ja->last_idx]),
-					ca->mi.bucket_size, GFP_NOIO, 0);
-
-			spin_lock(&j->lock);
-			ja->last_idx = (ja->last_idx + 1) % ja->nr;
-			spin_unlock(&j->lock);
-
-			journal_wake(j);
-		}
-
-		/*
-		 * Write out enough btree nodes to free up 50% journal
-		 * buckets
-		 */
-		spin_lock(&j->lock);
-		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
-		seq_to_flush = max_t(u64, seq_to_flush,
-				     ja->bucket_seq[bucket_to_flush]);
-		spin_unlock(&j->lock);
-	}
-
-	if (reclaim_lock_held)
-		mutex_unlock(&j->reclaim_lock);
-
-	/* Also flush if the pin fifo is more than half full */
-	spin_lock(&j->lock);
-	seq_to_flush = max_t(s64, seq_to_flush,
-			     (s64) journal_cur_seq(j) -
-			     (j->pin.size >> 1));
-	spin_unlock(&j->lock);
-
-	/*
-	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
-	 * make sure to flush at least one journal pin:
-	 */
-	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
-	need_flush = time_after(jiffies, next_flush);
-
-	while ((pin = journal_get_next_pin(j, need_flush
-					   ? U64_MAX
-					   : seq_to_flush, &seq))) {
-		__set_current_state(TASK_RUNNING);
-		pin->flush(j, pin, seq);
-		need_flush = false;
-
-		j->last_flushed = jiffies;
-	}
-
-	if (!test_bit(BCH_FS_RO, &c->flags))
-		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
-				   msecs_to_jiffies(j->reclaim_delay_ms));
-}
-
-/**
- * journal_next_bucket - move on to the next journal bucket if possible
- */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-			       unsigned sectors)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bkey_s_extent e;
-	struct bch_extent_ptr *ptr;
-	struct journal_device *ja;
-	struct bch_dev *ca;
-	struct dev_alloc_list devs_sorted;
-	unsigned i, replicas, replicas_want =
-		READ_ONCE(c->opts.metadata_replicas);
-
-	spin_lock(&j->lock);
-	e = bkey_i_to_s_extent(&j->key);
-
-	/*
-	 * Drop any pointers to devices that have been removed, are no longer
-	 * empty, or filled up their current journal bucket:
-	 *
-	 * Note that a device may have had a small amount of free space (perhaps
-	 * one sector) that wasn't enough for the smallest possible journal
-	 * entry - that's why we drop pointers to devices <= current free space,
-	 * i.e. whichever device was limiting the current journal entry size.
-	 */
-	extent_for_each_ptr_backwards(e, ptr) {
-		   ca = bch_dev_bkey_exists(c, ptr->dev);
-
-		if (ca->mi.state != BCH_MEMBER_STATE_RW ||
-		    ca->journal.sectors_free <= sectors)
-			__bch2_extent_drop_ptr(e, ptr);
-		else
-			ca->journal.sectors_free -= sectors;
-	}
-
-	replicas = bch2_extent_nr_ptrs(e.c);
-
-	rcu_read_lock();
-	devs_sorted = bch2_wp_alloc_list(c, &j->wp,
-					 &c->rw_devs[BCH_DATA_JOURNAL]);
-
-	for (i = 0; i < devs_sorted.nr; i++) {
-		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
-		if (!ca)
-			continue;
-
-		if (!ca->mi.durability)
-			continue;
-
-		ja = &ca->journal;
-		if (!ja->nr)
-			continue;
-
-		if (replicas >= replicas_want)
-			break;
-
-		/*
-		 * Check that we can use this device, and aren't already using
-		 * it:
-		 */
-		if (bch2_extent_has_device(e.c, ca->dev_idx) ||
-		    !journal_dev_buckets_available(j, ca) ||
-		    sectors > ca->mi.bucket_size)
-			continue;
-
-		j->wp.next_alloc[ca->dev_idx] += U32_MAX;
-		bch2_wp_rescale(c, ca, &j->wp);
-
-		ja->sectors_free = ca->mi.bucket_size - sectors;
-		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
-
-		extent_ptr_append(bkey_i_to_extent(&j->key),
-			(struct bch_extent_ptr) {
-				  .offset = bucket_to_sector(ca,
-					ja->buckets[ja->cur_idx]),
-				  .dev = ca->dev_idx,
-		});
-
-		replicas += ca->mi.durability;
-	}
-	rcu_read_unlock();
-
-	j->prev_buf_sectors = 0;
-
-	bkey_copy(&w->key, &j->key);
-	spin_unlock(&j->lock);
-
-	if (replicas < c->opts.metadata_replicas_required)
-		return -EROFS;
-
-	BUG_ON(!replicas);
-
-	return 0;
-}
-
-static void journal_write_compact(struct jset *jset)
-{
-	struct jset_entry *i, *next, *prev = NULL;
-
-	/*
-	 * Simple compaction, dropping empty jset_entries (from journal
-	 * reservations that weren't fully used) and merging jset_entries that
-	 * can be.
-	 *
-	 * If we wanted to be really fancy here, we could sort all the keys in
-	 * the jset and drop keys that were overwritten - probably not worth it:
-	 */
-	vstruct_for_each_safe(jset, i, next) {
-		unsigned u64s = le16_to_cpu(i->u64s);
-
-		/* Empty entry: */
-		if (!u64s)
-			continue;
-
-		/* Can we merge with previous entry? */
-		if (prev &&
-		    i->btree_id == prev->btree_id &&
-		    i->level	== prev->level &&
-		    i->type	== prev->type &&
-		    i->type	== JOURNAL_ENTRY_BTREE_KEYS &&
-		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
-			memmove_u64s_down(vstruct_next(prev),
-					  i->_data,
-					  u64s);
-			le16_add_cpu(&prev->u64s, u64s);
-			continue;
-		}
-
-		/* Couldn't merge, move i into new position (after prev): */
-		prev = prev ? vstruct_next(prev) : jset->start;
-		if (i != prev)
-			memmove_u64s_down(prev, i, jset_u64s(u64s));
-	}
-
-	prev = prev ? vstruct_next(prev) : jset->start;
-	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
-}
-
-static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
-{
-	/* we aren't holding j->lock: */
-	unsigned new_size = READ_ONCE(j->buf_size_want);
-	void *new_buf;
-
-	if (buf->size >= new_size)
-		return;
-
-	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
-	if (!new_buf)
-		return;
-
-	memcpy(new_buf, buf->data, buf->size);
-	kvpfree(buf->data, buf->size);
-	buf->data	= new_buf;
-	buf->size	= new_size;
-}
-
-static void journal_write_done(struct closure *cl)
-{
-	struct journal *j = container_of(cl, struct journal, io);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_prev_buf(j);
-	struct bch_devs_list devs =
-		bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
-
-	if (!devs.nr) {
-		bch_err(c, "unable to write journal to sufficient devices");
-		goto err;
-	}
-
-	if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
-		goto err;
-out:
-	__bch2_time_stats_update(j->write_time, j->write_start_time);
-
-	spin_lock(&j->lock);
-	j->last_seq_ondisk = le64_to_cpu(w->data->last_seq);
-
-	journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs = devs;
-
-	/*
-	 * Updating last_seq_ondisk may let journal_reclaim_work() discard more
-	 * buckets:
-	 *
-	 * Must come before signaling write completion, for
-	 * bch2_fs_journal_stop():
-	 */
-	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
-
-	/* also must come before signalling write completion: */
-	closure_debug_destroy(cl);
-
-	BUG_ON(!j->reservations.prev_buf_unwritten);
-	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
-		     &j->reservations.counter);
-
-	closure_wake_up(&w->wait);
-	journal_wake(j);
-
-	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
-		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
-	spin_unlock(&j->lock);
-	return;
-err:
-	bch2_fatal_error(c);
-	bch2_journal_halt(j);
-	goto out;
-}
-
-static void journal_write_endio(struct bio *bio)
-{
-	struct bch_dev *ca = bio->bi_private;
-	struct journal *j = &ca->fs->journal;
-
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
-	    bch2_meta_write_fault("journal")) {
-		struct journal_buf *w = journal_prev_buf(j);
-		unsigned long flags;
-
-		spin_lock_irqsave(&j->err_lock, flags);
-		bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
-		spin_unlock_irqrestore(&j->err_lock, flags);
-	}
-
-	closure_put(&j->io);
-	percpu_ref_put(&ca->io_ref);
-}
-
-static void journal_write(struct closure *cl)
-{
-	struct journal *j = container_of(cl, struct journal, io);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	struct journal_buf *w = journal_prev_buf(j);
-	struct jset *jset;
-	struct bio *bio;
-	struct bch_extent_ptr *ptr;
-	unsigned i, sectors, bytes;
-
-	journal_buf_realloc(j, w);
-	jset = w->data;
-
-	j->write_start_time = local_clock();
-	mutex_lock(&c->btree_root_lock);
-	for (i = 0; i < BTREE_ID_NR; i++) {
-		struct btree_root *r = &c->btree_roots[i];
-
-		if (r->alive)
-			bch2_journal_add_btree_root(w, i, &r->key, r->level);
-	}
-	c->btree_roots_dirty = false;
-	mutex_unlock(&c->btree_root_lock);
-
-	journal_write_compact(jset);
-
-	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
-	jset->magic		= cpu_to_le64(jset_magic(c));
-	jset->version		= cpu_to_le32(BCACHE_JSET_VERSION);
-
-	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
-	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
-
-	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-	    journal_entry_validate_entries(c, jset, WRITE))
-		goto err;
-
-	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
-		    jset->encrypted_start,
-		    vstruct_end(jset) - (void *) jset->encrypted_start);
-
-	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
-				  journal_nonce(jset), jset);
-
-	if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
-	    journal_entry_validate_entries(c, jset, WRITE))
-		goto err;
-
-	sectors = vstruct_sectors(jset, c->block_bits);
-	BUG_ON(sectors > j->prev_buf_sectors);
-
-	bytes = vstruct_bytes(w->data);
-	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
-
-	if (journal_write_alloc(j, w, sectors)) {
-		bch2_journal_halt(j);
-		bch_err(c, "Unable to allocate journal write");
-		bch2_fatal_error(c);
-		continue_at(cl, journal_write_done, system_highpri_wq);
-	}
-
-	/*
-	 * XXX: we really should just disable the entire journal in nochanges
-	 * mode
-	 */
-	if (c->opts.nochanges)
-		goto no_io;
-
-	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
-		if (!percpu_ref_tryget(&ca->io_ref)) {
-			/* XXX: fix this */
-			bch_err(c, "missing device for journal write\n");
-			continue;
-		}
-
-		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
-			     sectors);
-
-		bio = ca->journal.bio;
-		bio_reset(bio);
-		bio_set_dev(bio, ca->disk_sb.bdev);
-		bio->bi_iter.bi_sector	= ptr->offset;
-		bio->bi_iter.bi_size	= sectors << 9;
-		bio->bi_end_io		= journal_write_endio;
-		bio->bi_private		= ca;
-		bio_set_op_attrs(bio, REQ_OP_WRITE,
-				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
-		bch2_bio_map(bio, jset);
-
-		trace_journal_write(bio);
-		closure_bio_submit(bio, cl);
-
-		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
-	}
-
-	for_each_rw_member(ca, c, i)
-		if (journal_flushes_device(ca) &&
-		    !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
-			percpu_ref_get(&ca->io_ref);
-
-			bio = ca->journal.bio;
-			bio_reset(bio);
-			bio_set_dev(bio, ca->disk_sb.bdev);
-			bio->bi_opf		= REQ_OP_FLUSH;
-			bio->bi_end_io		= journal_write_endio;
-			bio->bi_private		= ca;
-			closure_bio_submit(bio, cl);
-		}
-
-no_io:
-	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
-		ptr->offset += sectors;
-
-	continue_at(cl, journal_write_done, system_highpri_wq);
-err:
-	bch2_inconsistent_error(c);
-	continue_at(cl, journal_write_done, system_highpri_wq);
-}
-
-/*
- * returns true if there's nothing to flush and no journal write still in flight
- */
-static bool journal_flush_write(struct journal *j)
-{
-	bool ret;
-
-	spin_lock(&j->lock);
-	ret = !j->reservations.prev_buf_unwritten;
-
-	if (!journal_entry_is_open(j)) {
-		spin_unlock(&j->lock);
-		return ret;
-	}
-
-	set_bit(JOURNAL_NEED_WRITE, &j->flags);
-	if (journal_buf_switch(j, false) == JOURNAL_UNLOCKED)
-		ret = false;
-	else
-		spin_unlock(&j->lock);
-	return ret;
-}
-
-static void journal_write_work(struct work_struct *work)
-{
-	struct journal *j = container_of(work, struct journal, write_work.work);
-
-	journal_flush_write(j);
-}
-
-/*
- * Given an inode number, if that inode number has data in the journal that
- * hasn't yet been flushed, return the journal sequence number that needs to be
- * flushed:
- */
-u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
-{
-	size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8));
-	u64 seq = 0;
-
-	if (!test_bit(h, j->buf[0].has_inode) &&
-	    !test_bit(h, j->buf[1].has_inode))
-		return 0;
-
-	spin_lock(&j->lock);
-	if (test_bit(h, journal_cur_buf(j)->has_inode))
-		seq = journal_cur_seq(j);
-	else if (test_bit(h, journal_prev_buf(j)->has_inode))
-		seq = journal_cur_seq(j) - 1;
-	spin_unlock(&j->lock);
-
-	return seq;
-}
-
-static int __journal_res_get(struct journal *j, struct journal_res *res,
-			      unsigned u64s_min, unsigned u64s_max)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *buf;
-	int ret;
-retry:
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret)
-		return ret;
-
-	spin_lock(&j->lock);
-	/*
-	 * Recheck after taking the lock, so we don't race with another thread
-	 * that just did journal_entry_open() and call journal_entry_close()
-	 * unnecessarily
-	 */
-	ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-	if (ret) {
-		spin_unlock(&j->lock);
-		return 1;
-	}
-
-	/*
-	 * If we couldn't get a reservation because the current buf filled up,
-	 * and we had room for a bigger entry on disk, signal that we want to
-	 * realloc the journal bufs:
-	 */
-	buf = journal_cur_buf(j);
-	if (journal_entry_is_open(j) &&
-	    buf->size >> 9 < buf->disk_sectors &&
-	    buf->size < JOURNAL_ENTRY_SIZE_MAX)
-		j->buf_size_want = max(j->buf_size_want, buf->size << 1);
-
-	/*
-	 * Close the current journal entry if necessary, then try to start a new
-	 * one:
-	 */
-	switch (journal_buf_switch(j, false)) {
-	case JOURNAL_ENTRY_ERROR:
-		spin_unlock(&j->lock);
-		return -EROFS;
-	case JOURNAL_ENTRY_INUSE:
-		/* haven't finished writing out the previous one: */
-		spin_unlock(&j->lock);
-		trace_journal_entry_full(c);
-		goto blocked;
-	case JOURNAL_ENTRY_CLOSED:
-		break;
-	case JOURNAL_UNLOCKED:
-		goto retry;
-	}
-
-	/* We now have a new, closed journal buf - see if we can open it: */
-	ret = journal_entry_open(j);
-	spin_unlock(&j->lock);
-
-	if (ret < 0)
-		return ret;
-	if (ret)
-		goto retry;
-
-	/* Journal's full, we have to wait */
-
-	/*
-	 * Direct reclaim - can't rely on reclaim from work item
-	 * due to freezing..
-	 */
-	journal_reclaim_work(&j->reclaim_work.work);
-
-	trace_journal_full(c);
-blocked:
-	if (!j->res_get_blocked_start)
-		j->res_get_blocked_start = local_clock() ?: 1;
-	return 0;
-}
-
-/*
- * Essentially the entry function to the journaling code. When bcachefs is doing
- * a btree insert, it calls this function to get the current journal write.
- * Journal write is the structure used set up journal writes. The calling
- * function will then add its keys to the structure, queuing them for the next
- * write.
- *
- * To ensure forward progress, the current task must not be holding any
- * btree node write locks.
- */
-int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-				 unsigned u64s_min, unsigned u64s_max)
-{
-	int ret;
-
-	wait_event(j->wait,
-		   (ret = __journal_res_get(j, res, u64s_min,
-					    u64s_max)));
-	return ret < 0 ? ret : 0;
-}
-
-u64 bch2_journal_last_unwritten_seq(struct journal *j)
-{
-	u64 seq;
-
-	spin_lock(&j->lock);
-	seq = journal_cur_seq(j);
-	if (j->reservations.prev_buf_unwritten)
-		seq--;
-	spin_unlock(&j->lock);
-
-	return seq;
-}
-
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
-{
-	int ret;
-
-	spin_lock(&j->lock);
-	BUG_ON(seq > journal_cur_seq(j));
-
-	if (seq < journal_cur_seq(j) ||
-	    journal_entry_is_open(j)) {
-		spin_unlock(&j->lock);
-		return 1;
-	}
-
-	ret = journal_entry_open(j);
-	if (!ret)
-		closure_wait(&j->async_wait, parent);
-	spin_unlock(&j->lock);
-
-	if (!ret)
-		journal_reclaim_work(&j->reclaim_work.work);
-
-	return ret;
-}
-
-void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent)
-{
-	spin_lock(&j->lock);
-
-	BUG_ON(seq > journal_cur_seq(j));
-
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
-		return;
-	}
-
-	if (seq == journal_cur_seq(j)) {
-		if (!closure_wait(&journal_cur_buf(j)->wait, parent))
-			BUG();
-	} else if (seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		if (!closure_wait(&journal_prev_buf(j)->wait, parent))
-			BUG();
-
-		smp_mb();
-
-		/* check if raced with write completion (or failure) */
-		if (!j->reservations.prev_buf_unwritten ||
-		    bch2_journal_error(j))
-			closure_wake_up(&journal_prev_buf(j)->wait);
-	}
-
-	spin_unlock(&j->lock);
-}
-
-void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent)
-{
-	struct journal_buf *buf;
-
-	spin_lock(&j->lock);
-
-	BUG_ON(seq > journal_cur_seq(j));
-
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
-		return;
-	}
-
-	if (seq == journal_cur_seq(j)) {
-		bool set_need_write = false;
-
-		buf = journal_cur_buf(j);
-
-		if (parent && !closure_wait(&buf->wait, parent))
-			BUG();
-
-		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-			j->need_write_time = local_clock();
-			set_need_write = true;
-		}
-
-		switch (journal_buf_switch(j, set_need_write)) {
-		case JOURNAL_ENTRY_ERROR:
-			if (parent)
-				closure_wake_up(&buf->wait);
-			break;
-		case JOURNAL_ENTRY_CLOSED:
-			/*
-			 * Journal entry hasn't been opened yet, but caller
-			 * claims it has something
-			 */
-			BUG();
-		case JOURNAL_ENTRY_INUSE:
-			break;
-		case JOURNAL_UNLOCKED:
-			return;
-		}
-	} else if (parent &&
-		   seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		buf = journal_prev_buf(j);
-
-		if (!closure_wait(&buf->wait, parent))
-			BUG();
-
-		smp_mb();
-
-		/* check if raced with write completion (or failure) */
-		if (!j->reservations.prev_buf_unwritten ||
-		    bch2_journal_error(j))
-			closure_wake_up(&buf->wait);
-	}
-
-	spin_unlock(&j->lock);
-}
-
-static int journal_seq_flushed(struct journal *j, u64 seq)
-{
-	struct journal_buf *buf;
-	int ret = 1;
-
-	spin_lock(&j->lock);
-	BUG_ON(seq > journal_cur_seq(j));
-
-	if (seq == journal_cur_seq(j)) {
-		bool set_need_write = false;
-
-		ret = 0;
-
-		buf = journal_cur_buf(j);
-
-		if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) {
-			j->need_write_time = local_clock();
-			set_need_write = true;
-		}
-
-		switch (journal_buf_switch(j, set_need_write)) {
-		case JOURNAL_ENTRY_ERROR:
-			ret = -EIO;
-			break;
-		case JOURNAL_ENTRY_CLOSED:
-			/*
-			 * Journal entry hasn't been opened yet, but caller
-			 * claims it has something
-			 */
-			BUG();
-		case JOURNAL_ENTRY_INUSE:
-			break;
-		case JOURNAL_UNLOCKED:
-			return 0;
-		}
-	} else if (seq + 1 == journal_cur_seq(j) &&
-		   j->reservations.prev_buf_unwritten) {
-		ret = bch2_journal_error(j);
-	}
-
-	spin_unlock(&j->lock);
-
-	return ret;
-}
-
-int bch2_journal_flush_seq(struct journal *j, u64 seq)
-{
-	u64 start_time = local_clock();
-	int ret, ret2;
-
-	ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq)));
-
-	bch2_time_stats_update(j->flush_seq_time, start_time);
-
-	return ret ?: ret2 < 0 ? ret2 : 0;
-}
-
-void bch2_journal_meta_async(struct journal *j, struct closure *parent)
-{
-	struct journal_res res;
-	unsigned u64s = jset_u64s(0);
-
-	memset(&res, 0, sizeof(res));
-
-	bch2_journal_res_get(j, &res, u64s, u64s);
-	bch2_journal_res_put(j, &res);
-
-	bch2_journal_flush_seq_async(j, res.seq, parent);
-}
-
-int bch2_journal_meta(struct journal *j)
-{
-	struct journal_res res;
-	unsigned u64s = jset_u64s(0);
-	int ret;
-
-	memset(&res, 0, sizeof(res));
-
-	ret = bch2_journal_res_get(j, &res, u64s, u64s);
-	if (ret)
-		return ret;
-
-	bch2_journal_res_put(j, &res);
-
-	return bch2_journal_flush_seq(j, res.seq);
-}
-
-void bch2_journal_flush_async(struct journal *j, struct closure *parent)
-{
-	u64 seq, journal_seq;
-
-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
-
-	if (journal_entry_is_open(j)) {
-		seq = journal_seq;
-	} else if (journal_seq) {
-		seq = journal_seq - 1;
-	} else {
-		spin_unlock(&j->lock);
-		return;
-	}
-	spin_unlock(&j->lock);
-
-	bch2_journal_flush_seq_async(j, seq, parent);
-}
-
-int bch2_journal_flush(struct journal *j)
-{
-	u64 seq, journal_seq;
-
-	spin_lock(&j->lock);
-	journal_seq = journal_cur_seq(j);
-
-	if (journal_entry_is_open(j)) {
-		seq = journal_seq;
-	} else if (journal_seq) {
-		seq = journal_seq - 1;
-	} else {
-		spin_unlock(&j->lock);
-		return 0;
-	}
-	spin_unlock(&j->lock);
-
-	return bch2_journal_flush_seq(j, seq);
-}
-
-int bch2_journal_flush_device(struct journal *j, int dev_idx)
-{
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_entry_pin_list *p;
-	struct bch_devs_list devs;
-	u64 iter, seq = 0;
-	int ret = 0;
-
-	spin_lock(&j->lock);
-	fifo_for_each_entry_ptr(p, &j->pin, iter)
-		if (dev_idx >= 0
-		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
-		    : p->devs.nr < c->opts.metadata_replicas)
-			seq = iter;
-	spin_unlock(&j->lock);
-
-	ret = bch2_journal_flush_pins(j, seq);
-	if (ret)
-		return ret;
-
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
-
-	seq = 0;
-
-	spin_lock(&j->lock);
-	while (!ret && seq < j->pin.back) {
-		seq = max(seq, journal_last_seq(j));
-		devs = journal_seq_pin(j, seq)->devs;
-		seq++;
-
-		spin_unlock(&j->lock);
-		ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
-		spin_lock(&j->lock);
-	}
-	spin_unlock(&j->lock);
-
-	bch2_replicas_gc_end(c, ret);
-	mutex_unlock(&c->replicas_gc_lock);
-
-	return ret;
-}
-
 /* startup/shutdown: */
 
 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
@@ -2936,6 +883,43 @@ void bch2_fs_journal_stop(struct journal *j)
 	cancel_delayed_work_sync(&j->reclaim_work);
 }
 
+void bch2_fs_journal_start(struct journal *j)
+{
+	struct journal_seq_blacklist *bl;
+	u64 blacklist = 0;
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		blacklist = max(blacklist, bl->end);
+
+	spin_lock(&j->lock);
+
+	set_bit(JOURNAL_STARTED, &j->flags);
+
+	while (journal_cur_seq(j) < blacklist)
+		journal_pin_new_entry(j, 0);
+
+	/*
+	 * journal_buf_switch() only inits the next journal entry when it
+	 * closes an open journal entry - the very first journal entry gets
+	 * initialized here:
+	 */
+	journal_pin_new_entry(j, 1);
+	bch2_journal_buf_init(j);
+
+	spin_unlock(&j->lock);
+
+	/*
+	 * Adding entries to the next journal entry before allocating space on
+	 * disk for the next journal entry - this is ok, because these entries
+	 * only have to go down with the next journal entry we write:
+	 */
+	bch2_journal_seq_blacklist_write(j);
+
+	queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+}
+
+/* init/exit: */
+
 void bch2_dev_journal_exit(struct bch_dev *ca)
 {
 	kfree(ca->journal.bio);
@@ -2994,7 +978,7 @@ int bch2_fs_journal_init(struct journal *j)
 	spin_lock_init(&j->err_lock);
 	init_waitqueue_head(&j->wait);
 	INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-	INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+	INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work);
 	mutex_init(&j->blacklist_lock);
 	INIT_LIST_HEAD(&j->seq_blacklist);
 	mutex_init(&j->reclaim_lock);
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h
index cf5cc9ba..4cec7bb5 100644
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -112,72 +112,37 @@
 
 #include "journal_types.h"
 
-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
-	struct list_head	list;
-	struct bch_devs_list	devs;
-	/* must be last: */
-	struct jset		j;
-};
-
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
-					struct jset_entry *entry, unsigned type)
-{
-	while (entry < vstruct_last(jset)) {
-		if (entry->type == type)
-			return entry;
-
-		entry = vstruct_next(entry);
-	}
-
-	return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type)			\
-	for (entry = (jset)->start;					\
-	     (entry = __jset_entry_type_next(jset, entry, type));	\
-	     entry = vstruct_next(entry))
-
-#define for_each_jset_key(k, _n, entry, jset)				\
-	for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS)	\
-		vstruct_for_each_safe(entry, k, _n)
-
-#define JOURNAL_PIN	(32 * 1024)
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
-{
-	return pin->pin_list != NULL;
-}
-
-static inline struct journal_entry_pin_list *
-journal_seq_pin(struct journal *j, u64 seq)
-{
-	return &j->pin.data[seq & j->pin.mask];
-}
-
-u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_pin_add(struct journal *, struct journal_res *,
-			  struct journal_entry_pin *, journal_pin_flush_fn);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void bch2_journal_pin_add_if_older(struct journal *,
-				  struct journal_entry_pin *,
-				  struct journal_entry_pin *,
-				  journal_pin_flush_fn);
-int bch2_journal_flush_pins(struct journal *, u64);
-int bch2_journal_flush_all_pins(struct journal *);
-
-struct closure;
 struct bch_fs;
-struct keylist;
 
-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
-					   enum btree_id, unsigned *);
+static inline void journal_wake(struct journal *j)
+{
+	wake_up(&j->wait);
+	closure_wake_up(&j->async_wait);
+}
 
-int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+	return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+	return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+	return j->pin.back - 1;
+}
 
 u64 bch2_inode_journal_seq(struct journal *, u64);
 
@@ -213,21 +178,18 @@ static inline unsigned jset_u64s(unsigned u64s)
 	return u64s + sizeof(struct jset_entry) / sizeof(u64);
 }
 
-static inline void bch2_journal_add_entry_at(struct journal_buf *buf,
-					    unsigned offset,
-					    unsigned type, enum btree_id id,
-					    unsigned level,
-					    const void *data, size_t u64s)
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 {
-	struct jset_entry *entry = vstruct_idx(buf->data, offset);
+	struct jset *jset = buf->data;
+	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));
 
 	memset(entry, 0, sizeof(*entry));
-	entry->u64s	= cpu_to_le16(u64s);
-	entry->btree_id = id;
-	entry->level	= level;
-	entry->type	= type;
+	entry->u64s = cpu_to_le16(u64s);
 
-	memcpy_u64s(entry->_data, data, u64s);
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+	return entry;
 }
 
 static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
@@ -236,21 +198,27 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
 					  const void *data, unsigned u64s)
 {
 	struct journal_buf *buf = &j->buf[res->idx];
+	struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
 	unsigned actual = jset_u64s(u64s);
 
 	EBUG_ON(!res->ref);
 	EBUG_ON(actual > res->u64s);
 
-	bch2_journal_add_entry_at(buf, res->offset, type,
-				  id, level, data, u64s);
 	res->offset	+= actual;
 	res->u64s	-= actual;
+
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->type	= type;
+	entry->btree_id = id;
+	entry->level	= level;
+	memcpy_u64s(entry->_data, data, u64s);
 }
 
 static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
 					enum btree_id id, const struct bkey_i *k)
 {
-	bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS,
+	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
 			       id, 0, k, k->k.u64s);
 }
 
@@ -292,7 +260,7 @@ static inline void bch2_journal_res_put(struct journal *j,
 
 	while (res->u64s)
 		bch2_journal_add_entry(j, res,
-				       JOURNAL_ENTRY_BTREE_KEYS,
+				       BCH_JSET_ENTRY_btree_keys,
 				       0, 0, NULL, 0);
 
 	bch2_journal_buf_put(j, res->idx, false);
@@ -368,7 +336,6 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 int bch2_journal_meta(struct journal *);
-int bch2_journal_flush_device(struct journal *, int);
 
 void bch2_journal_halt(struct journal *);
 
@@ -385,10 +352,8 @@ static inline bool journal_flushes_device(struct bch_dev *ca)
 	return true;
 }
 
-void bch2_journal_start(struct bch_fs *);
 int bch2_journal_mark(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_read(struct bch_fs *, struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);
 
 static inline void bch2_journal_set_replay_done(struct journal *j)
@@ -404,6 +369,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);
 
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 void bch2_fs_journal_stop(struct journal *);
+void bch2_fs_journal_start(struct journal *);
 void bch2_dev_journal_exit(struct bch_dev *);
 int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
 void bch2_fs_journal_exit(struct journal *);
diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c
new file mode 100644
index 00000000..2fd0d646
--- /dev/null
+++ b/libbcachefs/journal_io.c
@@ -0,0 +1,1423 @@
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "checksum.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+#include "replicas.h"
+
+#include <trace/events/bcachefs.h>
+
+static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
+						 enum btree_id id)
+{
+	struct jset_entry *entry;
+
+	for_each_jset_entry_type(entry, j, type)
+		if (entry->btree_id == id)
+			return entry;
+
+	return NULL;
+}
+
+struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
+					   enum btree_id id, unsigned *level)
+{
+	struct bkey_i *k;
+	struct jset_entry *entry =
+		bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id);
+
+	if (!entry)
+		return NULL;
+
+	if (!entry->u64s)
+		return ERR_PTR(-EINVAL);
+
+	k = entry->start;
+	*level = entry->level;
+	*level = entry->level;
+	return k;
+}
+
+struct journal_list {
+	struct closure		cl;
+	struct mutex		lock;
+	struct list_head	*head;
+	int			ret;
+};
+
+#define JOURNAL_ENTRY_ADD_OK		0
+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE	5
+
+/*
+ * Given a journal entry we just read, add it to the list of journal entries to
+ * be replayed:
+ */
+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+			     struct journal_list *jlist, struct jset *j)
+{
+	struct journal_replay *i, *pos;
+	struct list_head *where;
+	size_t bytes = vstruct_bytes(j);
+	__le64 last_seq;
+	int ret;
+
+	last_seq = !list_empty(jlist->head)
+		? list_last_entry(jlist->head, struct journal_replay,
+				  list)->j.last_seq
+		: 0;
+
+	/* Is this entry older than the range we need? */
+	if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+		ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+		goto out;
+	}
+
+	/* Drop entries we don't need anymore */
+	list_for_each_entry_safe(i, pos, jlist->head, list) {
+		if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+			break;
+		list_del(&i->list);
+		kvpfree(i, offsetof(struct journal_replay, j) +
+			vstruct_bytes(&i->j));
+	}
+
+	list_for_each_entry_reverse(i, jlist->head, list) {
+		/* Duplicate? */
+		if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
+			fsck_err_on(bytes != vstruct_bytes(&i->j) ||
+				    memcmp(j, &i->j, bytes), c,
+				    "found duplicate but non identical journal entries (seq %llu)",
+				    le64_to_cpu(j->seq));
+			goto found;
+		}
+
+		if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) {
+			where = &i->list;
+			goto add;
+		}
+	}
+
+	where = jlist->head;
+add:
+	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	if (!i) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	list_add(&i->list, where);
+	i->devs.nr = 0;
+	memcpy(&i->j, j, bytes);
+found:
+	if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
+		bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
+	else
+		fsck_err_on(1, c, "duplicate journal entries on same device");
+	ret = JOURNAL_ENTRY_ADD_OK;
+out:
+fsck_err:
+	return ret;
+}
+
+static struct nonce journal_nonce(const struct jset *jset)
+{
+	return (struct nonce) {{
+		[0] = 0,
+		[1] = ((__le32 *) &jset->seq)[0],
+		[2] = ((__le32 *) &jset->seq)[1],
+		[3] = BCH_NONCE_JOURNAL,
+	}};
+}
+
+/* this fills in a range with empty jset_entries: */
+static void journal_entry_null_range(void *start, void *end)
+{
+	struct jset_entry *entry;
+
+	for (entry = start; entry != end; entry = vstruct_next(entry))
+		memset(entry, 0, sizeof(*entry));
+}
+
+#define JOURNAL_ENTRY_REREAD	5
+#define JOURNAL_ENTRY_NONE	6
+#define JOURNAL_ENTRY_BAD	7
+
+#define journal_entry_err(c, msg, ...)					\
+({									\
+	switch (write) {						\
+	case READ:							\
+		mustfix_fsck_err(c, msg, ##__VA_ARGS__);		\
+		break;							\
+	case WRITE:							\
+		bch_err(c, "corrupt metadata before write:\n"		\
+			msg, ##__VA_ARGS__);				\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
+		}							\
+		break;							\
+	}								\
+	true;								\
+})
+
+#define journal_entry_err_on(cond, c, msg, ...)				\
+	((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false)
+
+static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+				struct jset_entry *entry,
+				struct bkey_i *k, enum bkey_type key_type,
+				const char *type, int write)
+{
+	void *next = vstruct_next(entry);
+	const char *invalid;
+	char buf[160];
+	int ret = 0;
+
+	if (journal_entry_err_on(!k->k.u64s, c,
+			"invalid %s in journal: k->u64s 0", type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (journal_entry_err_on((void *) bkey_next(k) >
+				(void *) vstruct_next(entry), c,
+			"invalid %s in journal: extends past end of journal entry",
+			type)) {
+		entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
+			"invalid %s in journal: bad format %u",
+			type, k->k.format)) {
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
+		bch2_bkey_swab(key_type, NULL, bkey_to_packed(k));
+
+	invalid = bch2_bkey_invalid(c, key_type, bkey_i_to_s_c(k));
+	if (invalid) {
+		bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
+				     bkey_i_to_s_c(k));
+		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
+				 type, invalid, buf);
+
+		le16_add_cpu(&entry->u64s, -k->k.u64s);
+		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_btree_keys(struct bch_fs *c,
+					     struct jset *jset,
+					     struct jset_entry *entry,
+					     int write)
+{
+	struct bkey_i *k;
+
+	vstruct_for_each(entry, k) {
+		int ret = journal_validate_key(c, jset, entry, k,
+				bkey_type(entry->level,
+					  entry->btree_id),
+				"key", write);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int journal_entry_validate_btree_root(struct bch_fs *c,
+					     struct jset *jset,
+					     struct jset_entry *entry,
+					     int write)
+{
+	struct bkey_i *k = entry->start;
+	int ret = 0;
+
+	if (journal_entry_err_on(!entry->u64s ||
+				 le16_to_cpu(entry->u64s) != k->k.u64s, c,
+				 "invalid btree root journal entry: wrong number of keys")) {
+		void *next = vstruct_next(entry);
+		/*
+		 * we don't want to null out this jset_entry,
+		 * just the contents, so that later we can tell
+		 * we were _supposed_ to have a btree root
+		 */
+		entry->u64s = 0;
+		journal_entry_null_range(vstruct_next(entry), next);
+		return 0;
+	}
+
+	return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+				    "btree root", write);
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	/* obsolete, don't care: */
+	return 0;
+}
+
+static int journal_entry_validate_blacklist(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
+					       struct jset *jset,
+					       struct jset_entry *entry,
+					       int write)
+{
+	struct jset_entry_blacklist_v2 *bl_entry;
+	int ret = 0;
+
+	if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c,
+		"invalid journal seq blacklist entry: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+	if (journal_entry_err_on(le64_to_cpu(bl_entry->start) >
+				 le64_to_cpu(bl_entry->end), c,
+		"invalid journal seq blacklist entry: start > end")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+	}
+
+fsck_err:
+	return ret;
+}
+
+struct jset_entry_ops {
+	int (*validate)(struct bch_fs *, struct jset *,
+			struct jset_entry *, int);
+};
+
+const struct jset_entry_ops bch2_jset_entry_ops[] = {
+#define x(f, nr)						\
+	[BCH_JSET_ENTRY_##f]	= (struct jset_entry_ops) {	\
+		.validate	= journal_entry_validate_##f,	\
+	},
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+};
+
+static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
+				  struct jset_entry *entry, int write)
+{
+	int ret = 0;
+
+	if (entry->type >= BCH_JSET_ENTRY_NR) {
+		journal_entry_err(c, "invalid journal entry type %u",
+				  entry->type);
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return 0;
+	}
+
+	ret = bch2_jset_entry_ops[entry->type].validate(c, jset, entry, write);
+fsck_err:
+	return ret;
+}
+
+static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
+				 int write)
+{
+	struct jset_entry *entry;
+	int ret = 0;
+
+	vstruct_for_each(jset, entry) {
+		if (journal_entry_err_on(vstruct_next(entry) >
+					 vstruct_last(jset), c,
+				"journal entry extends past end of jset")) {
+			jset->u64s = cpu_to_le32((u64 *) entry - jset->_data);
+			break;
+		}
+
+		ret = journal_entry_validate(c, jset, entry, write);
+		if (ret)
+			break;
+	}
+fsck_err:
+	return ret;
+}
+
+static int jset_validate(struct bch_fs *c,
+			 struct jset *jset, u64 sector,
+			 unsigned bucket_sectors_left,
+			 unsigned sectors_read,
+			 int write)
+{
+	size_t bytes = vstruct_bytes(jset);
+	struct bch_csum csum;
+	int ret = 0;
+
+	if (le64_to_cpu(jset->magic) != jset_magic(c))
+		return JOURNAL_ENTRY_NONE;
+
+	if (le32_to_cpu(jset->version) != BCACHE_JSET_VERSION) {
+		bch_err(c, "unknown journal entry version %u",
+			le32_to_cpu(jset->version));
+		return BCH_FSCK_UNKNOWN_VERSION;
+	}
+
+	if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c,
+				 "journal entry too big (%zu bytes), sector %lluu",
+				 bytes, sector)) {
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	if (bytes > sectors_read << 9)
+		return JOURNAL_ENTRY_REREAD;
+
+	if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c,
+			"journal entry with unknown csum type %llu sector %lluu",
+			JSET_CSUM_TYPE(jset), sector))
+		return JOURNAL_ENTRY_BAD;
+
+	csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset);
+	if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c,
+				 "journal checksum bad, sector %llu", sector)) {
+		/* XXX: retry IO, when we start retrying checksum errors */
+		/* XXX: note we might have missing journal entries */
+		return JOURNAL_ENTRY_BAD;
+	}
+
+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		     jset->encrypted_start,
+		     vstruct_end(jset) - (void *) jset->encrypted_start);
+
+	if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c,
+				 "invalid journal entry: last_seq > seq"))
+		jset->last_seq = jset->seq;
+
+	return 0;
+fsck_err:
+	return ret;
+}
+
+struct journal_read_buf {
+	void		*data;
+	size_t		size;
+};
+
+static int journal_read_buf_realloc(struct journal_read_buf *b,
+				    size_t new_size)
+{
+	void *n;
+
+	/* the bios are sized for this many pages, max: */
+	if (new_size > JOURNAL_ENTRY_SIZE_MAX)
+		return -ENOMEM;
+
+	new_size = roundup_pow_of_two(new_size);
+	n = kvpmalloc(new_size, GFP_KERNEL);
+	if (!n)
+		return -ENOMEM;
+
+	kvpfree(b->data, b->size);
+	b->data = n;
+	b->size = new_size;
+	return 0;
+}
+
+static int journal_read_bucket(struct bch_dev *ca,
+			       struct journal_read_buf *buf,
+			       struct journal_list *jlist,
+			       unsigned bucket, u64 *seq, bool *entries_found)
+{
+	struct bch_fs *c = ca->fs;
+	struct journal_device *ja = &ca->journal;
+	struct bio *bio = ja->bio;
+	struct jset *j = NULL;
+	unsigned sectors, sectors_read = 0;
+	u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
+	    end = offset + ca->mi.bucket_size;
+	bool saw_bad = false;
+	int ret = 0;
+
+	pr_debug("reading %u", bucket);
+
+	while (offset < end) {
+		if (!sectors_read) {
+reread:			sectors_read = min_t(unsigned,
+				end - offset, buf->size >> 9);
+
+			bio_reset(bio);
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			bio->bi_iter.bi_sector	= offset;
+			bio->bi_iter.bi_size	= sectors_read << 9;
+			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+			bch2_bio_map(bio, buf->data);
+
+			ret = submit_bio_wait(bio);
+
+			if (bch2_dev_io_err_on(ret, ca,
+					       "journal read from sector %llu",
+					       offset) ||
+			    bch2_meta_read_fault("journal"))
+				return -EIO;
+
+			j = buf->data;
+		}
+
+		ret = jset_validate(c, j, offset,
+				    end - offset, sectors_read,
+				    READ);
+		switch (ret) {
+		case BCH_FSCK_OK:
+			break;
+		case JOURNAL_ENTRY_REREAD:
+			if (vstruct_bytes(j) > buf->size) {
+				ret = journal_read_buf_realloc(buf,
+							vstruct_bytes(j));
+				if (ret)
+					return ret;
+			}
+			goto reread;
+		case JOURNAL_ENTRY_NONE:
+			if (!saw_bad)
+				return 0;
+			sectors = c->opts.block_size;
+			goto next_block;
+		case JOURNAL_ENTRY_BAD:
+			saw_bad = true;
+			sectors = c->opts.block_size;
+			goto next_block;
+		default:
+			return ret;
+		}
+
+		/*
+		 * This happens sometimes if we don't have discards on -
+		 * when we've partially overwritten a bucket with new
+		 * journal entries. We don't need the rest of the
+		 * bucket:
+		 */
+		if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
+			return 0;
+
+		ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
+
+		mutex_lock(&jlist->lock);
+		ret = journal_entry_add(c, ca, jlist, j);
+		mutex_unlock(&jlist->lock);
+
+		switch (ret) {
+		case JOURNAL_ENTRY_ADD_OK:
+			*entries_found = true;
+			break;
+		case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
+			break;
+		default:
+			return ret;
+		}
+
+		if (le64_to_cpu(j->seq) > *seq)
+			*seq = le64_to_cpu(j->seq);
+
+		sectors = vstruct_sectors(j, c->block_bits);
+next_block:
+		pr_debug("next");
+		offset		+= sectors;
+		sectors_read	-= sectors;
+		j = ((void *) j) + (sectors << 9);
+	}
+
+	return 0;
+}
+
+static void bch2_journal_read_device(struct closure *cl)
+{
+#define read_bucket(b)							\
+	({								\
+		bool entries_found = false;				\
+		ret = journal_read_bucket(ca, &buf, jlist, b, &seq,	\
+					  &entries_found);		\
+		if (ret)						\
+			goto err;					\
+		__set_bit(b, bitmap);					\
+		entries_found;						\
+	 })
+
+	struct journal_device *ja =
+		container_of(cl, struct journal_device, read);
+	struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
+	struct journal_list *jlist =
+		container_of(cl->parent, struct journal_list, cl);
+	struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
+	struct journal_read_buf buf = { NULL, 0 };
+
+	DECLARE_BITMAP(bitmap, ja->nr);
+	unsigned i, l, r;
+	u64 seq = 0;
+	int ret;
+
+	if (!ja->nr)
+		goto out;
+
+	bitmap_zero(bitmap, ja->nr);
+	ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
+	if (ret)
+		goto err;
+
+	pr_debug("%u journal buckets", ja->nr);
+
+	/*
+	 * If the device supports discard but not secure discard, we can't do
+	 * the fancy fibonacci hash/binary search because the live journal
+	 * entries might not form a contiguous range:
+	 */
+	for (i = 0; i < ja->nr; i++)
+		read_bucket(i);
+	goto search_done;
+
+	if (!blk_queue_nonrot(q))
+		goto linear_scan;
+
+	/*
+	 * Read journal buckets ordered by golden ratio hash to quickly
+	 * find a sequence of buckets with valid journal entries
+	 */
+	for (i = 0; i < ja->nr; i++) {
+		l = (i * 2654435769U) % ja->nr;
+
+		if (test_bit(l, bitmap))
+			break;
+
+		if (read_bucket(l))
+			goto bsearch;
+	}
+
+	/*
+	 * If that fails, check all the buckets we haven't checked
+	 * already
+	 */
+	pr_debug("falling back to linear search");
+linear_scan:
+	for (l = find_first_zero_bit(bitmap, ja->nr);
+	     l < ja->nr;
+	     l = find_next_zero_bit(bitmap, ja->nr, l + 1))
+		if (read_bucket(l))
+			goto bsearch;
+
+	/* no journal entries on this device? */
+	if (l == ja->nr)
+		goto out;
+bsearch:
+	/* Binary search */
+	r = find_next_bit(bitmap, ja->nr, l + 1);
+	pr_debug("starting binary search, l %u r %u", l, r);
+
+	while (l + 1 < r) {
+		unsigned m = (l + r) >> 1;
+		u64 cur_seq = seq;
+
+		read_bucket(m);
+
+		if (cur_seq != seq)
+			l = m;
+		else
+			r = m;
+	}
+
+search_done:
+	/*
+	 * Find the journal bucket with the highest sequence number:
+	 *
+	 * If there's duplicate journal entries in multiple buckets (which
+	 * definitely isn't supposed to happen, but...) - make sure to start
+	 * cur_idx at the last of those buckets, so we don't deadlock trying to
+	 * allocate
+	 */
+	seq = 0;
+
+	for (i = 0; i < ja->nr; i++)
+		if (ja->bucket_seq[i] >= seq &&
+		    ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
+			/*
+			 * When journal_next_bucket() goes to allocate for
+			 * the first time, it'll use the bucket after
+			 * ja->cur_idx
+			 */
+			ja->cur_idx = i;
+			seq = ja->bucket_seq[i];
+		}
+
+	/*
+	 * Set last_idx to indicate the entire journal is full and needs to be
+	 * reclaimed - journal reclaim will immediately reclaim whatever isn't
+	 * pinned when it first runs:
+	 */
+	ja->last_idx = (ja->cur_idx + 1) % ja->nr;
+
+	/*
+	 * Read buckets in reverse order until we stop finding more journal
+	 * entries:
+	 */
+	for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
+	     i != ja->cur_idx;
+	     i = (i + ja->nr - 1) % ja->nr)
+		if (!test_bit(i, bitmap) &&
+		    !read_bucket(i))
+			break;
+out:
+	kvpfree(buf.data, buf.size);
+	percpu_ref_put(&ca->io_ref);
+	closure_return(cl);
+err:
+	mutex_lock(&jlist->lock);
+	jlist->ret = ret;
+	mutex_unlock(&jlist->lock);
+	goto out;
+#undef read_bucket
+}
+
+void bch2_journal_entries_free(struct list_head *list)
+{
+
+	while (!list_empty(list)) {
+		struct journal_replay *i =
+			list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kvpfree(i, offsetof(struct journal_replay, j) +
+			vstruct_bytes(&i->j));
+	}
+}
+
+static inline bool journal_has_keys(struct list_head *list)
+{
+	struct journal_replay *i;
+	struct jset_entry *entry;
+	struct bkey_i *k, *_n;
+
+	list_for_each_entry(i, list, list)
+		for_each_jset_key(k, _n, entry, &i->j)
+			return true;
+
+	return false;
+}
+
+int bch2_journal_read(struct bch_fs *c, struct list_head *list)
+{
+	struct journal *j = &c->journal;
+	struct journal_list jlist;
+	struct journal_replay *i;
+	struct journal_entry_pin_list *p;
+	struct bch_dev *ca;
+	u64 cur_seq, end_seq, seq;
+	unsigned iter, keys = 0, entries = 0;
+	size_t nr;
+	bool degraded = false;
+	int ret = 0;
+
+	closure_init_stack(&jlist.cl);
+	mutex_init(&jlist.lock);
+	jlist.head = list;
+	jlist.ret = 0;
+
+	for_each_member_device(ca, c, iter) {
+		if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+			continue;
+
+		if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
+		     ca->mi.state == BCH_MEMBER_STATE_RO) &&
+		    percpu_ref_tryget(&ca->io_ref))
+			closure_call(&ca->journal.read,
+				     bch2_journal_read_device,
+				     system_unbound_wq,
+				     &jlist.cl);
+		else
+			degraded = true;
+	}
+
+	closure_sync(&jlist.cl);
+
+	if (jlist.ret)
+		return jlist.ret;
+
+	if (list_empty(list)){
+		bch_err(c, "no journal entries found");
+		return BCH_FSCK_REPAIR_IMPOSSIBLE;
+	}
+
+	fsck_err_on(c->sb.clean && journal_has_keys(list), c,
+		    "filesystem marked clean but journal has keys to replay");
+
+	list_for_each_entry(i, list, list) {
+		ret = jset_validate_entries(c, &i->j, READ);
+		if (ret)
+			goto fsck_err;
+
+		/*
+		 * If we're mounting in degraded mode - if we didn't read all
+		 * the devices - this is wrong:
+		 */
+
+		if (!degraded &&
+		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+		     fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
+						       i->devs), c,
+				 "superblock not marked as containing replicas (type %u)",
+				 BCH_DATA_JOURNAL))) {
+			ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+			if (ret)
+				return ret;
+		}
+	}
+
+	i = list_last_entry(list, struct journal_replay, list);
+
+	nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
+
+	if (nr > j->pin.size) {
+		free_fifo(&j->pin);
+		init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+		if (!j->pin.data) {
+			bch_err(c, "error reallocating journal fifo (%zu open entries)", nr);
+			return -ENOMEM;
+		}
+	}
+
+	atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
+	j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
+
+	j->pin.front	= le64_to_cpu(i->j.last_seq);
+	j->pin.back	= le64_to_cpu(i->j.seq) + 1;
+
+	fifo_for_each_entry_ptr(p, &j->pin, seq) {
+		INIT_LIST_HEAD(&p->list);
+		INIT_LIST_HEAD(&p->flushed);
+		atomic_set(&p->count, 0);
+		p->devs.nr = 0;
+	}
+
+	mutex_lock(&j->blacklist_lock);
+
+	list_for_each_entry(i, list, list) {
+		p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
+
+		atomic_set(&p->count, 1);
+		p->devs = i->devs;
+
+		if (bch2_journal_seq_blacklist_read(j, i)) {
+			mutex_unlock(&j->blacklist_lock);
+			return -ENOMEM;
+		}
+	}
+
+	mutex_unlock(&j->blacklist_lock);
+
+	cur_seq = journal_last_seq(j);
+	end_seq = le64_to_cpu(list_last_entry(list,
+				struct journal_replay, list)->j.seq);
+
+	list_for_each_entry(i, list, list) {
+		struct jset_entry *entry;
+		struct bkey_i *k, *_n;
+		bool blacklisted;
+
+		mutex_lock(&j->blacklist_lock);
+		while (cur_seq < le64_to_cpu(i->j.seq) &&
+		       bch2_journal_seq_blacklist_find(j, cur_seq))
+			cur_seq++;
+
+		blacklisted = bch2_journal_seq_blacklist_find(j,
+							 le64_to_cpu(i->j.seq));
+		mutex_unlock(&j->blacklist_lock);
+
+		fsck_err_on(blacklisted, c,
+			    "found blacklisted journal entry %llu",
+			    le64_to_cpu(i->j.seq));
+
+		fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
+			"journal entries %llu-%llu missing! (replaying %llu-%llu)",
+			cur_seq, le64_to_cpu(i->j.seq) - 1,
+			journal_last_seq(j), end_seq);
+
+		cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+		for_each_jset_key(k, _n, entry, &i->j)
+			keys++;
+		entries++;
+	}
+
+	bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
+		 keys, entries, journal_cur_seq(j));
+fsck_err:
+	return ret;
+}
+
+/* journal replay: */
+
+int bch2_journal_mark(struct bch_fs *c, struct list_head *list)
+{
+	struct bkey_i *k, *n;
+	struct jset_entry *j;
+	struct journal_replay *r;
+	int ret;
+
+	list_for_each_entry(r, list, list)
+		for_each_jset_key(k, n, j, &r->j) {
+			enum bkey_type type = bkey_type(j->level, j->btree_id);
+			struct bkey_s_c k_s_c = bkey_i_to_s_c(k);
+
+			if (btree_type_has_ptrs(type)) {
+				ret = bch2_btree_mark_key_initial(c, type, k_s_c);
+				if (ret)
+					return ret;
+			}
+		}
+
+	return 0;
+}
+
+int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
+{
+	struct journal *j = &c->journal;
+	struct journal_entry_pin_list *pin_list;
+	struct bkey_i *k, *_n;
+	struct jset_entry *entry;
+	struct journal_replay *i, *n;
+	int ret = 0;
+
+	list_for_each_entry_safe(i, n, list, list) {
+
+		j->replay_journal_seq = le64_to_cpu(i->j.seq);
+
+		for_each_jset_key(k, _n, entry, &i->j) {
+
+			if (entry->btree_id == BTREE_ID_ALLOC) {
+				/*
+				 * allocation code handles replay for
+				 * BTREE_ID_ALLOC keys:
+				 */
+				ret = bch2_alloc_replay_key(c, k->k.p);
+			} else {
+				/*
+				 * We might cause compressed extents to be
+				 * split, so we need to pass in a
+				 * disk_reservation:
+				 */
+				struct disk_reservation disk_res =
+					bch2_disk_reservation_init(c, 0);
+
+				ret = bch2_btree_insert(c, entry->btree_id, k,
+							&disk_res, NULL, NULL,
+							BTREE_INSERT_NOFAIL|
+							BTREE_INSERT_JOURNAL_REPLAY);
+			}
+
+			if (ret) {
+				bch_err(c, "journal replay: error %d while replaying key",
+					ret);
+				goto err;
+			}
+
+			cond_resched();
+		}
+
+		pin_list = journal_seq_pin(j, j->replay_journal_seq);
+
+		if (atomic_dec_and_test(&pin_list->count))
+			journal_wake(j);
+	}
+
+	j->replay_journal_seq = 0;
+
+	bch2_journal_set_replay_done(j);
+	ret = bch2_journal_flush_all_pins(j);
+err:
+	bch2_journal_entries_free(list);
+	return ret;
+}
+
+/* journal write: */
+
+static void bch2_journal_add_btree_root(struct journal_buf *buf,
+				       enum btree_id id, struct bkey_i *k,
+				       unsigned level)
+{
+	struct jset_entry *entry;
+
+	entry = bch2_journal_add_entry_noreservation(buf, k->k.u64s);
+	entry->type	= BCH_JSET_ENTRY_btree_root;
+	entry->btree_id = id;
+	entry->level	= level;
+	memcpy_u64s(entry->_data, k, k->k.u64s);
+}
+
+static unsigned journal_dev_buckets_available(struct journal *j,
+					      struct bch_dev *ca)
+{
+	struct journal_device *ja = &ca->journal;
+	unsigned next = (ja->cur_idx + 1) % ja->nr;
+	unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+
+	/*
+	 * Hack to avoid a deadlock during journal replay:
+	 * journal replay might require setting a new btree
+	 * root, which requires writing another journal entry -
+	 * thus, if the journal is full (and this happens when
+	 * replaying the first journal bucket's entries) we're
+	 * screwed.
+	 *
+	 * So don't let the journal fill up unless we're in
+	 * replay:
+	 */
+	if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
+		available = max((int) available - 2, 0);
+
+	/*
+	 * Don't use the last bucket unless writing the new last_seq
+	 * will make another bucket available:
+	 */
+	if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
+		available = max((int) available - 1, 0);
+
+	return available;
+}
+
+/* returns number of sectors available for next journal entry: */
+int bch2_journal_entry_sectors(struct journal *j)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
+	unsigned sectors_available = UINT_MAX;
+	unsigned i, nr_online = 0, nr_devs = 0;
+
+	lockdep_assert_held(&j->lock);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i,
+				   &c->rw_devs[BCH_DATA_JOURNAL]) {
+		struct journal_device *ja = &ca->journal;
+		unsigned buckets_required = 0;
+
+		if (!ja->nr)
+			continue;
+
+		sectors_available = min_t(unsigned, sectors_available,
+					  ca->mi.bucket_size);
+
+		/*
+		 * Note that we don't allocate the space for a journal entry
+		 * until we write it out - thus, if we haven't started the write
+		 * for the previous entry we have to make sure we have space for
+		 * it too:
+		 */
+		if (bch2_extent_has_device(e.c, ca->dev_idx)) {
+			if (j->prev_buf_sectors > ja->sectors_free)
+				buckets_required++;
+
+			if (j->prev_buf_sectors + sectors_available >
+			    ja->sectors_free)
+				buckets_required++;
+		} else {
+			if (j->prev_buf_sectors + sectors_available >
+			    ca->mi.bucket_size)
+				buckets_required++;
+
+			buckets_required++;
+		}
+
+		if (journal_dev_buckets_available(j, ca) >= buckets_required)
+			nr_devs++;
+		nr_online++;
+	}
+	rcu_read_unlock();
+
+	if (nr_online < c->opts.metadata_replicas_required)
+		return -EROFS;
+
+	if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
+		return 0;
+
+	return sectors_available;
+}
+
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+			       unsigned sectors)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bkey_s_extent e;
+	struct bch_extent_ptr *ptr;
+	struct journal_device *ja;
+	struct bch_dev *ca;
+	struct dev_alloc_list devs_sorted;
+	unsigned i, replicas, replicas_want =
+		READ_ONCE(c->opts.metadata_replicas);
+
+	spin_lock(&j->lock);
+	e = bkey_i_to_s_extent(&j->key);
+
+	/*
+	 * Drop any pointers to devices that have been removed, are no longer
+	 * empty, or filled up their current journal bucket:
+	 *
+	 * Note that a device may have had a small amount of free space (perhaps
+	 * one sector) that wasn't enough for the smallest possible journal
+	 * entry - that's why we drop pointers to devices <= current free space,
+	 * i.e. whichever device was limiting the current journal entry size.
+	 */
+	extent_for_each_ptr_backwards(e, ptr) {
+		   ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (ca->mi.state != BCH_MEMBER_STATE_RW ||
+		    ca->journal.sectors_free <= sectors)
+			__bch2_extent_drop_ptr(e, ptr);
+		else
+			ca->journal.sectors_free -= sectors;
+	}
+
+	replicas = bch2_extent_nr_ptrs(e.c);
+
+	rcu_read_lock();
+	devs_sorted = bch2_wp_alloc_list(c, &j->wp,
+					 &c->rw_devs[BCH_DATA_JOURNAL]);
+
+	for (i = 0; i < devs_sorted.nr; i++) {
+		ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+		if (!ca)
+			continue;
+
+		if (!ca->mi.durability)
+			continue;
+
+		ja = &ca->journal;
+		if (!ja->nr)
+			continue;
+
+		if (replicas >= replicas_want)
+			break;
+
+		/*
+		 * Check that we can use this device, and aren't already using
+		 * it:
+		 */
+		if (bch2_extent_has_device(e.c, ca->dev_idx) ||
+		    !journal_dev_buckets_available(j, ca) ||
+		    sectors > ca->mi.bucket_size)
+			continue;
+
+		j->wp.next_alloc[ca->dev_idx] += U32_MAX;
+		bch2_wp_rescale(c, ca, &j->wp);
+
+		ja->sectors_free = ca->mi.bucket_size - sectors;
+		ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+		extent_ptr_append(bkey_i_to_extent(&j->key),
+			(struct bch_extent_ptr) {
+				  .offset = bucket_to_sector(ca,
+					ja->buckets[ja->cur_idx]),
+				  .dev = ca->dev_idx,
+		});
+
+		replicas += ca->mi.durability;
+	}
+	rcu_read_unlock();
+
+	j->prev_buf_sectors = 0;
+
+	bkey_copy(&w->key, &j->key);
+	spin_unlock(&j->lock);
+
+	if (replicas < c->opts.metadata_replicas_required)
+		return -EROFS;
+
+	BUG_ON(!replicas);
+
+	return 0;
+}
+
+static void journal_write_compact(struct jset *jset)
+{
+	struct jset_entry *i, *next, *prev = NULL;
+
+	/*
+	 * Simple compaction, dropping empty jset_entries (from journal
+	 * reservations that weren't fully used) and merging jset_entries that
+	 * can be.
+	 *
+	 * If we wanted to be really fancy here, we could sort all the keys in
+	 * the jset and drop keys that were overwritten - probably not worth it:
+	 */
+	vstruct_for_each_safe(jset, i, next) {
+		unsigned u64s = le16_to_cpu(i->u64s);
+
+		/* Empty entry: */
+		if (!u64s)
+			continue;
+
+		/* Can we merge with previous entry? */
+		if (prev &&
+		    i->btree_id == prev->btree_id &&
+		    i->level	== prev->level &&
+		    i->type	== prev->type &&
+		    i->type	== BCH_JSET_ENTRY_btree_keys &&
+		    le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+			memmove_u64s_down(vstruct_next(prev),
+					  i->_data,
+					  u64s);
+			le16_add_cpu(&prev->u64s, u64s);
+			continue;
+		}
+
+		/* Couldn't merge, move i into new position (after prev): */
+		prev = prev ? vstruct_next(prev) : jset->start;
+		if (i != prev)
+			memmove_u64s_down(prev, i, jset_u64s(u64s));
+	}
+
+	prev = prev ? vstruct_next(prev) : jset->start;
+	jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
+{
+	/* we aren't holding j->lock: */
+	unsigned new_size = READ_ONCE(j->buf_size_want);
+	void *new_buf;
+
+	if (buf->size >= new_size)
+		return;
+
+	new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+	if (!new_buf)
+		return;
+
+	memcpy(new_buf, buf->data, buf->size);
+	kvpfree(buf->data, buf->size);
+	buf->data	= new_buf;
+	buf->size	= new_size;
+}
+
+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_buf *w = journal_prev_buf(j);
+	struct bch_devs_list devs =
+		bch2_extent_devs(bkey_i_to_s_c_extent(&w->key));
+	u64 seq = le64_to_cpu(w->data->seq);
+
+	if (!devs.nr) {
+		bch_err(c, "unable to write journal to sufficient devices");
+		goto err;
+	}
+
+	if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+		goto err;
+out:
+	bch2_time_stats_update(j->write_time, j->write_start_time);
+
+	spin_lock(&j->lock);
+	j->last_seq_ondisk = seq;
+	if (seq >= j->pin.front)
+		journal_seq_pin(j, seq)->devs = devs;
+
+	/*
+	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+	 * more buckets:
+	 *
+	 * Must come before signaling write completion, for
+	 * bch2_fs_journal_stop():
+	 */
+	mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+
+	/* also must come before signalling write completion: */
+	closure_debug_destroy(cl);
+
+	BUG_ON(!j->reservations.prev_buf_unwritten);
+	atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v,
+		     &j->reservations.counter);
+
+	closure_wake_up(&w->wait);
+	journal_wake(j);
+
+	if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
+		mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+	spin_unlock(&j->lock);
+	return;
+err:
+	bch2_fatal_error(c);
+	bch2_journal_halt(j);
+	goto out;
+}
+
+static void journal_write_endio(struct bio *bio)
+{
+	struct bch_dev *ca = bio->bi_private;
+	struct journal *j = &ca->fs->journal;
+
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") ||
+	    bch2_meta_write_fault("journal")) {
+		struct journal_buf *w = journal_prev_buf(j);
+		unsigned long flags;
+
+		spin_lock_irqsave(&j->err_lock, flags);
+		bch2_extent_drop_device(bkey_i_to_s_extent(&w->key), ca->dev_idx);
+		spin_unlock_irqrestore(&j->err_lock, flags);
+	}
+
+	closure_put(&j->io);
+	percpu_ref_put(&ca->io_ref);
+}
+
+void bch2_journal_write(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io);
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_dev *ca;
+	struct journal_buf *w = journal_prev_buf(j);
+	struct jset *jset;
+	struct bio *bio;
+	struct bch_extent_ptr *ptr;
+	unsigned i, sectors, bytes;
+
+	journal_buf_realloc(j, w);
+	jset = w->data;
+
+	j->write_start_time = local_clock();
+	mutex_lock(&c->btree_root_lock);
+	for (i = 0; i < BTREE_ID_NR; i++) {
+		struct btree_root *r = &c->btree_roots[i];
+
+		if (r->alive)
+			bch2_journal_add_btree_root(w, i, &r->key, r->level);
+	}
+	c->btree_roots_dirty = false;
+	mutex_unlock(&c->btree_root_lock);
+
+	journal_write_compact(jset);
+
+	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
+	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
+	jset->magic		= cpu_to_le64(jset_magic(c));
+	jset->version		= cpu_to_le32(BCACHE_JSET_VERSION);
+
+	SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
+	SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
+
+	if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+	    jset_validate_entries(c, jset, WRITE))
+		goto err;
+
+	bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
+		    jset->encrypted_start,
+		    vstruct_end(jset) - (void *) jset->encrypted_start);
+
+	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
+				  journal_nonce(jset), jset);
+
+	if (!bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)) &&
+	    jset_validate_entries(c, jset, WRITE))
+		goto err;
+
+	sectors = vstruct_sectors(jset, c->block_bits);
+	BUG_ON(sectors > j->prev_buf_sectors);
+
+	bytes = vstruct_bytes(w->data);
+	memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+
+	if (journal_write_alloc(j, w, sectors)) {
+		bch2_journal_halt(j);
+		bch_err(c, "Unable to allocate journal write");
+		bch2_fatal_error(c);
+		continue_at(cl, journal_write_done, system_highpri_wq);
+	}
+
+	/*
+	 * XXX: we really should just disable the entire journal in nochanges
+	 * mode
+	 */
+	if (c->opts.nochanges)
+		goto no_io;
+
+	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
+		ca = bch_dev_bkey_exists(c, ptr->dev);
+		if (!percpu_ref_tryget(&ca->io_ref)) {
+			/* XXX: fix this */
+			bch_err(c, "missing device for journal write\n");
+			continue;
+		}
+
+		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL],
+			     sectors);
+
+		bio = ca->journal.bio;
+		bio_reset(bio);
+		bio_set_dev(bio, ca->disk_sb.bdev);
+		bio->bi_iter.bi_sector	= ptr->offset;
+		bio->bi_iter.bi_size	= sectors << 9;
+		bio->bi_end_io		= journal_write_endio;
+		bio->bi_private		= ca;
+		bio_set_op_attrs(bio, REQ_OP_WRITE,
+				 REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
+		bch2_bio_map(bio, jset);
+
+		trace_journal_write(bio);
+		closure_bio_submit(bio, cl);
+
+		ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+	}
+
+	for_each_rw_member(ca, c, i)
+		if (journal_flushes_device(ca) &&
+		    !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+			percpu_ref_get(&ca->io_ref);
+
+			bio = ca->journal.bio;
+			bio_reset(bio);
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			bio->bi_opf		= REQ_OP_FLUSH;
+			bio->bi_end_io		= journal_write_endio;
+			bio->bi_private		= ca;
+			closure_bio_submit(bio, cl);
+		}
+
+no_io:
+	extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
+		ptr->offset += sectors;
+
+	continue_at(cl, journal_write_done, system_highpri_wq);
+err:
+	bch2_inconsistent_error(c);
+	continue_at(cl, journal_write_done, system_highpri_wq);
+}
diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h
new file mode 100644
index 00000000..4236b7fc
--- /dev/null
+++ b/libbcachefs/journal_io.h
@@ -0,0 +1,45 @@
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
+					    enum btree_id, unsigned *);
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	struct bch_devs_list	devs;
+	/* must be last: */
+	struct jset		j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < vstruct_last(jset)) {
+		if (entry->type == type)
+			return entry;
+
+		entry = vstruct_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
+		vstruct_for_each_safe(entry, k, _n)
+
+int bch2_journal_read(struct bch_fs *, struct list_head *);
+
+int bch2_journal_entry_sectors(struct journal *);
+void bch2_journal_write(struct closure *);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c
new file mode 100644
index 00000000..0e3e5b6a
--- /dev/null
+++ b/libbcachefs/journal_reclaim.c
@@ -0,0 +1,411 @@
+
+#include "bcachefs.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "super.h"
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+static inline u64 journal_pin_seq(struct journal *j,
+				  struct journal_entry_pin_list *pin_list)
+{
+	return fifo_entry_idx_abs(&j->pin, pin_list);
+}
+
+u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+{
+	u64 ret = 0;
+
+	spin_lock(&j->lock);
+	if (journal_pin_active(pin))
+		ret = journal_pin_seq(j, pin->pin_list);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static inline void __journal_pin_add(struct journal *j,
+				     struct journal_entry_pin_list *pin_list,
+				     struct journal_entry_pin *pin,
+				     journal_pin_flush_fn flush_fn)
+{
+	BUG_ON(journal_pin_active(pin));
+	BUG_ON(!atomic_read(&pin_list->count));
+
+	atomic_inc(&pin_list->count);
+	pin->pin_list	= pin_list;
+	pin->flush	= flush_fn;
+
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list);
+	else
+		INIT_LIST_HEAD(&pin->list);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
+}
+
+void bch2_journal_pin_add(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+	__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+	spin_unlock(&j->lock);
+}
+
+static inline void __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+	if (!journal_pin_active(pin))
+		return;
+
+	pin->pin_list = NULL;
+	list_del_init(&pin->list);
+
+	/*
+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 */
+	if (atomic_dec_and_test(&pin_list->count) &&
+	    pin_list == &fifo_peek_front(&j->pin))
+		bch2_journal_reclaim_fast(j);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+			  struct journal_entry_pin *pin)
+{
+	spin_lock(&j->lock);
+	__journal_pin_drop(j, pin);
+	spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_add_if_older(struct journal *j,
+				  struct journal_entry_pin *src_pin,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+
+	if (journal_pin_active(src_pin) &&
+	    (!journal_pin_active(pin) ||
+	     journal_pin_seq(j, src_pin->pin_list) <
+	     journal_pin_seq(j, pin->pin_list))) {
+		__journal_pin_drop(j, pin);
+		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+/**
+ * bch2_journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+	struct journal_entry_pin_list temp;
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!fifo_pop(&j->pin, temp));
+		popped = true;
+	}
+
+	if (popped)
+		journal_wake(j);
+}
+
+static struct journal_entry_pin *
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret;
+	u64 iter;
+
+	/* no need to iterate over empty fifo entries: */
+	bch2_journal_reclaim_fast(j);
+
+	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+		if (iter > seq_to_flush)
+			break;
+
+		ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list);
+		if (ret) {
+			/* must be list_del_init(), see bch2_journal_pin_drop() */
+			list_move(&ret->list, &pin_list->flushed);
+			*seq = iter;
+			return ret;
+		}
+	}
+
+	return NULL;
+}
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+	struct journal_entry_pin *ret;
+
+	spin_lock(&j->lock);
+	ret = __journal_get_next_pin(j, seq_to_flush, seq);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = ja->nr &&
+		(ja->last_idx != ja->cur_idx &&
+		 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/**
+ * bch2_journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(to_delayed_work(work),
+				struct bch_fs, journal.reclaim_work);
+	struct journal *j = &c->journal;
+	struct bch_dev *ca;
+	struct journal_entry_pin *pin;
+	u64 seq, seq_to_flush = 0;
+	unsigned iter, bucket_to_flush;
+	unsigned long next_flush;
+	bool reclaim_lock_held = false, need_flush;
+
+	/*
+	 * Advance last_idx to point to the oldest journal entry containing
+	 * btree node updates that have not yet been written out
+	 */
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!reclaim_lock_held) {
+				/*
+				 * ugh:
+				 * might be called from __journal_res_get()
+				 * under wait_event() - have to go back to
+				 * TASK_RUNNING before doing something that
+				 * would block, but only if we're doing work:
+				 */
+				__set_current_state(TASK_RUNNING);
+
+				mutex_lock(&j->reclaim_lock);
+				reclaim_lock_held = true;
+				/* recheck under reclaim_lock: */
+				continue;
+			}
+
+			if (ca->mi.discard &&
+			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->last_idx]),
+					ca->mi.bucket_size, GFP_NOIO, 0);
+
+			spin_lock(&j->lock);
+			ja->last_idx = (ja->last_idx + 1) % ja->nr;
+			spin_unlock(&j->lock);
+
+			journal_wake(j);
+		}
+
+		/*
+		 * Write out enough btree nodes to free up 50% journal
+		 * buckets
+		 */
+		spin_lock(&j->lock);
+		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+		seq_to_flush = max_t(u64, seq_to_flush,
+				     ja->bucket_seq[bucket_to_flush]);
+		spin_unlock(&j->lock);
+	}
+
+	if (reclaim_lock_held)
+		mutex_unlock(&j->reclaim_lock);
+
+	/* Also flush if the pin fifo is more than half full */
+	spin_lock(&j->lock);
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) journal_cur_seq(j) -
+			     (j->pin.size >> 1));
+	spin_unlock(&j->lock);
+
+	/*
+	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+	 * make sure to flush at least one journal pin:
+	 */
+	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+	need_flush = time_after(jiffies, next_flush);
+
+	while ((pin = journal_get_next_pin(j, need_flush
+					   ? U64_MAX
+					   : seq_to_flush, &seq))) {
+		__set_current_state(TASK_RUNNING);
+		pin->flush(j, pin, seq);
+		need_flush = false;
+
+		j->last_flushed = jiffies;
+	}
+
+	if (!test_bit(BCH_FS_RO, &c->flags))
+		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+				   msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+			      struct journal_entry_pin **pin,
+			      u64 *pin_seq)
+{
+	int ret;
+
+	*pin = NULL;
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	spin_lock(&j->lock);
+	/*
+	 * If journal replay hasn't completed, the unreplayed journal entries
+	 * hold refs on their corresponding sequence numbers
+	 */
+	ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+		journal_last_seq(j) > seq_to_flush ||
+		(fifo_used(&j->pin) == 1 &&
+		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin *pin;
+	u64 pin_seq;
+	bool flush;
+
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return 0;
+again:
+	wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
+	if (pin) {
+		/* flushing a journal pin might cause a new one to be added: */
+		pin->flush(j, pin, pin_seq);
+		goto again;
+	}
+
+	spin_lock(&j->lock);
+	flush = journal_last_seq(j) != j->last_seq_ondisk ||
+		(seq_to_flush == U64_MAX && c->btree_roots_dirty);
+	spin_unlock(&j->lock);
+
+	return flush ? bch2_journal_meta(j) : 0;
+}
+
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+	return bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct bch_devs_list devs;
+	u64 iter, seq = 0;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (dev_idx >= 0
+		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
+		    : p->devs.nr < c->opts.metadata_replicas)
+			seq = iter;
+	spin_unlock(&j->lock);
+
+	ret = bch2_journal_flush_pins(j, seq);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+	seq = 0;
+
+	spin_lock(&j->lock);
+	while (!ret && seq < j->pin.back) {
+		seq = max(seq, journal_last_seq(j));
+		devs = journal_seq_pin(j, seq)->devs;
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h
new file mode 100644
index 00000000..7d460c35
--- /dev/null
+++ b/libbcachefs/journal_reclaim.h
@@ -0,0 +1,36 @@
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN	(32 * 1024)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->pin_list != NULL;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+	return &j->pin.data[seq & j->pin.mask];
+}
+
+u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+			  journal_pin_flush_fn);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch2_journal_pin_add_if_older(struct journal *,
+				  struct journal_entry_pin *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+
+void bch2_journal_reclaim_fast(struct journal *);
+void bch2_journal_reclaim_work(struct work_struct *);
+
+int bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c
new file mode 100644
index 00000000..b5301d96
--- /dev/null
+++ b/libbcachefs/journal_seq_blacklist.c
@@ -0,0 +1,358 @@
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ *
+ * Blacklisted journal sequence numbers are themselves recorded as entries in
+ * the journal.
+ */
+
+/*
+ * Called when journal needs to evict a blacklist entry to reclaim space: find
+ * any btree nodes that refer to the blacklist journal sequence numbers, and
+ * rewrite them:
+ */
+static void journal_seq_blacklist_flush(struct journal *j,
+					struct journal_entry_pin *pin, u64 seq)
+{
+	struct bch_fs *c =
+		container_of(j, struct bch_fs, journal);
+	struct journal_seq_blacklist *bl =
+		container_of(pin, struct journal_seq_blacklist, pin);
+	struct blacklisted_node n;
+	struct closure cl;
+	unsigned i;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	for (i = 0;; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+
+		__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
+
+		b = bch2_btree_iter_peek_node(&iter);
+
+		/* The node might have already been rewritten: */
+
+		if (b->data->keys.seq == n.seq) {
+			ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+			if (ret) {
+				bch2_btree_iter_unlock(&iter);
+				bch2_fs_fatal_error(c,
+					"error %i rewriting btree node with blacklisted journal seq",
+					ret);
+				bch2_journal_halt(j);
+				return;
+			}
+		}
+
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	for (i = 0;; i++) {
+		struct btree_update *as;
+		struct pending_btree_node_free *d;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+redo_wait:
+		mutex_lock(&c->btree_interior_update_lock);
+
+		/*
+		 * Is the node on the list of pending interior node updates -
+		 * being freed? If so, wait for that to finish:
+		 */
+		for_each_pending_btree_node_free(c, as, d)
+			if (n.seq	== d->seq &&
+			    n.btree_id	== d->btree_id &&
+			    !d->level &&
+			    !bkey_cmp(n.pos, d->key.k.p)) {
+				closure_wait(&as->wait, &cl);
+				mutex_unlock(&c->btree_interior_update_lock);
+				closure_sync(&cl);
+				goto redo_wait;
+			}
+
+		mutex_unlock(&c->btree_interior_update_lock);
+	}
+
+	mutex_lock(&j->blacklist_lock);
+
+	bch2_journal_pin_drop(j, &bl->pin);
+	list_del(&bl->list);
+	kfree(bl->entries);
+	kfree(bl);
+
+	mutex_unlock(&j->blacklist_lock);
+}
+
+/*
+ * Determine if a particular sequence number is blacklisted - if so, return
+ * blacklist entry:
+ */
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		if (seq >= bl->start && seq <= bl->end)
+			return bl;
+
+	return NULL;
+}
+
+/*
+ * Allocate a new, in memory blacklist entry:
+ */
+static struct journal_seq_blacklist *
+bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	/*
+	 * When we start the journal, bch2_journal_start() will skip over @seq:
+	 */
+
+	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+	if (!bl)
+		return NULL;
+
+	bl->start	= start;
+	bl->end		= end;
+
+	list_add_tail(&bl->list, &j->seq_blacklist);
+	return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+{
+	struct journal *j = &c->journal;
+	struct journal_seq_blacklist *bl = NULL;
+	struct blacklisted_node *n;
+	u64 journal_seq;
+	int ret = 0;
+
+	if (!seq)
+		return 0;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+
+	/* Interier updates aren't journalled: */
+	BUG_ON(b->level);
+	BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+
+	/*
+	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
+	 * increasing it temporarily to work around bug in old kernels
+	 */
+	fsck_err_on(seq > journal_seq + 4, c,
+		    "bset journal seq too far in the future: %llu > %llu",
+		    seq, journal_seq);
+
+	if (seq <= journal_seq &&
+	    list_empty_careful(&j->seq_blacklist))
+		return 0;
+
+	mutex_lock(&j->blacklist_lock);
+
+	if (seq <= journal_seq) {
+		bl = bch2_journal_seq_blacklist_find(j, seq);
+		if (!bl)
+			goto out;
+	} else {
+		bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+			    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+		if (!j->new_blacklist) {
+			j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
+						journal_seq + 1,
+						journal_seq + 1);
+			if (!j->new_blacklist) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+		bl = j->new_blacklist;
+		bl->end = max(bl->end, seq);
+	}
+
+	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+		if (b->data->keys.seq	== n->seq &&
+		    b->btree_id		== n->btree_id &&
+		    !bkey_cmp(b->key.k.p, n->pos))
+			goto found_entry;
+
+	if (!bl->nr_entries ||
+	    is_power_of_2(bl->nr_entries)) {
+		n = krealloc(bl->entries,
+			     max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+			     GFP_KERNEL);
+		if (!n) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		bl->entries = n;
+	}
+
+	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+		.seq		= b->data->keys.seq,
+		.btree_id	= b->btree_id,
+		.pos		= b->key.k.p,
+	};
+found_entry:
+	ret = 1;
+out:
+fsck_err:
+	mutex_unlock(&j->blacklist_lock);
+	return ret;
+}
+
+static int __bch2_journal_seq_blacklist_read(struct journal *j,
+					     struct journal_replay *i,
+					     u64 start, u64 end)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_seq_blacklist *bl;
+
+	bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
+		    start, end);
+
+	bl = bch2_journal_seq_blacklisted_new(j, start, end);
+	if (!bl)
+		return -ENOMEM;
+
+	bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
+			     journal_seq_blacklist_flush);
+	return 0;
+}
+
+/*
+ * After reading the journal, find existing journal seq blacklist entries and
+ * read them into memory:
+ */
+int bch2_journal_seq_blacklist_read(struct journal *j,
+				    struct journal_replay *i)
+{
+	struct jset_entry *entry;
+	int ret = 0;
+
+	vstruct_for_each(&i->j, entry) {
+		switch (entry->type) {
+		case BCH_JSET_ENTRY_blacklist: {
+			struct jset_entry_blacklist *bl_entry =
+				container_of(entry, struct jset_entry_blacklist, entry);
+
+			ret = __bch2_journal_seq_blacklist_read(j, i,
+					le64_to_cpu(bl_entry->seq),
+					le64_to_cpu(bl_entry->seq));
+			break;
+		}
+		case BCH_JSET_ENTRY_blacklist_v2: {
+			struct jset_entry_blacklist_v2 *bl_entry =
+				container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+			ret = __bch2_journal_seq_blacklist_read(j, i,
+					le64_to_cpu(bl_entry->start),
+					le64_to_cpu(bl_entry->end));
+			break;
+		}
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * After reading the journal and walking the btree, we might have new journal
+ * sequence numbers to blacklist - add entries to the next journal entry to be
+ * written:
+ */
+void bch2_journal_seq_blacklist_write(struct journal *j)
+{
+	struct journal_seq_blacklist *bl = j->new_blacklist;
+	struct jset_entry_blacklist_v2 *bl_entry;
+	struct jset_entry *entry;
+
+	if (!bl)
+		return;
+
+	entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
+			(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+	bl_entry->entry.type	= BCH_JSET_ENTRY_blacklist_v2;
+	bl_entry->start		= cpu_to_le64(bl->start);
+	bl_entry->end		= cpu_to_le64(bl->end);
+
+	bch2_journal_pin_add(j,
+			     journal_cur_seq(j),
+			     &bl->pin,
+			     journal_seq_blacklist_flush);
+
+	j->new_blacklist = NULL;
+}
diff --git a/libbcachefs/journal_seq_blacklist.h b/libbcachefs/journal_seq_blacklist.h
new file mode 100644
index 00000000..95ea6e90
--- /dev/null
+++ b/libbcachefs/journal_seq_blacklist.h
@@ -0,0 +1,13 @@
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+struct journal_replay;
+
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *, u64);
+int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+int bch2_journal_seq_blacklist_read(struct journal *,
+				    struct journal_replay *);
+void bch2_journal_seq_blacklist_write(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h
index e39b18f2..a27e0548 100644
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@@ -59,8 +59,9 @@ struct blacklisted_node {
 
 struct journal_seq_blacklist {
 	struct list_head	list;
-	u64			seq;
-	bool			written;
+	u64			start;
+	u64			end;
+
 	struct journal_entry_pin pin;
 
 	struct blacklisted_node	*entries;
@@ -171,10 +172,11 @@ struct journal {
 		u64 front, back, size, mask;
 		struct journal_entry_pin_list *data;
 	}			pin;
-	struct journal_entry_pin_list *replay_pin_list;
+	u64			replay_journal_seq;
 
 	struct mutex		blacklist_lock;
 	struct list_head	seq_blacklist;
+	struct journal_seq_blacklist *new_blacklist;
 
 	BKEY_PADDED(key);
 	struct write_point	wp;
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 87e6e80c..0431fb81 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -5,6 +5,7 @@
 #include "buckets.h"
 #include "inode.h"
 #include "io.h"
+#include "journal_reclaim.h"
 #include "move.h"
 #include "replicas.h"
 #include "super-io.h"
@@ -22,7 +23,6 @@ struct moving_io {
 	struct closure		cl;
 	bool			read_completed;
 
-	unsigned		read_dev;
 	unsigned		read_sectors;
 	unsigned		write_sectors;
 
@@ -42,7 +42,7 @@ struct moving_context {
 	struct list_head	reads;
 
 	/* in flight sectors: */
-	atomic_t		read_sectors[BCH_SB_MEMBERS_MAX];
+	atomic_t		read_sectors;
 	atomic_t		write_sectors;
 
 	wait_queue_head_t	wait;
@@ -306,7 +306,8 @@ static void move_write(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
 
-	if (likely(!io->rbio.bio.bi_status)) {
+	if (likely(!io->rbio.bio.bi_status &&
+		   !io->rbio.hole)) {
 		bch2_migrate_read_done(&io->write, &io->rbio);
 
 		atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
@@ -330,7 +331,7 @@ static void move_read_endio(struct bio *bio)
 	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
 	struct moving_context *ctxt = io->write.ctxt;
 
-	atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
+	atomic_sub(io->read_sectors, &ctxt->read_sectors);
 	io->read_completed = true;
 
 	if (next_pending_write(ctxt))
@@ -376,7 +377,6 @@ static int bch2_move_extent(struct bch_fs *c,
 			    enum data_cmd data_cmd,
 			    struct data_opts data_opts)
 {
-	struct extent_pick_ptr pick;
 	struct moving_io *io;
 	const struct bch_extent_ptr *ptr;
 	struct bch_extent_crc_unpacked crc;
@@ -387,12 +387,8 @@ static int bch2_move_extent(struct bch_fs *c,
 		atomic_read(&ctxt->write_sectors) <
 		SECTORS_IN_FLIGHT_PER_DEVICE);
 
-	bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
-	if (IS_ERR_OR_NULL(pick.ca))
-		return pick.ca ? PTR_ERR(pick.ca) : 0;
-
 	move_ctxt_wait_event(ctxt,
-		atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) <
+		atomic_read(&ctxt->read_sectors) <
 		SECTORS_IN_FLIGHT_PER_DEVICE);
 
 	/* write path might have to decompress data: */
@@ -406,8 +402,7 @@ static int bch2_move_extent(struct bch_fs *c,
 		goto err;
 
 	io->write.ctxt		= ctxt;
-	io->read_dev		= pick.ca->dev_idx;
-	io->read_sectors	= pick.crc.uncompressed_size;
+	io->read_sectors	= e.k->size;
 	io->write_sectors	= e.k->size;
 
 	bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
@@ -421,6 +416,7 @@ static int bch2_move_extent(struct bch_fs *c,
 
 	io->rbio.opts = io_opts;
 	bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+	io->rbio.bio.bi_vcnt = pages;
 	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	io->rbio.bio.bi_iter.bi_size = sectors << 9;
 
@@ -438,7 +434,7 @@ static int bch2_move_extent(struct bch_fs *c,
 
 	trace_move_extent(e.k);
 
-	atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
+	atomic_add(io->read_sectors, &ctxt->read_sectors);
 	list_add_tail(&io->list, &ctxt->reads);
 
 	/*
@@ -446,14 +442,15 @@ static int bch2_move_extent(struct bch_fs *c,
 	 * ctxt when doing wakeup
 	 */
 	closure_get(&ctxt->cl);
-	bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE);
+	bch2_read_extent(c, &io->rbio, e.s_c,
+			 BCH_READ_NODECODE|
+			 BCH_READ_LAST_FRAGMENT);
 	return 0;
 err_free_pages:
 	bio_free_pages(&io->write.op.wbio.bio);
 err_free:
 	kfree(io);
 err:
-	percpu_ref_put(&pick.ca->io_ref);
 	trace_move_alloc_fail(e.k);
 	return ret;
 }
@@ -728,7 +725,7 @@ int bch2_data_job(struct bch_fs *c,
 	switch (op.op) {
 	case BCH_DATA_OP_REREPLICATE:
 		stats->data_type = BCH_DATA_JOURNAL;
-		ret = bch2_journal_flush_device(&c->journal, -1);
+		ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
 		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
 		ret = bch2_gc_btree_replicas(c) ?: ret;
@@ -745,7 +742,7 @@ int bch2_data_job(struct bch_fs *c,
 			return -EINVAL;
 
 		stats->data_type = BCH_DATA_JOURNAL;
-		ret = bch2_journal_flush_device(&c->journal, op.migrate.dev);
+		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
 
 		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
 		ret = bch2_gc_btree_replicas(c) ?: ret;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 05910c40..16b8cbfc 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -26,6 +26,8 @@
 #include "inode.h"
 #include "io.h"
 #include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
 #include "migrate.h"
@@ -396,9 +398,15 @@ err:
 
 static void bch2_fs_free(struct bch_fs *c)
 {
+#define BCH_TIME_STAT(name)				\
+	bch2_time_stats_exit(&c->name##_time);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
 	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
 	bch2_fs_encryption_exit(c);
+	bch2_fs_io_exit(c);
 	bch2_fs_btree_cache_exit(c);
 	bch2_fs_journal_exit(&c->journal);
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
@@ -407,10 +415,6 @@ static void bch2_fs_free(struct bch_fs *c)
 	lg_lock_free(&c->usage_lock);
 	free_percpu(c->usage_percpu);
 	mempool_exit(&c->btree_bounce_pool);
-	mempool_exit(&c->bio_bounce_pages);
-	bioset_exit(&c->bio_write);
-	bioset_exit(&c->bio_read_split);
-	bioset_exit(&c->bio_read);
 	bioset_exit(&c->btree_bio);
 	mempool_exit(&c->btree_interior_update_pool);
 	mempool_exit(&c->btree_reserve_pool);
@@ -561,8 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	init_rwsem(&c->gc_lock);
 
-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	spin_lock_init(&c->name##_time.lock);
+#define BCH_TIME_STAT(name)				\
+	bch2_time_stats_init(&c->name##_time);
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
 
@@ -587,9 +591,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
 	seqcount_init(&c->gc_pos_lock);
 
-	c->copy_gc_enabled = 1;
-	c->rebalance_enabled = 1;
-	c->rebalance_percent = 10;
+	c->copy_gc_enabled		= 1;
+	c->rebalance_enabled		= 1;
+	c->rebalance_percent		= 10;
+	c->promote_whole_extents	= true;
 
 	c->journal.write_time	= &c->journal_write_time;
 	c->journal.delay_time	= &c->journal_delay_time;
@@ -640,17 +645,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    mempool_init_page_pool(&c->bio_bounce_pages,
-				   max_t(unsigned,
-					 c->opts.btree_node_size,
-					 c->sb.encoded_extent_max) /
-				   PAGE_SECTORS, 0) ||
 	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
 	    lg_lock_init(&c->usage_lock) ||
 	    mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
@@ -658,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
 	    bch2_fs_btree_cache_init(c) ||
+	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
 	    bch2_fs_fsio_init(c))
@@ -774,11 +769,11 @@ const char *bch2_fs_start(struct bch_fs *c)
 			goto recovery_done;
 
 		/*
-		 * bch2_journal_start() can't happen sooner, or btree_gc_finish()
+		 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
 		 * will give spurious errors about oldest_gen > bucket_gen -
 		 * this is a hack but oh well.
 		 */
-		bch2_journal_start(c);
+		bch2_fs_journal_start(&c->journal);
 
 		err = "error starting allocator";
 		if (bch2_fs_allocator_start(c))
@@ -834,7 +829,7 @@ const char *bch2_fs_start(struct bch_fs *c)
 		 * journal_res_get() will crash if called before this has
 		 * set up the journal.pin FIFO and journal.cur pointer:
 		 */
-		bch2_journal_start(c);
+		bch2_fs_journal_start(&c->journal);
 		bch2_journal_set_replay_done(&c->journal);
 
 		err = "error starting allocator";
@@ -993,6 +988,9 @@ static void bch2_dev_free(struct bch_dev *ca)
 	bioset_exit(&ca->replica_set);
 	bch2_dev_buckets_free(ca);
 
+	bch2_time_stats_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_exit(&ca->io_latency[READ]);
+
 	percpu_ref_exit(&ca->io_ref);
 	percpu_ref_exit(&ca->ref);
 	kobject_put(&ca->kobj);
@@ -1089,6 +1087,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
+	bch2_time_stats_init(&ca->io_latency[READ]);
+	bch2_time_stats_init(&ca->io_latency[WRITE]);
+
 	ca->mi = bch2_mi_to_cpu(member);
 	ca->uuid = member->uuid;
 
@@ -1421,7 +1422,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 		goto err;
 	}
 
-	ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 	if (ret) {
 		bch_err(ca, "Remove failed: error %i flushing journal", ret);
 		goto err;
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index a52ee3bb..231bc529 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -27,7 +27,26 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)
 
 static inline bool bch2_dev_is_online(struct bch_dev *ca)
 {
-	return ca->disk_sb.bdev != NULL;
+	return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+	return bch2_dev_is_online(ca) &&
+		ca->mi.state != BCH_MEMBER_STATE_FAILED;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+	if (!percpu_ref_tryget(&ca->io_ref))
+		return false;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_RW ||
+	    (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+		return true;
+
+	percpu_ref_put(&ca->io_ref);
+	return false;
 }
 
 static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index e8089db9..65345d80 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -141,11 +141,19 @@ read_attribute(btree_node_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
 read_attribute(durability);
-read_attribute(iostats);
-read_attribute(last_read_quantiles);
-read_attribute(last_write_quantiles);
-read_attribute(fragmentation_quantiles);
-read_attribute(oldest_gen_quantiles);
+read_attribute(iodone);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(bucket_quantiles_last_read);
+read_attribute(bucket_quantiles_last_write);
+read_attribute(bucket_quantiles_fragmentation);
+read_attribute(bucket_quantiles_oldest_gen);
+
 read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
@@ -177,6 +185,7 @@ sysfs_pd_controller_attribute(copy_gc);
 rw_attribute(rebalance_enabled);
 rw_attribute(rebalance_percent);
 sysfs_pd_controller_attribute(rebalance);
+rw_attribute(promote_whole_extents);
 
 rw_attribute(pd_controllers_update_seconds);
 
@@ -189,8 +198,9 @@ read_attribute(data_replicas_have);
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	sysfs_time_stats_attribute(name, frequency_units, duration_units);
+#define BCH_TIME_STAT(_name)						\
+	static struct attribute sysfs_time_stat_##_name =		\
+		{ .name = #_name, .mode = S_IRUGO };
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
 
@@ -332,9 +342,10 @@ SHOW(bch2_fs)
 
 	sysfs_printf(rebalance_enabled,		"%i", c->rebalance_enabled);
 	sysfs_print(rebalance_percent,		c->rebalance_percent);
-
 	sysfs_pd_controller_show(rebalance,	&c->rebalance_pd); /* XXX */
 
+	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
+
 	sysfs_printf(meta_replicas_have, "%u",	bch2_replicas_online(c, true));
 	sysfs_printf(data_replicas_have, "%u",	bch2_replicas_online(c, false));
 
@@ -406,6 +417,8 @@ STORE(__bch2_fs)
 	sysfs_strtoul(rebalance_percent,	c->rebalance_percent);
 	sysfs_pd_controller_store(rebalance,	&c->rebalance_pd);
 
+	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
+
 	/* Debugging: */
 
 #define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
@@ -462,6 +475,7 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_journal_reclaim_delay_ms,
 
 	&sysfs_rebalance_percent,
+	&sysfs_promote_whole_extents,
 
 	&sysfs_compression_stats,
 	NULL
@@ -531,9 +545,16 @@ STORE(bch2_fs_opts_dir)
 	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
 	int ret, id = opt - bch2_opt_table;
+	char *tmp;
 	u64 v;
 
-	ret = bch2_opt_parse(c, opt, buf, &v);
+	tmp = kstrdup(buf, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+	kfree(tmp);
+
 	if (ret < 0)
 		return ret;
 
@@ -592,9 +613,9 @@ SHOW(bch2_fs_time_stats)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
 
-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	sysfs_print_time_stats(&c->name##_time, name,			\
-			       frequency_units, duration_units);
+#define BCH_TIME_STAT(name)						\
+	if (attr == &sysfs_time_stat_##name)				\
+		return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
 
@@ -603,23 +624,15 @@ SHOW(bch2_fs_time_stats)
 
 STORE(bch2_fs_time_stats)
 {
-	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	sysfs_clear_time_stats(&c->name##_time, name);
-	BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-
 	return size;
 }
 SYSFS_OPS(bch2_fs_time_stats);
 
 struct attribute *bch2_fs_time_stats_files[] = {
-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	sysfs_time_stats_attribute_list(name, frequency_units, duration_units)
+#define BCH_TIME_STAT(name)						\
+	&sysfs_time_stat_##name,
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
-
 	NULL
 };
 
@@ -774,7 +787,7 @@ static const char * const bch2_rw[] = {
 	NULL
 };
 
-static ssize_t show_dev_iostats(struct bch_dev *ca, char *buf)
+static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
 {
 	char *out = buf, *end = buf + PAGE_SIZE;
 	int rw, i, cpu;
@@ -851,16 +864,28 @@ SHOW(bch2_dev)
 		return out - buf;
 	}
 
-	if (attr == &sysfs_iostats)
-		return show_dev_iostats(ca, buf);
+	if (attr == &sysfs_iodone)
+		return show_dev_iodone(ca, buf);
 
-	if (attr == &sysfs_last_read_quantiles)
+	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
+	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
+
+	if (attr == &sysfs_io_latency_stats_read)
+		return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
+	if (attr == &sysfs_io_latency_stats_write)
+		return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+
+	sysfs_printf(congested,			"%u%%",
+		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+		     * 100 / CONGESTED_MAX);
+
+	if (attr == &sysfs_bucket_quantiles_last_read)
 		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
-	if (attr == &sysfs_last_write_quantiles)
+	if (attr == &sysfs_bucket_quantiles_last_write)
 		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
-	if (attr == &sysfs_fragmentation_quantiles)
+	if (attr == &sysfs_bucket_quantiles_fragmentation)
 		return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
-	if (attr == &sysfs_oldest_gen_quantiles)
+	if (attr == &sysfs_bucket_quantiles_oldest_gen)
 		return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
 
 	if (attr == &sysfs_reserve_stats)
@@ -944,13 +969,20 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_label,
 
 	&sysfs_has_data,
-	&sysfs_iostats,
+	&sysfs_iodone,
+
+	&sysfs_io_latency_read,
+	&sysfs_io_latency_write,
+	&sysfs_io_latency_stats_read,
+	&sysfs_io_latency_stats_write,
+	&sysfs_congested,
 
 	/* alloc info - other stats: */
-	&sysfs_last_read_quantiles,
-	&sysfs_last_write_quantiles,
-	&sysfs_fragmentation_quantiles,
-	&sysfs_oldest_gen_quantiles,
+	&sysfs_bucket_quantiles_last_read,
+	&sysfs_bucket_quantiles_last_write,
+	&sysfs_bucket_quantiles_fragmentation,
+	&sysfs_bucket_quantiles_oldest_gen,
+
 	&sysfs_reserve_stats,
 
 	/* debug: */
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index fa853750..1f2c23b9 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -13,12 +13,15 @@
 #include <linux/kthread.h>
 #include <linux/log2.h>
 #include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched/clock.h>
 
+#include "eytzinger.h"
 #include "util.h"
 
 #define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
@@ -200,59 +203,189 @@ bool bch2_is_zero(const void *_p, size_t n)
 	return true;
 }
 
-void bch2_time_stats_clear(struct time_stats *stats)
+void bch2_quantiles_update(struct quantiles *q, u64 v)
 {
-	spin_lock(&stats->lock);
+	unsigned i = 0;
 
-	stats->count = 0;
-	stats->last_duration = 0;
-	stats->max_duration = 0;
-	stats->average_duration = 0;
-	stats->average_frequency = 0;
-	stats->last = 0;
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct quantile_entry *e = q->entries + i;
 
-	spin_unlock(&stats->lock);
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
 }
 
-void __bch2_time_stats_update(struct time_stats *stats, u64 start_time)
+/* time stats: */
+
+static void bch2_time_stats_update_one(struct time_stats *stats,
+				       u64 start, u64 end)
 {
-	u64 now, duration, last;
+	u64 duration, freq;
+
+	duration	= time_after64(end, start)
+		? end - start : 0;
+	freq		= time_after64(end, stats->last_event)
+		? end - stats->last_event : 0;
 
 	stats->count++;
 
-	now		= local_clock();
-	duration	= time_after64(now, start_time)
-		? now - start_time : 0;
-	last		= time_after64(now, stats->last)
-		? now - stats->last : 0;
+	stats->average_duration = stats->average_duration
+		? ewma_add(stats->average_duration, duration, 6)
+		: duration;
+
+	stats->average_frequency = stats->average_frequency
+		? ewma_add(stats->average_frequency, freq, 6)
+		: freq;
 
-	stats->last_duration = duration;
 	stats->max_duration = max(stats->max_duration, duration);
 
-	if (stats->last) {
-		stats->average_duration = ewma_add(stats->average_duration,
-						   duration << 8, 3);
+	stats->last_event = end;
 
-		if (stats->average_frequency)
-			stats->average_frequency =
-				ewma_add(stats->average_frequency,
-					 last << 8, 3);
-		else
-			stats->average_frequency  = last << 8;
+	bch2_quantiles_update(&stats->quantiles, duration);
+}
+
+void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		bch2_time_stats_update_one(stats, start, end);
+
+		if (stats->average_frequency < 32 &&
+		    stats->count > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
 	} else {
-		stats->average_duration = duration << 8;
+		struct time_stat_buffer_entry *i;
+		struct time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (b->nr == ARRAY_SIZE(b->entries)) {
+			spin_lock_irqsave(&stats->lock, flags);
+			for (i = b->entries;
+			     i < b->entries + ARRAY_SIZE(b->entries);
+			     i++)
+				bch2_time_stats_update_one(stats, i->start, i->end);
+			spin_unlock_irqrestore(&stats->lock, flags);
+
+			b->nr = 0;
+		}
+
+		preempt_enable();
+	}
+}
+
+static const struct time_unit {
+	const char	*name;
+	u32		nsecs;
+} time_units[] = {
+	{ "ns",		1		},
+	{ "us",		NSEC_PER_USEC	},
+	{ "ms",		NSEC_PER_MSEC	},
+	{ "sec",	NSEC_PER_SEC	},
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+static size_t pr_time_units(char *buf, size_t len, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len)
+{
+	char *out = buf, *end = buf + len;
+	const struct time_unit *u;
+	u64 freq = READ_ONCE(stats->average_frequency);
+	u64 q, last_q = 0;
+	int i;
+
+	out += scnprintf(out, end - out, "count:\t\t%llu\n",
+			 stats->count);
+	out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
+			 freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+
+	out += scnprintf(out, end - out, "frequency:\t");
+	out += pr_time_units(out, end - out, freq);
+
+	out += scnprintf(out, end - out, "\navg duration:\t");
+	out += pr_time_units(out, end - out, stats->average_duration);
+
+	out += scnprintf(out, end - out, "\nmax duration:\t");
+	out += pr_time_units(out, end - out, stats->max_duration);
+
+	i = eytzinger0_first(NR_QUANTILES);
+	u = pick_time_units(stats->quantiles.entries[i].m);
+
+	out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
+	eytzinger0_for_each(i, NR_QUANTILES) {
+		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+		q = max(stats->quantiles.entries[i].m, last_q);
+		out += scnprintf(out, end - out, "%llu%s",
+				 div_u64(q, u->nsecs),
+				 is_last ? "\n" : " ");
+		last_q = q;
 	}
 
-	stats->last = now ?: 1;
+	return out - buf;
 }
 
-void bch2_time_stats_update(struct time_stats *stats, u64 start_time)
+void bch2_time_stats_exit(struct time_stats *stats)
 {
-	spin_lock(&stats->lock);
-	__bch2_time_stats_update(stats, start_time);
-	spin_unlock(&stats->lock);
+	free_percpu(stats->buffer);
 }
 
+void bch2_time_stats_init(struct time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
 /**
  * bch2_ratelimit_delay() - return how long to delay until the next time to do
  * some work
@@ -310,6 +443,8 @@ int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
 	}
 }
 
+/* pd controller: */
+
 /*
  * Updates pd_controller. Attempts to scale inputed values to units per second.
  * @target: desired value
@@ -404,6 +539,8 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
 		       derivative, change, next_io);
 }
 
+/* misc: */
+
 void bch2_bio_map(struct bio *bio, void *base)
 {
 	size_t size = bio->bi_iter.bi_size;
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index cc89da1f..7c7264f4 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -371,87 +371,50 @@ ssize_t bch2_read_string_list(const char *, const char * const[]);
 ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
 u64 bch2_read_flag_list(char *, const char * const[]);
 
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+	struct quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+	unsigned	nr;
+	struct time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[32];
+};
+
 struct time_stats {
 	spinlock_t	lock;
 	u64		count;
-	/*
-	 * all fields are in nanoseconds, averages are ewmas stored left shifted
-	 * by 8
-	 */
-	u64		last_duration;
-	u64		max_duration;
+	/* all fields are in nanoseconds */
 	u64		average_duration;
 	u64		average_frequency;
-	u64		last;
+	u64		max_duration;
+	u64		last_event;
+	struct quantiles quantiles;
+
+	struct time_stat_buffer __percpu *buffer;
 };
 
-void bch2_time_stats_clear(struct time_stats *stats);
-void __bch2_time_stats_update(struct time_stats *stats, u64 time);
-void bch2_time_stats_update(struct time_stats *stats, u64 time);
+void __bch2_time_stats_update(struct time_stats *stats, u64, u64);
 
-static inline unsigned local_clock_us(void)
+static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
 {
-	return local_clock() >> 10;
+	__bch2_time_stats_update(stats, start, local_clock());
 }
 
-#define NSEC_PER_ns			1L
-#define NSEC_PER_us			NSEC_PER_USEC
-#define NSEC_PER_ms			NSEC_PER_MSEC
-#define NSEC_PER_sec			NSEC_PER_SEC
+size_t bch2_time_stats_print(struct time_stats *, char *, size_t);
 
-#define __print_time_stat(stats, name, stat, units)			\
-	sysfs_print(name ## _ ## stat ## _ ## units,			\
-		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
-
-#define sysfs_print_time_stats(stats, name,				\
-			       frequency_units,				\
-			       duration_units)				\
-do {									\
-	__print_time_stat(stats, name,					\
-			  average_frequency,	frequency_units);	\
-	__print_time_stat(stats, name,					\
-			  average_duration,	duration_units);	\
-	sysfs_print(name ## _ ##count, (stats)->count);			\
-	sysfs_print(name ## _ ##last_duration ## _ ## duration_units,	\
-			div_u64((stats)->last_duration,			\
-				NSEC_PER_ ## duration_units));		\
-	sysfs_print(name ## _ ##max_duration ## _ ## duration_units,	\
-			div_u64((stats)->max_duration,			\
-				NSEC_PER_ ## duration_units));		\
-									\
-	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
-		    ? div_s64(local_clock() - (stats)->last,		\
-			      NSEC_PER_ ## frequency_units)		\
-		    : -1LL);						\
-} while (0)
-
-#define sysfs_clear_time_stats(stats, name)				\
-do {									\
-	if (attr == &sysfs_ ## name ## _clear)				\
-		bch2_time_stats_clear(stats);				\
-} while (0)
-
-#define sysfs_time_stats_attribute(name,				\
-				   frequency_units,			\
-				   duration_units)			\
-write_attribute(name ## _clear);					\
-read_attribute(name ## _count);						\
-read_attribute(name ## _average_frequency_ ## frequency_units);		\
-read_attribute(name ## _average_duration_ ## duration_units);		\
-read_attribute(name ## _last_duration_ ## duration_units);		\
-read_attribute(name ## _max_duration_ ## duration_units);		\
-read_attribute(name ## _last_ ## frequency_units)
-
-#define sysfs_time_stats_attribute_list(name,				\
-					frequency_units,		\
-					duration_units)			\
-&sysfs_ ## name ## _clear,						\
-&sysfs_ ## name ## _count,						\
-&sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
-&sysfs_ ## name ## _average_duration_ ## duration_units,		\
-&sysfs_ ## name ## _last_duration_ ## duration_units,			\
-&sysfs_ ## name ## _max_duration_ ## duration_units,			\
-&sysfs_ ## name ## _last_ ## frequency_units,
+void bch2_time_stats_exit(struct time_stats *);
+void bch2_time_stats_init(struct time_stats *);
 
 #define ewma_add(ewma, val, weight)					\
 ({									\