Update bcachefs sources to ed4aea2ad4 bcachefs: fix gcc warning

2025-02-22 00:00:03 +03:00 · 2018-05-04 14:04:31 -04:00 · 2018-05-04 14:04:31 -04:00 · 018de5aa89
commit 018de5aa89
parent c598d91dcb
37 changed files with 4216 additions and 3299 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-edf5f38218f699e53913a549465f35d36c4418f7
+ed4aea2ad4fa1b3891684cbd071d1a1ae9094342
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@ -69,6 +69,7 @@
 #include "extents.h"
 #include "io.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "super-io.h"

 #include <linux/blkdev.h>
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -271,17 +271,19 @@ do {									\
 #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS()
 #endif

-/* name, frequency_units, duration_units */
 #define BCH_TIME_STATS()				\
-	BCH_TIME_STAT(btree_node_mem_alloc,	sec, us)		\
-	BCH_TIME_STAT(btree_gc,			sec, ms)		\
-	BCH_TIME_STAT(btree_split,		sec, us)		\
-	BCH_TIME_STAT(btree_sort,		ms, us)			\
-	BCH_TIME_STAT(btree_read,		ms, us)			\
-	BCH_TIME_STAT(journal_write,		us, us)			\
-	BCH_TIME_STAT(journal_delay,		ms, us)			\
-	BCH_TIME_STAT(journal_blocked,		sec, ms)		\
-	BCH_TIME_STAT(journal_flush_seq,	us, us)
+	BCH_TIME_STAT(btree_node_mem_alloc)		\
+	BCH_TIME_STAT(btree_gc)				\
+	BCH_TIME_STAT(btree_split)			\
+	BCH_TIME_STAT(btree_sort)			\
+	BCH_TIME_STAT(btree_read)			\
+	BCH_TIME_STAT(data_write)			\
+	BCH_TIME_STAT(data_read)			\
+	BCH_TIME_STAT(data_promote)			\
+	BCH_TIME_STAT(journal_write)			\
+	BCH_TIME_STAT(journal_delay)			\
+	BCH_TIME_STAT(journal_blocked)			\
+	BCH_TIME_STAT(journal_flush_seq)

 #include "alloc_types.h"
 #include "buckets_types.h"
@ -416,7 +418,12 @@ struct bch_dev {
 	struct work_struct	io_error_work;

 	/* The rest of this all shows up in sysfs */
-	atomic_t		latency[2];
+	atomic64_t		cur_latency[2];
+	struct time_stats	io_latency[2];
+
+#define CONGESTED_MAX		1024
+	atomic_t		congested;
+	u64			congested_last;

 	struct io_count __percpu *io_done;
 };
@ -644,6 +651,7 @@ struct bch_fs {
 	struct bio_set		bio_write;
 	struct mutex		bio_bounce_pages_lock;
 	mempool_t		bio_bounce_pages;
+	struct rhashtable	promote_table;

 	mempool_t		compression_bounce[2];
 	mempool_t		compress_workspace[BCH_COMPRESSION_NR];
@ -708,12 +716,13 @@ struct bch_fs {
 	unsigned		copy_gc_enabled:1;
 	unsigned		rebalance_enabled:1;
 	unsigned		rebalance_percent;
+	bool			promote_whole_extents;

 #define BCH_DEBUG_PARAM(name, description) bool name;
 	BCH_DEBUG_PARAMS_ALL()
 #undef BCH_DEBUG_PARAM

-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
+#define BCH_TIME_STAT(name)				\
 	struct time_stats	name##_time;
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -1088,13 +1088,14 @@ LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS,

 LE64_BITMASK(BCH_SB_META_REPLICAS_REQ,	struct bch_sb, flags[1], 20, 24);
 LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,	struct bch_sb, flags[1], 24, 28);
-LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
-					struct bch_sb, flags[1], 28, 32);

 LE64_BITMASK(BCH_SB_PROMOTE_TARGET,	struct bch_sb, flags[1], 28, 40);
 LE64_BITMASK(BCH_SB_FOREGROUND_TARGET,	struct bch_sb, flags[1], 40, 52);
 LE64_BITMASK(BCH_SB_BACKGROUND_TARGET,	struct bch_sb, flags[1], 52, 64);

+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
+					struct bch_sb, flags[2],  0,  4);
+
 /* Features: */
 enum bch_sb_features {
 	BCH_FEATURE_LZ4			= 0,
@ -1193,29 +1194,41 @@ struct jset_entry {
 	};
 };

+#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
+
+#define BCH_JSET_ENTRY_TYPES()			\
+	x(btree_keys,		0)		\
+	x(btree_root,		1)		\
+	x(prio_ptrs,		2)		\
+	x(blacklist,		3)		\
+	x(blacklist_v2,		4)
+
+enum {
+#define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
+	BCH_JSET_ENTRY_TYPES()
+#undef x
+	BCH_JSET_ENTRY_NR
+};
+
+/*
+ * Journal sequence numbers can be blacklisted: bsets record the max sequence
+ * number of all the journal entries they contain updates for, so that on
+ * recovery we can ignore those bsets that contain index updates newer that what
+ * made it into the journal.
+ *
+ * This means that we can't reuse that journal_seq - we have to skip it, and
+ * then record that we skipped it so that the next time we crash and recover we
+ * don't think there was a missing journal entry.
+ */
 struct jset_entry_blacklist {
 	struct jset_entry	entry;
 	__le64			seq;
 };

-#define JSET_KEYS_U64s	(sizeof(struct jset_entry) / sizeof(__u64))
-
-enum {
-	JOURNAL_ENTRY_BTREE_KEYS		= 0,
-	JOURNAL_ENTRY_BTREE_ROOT		= 1,
-	JOURNAL_ENTRY_PRIO_PTRS			= 2, /* Obsolete */
-
-	/*
-	 * Journal sequence numbers can be blacklisted: bsets record the max
-	 * sequence number of all the journal entries they contain updates for,
-	 * so that on recovery we can ignore those bsets that contain index
-	 * updates newer that what made it into the journal.
-	 *
-	 * This means that we can't reuse that journal_seq - we have to skip it,
-	 * and then record that we skipped it so that the next time we crash and
-	 * recover we don't think there was a missing journal entry.
-	 */
-	JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED	= 3,
+struct jset_entry_blacklist_v2 {
+	struct jset_entry	entry;
+	__le64			start;
+	__le64			end;
 };

 /*
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@ -13,7 +13,8 @@
 #include "error.h"
 #include "extents.h"
 #include "io.h"
-#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
 #include "super-io.h"

 #include <trace/events/bcachefs.h>
@ -947,6 +948,7 @@ enum btree_validate_ret {

 #define btree_err(type, c, b, i, msg, ...)				\
 ({									\
+	__label__ out;							\
 	char _buf[300], *out = _buf, *end = out + sizeof(_buf);		\
 									\
 	out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
@ -956,7 +958,11 @@ enum btree_validate_ret {
 	    write == READ &&						\
 	    !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {		\
 		mustfix_fsck_err(c, "%s", _buf);			\
-	} else {							\
+		goto out;						\
+	}								\
+									\
+	switch (write) {						\
+	case READ:							\
 		bch_err(c, "%s", _buf);					\
 									\
 		switch (type) {						\
@ -976,7 +982,17 @@ enum btree_validate_ret {
 			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
 			goto fsck_err;					\
 		}							\
+		break;							\
+	case WRITE:							\
+		bch_err(c, "corrupt metadata before write: %s", _buf);	\
+									\
+		if (bch2_fs_inconsistent(c)) {				\
+			ret = BCH_FSCK_ERRORS_NOT_FIXED;		\
+			goto fsck_err;					\
 		}							\
+		break;							\
+	}								\
+out:									\
 	true;								\
 })

@ -1323,37 +1339,48 @@ static void btree_node_read_work(struct work_struct *work)
 	struct btree_read_bio *rb =
 		container_of(work, struct btree_read_bio, work);
 	struct bch_fs *c	= rb->c;
+	struct bch_dev *ca	= bch_dev_bkey_exists(c, rb->pick.ptr.dev);
 	struct btree *b		= rb->bio.bi_private;
 	struct bio *bio		= &rb->bio;
 	struct bch_devs_mask avoid;
+	bool can_retry;

 	memset(&avoid, 0, sizeof(avoid));

 	goto start;
-	do {
+	while (1) {
 		bch_info(c, "retrying read");
+		ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
 		bio_reset(bio);
-		bio_set_dev(bio, rb->pick.ca->disk_sb.bdev);
 		bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
 		bio->bi_iter.bi_sector	= rb->pick.ptr.offset;
 		bio->bi_iter.bi_size	= btree_bytes(c);
-		submit_bio_wait(bio);
-start:
-		bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
-		percpu_ref_put(&rb->pick.ca->io_ref);

-		__set_bit(rb->pick.ca->dev_idx, avoid.d);
-		rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
+		if (rb->have_ioref) {
+			bio_set_dev(bio, ca->disk_sb.bdev);
+			submit_bio_wait(bio);
+		} else {
+			bio->bi_status = BLK_STS_REMOVED;
+		}
+start:
+		bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+		if (rb->have_ioref)
+			percpu_ref_put(&ca->io_ref);
+		rb->have_ioref = false;
+
+		__set_bit(rb->pick.ptr.dev, avoid.d);
+		can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;

 		if (!bio->bi_status &&
-		    !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
-			goto out;
-	} while (!IS_ERR_OR_NULL(rb->pick.ca));
+		    !bch2_btree_node_read_done(c, b, can_retry))
+			break;

+		if (!can_retry) {
 			set_btree_node_read_error(b);
-out:
-	if (!IS_ERR_OR_NULL(rb->pick.ca))
-		percpu_ref_put(&rb->pick.ca->io_ref);
+			break;
+		}
+	}

 	bch2_time_stats_update(&c->btree_read_time, rb->start_time);
 	bio_put(&rb->bio);
@ -1365,10 +1392,13 @@ static void btree_node_read_endio(struct bio *bio)
 {
 	struct btree_read_bio *rb =
 		container_of(bio, struct btree_read_bio, bio);
+	struct bch_fs *c	= rb->c;

-	bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+	if (rb->have_ioref) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+		bch2_latency_acct(ca, rb->start_time, READ);
+	}

-	INIT_WORK(&rb->work, btree_node_read_work);
 	queue_work(system_unbound_wq, &rb->work);
 }

@ -1377,42 +1407,59 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
 {
 	struct extent_pick_ptr pick;
 	struct btree_read_bio *rb;
+	struct bch_dev *ca;
 	struct bio *bio;
+	int ret;

 	trace_btree_read(c, b);

-	pick = bch2_btree_pick_ptr(c, b, NULL);
-	if (bch2_fs_fatal_err_on(!pick.ca, c,
+	ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+	if (bch2_fs_fatal_err_on(ret <= 0, c,
 			"btree node read error: no device to read from")) {
 		set_btree_node_read_error(b);
 		return;
 	}

+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
 	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
 	rb = container_of(bio, struct btree_read_bio, bio);
 	rb->c			= c;
 	rb->start_time		= local_clock();
+	rb->have_ioref		= bch2_dev_get_ioref(ca, READ);
 	rb->pick		= pick;
-	bio_set_dev(bio, pick.ca->disk_sb.bdev);
+	INIT_WORK(&rb->work, btree_node_read_work);
 	bio->bi_opf		= REQ_OP_READ|REQ_SYNC|REQ_META;
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
 	bio->bi_iter.bi_size	= btree_bytes(c);
+	bio->bi_end_io		= btree_node_read_endio;
+	bio->bi_private		= b;
 	bch2_bio_map(bio, b->data);

-	this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
-		     bio_sectors(bio));
-
 	set_btree_node_read_in_flight(b);

+	if (rb->have_ioref) {
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+			     bio_sectors(bio));
+		bio_set_dev(bio, ca->disk_sb.bdev);
+
 		if (sync) {
 			submit_bio_wait(bio);
+
 			bio->bi_private	= b;
 			btree_node_read_work(&rb->work);
 		} else {
-		bio->bi_end_io	= btree_node_read_endio;
-		bio->bi_private	= b;
 			submit_bio(bio);
 		}
+	} else {
+		bio->bi_status = BLK_STS_REMOVED;
+
+		if (sync)
+			btree_node_read_work(&rb->work);
+		else
+			queue_work(system_unbound_wq, &rb->work);
+
+	}
 }

 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
@ -1593,20 +1640,21 @@ static void btree_node_write_endio(struct bio *bio)
 	struct bch_write_bio *parent	= wbio->split ? wbio->parent : NULL;
 	struct bch_write_bio *orig	= parent ?: wbio;
 	struct bch_fs *c		= wbio->c;
-	struct bch_dev *ca		= wbio->ca;
+	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
 	unsigned long flags;

-	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+	if (wbio->have_ioref)
+		bch2_latency_acct(ca, wbio->submit_time, WRITE);

 	if (bio->bi_status == BLK_STS_REMOVED ||
 	    bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
 	    bch2_meta_write_fault("btree")) {
 		spin_lock_irqsave(&c->btree_write_error_lock, flags);
-		bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+		bch2_dev_list_add_dev(&orig->failed, wbio->dev);
 		spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
 	}

-	if (wbio->have_io_ref)
+	if (wbio->have_ioref)
 		percpu_ref_put(&ca->io_ref);

 	if (parent) {
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@ -12,8 +12,8 @@ struct btree_iter;

 struct btree_read_bio {
 	struct bch_fs		*c;
-	unsigned		submit_time_us;
 	u64			start_time;
+	unsigned		have_ioref:1;
 	struct extent_pick_ptr	pick;
 	struct work_struct	work;
 	struct bio		bio;
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -748,7 +748,9 @@ static void btree_iter_prefetch(struct btree_iter *iter)
 	struct btree_node_iter node_iter = l->iter;
 	struct bkey_packed *k;
 	BKEY_PADDED(k) tmp;
-	unsigned nr = iter->level > 1 ? 1 : 8;
+	unsigned nr = test_bit(BCH_FS_STARTED, &iter->c->flags)
+		? (iter->level > 1 ? 0 :  2)
+		: (iter->level > 1 ? 1 : 16);
 	bool was_locked = btree_node_locked(iter, iter->level);

 	while (nr) {
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -12,6 +12,7 @@
 #include "buckets.h"
 #include "extents.h"
 #include "journal.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
 #include "replicas.h"
 #include "super-io.h"
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -8,6 +8,7 @@
 #include "debug.h"
 #include "extents.h"
 #include "journal.h"
+#include "journal_reclaim.h"
 #include "keylist.h"

 #include <linux/sort.h>
@ -137,7 +138,7 @@ void bch2_btree_journal_key(struct btree_insert *trans,
 	EBUG_ON(trans->journal_res.ref !=
 		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));

-	if (likely(trans->journal_res.ref)) {
+	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
 		u64 seq = trans->journal_res.seq;
 		bool needs_whiteout = insert->k.needs_whiteout;

@ -155,12 +156,16 @@ void bch2_btree_journal_key(struct btree_insert *trans,
 		btree_bset_last(b)->journal_seq = cpu_to_le64(seq);
 	}

-	if (unlikely(!journal_pin_active(&w->journal)))
-		bch2_journal_pin_add(j, &trans->journal_res,
-				     &w->journal,
+	if (unlikely(!journal_pin_active(&w->journal))) {
+		u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+			? trans->journal_res.seq
+			: j->replay_journal_seq;
+
+		bch2_journal_pin_add(j, seq, &w->journal,
 				     btree_node_write_idx(b) == 0
 				     ? btree_node_flush0
 				     : btree_node_flush1);
+	}

 	if (unlikely(!btree_node_dirty(b)))
 		set_btree_node_dirty(b);
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -142,7 +142,8 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca,
 	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;

 	if (WARN_ONCE(stats.buckets_unavailable > total,
-		      "buckets_unavailable overflow\n"))
+		      "buckets_unavailable overflow (%llu > %llu)\n",
+		      stats.buckets_unavailable, total))
 		return 0;

 	return total - stats.buckets_unavailable;
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@ -36,6 +36,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
 	struct bset *sorted, *inmemory;
 	struct extent_pick_ptr pick;
+	struct bch_dev *ca;
 	struct bio *bio;

 	if (c->opts.nochanges)
@ -54,12 +55,15 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	v->btree_id	= b->btree_id;
 	bch2_btree_keys_init(v, &c->expensive_debug_checks);

-	pick = bch2_btree_pick_ptr(c, b, NULL);
-	if (IS_ERR_OR_NULL(pick.ca))
+	if (bch2_btree_pick_ptr(c, b, NULL, &pick) <= 0)
+		return;
+
+	ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+	if (!bch2_dev_get_ioref(ca, READ))
 		return;

 	bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
-	bio_set_dev(bio, pick.ca->disk_sb.bdev);
+	bio_set_dev(bio, ca->disk_sb.bdev);
 	bio->bi_opf		= REQ_OP_READ|REQ_META;
 	bio->bi_iter.bi_sector	= pick.ptr.offset;
 	bio->bi_iter.bi_size	= btree_bytes(c);
@ -68,7 +72,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	submit_bio_wait(bio);

 	bio_put(bio);
-	percpu_ref_put(&pick.ca->io_ref);
+	percpu_ref_put(&ca->io_ref);

 	memcpy(n_ondisk, n_sorted, btree_bytes(c));

--- a/libbcachefs/error.c
+++ b/libbcachefs/error.c
@ -3,20 +3,22 @@
 #include "io.h"
 #include "super.h"

-void bch2_inconsistent_error(struct bch_fs *c)
+bool bch2_inconsistent_error(struct bch_fs *c)
 {
 	set_bit(BCH_FS_ERROR, &c->flags);

 	switch (c->opts.errors) {
 	case BCH_ON_ERROR_CONTINUE:
-		break;
+		return false;
 	case BCH_ON_ERROR_RO:
 		if (bch2_fs_emergency_read_only(c))
 			bch_err(c, "emergency read only");
-		break;
+		return true;
 	case BCH_ON_ERROR_PANIC:
 		panic(bch2_fmt(c, "panic after error"));
-		break;
+		return true;
+	default:
+		BUG();
 	}
 }

--- a/libbcachefs/error.h
+++ b/libbcachefs/error.h
@ -45,13 +45,13 @@ do {									\
 * BCH_ON_ERROR_CONTINUE mode
 */

-void bch2_inconsistent_error(struct bch_fs *);
+bool bch2_inconsistent_error(struct bch_fs *);

 #define bch2_fs_inconsistent(c, ...)					\
-do {									\
+({									\
 	bch_err(c, __VA_ARGS__);					\
 	bch2_inconsistent_error(c);					\
-} while (0)
+})

 #define bch2_fs_inconsistent_on(cond, c, ...)				\
 ({									\
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -588,58 +588,51 @@ out:
 	return out - buf;
 }

-static inline bool dev_latency_better(struct bch_dev *dev1,
-				      struct bch_dev *dev2)
+static inline bool dev_latency_better(struct bch_fs *c,
+			      const struct bch_extent_ptr *ptr1,
+			      const struct bch_extent_ptr *ptr2)
 {
-	unsigned l1 = atomic_read(&dev1->latency[READ]);
-	unsigned l2 = atomic_read(&dev2->latency[READ]);
+	struct bch_dev *dev1 = bch_dev_bkey_exists(c, ptr1->dev);
+	struct bch_dev *dev2 = bch_dev_bkey_exists(c, ptr2->dev);
+	u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+	u64 l2 = atomic64_read(&dev2->cur_latency[READ]);

 	/* Pick at random, biased in favor of the faster device: */

 	return bch2_rand_range(l1 + l2) > l1;
 }

-static void extent_pick_read_device(struct bch_fs *c,
+static int extent_pick_read_device(struct bch_fs *c,
 				   struct bkey_s_c_extent e,
 				   struct bch_devs_mask *avoid,
 				   struct extent_pick_ptr *pick)
 {
 	const struct bch_extent_ptr *ptr;
 	struct bch_extent_crc_unpacked crc;
+	struct bch_dev *ca;
+	int ret = 0;

 	extent_for_each_ptr_crc(e, ptr, crc) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		ca = bch_dev_bkey_exists(c, ptr->dev);

 		if (ptr->cached && ptr_stale(ca, ptr))
 			continue;

-		if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+		if (avoid && test_bit(ptr->dev, avoid->d))
 			continue;

-		if (avoid) {
-			if (test_bit(ca->dev_idx, avoid->d))
+		if (ret && !dev_latency_better(c, ptr, &pick->ptr))
 			continue;

-			if (pick->ca &&
-			    test_bit(pick->ca->dev_idx, avoid->d))
-				goto use;
-		}
-
-		if (pick->ca && !dev_latency_better(ca, pick->ca))
-			continue;
-use:
-		if (!percpu_ref_tryget(&ca->io_ref))
-			continue;
-
-		if (pick->ca)
-			percpu_ref_put(&pick->ca->io_ref);
-
 		*pick = (struct extent_pick_ptr) {
 			.ptr	= *ptr,
 			.crc	= crc,
-			.ca	= ca,
 		};
+
+		ret = 1;
 	}
+
+	return ret;
 }

 /* Btree ptrs */
@ -759,16 +752,12 @@ void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
 #undef p
 }

-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
-		    struct bch_devs_mask *avoid)
+int bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+			struct bch_devs_mask *avoid,
+			struct extent_pick_ptr *pick)
 {
-	struct extent_pick_ptr pick = { .ca = NULL };
-
-	extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
-				avoid, &pick);
-
-	return pick;
+	return extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+				       avoid, pick);
 }

 /* Extents */
@ -2057,37 +2046,33 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
 * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
 * other devices, it will still pick a pointer from avoid.
 */
-void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+int bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
 			 struct bch_devs_mask *avoid,
-			  struct extent_pick_ptr *ret)
+			 struct extent_pick_ptr *pick)
 {
-	struct bkey_s_c_extent e;
+	int ret;

 	switch (k.k->type) {
 	case KEY_TYPE_DELETED:
 	case KEY_TYPE_DISCARD:
 	case KEY_TYPE_COOKIE:
-		ret->ca = NULL;
-		return;
+		return 0;

 	case KEY_TYPE_ERROR:
-		ret->ca = ERR_PTR(-EIO);
-		return;
+		return -EIO;

 	case BCH_EXTENT:
 	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
-		ret->ca = NULL;
+		ret = extent_pick_read_device(c, bkey_s_c_to_extent(k),
+					      avoid, pick);

-		extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
+		if (!ret && !bkey_extent_is_cached(k.k))
+			ret = -EIO;

-		if (!ret->ca && !bkey_extent_is_cached(e.k))
-			ret->ca = ERR_PTR(-EIO);
-		return;
+		return ret;

 	case BCH_RESERVATION:
-		ret->ca = NULL;
-		return;
+		return 0;

 	default:
 		BUG();
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -53,11 +53,11 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 						     struct btree *,
 						     struct btree_node_iter_large *);

-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
-		    struct bch_devs_mask *avoid);
+int bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
+			struct bch_devs_mask *avoid,
+			struct extent_pick_ptr *);

-void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
+int bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c,
 			 struct bch_devs_mask *,
 			 struct extent_pick_ptr *);

--- a/libbcachefs/extents_types.h
+++ b/libbcachefs/extents_types.h
@ -21,7 +21,6 @@ struct bch_extent_crc_unpacked {
 struct extent_pick_ptr {
 	struct bch_extent_ptr		ptr;
 	struct bch_extent_crc_unpacked	crc;
-	struct bch_dev			*ca;
 };

 #endif /* _BCACHEFS_EXTENTS_TYPES_H */
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -20,6 +20,7 @@
 #include <linux/migrate.h>
 #include <linux/mmu_context.h>
 #include <linux/pagevec.h>
+#include <linux/sched/signal.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
@ -124,13 +125,13 @@ static void bch2_quota_reservation_put(struct bch_fs *c,
 	if (!res->sectors)
 		return;

-	mutex_lock(&inode->ei_update_lock);
+	mutex_lock(&inode->ei_quota_lock);
 	BUG_ON(res->sectors > inode->ei_quota_reserved);

 	bch2_quota_acct(c, inode->ei_qid, Q_SPC,
 			-((s64) res->sectors), BCH_QUOTA_PREALLOC);
 	inode->ei_quota_reserved -= res->sectors;
-	mutex_unlock(&inode->ei_update_lock);
+	mutex_unlock(&inode->ei_quota_lock);

 	res->sectors = 0;
 }
@ -143,14 +144,14 @@ static int bch2_quota_reservation_add(struct bch_fs *c,
 {
 	int ret;

-	mutex_lock(&inode->ei_update_lock);
+	mutex_lock(&inode->ei_quota_lock);
 	ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors,
 			      check_enospc ? BCH_QUOTA_PREALLOC : BCH_QUOTA_NOCHECK);
 	if (likely(!ret)) {
 		inode->ei_quota_reserved += sectors;
 		res->sectors += sectors;
 	}
-	mutex_unlock(&inode->ei_update_lock);
+	mutex_unlock(&inode->ei_quota_lock);

 	return ret;
 }
@ -195,9 +196,10 @@ static int __must_check bch2_write_inode_size(struct bch_fs *c,
 	return __bch2_write_inode(c, inode, inode_set_size, &new_size);
 }

-static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 			   struct quota_res *quota_res, int sectors)
 {
+	mutex_lock(&inode->ei_quota_lock);
 #ifdef CONFIG_BCACHEFS_QUOTA
 	if (quota_res && sectors > 0) {
 		BUG_ON(sectors > quota_res->sectors);
@ -210,14 +212,7 @@ static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 	}
 #endif
 	inode->v.i_blocks += sectors;
-}
-
-static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
-			   struct quota_res *quota_res, int sectors)
-{
-	mutex_lock(&inode->ei_update_lock);
-	__i_sectors_acct(c, inode, quota_res, sectors);
-	mutex_unlock(&inode->ei_update_lock);
+	mutex_unlock(&inode->ei_quota_lock);
 }

 /* i_sectors accounting: */
@ -265,7 +260,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
 	if (h->new_i_size != U64_MAX)
 		i_size_write(&h->inode->v, h->new_i_size);

-	__i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
+	i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);

 	ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
 	mutex_unlock(&h->inode->ei_update_lock);
@ -773,6 +768,7 @@ void bch2_invalidatepage(struct page *page, unsigned int offset,

 int bch2_releasepage(struct page *page, gfp_t gfp_mask)
 {
+	/* XXX: this can't take locks that are held while we allocate memory */
 	EBUG_ON(!PageLocked(page));
 	EBUG_ON(PageWriteback(page));

@ -881,10 +877,12 @@ static int readpage_add_page(struct readpages_iter *iter, struct page *page)
 	int ret;

 	prefetchw(&page->flags);
-	page_state_init_for_read(page);

 	ret = add_to_page_cache_lru(page, iter->mapping,
 				    page->index, GFP_NOFS);
+	if (!ret)
+		page_state_init_for_read(page);
+
 	put_page(page);
 	return ret;
 }
@ -992,12 +990,13 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 	int flags = BCH_READ_RETRY_IF_STALE|
 		BCH_READ_MAY_PROMOTE;

+	rbio->c = c;
+	rbio->start_time = local_clock();
+
 	while (1) {
-		struct extent_pick_ptr pick;
 		BKEY_PADDED(k) tmp;
 		struct bkey_s_c k;
 		unsigned bytes;
-		bool is_last;

 		bch2_btree_iter_set_pos(iter, POS(inum, bio->bi_iter.bi_sector));

@ -1016,45 +1015,37 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 		bch2_btree_iter_unlock(iter);
 		k = bkey_i_to_s_c(&tmp.k);

-		bch2_extent_pick_ptr(c, k, NULL, &pick);
-		if (IS_ERR(pick.ca)) {
-			bcache_io_error(c, bio, "no device to read from");
-			bio_endio(bio);
-			return;
+		if (readpages_iter) {
+			bool want_full_extent = false;
+
+			if (bkey_extent_is_data(k.k)) {
+				struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+				const struct bch_extent_ptr *ptr;
+				struct bch_extent_crc_unpacked crc;
+
+				extent_for_each_ptr_crc(e, ptr, crc)
+					want_full_extent |= !!crc.csum_type |
+							     !!crc.compression_type;
 			}

-		if (readpages_iter)
 			readpage_bio_extend(readpages_iter,
 					    bio, k.k->p.offset,
-					    pick.ca &&
-					    (pick.crc.csum_type ||
-					     pick.crc.compression_type));
+					    want_full_extent);
+		}

 		bytes = (min_t(u64, k.k->p.offset, bio_end_sector(bio)) -
 			 bio->bi_iter.bi_sector) << 9;
-		is_last = bytes == bio->bi_iter.bi_size;
 		swap(bio->bi_iter.bi_size, bytes);

+		if (bytes == bio->bi_iter.bi_size)
+			flags |= BCH_READ_LAST_FRAGMENT;
+
 		if (bkey_extent_is_allocation(k.k))
 			bch2_add_page_sectors(bio, k);

-		if (pick.ca) {
-			if (!is_last) {
-				bio_inc_remaining(&rbio->bio);
-				flags |= BCH_READ_MUST_CLONE;
-				trace_read_split(&rbio->bio);
-			}
+		bch2_read_extent(c, rbio, k, flags);

-			bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
-					 &pick, flags);
-		} else {
-			zero_fill_bio(bio);
-
-			if (is_last)
-				bio_endio(bio);
-		}
-
-		if (is_last)
+		if (flags & BCH_READ_LAST_FRAGMENT)
 			return;

 		swap(bio->bi_iter.bi_size, bytes);
@ -1487,6 +1478,194 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
 	return copied;
 }

+#define WRITE_BATCH_PAGES	32
+
+static int __bch2_buffered_write(struct bch_inode_info *inode,
+				 struct address_space *mapping,
+				 struct iov_iter *iter,
+				 loff_t pos, unsigned len)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct page *pages[WRITE_BATCH_PAGES];
+	unsigned long index = pos >> PAGE_SHIFT;
+	unsigned offset = pos & (PAGE_SIZE - 1);
+	unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+	unsigned i, copied = 0, nr_pages_copied = 0;
+	int ret = 0;
+
+	BUG_ON(!len);
+	BUG_ON(nr_pages > ARRAY_SIZE(pages));
+
+	for (i = 0; i < nr_pages; i++) {
+		pages[i] = grab_cache_page_write_begin(mapping, index + i, 0);
+		if (!pages[i]) {
+			nr_pages = i;
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	if (offset && !PageUptodate(pages[0])) {
+		ret = bch2_read_single_page(pages[0], mapping);
+		if (ret)
+			goto out;
+	}
+
+	if ((pos + len) & (PAGE_SIZE - 1) &&
+	    !PageUptodate(pages[nr_pages - 1])) {
+		if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) {
+			zero_user(pages[nr_pages - 1], 0, PAGE_SIZE);
+		} else {
+			ret = bch2_read_single_page(pages[nr_pages - 1], mapping);
+			if (ret)
+				goto out;
+		}
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		ret = bch2_get_page_reservation(c, inode, pages[i], true);
+
+		if (ret && !PageUptodate(pages[i])) {
+			ret = bch2_read_single_page(pages[i], mapping);
+			if (ret)
+				goto out;
+
+			ret = bch2_get_page_reservation(c, inode, pages[i], true);
+		}
+
+		if (ret)
+			goto out;
+	}
+
+	if (mapping_writably_mapped(mapping))
+		for (i = 0; i < nr_pages; i++)
+			flush_dcache_page(pages[i]);
+
+	while (copied < len) {
+		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+		unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1);
+		unsigned pg_bytes = min_t(unsigned, len - copied,
+					  PAGE_SIZE - pg_offset);
+		unsigned pg_copied = iov_iter_copy_from_user_atomic(page,
+						iter, pg_offset, pg_bytes);
+
+		if (!pg_copied)
+			break;
+
+		flush_dcache_page(page);
+		iov_iter_advance(iter, pg_copied);
+		copied += pg_copied;
+	}
+
+	if (!copied)
+		goto out;
+
+	nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
+	inode->ei_last_dirtied = (unsigned long) current;
+
+	if (pos + copied > inode->v.i_size)
+		i_size_write(&inode->v, pos + copied);
+
+	if (copied < len &&
+	    ((offset + copied) & (PAGE_SIZE - 1))) {
+		struct page *page = pages[(offset + copied) >> PAGE_SHIFT];
+
+		if (!PageUptodate(page)) {
+			zero_user(page, 0, PAGE_SIZE);
+			copied -= (offset + copied) & (PAGE_SIZE - 1);
+		}
+	}
+out:
+	for (i = 0; i < nr_pages_copied; i++) {
+		if (!PageUptodate(pages[i]))
+			SetPageUptodate(pages[i]);
+		if (!PageDirty(pages[i]))
+			set_page_dirty(pages[i]);
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+
+	for (i = nr_pages_copied; i < nr_pages; i++) {
+		if (!PageDirty(pages[i]))
+			bch2_put_page_reservation(c, inode, pages[i]);
+		unlock_page(pages[i]);
+		put_page(pages[i]);
+	}
+
+	return copied ?: ret;
+}
+
+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct address_space *mapping = file->f_mapping;
+	struct bch_inode_info *inode = file_bch_inode(file);
+	loff_t pos = iocb->ki_pos;
+	ssize_t written = 0;
+	int ret = 0;
+
+	pagecache_add_get(&mapping->add_lock);
+
+	do {
+		unsigned offset = pos & (PAGE_SIZE - 1);
+		unsigned bytes = min_t(unsigned long, iov_iter_count(iter),
+			      PAGE_SIZE * WRITE_BATCH_PAGES - offset);
+again:
+		/*
+		 * Bring in the user page that we will copy from _first_.
+		 * Otherwise there's a nasty deadlock on copying from the
+		 * same page as we're writing to, without it being marked
+		 * up-to-date.
+		 *
+		 * Not only is this an optimisation, but it is also required
+		 * to check that the address is actually valid, when atomic
+		 * usercopies are used, below.
+		 */
+		if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+			bytes = min_t(unsigned long, iov_iter_count(iter),
+				      PAGE_SIZE - offset);
+
+			if (unlikely(iov_iter_fault_in_readable(iter, bytes))) {
+				ret = -EFAULT;
+				break;
+			}
+		}
+
+		if (unlikely(fatal_signal_pending(current))) {
+			ret = -EINTR;
+			break;
+		}
+
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		if (unlikely(ret < 0))
+			break;
+
+		cond_resched();
+
+		if (unlikely(ret == 0)) {
+			/*
+			 * If we were unable to copy any data at all, we must
+			 * fall back to a single segment length write.
+			 *
+			 * If we didn't fallback here, we could livelock
+			 * because not all segments in the iov can be copied at
+			 * once without a pagefault.
+			 */
+			bytes = min_t(unsigned long, PAGE_SIZE - offset,
+				      iov_iter_single_seg_count(iter));
+			goto again;
+		}
+		pos += ret;
+		written += ret;
+
+		balance_dirty_pages_ratelimited(mapping);
+	} while (iov_iter_count(iter));
+
+	pagecache_add_put(&mapping->add_lock);
+
+	return written ? written : ret;
+}
+
 /* O_DIRECT reads */

 static void bch2_dio_read_complete(struct closure *cl)
@ -1822,7 +2001,7 @@ static ssize_t __bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)

 	ret = iocb->ki_flags & IOCB_DIRECT
 		? bch2_direct_write(iocb, from)
-		: generic_perform_write(file, from, iocb->ki_pos);
+		: bch2_buffered_write(iocb, from);

 	if (likely(ret > 0))
 		iocb->ki_pos += ret;
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@ -1028,6 +1028,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb)

 	inode_init_once(&inode->v);
 	mutex_init(&inode->ei_update_lock);
+	mutex_init(&inode->ei_quota_lock);
 	inode->ei_journal_seq = 0;

 	return &inode->v;
--- a/libbcachefs/fs.h
+++ b/libbcachefs/fs.h
@ -15,6 +15,8 @@ struct bch_inode_info {
 	u64			ei_journal_seq;
 	u64			ei_quota_reserved;
 	unsigned long		ei_last_dirtied;
+
+	struct mutex		ei_quota_lock;
 	struct bch_qid		ei_qid;

 	struct bch_hash_info	ei_str_hash;
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@ -16,7 +16,7 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
 void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);

-void bch2_latency_acct(struct bch_dev *, unsigned, int);
+void bch2_latency_acct(struct bch_dev *, u64, int);

 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *);
@ -99,40 +99,28 @@ struct cache_promote_op;
 struct extent_pick_ptr;

 int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		       struct bkey_s_c_extent e, struct extent_pick_ptr *,
-		       unsigned);
-void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		 u64, struct bch_devs_mask *, unsigned);
+		       struct bkey_s_c, struct bch_devs_mask *, unsigned);
+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64);

 enum bch_read_flags {
 	BCH_READ_RETRY_IF_STALE		= 1 << 0,
 	BCH_READ_MAY_PROMOTE		= 1 << 1,
 	BCH_READ_USER_MAPPED		= 1 << 2,
 	BCH_READ_NODECODE		= 1 << 3,
+	BCH_READ_LAST_FRAGMENT		= 1 << 4,

 	/* internal: */
-	BCH_READ_MUST_BOUNCE		= 1 << 4,
-	BCH_READ_MUST_CLONE		= 1 << 5,
-	BCH_READ_IN_RETRY		= 1 << 6,
+	BCH_READ_MUST_BOUNCE		= 1 << 5,
+	BCH_READ_MUST_CLONE		= 1 << 6,
+	BCH_READ_IN_RETRY		= 1 << 7,
 };

 static inline void bch2_read_extent(struct bch_fs *c,
 				    struct bch_read_bio *rbio,
-				    struct bkey_s_c_extent e,
-				    struct extent_pick_ptr *pick,
+				    struct bkey_s_c k,
 				    unsigned flags)
 {
-	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
-}
-
-static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
-			     u64 inode)
-{
-	BUG_ON(rbio->_state);
-	__bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL,
-		    BCH_READ_RETRY_IF_STALE|
-		    BCH_READ_MAY_PROMOTE|
-		    BCH_READ_USER_MAPPED);
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, NULL, flags);
 }

 static inline struct bch_read_bio *rbio_init(struct bio *bio,
@ -146,4 +134,7 @@ static inline struct bch_read_bio *rbio_init(struct bio *bio,
 	return rbio;
 }

+void bch2_fs_io_exit(struct bch_fs *);
+int bch2_fs_io_init(struct bch_fs *);
+
 #endif /* _BCACHEFS_IO_H */
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@ -14,6 +14,8 @@

 struct bch_read_bio {
 	struct bch_fs		*c;
+	u64			start_time;
+	u64			submit_time;

 	/*
 	 * Reads will often have to be split, and if the extent being read from
@ -35,17 +37,19 @@ struct bch_read_bio {
 	 */
 	struct bvec_iter	bvec_iter;

-	unsigned		submit_time_us;
-	u8			flags;
+	u16			flags;
 	union {
 	struct {
-	u8			bounce:1,
+	u16			bounce:1,
 				split:1,
+				kmalloc:1,
+				have_ioref:1,
 				narrow_crcs:1,
+				hole:1,
 				retry:2,
 				context:2;
 	};
-	u8			_state;
+	u16			_state;
 	};

 	struct bch_devs_list	devs_have;
@ -66,20 +70,20 @@ struct bch_read_bio {

 struct bch_write_bio {
 	struct bch_fs		*c;
-	struct bch_dev		*ca;
 	struct bch_write_bio	*parent;

+	u64			submit_time;
+
 	struct bch_devs_list	failed;
 	u8			order;
+	u8			dev;

 	unsigned		split:1,
 				bounce:1,
 				put_bio:1,
-				have_io_ref:1,
+				have_ioref:1,
 				used_mempool:1;

-	unsigned		submit_time_us;
-
 	struct bio		bio;
 };

@ -87,6 +91,7 @@ struct bch_write_op {
 	struct closure		cl;
 	struct bch_fs		*c;
 	struct workqueue_struct	*io_wq;
+	u64			start_time;

 	unsigned		written; /* sectors */
 	u16			flags;
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -112,72 +112,37 @@

 #include "journal_types.h"

-/*
- * Only used for holding the journal entries we read in btree_journal_read()
- * during cache_registration
- */
-struct journal_replay {
-	struct list_head	list;
-	struct bch_devs_list	devs;
-	/* must be last: */
-	struct jset		j;
-};
-
-static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
-					struct jset_entry *entry, unsigned type)
-{
-	while (entry < vstruct_last(jset)) {
-		if (entry->type == type)
-			return entry;
-
-		entry = vstruct_next(entry);
-	}
-
-	return NULL;
-}
-
-#define for_each_jset_entry_type(entry, jset, type)			\
-	for (entry = (jset)->start;					\
-	     (entry = __jset_entry_type_next(jset, entry, type));	\
-	     entry = vstruct_next(entry))
-
-#define for_each_jset_key(k, _n, entry, jset)				\
-	for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS)	\
-		vstruct_for_each_safe(entry, k, _n)
-
-#define JOURNAL_PIN	(32 * 1024)
-
-static inline bool journal_pin_active(struct journal_entry_pin *pin)
-{
-	return pin->pin_list != NULL;
-}
-
-static inline struct journal_entry_pin_list *
-journal_seq_pin(struct journal *j, u64 seq)
-{
-	return &j->pin.data[seq & j->pin.mask];
-}
-
-u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
-
-void bch2_journal_pin_add(struct journal *, struct journal_res *,
-			  struct journal_entry_pin *, journal_pin_flush_fn);
-void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
-void bch2_journal_pin_add_if_older(struct journal *,
-				  struct journal_entry_pin *,
-				  struct journal_entry_pin *,
-				  journal_pin_flush_fn);
-int bch2_journal_flush_pins(struct journal *, u64);
-int bch2_journal_flush_all_pins(struct journal *);
-
-struct closure;
 struct bch_fs;
-struct keylist;

-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
-					   enum btree_id, unsigned *);
+static inline void journal_wake(struct journal *j)
+{
+	wake_up(&j->wait);
+	closure_wake_up(&j->async_wait);
+}

-int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+static inline struct journal_buf *journal_cur_buf(struct journal *j)
+{
+	return j->buf + j->reservations.idx;
+}
+
+static inline struct journal_buf *journal_prev_buf(struct journal *j)
+{
+	return j->buf + !j->reservations.idx;
+}
+
+/* Sequence number of oldest dirty journal entry */
+
+static inline u64 journal_last_seq(struct journal *j)
+{
+	return j->pin.front;
+}
+
+static inline u64 journal_cur_seq(struct journal *j)
+{
+	BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+	return j->pin.back - 1;
+}

 u64 bch2_inode_journal_seq(struct journal *, u64);

@ -213,21 +178,18 @@ static inline unsigned jset_u64s(unsigned u64s)
 	return u64s + sizeof(struct jset_entry) / sizeof(u64);
 }

-static inline void bch2_journal_add_entry_at(struct journal_buf *buf,
-					    unsigned offset,
-					    unsigned type, enum btree_id id,
-					    unsigned level,
-					    const void *data, size_t u64s)
+static inline struct jset_entry *
+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 {
-	struct jset_entry *entry = vstruct_idx(buf->data, offset);
+	struct jset *jset = buf->data;
+	struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s));

 	memset(entry, 0, sizeof(*entry));
 	entry->u64s = cpu_to_le16(u64s);
-	entry->btree_id = id;
-	entry->level	= level;
-	entry->type	= type;

-	memcpy_u64s(entry->_data, data, u64s);
+	le32_add_cpu(&jset->u64s, jset_u64s(u64s));
+
+	return entry;
 }

 static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res,
@ -236,21 +198,27 @@ static inline void bch2_journal_add_entry(struct journal *j, struct journal_res
 					  const void *data, unsigned u64s)
 {
 	struct journal_buf *buf = &j->buf[res->idx];
+	struct jset_entry *entry = vstruct_idx(buf->data, res->offset);
 	unsigned actual = jset_u64s(u64s);

 	EBUG_ON(!res->ref);
 	EBUG_ON(actual > res->u64s);

-	bch2_journal_add_entry_at(buf, res->offset, type,
-				  id, level, data, u64s);
 	res->offset	+= actual;
 	res->u64s	-= actual;
+
+	memset(entry, 0, sizeof(*entry));
+	entry->u64s	= cpu_to_le16(u64s);
+	entry->type	= type;
+	entry->btree_id = id;
+	entry->level	= level;
+	memcpy_u64s(entry->_data, data, u64s);
 }

 static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res,
 					enum btree_id id, const struct bkey_i *k)
 {
-	bch2_journal_add_entry(j, res, JOURNAL_ENTRY_BTREE_KEYS,
+	bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys,
 			       id, 0, k, k->k.u64s);
 }

@ -292,7 +260,7 @@ static inline void bch2_journal_res_put(struct journal *j,

 	while (res->u64s)
 		bch2_journal_add_entry(j, res,
-				       JOURNAL_ENTRY_BTREE_KEYS,
+				       BCH_JSET_ENTRY_btree_keys,
 				       0, 0, NULL, 0);

 	bch2_journal_buf_put(j, res->idx, false);
@ -368,7 +336,6 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
 int bch2_journal_flush_seq(struct journal *, u64);
 int bch2_journal_flush(struct journal *);
 int bch2_journal_meta(struct journal *);
-int bch2_journal_flush_device(struct journal *, int);

 void bch2_journal_halt(struct journal *);

@ -385,10 +352,8 @@ static inline bool journal_flushes_device(struct bch_dev *ca)
 	return true;
 }

-void bch2_journal_start(struct bch_fs *);
 int bch2_journal_mark(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
-int bch2_journal_read(struct bch_fs *, struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);

 static inline void bch2_journal_set_replay_done(struct journal *j)
@ -404,6 +369,7 @@ int bch2_dev_journal_alloc(struct bch_dev *);

 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 void bch2_fs_journal_stop(struct journal *);
+void bch2_fs_journal_start(struct journal *);
 void bch2_dev_journal_exit(struct bch_dev *);
 int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *);
 void bch2_fs_journal_exit(struct journal *);
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
--- a/libbcachefs/journal_io.h
+++ b/libbcachefs/journal_io.h
@ -0,0 +1,45 @@
+#ifndef _BCACHEFS_JOURNAL_IO_H
+#define _BCACHEFS_JOURNAL_IO_H
+
+struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
+					    enum btree_id, unsigned *);
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	struct bch_devs_list	devs;
+	/* must be last: */
+	struct jset		j;
+};
+
+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
+					struct jset_entry *entry, unsigned type)
+{
+	while (entry < vstruct_last(jset)) {
+		if (entry->type == type)
+			return entry;
+
+		entry = vstruct_next(entry);
+	}
+
+	return NULL;
+}
+
+#define for_each_jset_entry_type(entry, jset, type)			\
+	for (entry = (jset)->start;					\
+	     (entry = __jset_entry_type_next(jset, entry, type));	\
+	     entry = vstruct_next(entry))
+
+#define for_each_jset_key(k, _n, entry, jset)				\
+	for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)	\
+		vstruct_for_each_safe(entry, k, _n)
+
+int bch2_journal_read(struct bch_fs *, struct list_head *);
+
+int bch2_journal_entry_sectors(struct journal *);
+void bch2_journal_write(struct closure *);
+
+#endif /* _BCACHEFS_JOURNAL_IO_H */
--- a/libbcachefs/journal_reclaim.c
+++ b/libbcachefs/journal_reclaim.c
@ -0,0 +1,411 @@
+
+#include "bcachefs.h"
+#include "journal.h"
+#include "journal_reclaim.h"
+#include "replicas.h"
+#include "super.h"
+
+/*
+ * Journal entry pinning - machinery for holding a reference on a given journal
+ * entry, holding it open to ensure it gets replayed during recovery:
+ */
+
+static inline u64 journal_pin_seq(struct journal *j,
+				  struct journal_entry_pin_list *pin_list)
+{
+	return fifo_entry_idx_abs(&j->pin, pin_list);
+}
+
+u64 bch2_journal_pin_seq(struct journal *j, struct journal_entry_pin *pin)
+{
+	u64 ret = 0;
+
+	spin_lock(&j->lock);
+	if (journal_pin_active(pin))
+		ret = journal_pin_seq(j, pin->pin_list);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static inline void __journal_pin_add(struct journal *j,
+				     struct journal_entry_pin_list *pin_list,
+				     struct journal_entry_pin *pin,
+				     journal_pin_flush_fn flush_fn)
+{
+	BUG_ON(journal_pin_active(pin));
+	BUG_ON(!atomic_read(&pin_list->count));
+
+	atomic_inc(&pin_list->count);
+	pin->pin_list	= pin_list;
+	pin->flush	= flush_fn;
+
+	if (flush_fn)
+		list_add(&pin->list, &pin_list->list);
+	else
+		INIT_LIST_HEAD(&pin->list);
+
+	/*
+	 * If the journal is currently full,  we might want to call flush_fn
+	 * immediately:
+	 */
+	journal_wake(j);
+}
+
+void bch2_journal_pin_add(struct journal *j, u64 seq,
+			  struct journal_entry_pin *pin,
+			  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+	__journal_pin_add(j, journal_seq_pin(j, seq), pin, flush_fn);
+	spin_unlock(&j->lock);
+}
+
+static inline void __journal_pin_drop(struct journal *j,
+				      struct journal_entry_pin *pin)
+{
+	struct journal_entry_pin_list *pin_list = pin->pin_list;
+
+	if (!journal_pin_active(pin))
+		return;
+
+	pin->pin_list = NULL;
+	list_del_init(&pin->list);
+
+	/*
+	 * Unpinning a journal entry make make journal_next_bucket() succeed, if
+	 * writing a new last_seq will now make another bucket available:
+	 */
+	if (atomic_dec_and_test(&pin_list->count) &&
+	    pin_list == &fifo_peek_front(&j->pin))
+		bch2_journal_reclaim_fast(j);
+}
+
+void bch2_journal_pin_drop(struct journal *j,
+			  struct journal_entry_pin *pin)
+{
+	spin_lock(&j->lock);
+	__journal_pin_drop(j, pin);
+	spin_unlock(&j->lock);
+}
+
+void bch2_journal_pin_add_if_older(struct journal *j,
+				  struct journal_entry_pin *src_pin,
+				  struct journal_entry_pin *pin,
+				  journal_pin_flush_fn flush_fn)
+{
+	spin_lock(&j->lock);
+
+	if (journal_pin_active(src_pin) &&
+	    (!journal_pin_active(pin) ||
+	     journal_pin_seq(j, src_pin->pin_list) <
+	     journal_pin_seq(j, pin->pin_list))) {
+		__journal_pin_drop(j, pin);
+		__journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
+	}
+
+	spin_unlock(&j->lock);
+}
+
+/*
+ * Journal reclaim: flush references to open journal entries to reclaim space in
+ * the journal
+ *
+ * May be done by the journal code in the background as needed to free up space
+ * for more journal entries, or as part of doing a clean shutdown, or to migrate
+ * data off of a specific device:
+ */
+
+/**
+ * bch2_journal_reclaim_fast - do the fast part of journal reclaim
+ *
+ * Called from IO submission context, does not block. Cleans up after btree
+ * write completions by advancing the journal pin and each cache's last_idx,
+ * kicking off discards and background reclaim as necessary.
+ */
+void bch2_journal_reclaim_fast(struct journal *j)
+{
+	struct journal_entry_pin_list temp;
+	bool popped = false;
+
+	lockdep_assert_held(&j->lock);
+
+	/*
+	 * Unpin journal entries whose reference counts reached zero, meaning
+	 * all btree nodes got written out
+	 */
+	while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+		BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+		BUG_ON(!fifo_pop(&j->pin, temp));
+		popped = true;
+	}
+
+	if (popped)
+		journal_wake(j);
+}
+
+static struct journal_entry_pin *
+__journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+	struct journal_entry_pin_list *pin_list;
+	struct journal_entry_pin *ret;
+	u64 iter;
+
+	/* no need to iterate over empty fifo entries: */
+	bch2_journal_reclaim_fast(j);
+
+	fifo_for_each_entry_ptr(pin_list, &j->pin, iter) {
+		if (iter > seq_to_flush)
+			break;
+
+		ret = list_first_entry_or_null(&pin_list->list,
+				struct journal_entry_pin, list);
+		if (ret) {
+			/* must be list_del_init(), see bch2_journal_pin_drop() */
+			list_move(&ret->list, &pin_list->flushed);
+			*seq = iter;
+			return ret;
+		}
+	}
+
+	return NULL;
+}
+
+static struct journal_entry_pin *
+journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+{
+	struct journal_entry_pin *ret;
+
+	spin_lock(&j->lock);
+	ret = __journal_get_next_pin(j, seq_to_flush, seq);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+	bool ret;
+
+	spin_lock(&j->lock);
+	ret = ja->nr &&
+		(ja->last_idx != ja->cur_idx &&
+		 ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+/**
+ * bch2_journal_reclaim_work - free up journal buckets
+ *
+ * Background journal reclaim writes out btree nodes. It should be run
+ * early enough so that we never completely run out of journal buckets.
+ *
+ * High watermarks for triggering background reclaim:
+ * - FIFO has fewer than 512 entries left
+ * - fewer than 25% journal buckets free
+ *
+ * Background reclaim runs until low watermarks are reached:
+ * - FIFO has more than 1024 entries left
+ * - more than 50% journal buckets free
+ *
+ * As long as a reclaim can complete in the time it takes to fill up
+ * 512 journal entries or 25% of all journal buckets, then
+ * journal_next_bucket() should not stall.
+ */
+void bch2_journal_reclaim_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(to_delayed_work(work),
+				struct bch_fs, journal.reclaim_work);
+	struct journal *j = &c->journal;
+	struct bch_dev *ca;
+	struct journal_entry_pin *pin;
+	u64 seq, seq_to_flush = 0;
+	unsigned iter, bucket_to_flush;
+	unsigned long next_flush;
+	bool reclaim_lock_held = false, need_flush;
+
+	/*
+	 * Advance last_idx to point to the oldest journal entry containing
+	 * btree node updates that have not yet been written out
+	 */
+	for_each_rw_member(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		if (!ja->nr)
+			continue;
+
+		while (should_discard_bucket(j, ja)) {
+			if (!reclaim_lock_held) {
+				/*
+				 * ugh:
+				 * might be called from __journal_res_get()
+				 * under wait_event() - have to go back to
+				 * TASK_RUNNING before doing something that
+				 * would block, but only if we're doing work:
+				 */
+				__set_current_state(TASK_RUNNING);
+
+				mutex_lock(&j->reclaim_lock);
+				reclaim_lock_held = true;
+				/* recheck under reclaim_lock: */
+				continue;
+			}
+
+			if (ca->mi.discard &&
+			    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+				blkdev_issue_discard(ca->disk_sb.bdev,
+					bucket_to_sector(ca,
+						ja->buckets[ja->last_idx]),
+					ca->mi.bucket_size, GFP_NOIO, 0);
+
+			spin_lock(&j->lock);
+			ja->last_idx = (ja->last_idx + 1) % ja->nr;
+			spin_unlock(&j->lock);
+
+			journal_wake(j);
+		}
+
+		/*
+		 * Write out enough btree nodes to free up 50% journal
+		 * buckets
+		 */
+		spin_lock(&j->lock);
+		bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
+		seq_to_flush = max_t(u64, seq_to_flush,
+				     ja->bucket_seq[bucket_to_flush]);
+		spin_unlock(&j->lock);
+	}
+
+	if (reclaim_lock_held)
+		mutex_unlock(&j->reclaim_lock);
+
+	/* Also flush if the pin fifo is more than half full */
+	spin_lock(&j->lock);
+	seq_to_flush = max_t(s64, seq_to_flush,
+			     (s64) journal_cur_seq(j) -
+			     (j->pin.size >> 1));
+	spin_unlock(&j->lock);
+
+	/*
+	 * If it's been longer than j->reclaim_delay_ms since we last flushed,
+	 * make sure to flush at least one journal pin:
+	 */
+	next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
+	need_flush = time_after(jiffies, next_flush);
+
+	while ((pin = journal_get_next_pin(j, need_flush
+					   ? U64_MAX
+					   : seq_to_flush, &seq))) {
+		__set_current_state(TASK_RUNNING);
+		pin->flush(j, pin, seq);
+		need_flush = false;
+
+		j->last_flushed = jiffies;
+	}
+
+	if (!test_bit(BCH_FS_RO, &c->flags))
+		queue_delayed_work(system_freezable_wq, &j->reclaim_work,
+				   msecs_to_jiffies(j->reclaim_delay_ms));
+}
+
+static int journal_flush_done(struct journal *j, u64 seq_to_flush,
+			      struct journal_entry_pin **pin,
+			      u64 *pin_seq)
+{
+	int ret;
+
+	*pin = NULL;
+
+	ret = bch2_journal_error(j);
+	if (ret)
+		return ret;
+
+	spin_lock(&j->lock);
+	/*
+	 * If journal replay hasn't completed, the unreplayed journal entries
+	 * hold refs on their corresponding sequence numbers
+	 */
+	ret = (*pin = __journal_get_next_pin(j, seq_to_flush, pin_seq)) != NULL ||
+		!test_bit(JOURNAL_REPLAY_DONE, &j->flags) ||
+		journal_last_seq(j) > seq_to_flush ||
+		(fifo_used(&j->pin) == 1 &&
+		 atomic_read(&fifo_peek_front(&j->pin).count) == 1);
+	spin_unlock(&j->lock);
+
+	return ret;
+}
+
+int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin *pin;
+	u64 pin_seq;
+	bool flush;
+
+	if (!test_bit(JOURNAL_STARTED, &j->flags))
+		return 0;
+again:
+	wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
+	if (pin) {
+		/* flushing a journal pin might cause a new one to be added: */
+		pin->flush(j, pin, pin_seq);
+		goto again;
+	}
+
+	spin_lock(&j->lock);
+	flush = journal_last_seq(j) != j->last_seq_ondisk ||
+		(seq_to_flush == U64_MAX && c->btree_roots_dirty);
+	spin_unlock(&j->lock);
+
+	return flush ? bch2_journal_meta(j) : 0;
+}
+
+int bch2_journal_flush_all_pins(struct journal *j)
+{
+	return bch2_journal_flush_pins(j, U64_MAX);
+}
+
+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_entry_pin_list *p;
+	struct bch_devs_list devs;
+	u64 iter, seq = 0;
+	int ret = 0;
+
+	spin_lock(&j->lock);
+	fifo_for_each_entry_ptr(p, &j->pin, iter)
+		if (dev_idx >= 0
+		    ? bch2_dev_list_has_dev(p->devs, dev_idx)
+		    : p->devs.nr < c->opts.metadata_replicas)
+			seq = iter;
+	spin_unlock(&j->lock);
+
+	ret = bch2_journal_flush_pins(j, seq);
+	if (ret)
+		return ret;
+
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL);
+
+	seq = 0;
+
+	spin_lock(&j->lock);
+	while (!ret && seq < j->pin.back) {
+		seq = max(seq, journal_last_seq(j));
+		devs = journal_seq_pin(j, seq)->devs;
+		seq++;
+
+		spin_unlock(&j->lock);
+		ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+		spin_lock(&j->lock);
+	}
+	spin_unlock(&j->lock);
+
+	bch2_replicas_gc_end(c, ret);
+	mutex_unlock(&c->replicas_gc_lock);
+
+	return ret;
+}
--- a/libbcachefs/journal_reclaim.h
+++ b/libbcachefs/journal_reclaim.h
@ -0,0 +1,36 @@
+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H
+#define _BCACHEFS_JOURNAL_RECLAIM_H
+
+#define JOURNAL_PIN	(32 * 1024)
+
+static inline bool journal_pin_active(struct journal_entry_pin *pin)
+{
+	return pin->pin_list != NULL;
+}
+
+static inline struct journal_entry_pin_list *
+journal_seq_pin(struct journal *j, u64 seq)
+{
+	BUG_ON(seq < j->pin.front || seq >= j->pin.back);
+
+	return &j->pin.data[seq & j->pin.mask];
+}
+
+u64 bch2_journal_pin_seq(struct journal *, struct journal_entry_pin *);
+
+void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
+			  journal_pin_flush_fn);
+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
+void bch2_journal_pin_add_if_older(struct journal *,
+				  struct journal_entry_pin *,
+				  struct journal_entry_pin *,
+				  journal_pin_flush_fn);
+
+void bch2_journal_reclaim_fast(struct journal *);
+void bch2_journal_reclaim_work(struct work_struct *);
+
+int bch2_journal_flush_pins(struct journal *, u64);
+int bch2_journal_flush_all_pins(struct journal *);
+int bch2_journal_flush_device_pins(struct journal *, int);
+
+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
--- a/libbcachefs/journal_seq_blacklist.c
+++ b/libbcachefs/journal_seq_blacklist.c
@ -0,0 +1,358 @@
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "error.h"
+#include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
+
+/*
+ * journal_seq_blacklist machinery:
+ *
+ * To guarantee order of btree updates after a crash, we need to detect when a
+ * btree node entry (bset) is newer than the newest journal entry that was
+ * successfully written, and ignore it - effectively ignoring any btree updates
+ * that didn't make it into the journal.
+ *
+ * If we didn't do this, we might have two btree nodes, a and b, both with
+ * updates that weren't written to the journal yet: if b was updated after a,
+ * but b was flushed and not a - oops; on recovery we'll find that the updates
+ * to b happened, but not the updates to a that happened before it.
+ *
+ * Ignoring bsets that are newer than the newest journal entry is always safe,
+ * because everything they contain will also have been journalled - and must
+ * still be present in the journal on disk until a journal entry has been
+ * written _after_ that bset was written.
+ *
+ * To accomplish this, bsets record the newest journal sequence number they
+ * contain updates for; then, on startup, the btree code queries the journal
+ * code to ask "Is this sequence number newer than the newest journal entry? If
+ * so, ignore it."
+ *
+ * When this happens, we must blacklist that journal sequence number: the
+ * journal must not write any entries with that sequence number, and it must
+ * record that it was blacklisted so that a) on recovery we don't think we have
+ * missing journal entries and b) so that the btree code continues to ignore
+ * that bset, until that btree node is rewritten.
+ *
+ * Blacklisted journal sequence numbers are themselves recorded as entries in
+ * the journal.
+ */
+
+/*
+ * Called when journal needs to evict a blacklist entry to reclaim space: find
+ * any btree nodes that refer to the blacklist journal sequence numbers, and
+ * rewrite them:
+ */
+static void journal_seq_blacklist_flush(struct journal *j,
+					struct journal_entry_pin *pin, u64 seq)
+{
+	struct bch_fs *c =
+		container_of(j, struct bch_fs, journal);
+	struct journal_seq_blacklist *bl =
+		container_of(pin, struct journal_seq_blacklist, pin);
+	struct blacklisted_node n;
+	struct closure cl;
+	unsigned i;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	for (i = 0;; i++) {
+		struct btree_iter iter;
+		struct btree *b;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+
+		__bch2_btree_iter_init(&iter, c, n.btree_id, n.pos, 0, 0, 0);
+
+		b = bch2_btree_iter_peek_node(&iter);
+
+		/* The node might have already been rewritten: */
+
+		if (b->data->keys.seq == n.seq) {
+			ret = bch2_btree_node_rewrite(c, &iter, n.seq, 0);
+			if (ret) {
+				bch2_btree_iter_unlock(&iter);
+				bch2_fs_fatal_error(c,
+					"error %i rewriting btree node with blacklisted journal seq",
+					ret);
+				bch2_journal_halt(j);
+				return;
+			}
+		}
+
+		bch2_btree_iter_unlock(&iter);
+	}
+
+	for (i = 0;; i++) {
+		struct btree_update *as;
+		struct pending_btree_node_free *d;
+
+		mutex_lock(&j->blacklist_lock);
+		if (i >= bl->nr_entries) {
+			mutex_unlock(&j->blacklist_lock);
+			break;
+		}
+		n = bl->entries[i];
+		mutex_unlock(&j->blacklist_lock);
+redo_wait:
+		mutex_lock(&c->btree_interior_update_lock);
+
+		/*
+		 * Is the node on the list of pending interior node updates -
+		 * being freed? If so, wait for that to finish:
+		 */
+		for_each_pending_btree_node_free(c, as, d)
+			if (n.seq	== d->seq &&
+			    n.btree_id	== d->btree_id &&
+			    !d->level &&
+			    !bkey_cmp(n.pos, d->key.k.p)) {
+				closure_wait(&as->wait, &cl);
+				mutex_unlock(&c->btree_interior_update_lock);
+				closure_sync(&cl);
+				goto redo_wait;
+			}
+
+		mutex_unlock(&c->btree_interior_update_lock);
+	}
+
+	mutex_lock(&j->blacklist_lock);
+
+	bch2_journal_pin_drop(j, &bl->pin);
+	list_del(&bl->list);
+	kfree(bl->entries);
+	kfree(bl);
+
+	mutex_unlock(&j->blacklist_lock);
+}
+
+/*
+ * Determine if a particular sequence number is blacklisted - if so, return
+ * blacklist entry:
+ */
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *j, u64 seq)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	list_for_each_entry(bl, &j->seq_blacklist, list)
+		if (seq >= bl->start && seq <= bl->end)
+			return bl;
+
+	return NULL;
+}
+
+/*
+ * Allocate a new, in memory blacklist entry:
+ */
+static struct journal_seq_blacklist *
+bch2_journal_seq_blacklisted_new(struct journal *j, u64 start, u64 end)
+{
+	struct journal_seq_blacklist *bl;
+
+	lockdep_assert_held(&j->blacklist_lock);
+
+	/*
+	 * When we start the journal, bch2_journal_start() will skip over @seq:
+	 */
+
+	bl = kzalloc(sizeof(*bl), GFP_KERNEL);
+	if (!bl)
+		return NULL;
+
+	bl->start	= start;
+	bl->end		= end;
+
+	list_add_tail(&bl->list, &j->seq_blacklist);
+	return bl;
+}
+
+/*
+ * Returns true if @seq is newer than the most recent journal entry that got
+ * written, and data corresponding to @seq should be ignored - also marks @seq
+ * as blacklisted so that on future restarts the corresponding data will still
+ * be ignored:
+ */
+int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
+{
+	struct journal *j = &c->journal;
+	struct journal_seq_blacklist *bl = NULL;
+	struct blacklisted_node *n;
+	u64 journal_seq;
+	int ret = 0;
+
+	if (!seq)
+		return 0;
+
+	spin_lock(&j->lock);
+	journal_seq = journal_cur_seq(j);
+	spin_unlock(&j->lock);
+
+	/* Interier updates aren't journalled: */
+	BUG_ON(b->level);
+	BUG_ON(seq > journal_seq && test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags));
+
+	/*
+	 * Decrease this back to j->seq + 2 when we next rev the on disk format:
+	 * increasing it temporarily to work around bug in old kernels
+	 */
+	fsck_err_on(seq > journal_seq + 4, c,
+		    "bset journal seq too far in the future: %llu > %llu",
+		    seq, journal_seq);
+
+	if (seq <= journal_seq &&
+	    list_empty_careful(&j->seq_blacklist))
+		return 0;
+
+	mutex_lock(&j->blacklist_lock);
+
+	if (seq <= journal_seq) {
+		bl = bch2_journal_seq_blacklist_find(j, seq);
+		if (!bl)
+			goto out;
+	} else {
+		bch_verbose(c, "btree node %u:%llu:%llu has future journal sequence number %llu, blacklisting",
+			    b->btree_id, b->key.k.p.inode, b->key.k.p.offset, seq);
+
+		if (!j->new_blacklist) {
+			j->new_blacklist = bch2_journal_seq_blacklisted_new(j,
+						journal_seq + 1,
+						journal_seq + 1);
+			if (!j->new_blacklist) {
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+		bl = j->new_blacklist;
+		bl->end = max(bl->end, seq);
+	}
+
+	for (n = bl->entries; n < bl->entries + bl->nr_entries; n++)
+		if (b->data->keys.seq	== n->seq &&
+		    b->btree_id		== n->btree_id &&
+		    !bkey_cmp(b->key.k.p, n->pos))
+			goto found_entry;
+
+	if (!bl->nr_entries ||
+	    is_power_of_2(bl->nr_entries)) {
+		n = krealloc(bl->entries,
+			     max(bl->nr_entries * 2, 8UL) * sizeof(*n),
+			     GFP_KERNEL);
+		if (!n) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		bl->entries = n;
+	}
+
+	bl->entries[bl->nr_entries++] = (struct blacklisted_node) {
+		.seq		= b->data->keys.seq,
+		.btree_id	= b->btree_id,
+		.pos		= b->key.k.p,
+	};
+found_entry:
+	ret = 1;
+out:
+fsck_err:
+	mutex_unlock(&j->blacklist_lock);
+	return ret;
+}
+
+static int __bch2_journal_seq_blacklist_read(struct journal *j,
+					     struct journal_replay *i,
+					     u64 start, u64 end)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct journal_seq_blacklist *bl;
+
+	bch_verbose(c, "blacklisting existing journal seq %llu-%llu",
+		    start, end);
+
+	bl = bch2_journal_seq_blacklisted_new(j, start, end);
+	if (!bl)
+		return -ENOMEM;
+
+	bch2_journal_pin_add(j, le64_to_cpu(i->j.seq), &bl->pin,
+			     journal_seq_blacklist_flush);
+	return 0;
+}
+
+/*
+ * After reading the journal, find existing journal seq blacklist entries and
+ * read them into memory:
+ */
+int bch2_journal_seq_blacklist_read(struct journal *j,
+				    struct journal_replay *i)
+{
+	struct jset_entry *entry;
+	int ret = 0;
+
+	vstruct_for_each(&i->j, entry) {
+		switch (entry->type) {
+		case BCH_JSET_ENTRY_blacklist: {
+			struct jset_entry_blacklist *bl_entry =
+				container_of(entry, struct jset_entry_blacklist, entry);
+
+			ret = __bch2_journal_seq_blacklist_read(j, i,
+					le64_to_cpu(bl_entry->seq),
+					le64_to_cpu(bl_entry->seq));
+			break;
+		}
+		case BCH_JSET_ENTRY_blacklist_v2: {
+			struct jset_entry_blacklist_v2 *bl_entry =
+				container_of(entry, struct jset_entry_blacklist_v2, entry);
+
+			ret = __bch2_journal_seq_blacklist_read(j, i,
+					le64_to_cpu(bl_entry->start),
+					le64_to_cpu(bl_entry->end));
+			break;
+		}
+		}
+
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * After reading the journal and walking the btree, we might have new journal
+ * sequence numbers to blacklist - add entries to the next journal entry to be
+ * written:
+ */
+void bch2_journal_seq_blacklist_write(struct journal *j)
+{
+	struct journal_seq_blacklist *bl = j->new_blacklist;
+	struct jset_entry_blacklist_v2 *bl_entry;
+	struct jset_entry *entry;
+
+	if (!bl)
+		return;
+
+	entry = bch2_journal_add_entry_noreservation(journal_cur_buf(j),
+			(sizeof(*bl_entry) - sizeof(*entry)) / sizeof(u64));
+
+	bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry);
+	bl_entry->entry.type	= BCH_JSET_ENTRY_blacklist_v2;
+	bl_entry->start		= cpu_to_le64(bl->start);
+	bl_entry->end		= cpu_to_le64(bl->end);
+
+	bch2_journal_pin_add(j,
+			     journal_cur_seq(j),
+			     &bl->pin,
+			     journal_seq_blacklist_flush);
+
+	j->new_blacklist = NULL;
+}
--- a/libbcachefs/journal_seq_blacklist.h
+++ b/libbcachefs/journal_seq_blacklist.h
@ -0,0 +1,13 @@
+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H
+
+struct journal_replay;
+
+struct journal_seq_blacklist *
+bch2_journal_seq_blacklist_find(struct journal *, u64);
+int bch2_journal_seq_should_ignore(struct bch_fs *, u64, struct btree *);
+int bch2_journal_seq_blacklist_read(struct journal *,
+				    struct journal_replay *);
+void bch2_journal_seq_blacklist_write(struct journal *);
+
+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */
--- a/libbcachefs/journal_types.h
+++ b/libbcachefs/journal_types.h
@ -59,8 +59,9 @@ struct blacklisted_node {

 struct journal_seq_blacklist {
 	struct list_head	list;
-	u64			seq;
-	bool			written;
+	u64			start;
+	u64			end;
+
 	struct journal_entry_pin pin;

 	struct blacklisted_node	*entries;
@ -171,10 +172,11 @@ struct journal {
 		u64 front, back, size, mask;
 		struct journal_entry_pin_list *data;
 	}			pin;
-	struct journal_entry_pin_list *replay_pin_list;
+	u64			replay_journal_seq;

 	struct mutex		blacklist_lock;
 	struct list_head	seq_blacklist;
+	struct journal_seq_blacklist *new_blacklist;

 	BKEY_PADDED(key);
 	struct write_point	wp;
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -5,6 +5,7 @@
 #include "buckets.h"
 #include "inode.h"
 #include "io.h"
+#include "journal_reclaim.h"
 #include "move.h"
 #include "replicas.h"
 #include "super-io.h"
@ -22,7 +23,6 @@ struct moving_io {
 	struct closure		cl;
 	bool			read_completed;

-	unsigned		read_dev;
 	unsigned		read_sectors;
 	unsigned		write_sectors;

@ -42,7 +42,7 @@ struct moving_context {
 	struct list_head	reads;

 	/* in flight sectors: */
-	atomic_t		read_sectors[BCH_SB_MEMBERS_MAX];
+	atomic_t		read_sectors;
 	atomic_t		write_sectors;

 	wait_queue_head_t	wait;
@ -306,7 +306,8 @@ static void move_write(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);

-	if (likely(!io->rbio.bio.bi_status)) {
+	if (likely(!io->rbio.bio.bi_status &&
+		   !io->rbio.hole)) {
 		bch2_migrate_read_done(&io->write, &io->rbio);

 		atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
@ -330,7 +331,7 @@ static void move_read_endio(struct bio *bio)
 	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
 	struct moving_context *ctxt = io->write.ctxt;

-	atomic_sub(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
+	atomic_sub(io->read_sectors, &ctxt->read_sectors);
 	io->read_completed = true;

 	if (next_pending_write(ctxt))
@ -376,7 +377,6 @@ static int bch2_move_extent(struct bch_fs *c,
 			    enum data_cmd data_cmd,
 			    struct data_opts data_opts)
 {
-	struct extent_pick_ptr pick;
 	struct moving_io *io;
 	const struct bch_extent_ptr *ptr;
 	struct bch_extent_crc_unpacked crc;
@ -387,12 +387,8 @@ static int bch2_move_extent(struct bch_fs *c,
 		atomic_read(&ctxt->write_sectors) <
 		SECTORS_IN_FLIGHT_PER_DEVICE);

-	bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
-	if (IS_ERR_OR_NULL(pick.ca))
-		return pick.ca ? PTR_ERR(pick.ca) : 0;
-
 	move_ctxt_wait_event(ctxt,
-		atomic_read(&ctxt->read_sectors[pick.ca->dev_idx]) <
+		atomic_read(&ctxt->read_sectors) <
 		SECTORS_IN_FLIGHT_PER_DEVICE);

 	/* write path might have to decompress data: */
@ -406,8 +402,7 @@ static int bch2_move_extent(struct bch_fs *c,
 		goto err;

 	io->write.ctxt		= ctxt;
-	io->read_dev		= pick.ca->dev_idx;
-	io->read_sectors	= pick.crc.uncompressed_size;
+	io->read_sectors	= e.k->size;
 	io->write_sectors	= e.k->size;

 	bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
@ -421,6 +416,7 @@ static int bch2_move_extent(struct bch_fs *c,

 	io->rbio.opts = io_opts;
 	bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+	io->rbio.bio.bi_vcnt = pages;
 	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
 	io->rbio.bio.bi_iter.bi_size = sectors << 9;

@ -438,7 +434,7 @@ static int bch2_move_extent(struct bch_fs *c,

 	trace_move_extent(e.k);

-	atomic_add(io->read_sectors, &ctxt->read_sectors[io->read_dev]);
+	atomic_add(io->read_sectors, &ctxt->read_sectors);
 	list_add_tail(&io->list, &ctxt->reads);

 	/*
@ -446,14 +442,15 @@ static int bch2_move_extent(struct bch_fs *c,
 	 * ctxt when doing wakeup
 	 */
 	closure_get(&ctxt->cl);
-	bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE);
+	bch2_read_extent(c, &io->rbio, e.s_c,
+			 BCH_READ_NODECODE|
+			 BCH_READ_LAST_FRAGMENT);
 	return 0;
 err_free_pages:
 	bio_free_pages(&io->write.op.wbio.bio);
 err_free:
 	kfree(io);
 err:
-	percpu_ref_put(&pick.ca->io_ref);
 	trace_move_alloc_fail(e.k);
 	return ret;
 }
@ -728,7 +725,7 @@ int bch2_data_job(struct bch_fs *c,
 	switch (op.op) {
 	case BCH_DATA_OP_REREPLICATE:
 		stats->data_type = BCH_DATA_JOURNAL;
-		ret = bch2_journal_flush_device(&c->journal, -1);
+		ret = bch2_journal_flush_device_pins(&c->journal, -1);

 		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
 		ret = bch2_gc_btree_replicas(c) ?: ret;
@ -745,7 +742,7 @@ int bch2_data_job(struct bch_fs *c,
 			return -EINVAL;

 		stats->data_type = BCH_DATA_JOURNAL;
-		ret = bch2_journal_flush_device(&c->journal, op.migrate.dev);
+		ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);

 		ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
 		ret = bch2_gc_btree_replicas(c) ?: ret;
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -26,6 +26,8 @@
 #include "inode.h"
 #include "io.h"
 #include "journal.h"
+#include "journal_io.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
 #include "migrate.h"
@ -396,9 +398,15 @@ err:

 static void bch2_fs_free(struct bch_fs *c)
 {
+#define BCH_TIME_STAT(name)				\
+	bch2_time_stats_exit(&c->name##_time);
+	BCH_TIME_STATS()
+#undef BCH_TIME_STAT
+
 	bch2_fs_quota_exit(c);
 	bch2_fs_fsio_exit(c);
 	bch2_fs_encryption_exit(c);
+	bch2_fs_io_exit(c);
 	bch2_fs_btree_cache_exit(c);
 	bch2_fs_journal_exit(&c->journal);
 	bch2_io_clock_exit(&c->io_clock[WRITE]);
@ -407,10 +415,6 @@ static void bch2_fs_free(struct bch_fs *c)
 	lg_lock_free(&c->usage_lock);
 	free_percpu(c->usage_percpu);
 	mempool_exit(&c->btree_bounce_pool);
-	mempool_exit(&c->bio_bounce_pages);
-	bioset_exit(&c->bio_write);
-	bioset_exit(&c->bio_read_split);
-	bioset_exit(&c->bio_read);
 	bioset_exit(&c->btree_bio);
 	mempool_exit(&c->btree_interior_update_pool);
 	mempool_exit(&c->btree_reserve_pool);
@ -561,8 +565,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)

 	init_rwsem(&c->gc_lock);

-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	spin_lock_init(&c->name##_time.lock);
+#define BCH_TIME_STAT(name)				\
+	bch2_time_stats_init(&c->name##_time);
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT

@ -590,6 +594,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->copy_gc_enabled		= 1;
 	c->rebalance_enabled		= 1;
 	c->rebalance_percent		= 10;
+	c->promote_whole_extents	= true;

 	c->journal.write_time	= &c->journal_write_time;
 	c->journal.delay_time	= &c->journal_delay_time;
@ -640,17 +645,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-			BIOSET_NEED_BVECS) ||
-	    mempool_init_page_pool(&c->bio_bounce_pages,
-				   max_t(unsigned,
-					 c->opts.btree_node_size,
-					 c->sb.encoded_extent_max) /
-				   PAGE_SECTORS, 0) ||
 	    !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
 	    lg_lock_init(&c->usage_lock) ||
 	    mempool_init_vp_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) ||
@ -658,6 +652,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_io_clock_init(&c->io_clock[WRITE]) ||
 	    bch2_fs_journal_init(&c->journal) ||
 	    bch2_fs_btree_cache_init(c) ||
+	    bch2_fs_io_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
 	    bch2_fs_fsio_init(c))
@ -774,11 +769,11 @@ const char *bch2_fs_start(struct bch_fs *c)
 			goto recovery_done;

 		/*
-		 * bch2_journal_start() can't happen sooner, or btree_gc_finish()
+		 * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
 		 * will give spurious errors about oldest_gen > bucket_gen -
 		 * this is a hack but oh well.
 		 */
-		bch2_journal_start(c);
+		bch2_fs_journal_start(&c->journal);

 		err = "error starting allocator";
 		if (bch2_fs_allocator_start(c))
@ -834,7 +829,7 @@ const char *bch2_fs_start(struct bch_fs *c)
 		 * journal_res_get() will crash if called before this has
 		 * set up the journal.pin FIFO and journal.cur pointer:
 		 */
-		bch2_journal_start(c);
+		bch2_fs_journal_start(&c->journal);
 		bch2_journal_set_replay_done(&c->journal);

 		err = "error starting allocator";
@ -993,6 +988,9 @@ static void bch2_dev_free(struct bch_dev *ca)
 	bioset_exit(&ca->replica_set);
 	bch2_dev_buckets_free(ca);

+	bch2_time_stats_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_exit(&ca->io_latency[READ]);
+
 	percpu_ref_exit(&ca->io_ref);
 	percpu_ref_exit(&ca->ref);
 	kobject_put(&ca->kobj);
@ -1089,6 +1087,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,

 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);

+	bch2_time_stats_init(&ca->io_latency[READ]);
+	bch2_time_stats_init(&ca->io_latency[WRITE]);
+
 	ca->mi = bch2_mi_to_cpu(member);
 	ca->uuid = member->uuid;

@ -1421,7 +1422,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 		goto err;
 	}

-	ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
+	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
 	if (ret) {
 		bch_err(ca, "Remove failed: error %i flushing journal", ret);
 		goto err;
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@ -27,7 +27,26 @@ static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s)

 static inline bool bch2_dev_is_online(struct bch_dev *ca)
 {
-	return ca->disk_sb.bdev != NULL;
+	return !percpu_ref_is_zero(&ca->io_ref);
+}
+
+static inline bool bch2_dev_is_readable(struct bch_dev *ca)
+{
+	return bch2_dev_is_online(ca) &&
+		ca->mi.state != BCH_MEMBER_STATE_FAILED;
+}
+
+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw)
+{
+	if (!percpu_ref_tryget(&ca->io_ref))
+		return false;
+
+	if (ca->mi.state == BCH_MEMBER_STATE_RW ||
+	    (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ))
+		return true;
+
+	percpu_ref_put(&ca->io_ref);
+	return false;
 }

 static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -141,11 +141,19 @@ read_attribute(btree_node_size);
 read_attribute(first_bucket);
 read_attribute(nbuckets);
 read_attribute(durability);
-read_attribute(iostats);
-read_attribute(last_read_quantiles);
-read_attribute(last_write_quantiles);
-read_attribute(fragmentation_quantiles);
-read_attribute(oldest_gen_quantiles);
+read_attribute(iodone);
+
+read_attribute(io_latency_read);
+read_attribute(io_latency_write);
+read_attribute(io_latency_stats_read);
+read_attribute(io_latency_stats_write);
+read_attribute(congested);
+
+read_attribute(bucket_quantiles_last_read);
+read_attribute(bucket_quantiles_last_write);
+read_attribute(bucket_quantiles_fragmentation);
+read_attribute(bucket_quantiles_oldest_gen);
+
 read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
@ -177,6 +185,7 @@ sysfs_pd_controller_attribute(copy_gc);
 rw_attribute(rebalance_enabled);
 rw_attribute(rebalance_percent);
 sysfs_pd_controller_attribute(rebalance);
+rw_attribute(promote_whole_extents);

 rw_attribute(pd_controllers_update_seconds);

@ -189,8 +198,9 @@ read_attribute(data_replicas_have);
 	BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM

-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	sysfs_time_stats_attribute(name, frequency_units, duration_units);
+#define BCH_TIME_STAT(_name)						\
+	static struct attribute sysfs_time_stat_##_name =		\
+		{ .name = #_name, .mode = S_IRUGO };
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT

@ -332,9 +342,10 @@ SHOW(bch2_fs)

 	sysfs_printf(rebalance_enabled,		"%i", c->rebalance_enabled);
 	sysfs_print(rebalance_percent,		c->rebalance_percent);
-
 	sysfs_pd_controller_show(rebalance,	&c->rebalance_pd); /* XXX */

+	sysfs_print(promote_whole_extents,	c->promote_whole_extents);
+
 	sysfs_printf(meta_replicas_have, "%u",	bch2_replicas_online(c, true));
 	sysfs_printf(data_replicas_have, "%u",	bch2_replicas_online(c, false));

@ -406,6 +417,8 @@ STORE(__bch2_fs)
 	sysfs_strtoul(rebalance_percent,	c->rebalance_percent);
 	sysfs_pd_controller_store(rebalance,	&c->rebalance_pd);

+	sysfs_strtoul(promote_whole_extents,	c->promote_whole_extents);
+
 	/* Debugging: */

 #define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name);
@ -462,6 +475,7 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_journal_reclaim_delay_ms,

 	&sysfs_rebalance_percent,
+	&sysfs_promote_whole_extents,

 	&sysfs_compression_stats,
 	NULL
@ -531,9 +545,16 @@ STORE(bch2_fs_opts_dir)
 	struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir);
 	const struct bch_option *opt = container_of(attr, struct bch_option, attr);
 	int ret, id = opt - bch2_opt_table;
+	char *tmp;
 	u64 v;

-	ret = bch2_opt_parse(c, opt, buf, &v);
+	tmp = kstrdup(buf, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = bch2_opt_parse(c, opt, strim(tmp), &v);
+	kfree(tmp);
+
 	if (ret < 0)
 		return ret;

@ -592,9 +613,9 @@ SHOW(bch2_fs_time_stats)
 {
 	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);

-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	sysfs_print_time_stats(&c->name##_time, name,			\
-			       frequency_units, duration_units);
+#define BCH_TIME_STAT(name)						\
+	if (attr == &sysfs_time_stat_##name)				\
+		return bch2_time_stats_print(&c->name##_time, buf, PAGE_SIZE);
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT

@ -603,23 +624,15 @@ SHOW(bch2_fs_time_stats)

 STORE(bch2_fs_time_stats)
 {
-	struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats);
-
-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	sysfs_clear_time_stats(&c->name##_time, name);
-	BCH_TIME_STATS()
-#undef BCH_TIME_STAT
-
 	return size;
 }
 SYSFS_OPS(bch2_fs_time_stats);

 struct attribute *bch2_fs_time_stats_files[] = {
-#define BCH_TIME_STAT(name, frequency_units, duration_units)		\
-	sysfs_time_stats_attribute_list(name, frequency_units, duration_units)
+#define BCH_TIME_STAT(name)						\
+	&sysfs_time_stat_##name,
 	BCH_TIME_STATS()
 #undef BCH_TIME_STAT
-
 	NULL
 };

@ -774,7 +787,7 @@ static const char * const bch2_rw[] = {
 	NULL
 };

-static ssize_t show_dev_iostats(struct bch_dev *ca, char *buf)
+static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf)
 {
 	char *out = buf, *end = buf + PAGE_SIZE;
 	int rw, i, cpu;
@ -851,16 +864,28 @@ SHOW(bch2_dev)
 		return out - buf;
 	}

-	if (attr == &sysfs_iostats)
-		return show_dev_iostats(ca, buf);
+	if (attr == &sysfs_iodone)
+		return show_dev_iodone(ca, buf);

-	if (attr == &sysfs_last_read_quantiles)
+	sysfs_print(io_latency_read,		atomic64_read(&ca->cur_latency[READ]));
+	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
+
+	if (attr == &sysfs_io_latency_stats_read)
+		return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE);
+	if (attr == &sysfs_io_latency_stats_write)
+		return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE);
+
+	sysfs_printf(congested,			"%u%%",
+		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
+		     * 100 / CONGESTED_MAX);
+
+	if (attr == &sysfs_bucket_quantiles_last_read)
 		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
-	if (attr == &sysfs_last_write_quantiles)
+	if (attr == &sysfs_bucket_quantiles_last_write)
 		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
-	if (attr == &sysfs_fragmentation_quantiles)
+	if (attr == &sysfs_bucket_quantiles_fragmentation)
 		return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
-	if (attr == &sysfs_oldest_gen_quantiles)
+	if (attr == &sysfs_bucket_quantiles_oldest_gen)
 		return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);

 	if (attr == &sysfs_reserve_stats)
@ -944,13 +969,20 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_label,

 	&sysfs_has_data,
-	&sysfs_iostats,
+	&sysfs_iodone,
+
+	&sysfs_io_latency_read,
+	&sysfs_io_latency_write,
+	&sysfs_io_latency_stats_read,
+	&sysfs_io_latency_stats_write,
+	&sysfs_congested,

 	/* alloc info - other stats: */
-	&sysfs_last_read_quantiles,
-	&sysfs_last_write_quantiles,
-	&sysfs_fragmentation_quantiles,
-	&sysfs_oldest_gen_quantiles,
+	&sysfs_bucket_quantiles_last_read,
+	&sysfs_bucket_quantiles_last_write,
+	&sysfs_bucket_quantiles_fragmentation,
+	&sysfs_bucket_quantiles_oldest_gen,
+
 	&sysfs_reserve_stats,

 	/* debug: */
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@ -13,12 +13,15 @@
 #include <linux/kthread.h>
 #include <linux/log2.h>
 #include <linux/math64.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
 #include <linux/random.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/sched/clock.h>

+#include "eytzinger.h"
 #include "util.h"

 #define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
@ -200,59 +203,189 @@ bool bch2_is_zero(const void *_p, size_t n)
 	return true;
 }

-void bch2_time_stats_clear(struct time_stats *stats)
+void bch2_quantiles_update(struct quantiles *q, u64 v)
 {
-	spin_lock(&stats->lock);
+	unsigned i = 0;

-	stats->count = 0;
-	stats->last_duration = 0;
-	stats->max_duration = 0;
-	stats->average_duration = 0;
-	stats->average_frequency = 0;
-	stats->last = 0;
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct quantile_entry *e = q->entries + i;

-	spin_unlock(&stats->lock);
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
 		}

-void __bch2_time_stats_update(struct time_stats *stats, u64 start_time)
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+/* time stats: */
+
+static void bch2_time_stats_update_one(struct time_stats *stats,
+				       u64 start, u64 end)
 {
-	u64 now, duration, last;
+	u64 duration, freq;
+
+	duration	= time_after64(end, start)
+		? end - start : 0;
+	freq		= time_after64(end, stats->last_event)
+		? end - stats->last_event : 0;

 	stats->count++;

-	now		= local_clock();
-	duration	= time_after64(now, start_time)
-		? now - start_time : 0;
-	last		= time_after64(now, stats->last)
-		? now - stats->last : 0;
+	stats->average_duration = stats->average_duration
+		? ewma_add(stats->average_duration, duration, 6)
+		: duration;
+
+	stats->average_frequency = stats->average_frequency
+		? ewma_add(stats->average_frequency, freq, 6)
+		: freq;

-	stats->last_duration = duration;
 	stats->max_duration = max(stats->max_duration, duration);

-	if (stats->last) {
-		stats->average_duration = ewma_add(stats->average_duration,
-						   duration << 8, 3);
+	stats->last_event = end;

-		if (stats->average_frequency)
-			stats->average_frequency =
-				ewma_add(stats->average_frequency,
-					 last << 8, 3);
-		else
-			stats->average_frequency  = last << 8;
-	} else {
-		stats->average_duration = duration << 8;
+	bch2_quantiles_update(&stats->quantiles, duration);
 }

-	stats->last = now ?: 1;
-}
-
-void bch2_time_stats_update(struct time_stats *stats, u64 start_time)
+void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end)
 {
-	spin_lock(&stats->lock);
-	__bch2_time_stats_update(stats, start_time);
-	spin_unlock(&stats->lock);
+	unsigned long flags;
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		bch2_time_stats_update_one(stats, start, end);
+
+		if (stats->average_frequency < 32 &&
+		    stats->count > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct time_stat_buffer_entry *i;
+		struct time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (b->nr == ARRAY_SIZE(b->entries)) {
+			spin_lock_irqsave(&stats->lock, flags);
+			for (i = b->entries;
+			     i < b->entries + ARRAY_SIZE(b->entries);
+			     i++)
+				bch2_time_stats_update_one(stats, i->start, i->end);
+			spin_unlock_irqrestore(&stats->lock, flags);
+
+			b->nr = 0;
 		}

+		preempt_enable();
+	}
+}
+
+static const struct time_unit {
+	const char	*name;
+	u32		nsecs;
+} time_units[] = {
+	{ "ns",		1		},
+	{ "us",		NSEC_PER_USEC	},
+	{ "ms",		NSEC_PER_MSEC	},
+	{ "sec",	NSEC_PER_SEC	},
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+static size_t pr_time_units(char *buf, size_t len, u64 ns)
+{
+	const struct time_unit *u = pick_time_units(ns);
+
+	return scnprintf(buf, len, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
+size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len)
+{
+	char *out = buf, *end = buf + len;
+	const struct time_unit *u;
+	u64 freq = READ_ONCE(stats->average_frequency);
+	u64 q, last_q = 0;
+	int i;
+
+	out += scnprintf(out, end - out, "count:\t\t%llu\n",
+			 stats->count);
+	out += scnprintf(out, end - out, "rate:\t\t%llu/sec\n",
+			 freq ?  div64_u64(NSEC_PER_SEC, freq) : 0);
+
+	out += scnprintf(out, end - out, "frequency:\t");
+	out += pr_time_units(out, end - out, freq);
+
+	out += scnprintf(out, end - out, "\navg duration:\t");
+	out += pr_time_units(out, end - out, stats->average_duration);
+
+	out += scnprintf(out, end - out, "\nmax duration:\t");
+	out += pr_time_units(out, end - out, stats->max_duration);
+
+	i = eytzinger0_first(NR_QUANTILES);
+	u = pick_time_units(stats->quantiles.entries[i].m);
+
+	out += scnprintf(out, end - out, "\nquantiles (%s):\t", u->name);
+	eytzinger0_for_each(i, NR_QUANTILES) {
+		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+		q = max(stats->quantiles.entries[i].m, last_q);
+		out += scnprintf(out, end - out, "%llu%s",
+				 div_u64(q, u->nsecs),
+				 is_last ? "\n" : " ");
+		last_q = q;
+	}
+
+	return out - buf;
+}
+
+void bch2_time_stats_exit(struct time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	spin_lock_init(&stats->lock);
+}
+
+/* ratelimit: */
+
 /**
 * bch2_ratelimit_delay() - return how long to delay until the next time to do
 * some work
@ -310,6 +443,8 @@ int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
 	}
 }

+/* pd controller: */
+
 /*
 * Updates pd_controller. Attempts to scale inputed values to units per second.
 * @target: desired value
@ -404,6 +539,8 @@ size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf)
 		       derivative, change, next_io);
 }

+/* misc: */
+
 void bch2_bio_map(struct bio *bio, void *base)
 {
 	size_t size = bio->bi_iter.bi_size;
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@ -371,87 +371,50 @@ ssize_t bch2_read_string_list(const char *, const char * const[]);
 ssize_t bch2_scnprint_flag_list(char *, size_t, const char * const[], u64);
 u64 bch2_read_flag_list(char *, const char * const[]);

+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+	struct quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+	unsigned	nr;
+	struct time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[32];
+};
+
 struct time_stats {
 	spinlock_t	lock;
 	u64		count;
-	/*
-	 * all fields are in nanoseconds, averages are ewmas stored left shifted
-	 * by 8
-	 */
-	u64		last_duration;
-	u64		max_duration;
+	/* all fields are in nanoseconds */
 	u64		average_duration;
 	u64		average_frequency;
-	u64		last;
+	u64		max_duration;
+	u64		last_event;
+	struct quantiles quantiles;
+
+	struct time_stat_buffer __percpu *buffer;
 };

-void bch2_time_stats_clear(struct time_stats *stats);
-void __bch2_time_stats_update(struct time_stats *stats, u64 time);
-void bch2_time_stats_update(struct time_stats *stats, u64 time);
+void __bch2_time_stats_update(struct time_stats *stats, u64, u64);

-static inline unsigned local_clock_us(void)
+static inline void bch2_time_stats_update(struct time_stats *stats, u64 start)
 {
-	return local_clock() >> 10;
+	__bch2_time_stats_update(stats, start, local_clock());
 }

-#define NSEC_PER_ns			1L
-#define NSEC_PER_us			NSEC_PER_USEC
-#define NSEC_PER_ms			NSEC_PER_MSEC
-#define NSEC_PER_sec			NSEC_PER_SEC
+size_t bch2_time_stats_print(struct time_stats *, char *, size_t);

-#define __print_time_stat(stats, name, stat, units)			\
-	sysfs_print(name ## _ ## stat ## _ ## units,			\
-		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
-
-#define sysfs_print_time_stats(stats, name,				\
-			       frequency_units,				\
-			       duration_units)				\
-do {									\
-	__print_time_stat(stats, name,					\
-			  average_frequency,	frequency_units);	\
-	__print_time_stat(stats, name,					\
-			  average_duration,	duration_units);	\
-	sysfs_print(name ## _ ##count, (stats)->count);			\
-	sysfs_print(name ## _ ##last_duration ## _ ## duration_units,	\
-			div_u64((stats)->last_duration,			\
-				NSEC_PER_ ## duration_units));		\
-	sysfs_print(name ## _ ##max_duration ## _ ## duration_units,	\
-			div_u64((stats)->max_duration,			\
-				NSEC_PER_ ## duration_units));		\
-									\
-	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
-		    ? div_s64(local_clock() - (stats)->last,		\
-			      NSEC_PER_ ## frequency_units)		\
-		    : -1LL);						\
-} while (0)
-
-#define sysfs_clear_time_stats(stats, name)				\
-do {									\
-	if (attr == &sysfs_ ## name ## _clear)				\
-		bch2_time_stats_clear(stats);				\
-} while (0)
-
-#define sysfs_time_stats_attribute(name,				\
-				   frequency_units,			\
-				   duration_units)			\
-write_attribute(name ## _clear);					\
-read_attribute(name ## _count);						\
-read_attribute(name ## _average_frequency_ ## frequency_units);		\
-read_attribute(name ## _average_duration_ ## duration_units);		\
-read_attribute(name ## _last_duration_ ## duration_units);		\
-read_attribute(name ## _max_duration_ ## duration_units);		\
-read_attribute(name ## _last_ ## frequency_units)
-
-#define sysfs_time_stats_attribute_list(name,				\
-					frequency_units,		\
-					duration_units)			\
-&sysfs_ ## name ## _clear,						\
-&sysfs_ ## name ## _count,						\
-&sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
-&sysfs_ ## name ## _average_duration_ ## duration_units,		\
-&sysfs_ ## name ## _last_duration_ ## duration_units,			\
-&sysfs_ ## name ## _max_duration_ ## duration_units,			\
-&sysfs_ ## name ## _last_ ## frequency_units,
+void bch2_time_stats_exit(struct time_stats *);
+void bch2_time_stats_init(struct time_stats *);

 #define ewma_add(ewma, val, weight)					\
 ({									\