Update bcachefs sources to 26409a8f75 bcachefs: Journal updates to dev usage

2025-12-08 00:00:12 +03:00 · 2021-02-02 14:26:28 -05:00 · 2021-02-02 14:26:28 -05:00 · 4064aa126e
commit 4064aa126e
parent 7eef5f46dd
31 changed files with 804 additions and 704 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-ea3414eed52e5d90c248453e84b2dcd91c960306
+26409a8f755b8faa620a49796d7935566204daaf
--- a/cmd_debug.c
+++ b/cmd_debug.c
@ -572,14 +572,10 @@ int cmd_list_journal(int argc, char *argv[])
 		printf("journal entry   %8llu\n"
 		       "    version     %8u\n"
 		       "    last seq    %8llu\n"
-		       "    read clock  %8u\n"
-		       "    write clock %8u\n"
 		       ,
 		       le64_to_cpu(p->j.seq),
 		       le32_to_cpu(p->j.version),
-		       le64_to_cpu(p->j.last_seq),
-		       le16_to_cpu(p->j.read_clock),
-		       le16_to_cpu(p->j.write_clock));
+		       le64_to_cpu(p->j.last_seq));

 		for_each_jset_key(k, _n, entry, &p->j) {
 			char buf[200];
--- a/libbcachefs.c
+++ b/libbcachefs.c
@ -623,8 +623,6 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,


 	printf("  flags:       %x", le32_to_cpu(clean->flags));
-	printf("  read clock:  %x", le16_to_cpu(clean->read_clock));
-	printf("  write clock: %x", le16_to_cpu(clean->write_clock));
 	printf("  journal seq: %llx", le64_to_cpu(clean->journal_seq));
 }

--- a/libbcachefs/alloc_background.c
+++ b/libbcachefs/alloc_background.c
@ -14,6 +14,7 @@
 #include "ec.h"
 #include "error.h"
 #include "recovery.h"
+#include "varint.h"

 #include <linux/kthread.h>
 #include <linux/math64.h>
@ -24,15 +25,12 @@
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>

-static const char * const bch2_alloc_field_names[] = {
-#define x(name, bytes) #name,
-	BCH_ALLOC_FIELDS()
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+	BCH_ALLOC_FIELDS_V1()
 #undef x
-	NULL
 };

-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
 /* Ratelimiting/PD controllers */

 static void pd_controllers_update(struct work_struct *work)
@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work)

 /* Persistent alloc info: */

-static inline u64 get_alloc_field(const struct bch_alloc *a,
-				  const void **p, unsigned field)
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+				     const void **p, unsigned field)
 {
-	unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
 	u64 v;

 	if (!(a->fields & (1 << field)))
@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
 	return v;
 }

-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
-				   unsigned field, u64 v)
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
+				      unsigned field, u64 v)
 {
-	unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+	unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];

 	if (!v)
 		return;
@ -127,55 +125,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
 	*p += bytes;
 }

+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+				 struct bkey_s_c k)
+{
+	const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+	const void *d = in->data;
+	unsigned idx = 0;
+
+	out->gen = in->gen;
+
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+	BCH_ALLOC_FIELDS_V1()
+#undef  x
+}
+
+static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst,
+			       const struct bkey_alloc_unpacked src)
+{
+	struct bkey_i_alloc *a = bkey_alloc_init(&dst->k);
+	void *d = a->v.data;
+	unsigned bytes, idx = 0;
+
+	a->k.p		= POS(src.dev, src.bucket);
+	a->v.fields	= 0;
+	a->v.gen	= src.gen;
+
+#define x(_name, _bits)	alloc_field_v1_put(a, &d, idx++, src._name);
+	BCH_ALLOC_FIELDS_V1()
+#undef  x
+	bytes = (void *) d - (void *) &a->v;
+	set_bkey_val_bytes(&a->k, bytes);
+	memset_u64s_tail(&a->v, 0, bytes);
+}
+
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+				struct bkey_s_c k)
+{
+	struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+	const u8 *in = a.v->data;
+	const u8 *end = bkey_val_end(a);
+	unsigned fieldnr = 0;
+	int ret;
+	u64 v;
+
+	out->gen	= a.v->gen;
+	out->oldest_gen	= a.v->oldest_gen;
+	out->data_type	= a.v->data_type;
+
+#define x(_name, _bits)							\
+	if (fieldnr < a.v->nr_fields) {					\
+		ret = bch2_varint_decode(in, end, &v);			\
+		if (ret < 0)						\
+			return ret;					\
+		in += ret;						\
+	} else {							\
+		v = 0;							\
+	}								\
+	out->_name = v;							\
+	if (v != out->_name)						\
+		return -1;						\
+	fieldnr++;
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	return 0;
+}
+
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+			       const struct bkey_alloc_unpacked src)
+{
+	struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+	unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+	u8 *out = a->v.data;
+	u8 *end = (void *) &dst[1];
+	u8 *last_nonzero_field = out;
+	unsigned bytes;
+
+	a->k.p		= POS(src.dev, src.bucket);
+	a->v.gen	= src.gen;
+	a->v.oldest_gen	= src.oldest_gen;
+	a->v.data_type	= src.data_type;
+
+#define x(_name, _bits)							\
+	nr_fields++;							\
+									\
+	if (src._name) {						\
+		out += bch2_varint_encode(out, src._name);		\
+									\
+		last_nonzero_field = out;				\
+		last_nonzero_fieldnr = nr_fields;			\
+	} else {							\
+		*out++ = 0;						\
+	}
+
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
+	BUG_ON(out > end);
+
+	out = last_nonzero_field;
+	a->v.nr_fields = last_nonzero_fieldnr;
+
+	bytes = (u8 *) out - (u8 *) &a->v;
+	set_bkey_val_bytes(&a->k, bytes);
+	memset_u64s_tail(&a->v, 0, bytes);
+}
+
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
 {
-	struct bkey_alloc_unpacked ret = { .gen = 0 };
+	struct bkey_alloc_unpacked ret = {
+		.dev	= k.k->p.inode,
+		.bucket	= k.k->p.offset,
+		.gen	= 0,
+	};

-	if (k.k->type == KEY_TYPE_alloc) {
-		const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
-		const void *d = a->data;
-		unsigned idx = 0;
+	if (k.k->type == KEY_TYPE_alloc_v2)
+		bch2_alloc_unpack_v2(&ret, k);
+	else if (k.k->type == KEY_TYPE_alloc)
+		bch2_alloc_unpack_v1(&ret, k);

-		ret.gen = a->gen;
-
-#define x(_name, _bits)	ret._name = get_alloc_field(a, &d, idx++);
-		BCH_ALLOC_FIELDS()
-#undef  x
-	}
 	return ret;
 }

-void bch2_alloc_pack(struct bkey_i_alloc *dst,
+void bch2_alloc_pack(struct bch_fs *c,
+		     struct bkey_alloc_buf *dst,
 		     const struct bkey_alloc_unpacked src)
 {
-	unsigned idx = 0;
-	void *d = dst->v.data;
-	unsigned bytes;
-
-	dst->v.fields	= 0;
-	dst->v.gen	= src.gen;
-
-#define x(_name, _bits)	put_alloc_field(dst, &d, idx++, src._name);
-	BCH_ALLOC_FIELDS()
-#undef  x
-
-	bytes = (void *) d - (void *) &dst->v;
-	set_bkey_val_bytes(&dst->k, bytes);
-	memset_u64s_tail(&dst->v, 0, bytes);
+	if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))
+		bch2_alloc_pack_v2(dst, src);
+	else
+		bch2_alloc_pack_v1(dst, src);
 }

 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
 	unsigned i, bytes = offsetof(struct bch_alloc, data);

-	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+	for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
 		if (a->fields & (1 << i))
-			bytes += BCH_ALLOC_FIELD_BYTES[i];
+			bytes += BCH_ALLOC_V1_FIELD_BYTES[i];

 	return DIV_ROUND_UP(bytes, sizeof(u64));
 }

-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);

@ -190,20 +282,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 	return NULL;
 }

-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-			struct bkey_s_c k)
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-	struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-	const void *d = a.v->data;
-	unsigned i;
+	struct bkey_alloc_unpacked u;

-	pr_buf(out, "gen %u", a.v->gen);
+	if (k.k->p.inode >= c->sb.nr_devices ||
+	    !c->devs[k.k->p.inode])
+		return "invalid device";

-	for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
-		if (a.v->fields & (1 << i))
-			pr_buf(out, " %s %llu",
-			       bch2_alloc_field_names[i],
-			       get_alloc_field(a.v, &d, i));
+	if (bch2_alloc_unpack_v2(&u, k))
+		return "unpack error";
+
+	return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+			   struct bkey_s_c k)
+{
+	struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+	pr_buf(out, "gen %u oldest_gen %u data_type %u",
+	       u.gen, u.oldest_gen, u.data_type);
+#define x(_name, ...)	pr_buf(out, #_name " %llu ", (u64) u._name);
+	BCH_ALLOC_FIELDS_V2()
+#undef  x
 }

 static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@ -213,7 +315,9 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 	struct bucket *g;
 	struct bkey_alloc_unpacked u;

-	if (level || k.k->type != KEY_TYPE_alloc)
+	if (level ||
+	    (k.k->type != KEY_TYPE_alloc &&
+	     k.k->type != KEY_TYPE_alloc_v2))
 		return 0;

 	ca = bch_dev_bkey_exists(c, k.k->p.inode);
@ -234,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,

 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-	struct bch_dev *ca;
-	unsigned i;
-	int ret = 0;
+	int ret;

 	down_read(&c->gc_lock);
 	ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@ -248,26 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 		return ret;
 	}

-	percpu_down_write(&c->mark_lock);
-	bch2_dev_usage_from_buckets(c);
-	percpu_up_write(&c->mark_lock);
-
-	mutex_lock(&c->bucket_clock[READ].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, READ);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[READ].lock);
-
-	mutex_lock(&c->bucket_clock[WRITE].lock);
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		bch2_recalc_oldest_io(c, ca, WRITE);
-		up_read(&ca->bucket_lock);
-	}
-	mutex_unlock(&c->bucket_clock[WRITE].lock);
-
 	return 0;
 }

@ -281,8 +363,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	struct bucket *g;
 	struct bucket_mark m;
 	struct bkey_alloc_unpacked old_u, new_u;
-	__BKEY_PADDED(k, 8) alloc_key; /* hack: */
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf a;
 	int ret;
 retry:
 	bch2_trans_begin(trans);
@ -303,17 +384,14 @@ retry:
 	ca	= bch_dev_bkey_exists(c, iter->pos.inode);
 	g	= bucket(ca, iter->pos.offset);
 	m	= READ_ONCE(g->mark);
-	new_u	= alloc_mem_to_key(g, m);
+	new_u	= alloc_mem_to_key(iter, g, m);
 	percpu_up_read(&c->mark_lock);

 	if (!bkey_alloc_unpacked_cmp(old_u, new_u))
 		return 0;

-	a = bkey_alloc_init(&alloc_key.k);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, new_u);
-
-	bch2_trans_update(trans, iter, &a->k_i,
+	bch2_alloc_pack(c, &a, new_u);
+	bch2_trans_update(trans, iter, &a.k,
 			  BTREE_TRIGGER_NORUN);
 	ret = bch2_trans_commit(trans, NULL, NULL,
 				BTREE_INSERT_NOFAIL|flags);
@ -358,114 +436,6 @@ err:

 /* Bucket IO clocks: */

-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets = bucket_array(ca);
-	struct bucket *g;
-	u16 max_last_io = 0;
-	unsigned i;
-
-	lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-	/* Recalculate max_last_io for this device: */
-	for_each_bucket(g, buckets)
-		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-	ca->max_last_bucket_io[rw] = max_last_io;
-
-	/* Recalculate global max_last_io: */
-	max_last_io = 0;
-
-	for_each_member_device(ca, c, i)
-		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-	clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-	struct bucket_array *buckets;
-	struct bch_dev *ca;
-	struct bucket *g;
-	unsigned i;
-
-	trace_rescale_prios(c);
-
-	for_each_member_device(ca, c, i) {
-		down_read(&ca->bucket_lock);
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			g->io_time[rw] = clock->hand -
-			bucket_last_io(c, g, rw) / 2;
-
-		bch2_recalc_oldest_io(c, ca, rw);
-
-		up_read(&ca->bucket_lock);
-	}
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
-	return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-	struct bucket_clock *clock = container_of(timer,
-						struct bucket_clock, rescale);
-	struct bch_fs *c = container_of(clock,
-					struct bch_fs, bucket_clock[clock->rw]);
-	struct bch_dev *ca;
-	u64 capacity;
-	unsigned i;
-
-	mutex_lock(&clock->lock);
-
-	/* if clock cannot be advanced more, rescale prio */
-	if (clock->max_last_io >= U16_MAX - 2)
-		bch2_rescale_bucket_io_times(c, clock->rw);
-
-	BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-	for_each_member_device(ca, c, i)
-		ca->max_last_bucket_io[clock->rw]++;
-	clock->max_last_io++;
-	clock->hand++;
-
-	mutex_unlock(&clock->lock);
-
-	capacity = READ_ONCE(c->capacity);
-
-	if (!capacity)
-		return;
-
-	/*
-	 * we only increment when 0.1% of the filesystem capacity has been read
-	 * or written too, this determines if it's time
-	 *
-	 * XXX: we shouldn't really be going off of the capacity of devices in
-	 * RW mode (that will be 0 when we're RO, yet we can still service
-	 * reads)
-	 */
-	timer->expire += bucket_clock_freq(capacity);
-
-	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-	struct bucket_clock *clock = &c->bucket_clock[rw];
-
-	clock->hand		= 1;
-	clock->rw		= rw;
-	clock->rescale.fn	= bch2_inc_clock_hand;
-	clock->rescale.expire	= bucket_clock_freq(c->capacity);
-	mutex_init(&clock->lock);
-}
-
 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 			      size_t bucket_nr, int rw)
 {
@ -473,9 +443,9 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
 	struct btree_iter *iter;
 	struct bucket *g;
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf *a;
 	struct bkey_alloc_unpacked u;
-	u16 *time;
+	u64 *time, now;
 	int ret = 0;

 	iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@ -486,28 +456,25 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
 	if (ret)
 		goto out;

-	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
 	ret = PTR_ERR_OR_ZERO(a);
 	if (ret)
 		goto out;

 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, bucket_nr);
-	u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+	u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
 	percpu_up_read(&c->mark_lock);

-	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
-
 	time = rw == READ ? &u.read_time : &u.write_time;
-	if (*time == c->bucket_clock[rw].hand)
+	now = atomic64_read(&c->io_clock[rw].now);
+	if (*time == now)
 		goto out;

-	*time = c->bucket_clock[rw].hand;
+	*time = now;

-	bch2_alloc_pack(a, u);
-
-	ret   = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+	bch2_alloc_pack(c, a, u);
+	ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
 		bch2_trans_commit(trans, NULL, NULL, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
@ -576,23 +543,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	return ret;
 }

-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-				       size_t bucket,
-				       struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+				       struct bucket_mark m)
 {
 	u8 gc_gen;

-	if (!is_available_bucket(mark))
+	if (!is_available_bucket(m))
 		return false;

-	if (mark.owned_by_allocator)
+	if (m.owned_by_allocator)
 		return false;

 	if (ca->buckets_nouse &&
-	    test_bit(bucket, ca->buckets_nouse))
+	    test_bit(b, ca->buckets_nouse))
 		return false;

-	gc_gen = bucket_gc_gen(ca, bucket);
+	gc_gen = bucket_gc_gen(bucket(ca, b));

 	if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
 		ca->inc_gen_needs_gc++;
@ -606,43 +572,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 /*
 * Determines what order we're going to reuse buckets, smallest bucket_key()
 * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
 */

-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-				     size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+				u64 now, u64 last_seq_ondisk)
 {
-	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-	unsigned max_last_io = ca->max_last_bucket_io[READ];
+	unsigned used = bucket_sectors_used(m);

-	/*
-	 * Time since last read, scaled to [0, 8) where larger value indicates
-	 * more recently read data:
-	 */
-	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
+	if (used) {
+		/*
+		 * Prefer to keep buckets that have been read more recently, and
+		 * buckets that have more data in them:
+		 */
+		u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+		u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));

-	/* How much we want to keep the data in this bucket: */
-	unsigned long data_wantness =
-		(hotness + 1) * bucket_sectors_used(m);
-
-	unsigned long needs_journal_commit =
-		bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
-
-	return  (data_wantness << 9) |
-		(needs_journal_commit << 8) |
-		(bucket_gc_gen(ca, b) / 16);
+		return -last_read_scaled;
+	} else {
+		/*
+		 * Prefer to use buckets with smaller gc_gen so that we don't
+		 * have to walk the btree and recalculate oldest_gen - but shift
+		 * off the low bits so that buckets will still have equal sort
+		 * keys when there's only a small difference, so that we can
+		 * keep sequential buckets together:
+		 */
+		return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+			(bucket_gc_gen(g) >> 4);
+	}
 }

 static inline int bucket_alloc_cmp(alloc_heap *h,
@ -665,16 +621,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	struct alloc_heap_entry e = { 0 };
+	u64 now, last_seq_ondisk;
 	size_t b, i, nr = 0;

-	ca->alloc_heap.used = 0;
-
-	mutex_lock(&c->bucket_clock[READ].lock);
 	down_read(&ca->bucket_lock);

 	buckets = bucket_array(ca);
-
-	bch2_recalc_oldest_io(c, ca, READ);
+	ca->alloc_heap.used = 0;
+	now = atomic64_read(&c->io_clock[READ].now);
+	last_seq_ondisk = c->journal.last_seq_ondisk;

 	/*
 	 * Find buckets with lowest read priority, by building a maxheap sorted
@ -682,8 +637,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	 * all buckets have been visited.
 	 */
 	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-		unsigned long key = bucket_sort_key(c, ca, b, m);
+		struct bucket *g = &buckets->b[b];
+		struct bucket_mark m = READ_ONCE(g->mark);
+		unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);

 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;
@ -718,7 +674,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	}

 	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->bucket_clock[READ].lock);
 }

 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@ -863,14 +818,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 				       struct btree_iter *iter,
 				       u64 *journal_seq, unsigned flags)
 {
-#if 0
-	__BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
-	/* hack: */
-	__BKEY_PADDED(k, 8) alloc_key;
-#endif
 	struct bch_fs *c = trans->c;
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf a;
 	struct bkey_alloc_unpacked u;
 	struct bucket *g;
 	struct bucket_mark m;
@ -920,8 +869,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 		goto out;
 	}

-	BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
 	bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
 retry:
 	ret = bch2_btree_iter_traverse(iter);
@ -931,7 +878,7 @@ retry:
 	percpu_down_read(&c->mark_lock);
 	g = bucket(ca, iter->pos.offset);
 	m = READ_ONCE(g->mark);
-	u = alloc_mem_to_key(g, m);
+	u = alloc_mem_to_key(iter, g, m);

 	percpu_up_read(&c->mark_lock);

@ -941,14 +888,11 @@ retry:
 	u.data_type	= 0;
 	u.dirty_sectors	= 0;
 	u.cached_sectors = 0;
-	u.read_time	= c->bucket_clock[READ].hand;
-	u.write_time	= c->bucket_clock[WRITE].hand;
+	u.read_time	= atomic64_read(&c->io_clock[READ].now);
+	u.write_time	= atomic64_read(&c->io_clock[WRITE].now);

-	a = bkey_alloc_init(&alloc_key.k);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, u);
-
-	bch2_trans_update(trans, iter, &a->k_i,
+	bch2_alloc_pack(c, &a, u);
+	bch2_trans_update(trans, iter, &a.k,
 			  BTREE_TRIGGER_BUCKET_INVALIDATE);

 	/*
@ -1455,8 +1399,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
-	bch2_bucket_clock_init(c, READ);
-	bch2_bucket_clock_init(c, WRITE);

 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
--- a/libbcachefs/alloc_background.h
+++ b/libbcachefs/alloc_background.h
@ -7,12 +7,33 @@
 #include "debug.h"

 struct bkey_alloc_unpacked {
+	u64		bucket;
+	u8		dev;
 	u8		gen;
+	u8		oldest_gen;
+	u8		data_type;
 #define x(_name, _bits)	u##_bits _name;
-	BCH_ALLOC_FIELDS()
+	BCH_ALLOC_FIELDS_V2()
 #undef  x
 };

+struct bkey_alloc_buf {
+	struct bkey_i	k;
+
+	union {
+	struct {
+#define x(_name,  _bits)		+ _bits / 8
+	u8		_pad[8 + BCH_ALLOC_FIELDS_V1()];
+#undef  x
+	} _v1;
+	struct {
+#define x(_name,  _bits)		+ 8 + _bits / 8
+	u8		_pad[8 + BCH_ALLOC_FIELDS_V2()];
+#undef  x
+	} _v2;
+	};
+} __attribute__((packed, aligned(8)));
+
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX	96U

@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
 static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
 					   struct bkey_alloc_unpacked r)
 {
-	return l.gen != r.gen
-#define x(_name, _bits)	|| l._name != r._name
-	BCH_ALLOC_FIELDS()
+	return  l.gen != r.gen			||
+		l.oldest_gen != r.oldest_gen	||
+		l.data_type != r.data_type
+#define x(_name, ...)	|| l._name != r._name
+	BCH_ALLOC_FIELDS_V2()
 #undef  x
 	;
 }

 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bkey_i_alloc *,
+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
 		     const struct bkey_alloc_unpacked);

 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);

 static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+alloc_mem_to_key(struct btree_iter *iter,
+		 struct bucket *g, struct bucket_mark m)
 {
 	return (struct bkey_alloc_unpacked) {
+		.dev		= iter->pos.inode,
+		.bucket		= iter->pos.offset,
 		.gen		= m.gen,
 		.oldest_gen	= g->oldest_gen,
 		.data_type	= m.data_type,
@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)

 #define ALLOC_SCAN_BATCH(ca)		max_t(size_t, 1, (ca)->mi.nbuckets >> 9)

-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);

 #define bch2_bkey_ops_alloc (struct bkey_ops) {		\
-	.key_invalid	= bch2_alloc_invalid,		\
+	.key_invalid	= bch2_alloc_v1_invalid,	\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+
+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {	\
+	.key_invalid	= bch2_alloc_v2_invalid,	\
 	.val_to_text	= bch2_alloc_to_text,		\
 }

--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@ -10,30 +10,6 @@

 struct ec_bucket_buf;

-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
-	/*
-	 * "now" in (read/write) IO time - incremented whenever we do X amount
-	 * of reads or writes.
-	 *
-	 * Goes with the bucket read/write prios: when we read or write to a
-	 * bucket we reset the bucket's prio to the current hand; thus hand -
-	 * prio = time since bucket was last read/written.
-	 *
-	 * The units are some amount (bytes/sectors) of data read/written, and
-	 * the units can change on the fly if we need to rescale to fit
-	 * everything in a u16 - your only guarantee is that the units are
-	 * consistent.
-	 */
-	u16			hand;
-	u16			max_last_io;
-
-	int			rw;
-
-	struct io_timer		rescale;
-	struct mutex		lock;
-};
-
 enum alloc_reserve {
 	RESERVE_BTREE_MOVINGGC	= -2,
 	RESERVE_BTREE		= -1,
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -429,7 +429,9 @@ struct bch_dev {
 	unsigned long		*buckets_nouse;
 	struct rw_semaphore	bucket_lock;

-	struct bch_dev_usage __percpu *usage[2];
+	struct bch_dev_usage		*usage_base;
+	struct bch_dev_usage __percpu	*usage[JOURNAL_BUF_NR];
+	struct bch_dev_usage __percpu	*usage_gc;

 	/* Allocator: */
 	struct task_struct __rcu *alloc_thread;
@ -451,9 +453,6 @@ struct bch_dev {

 	size_t			fifo_last_bucket;

-	/* last calculated minimum prio */
-	u16			max_last_bucket_io[2];
-
 	size_t			inc_gen_needs_gc;
 	size_t			inc_gen_really_needs_gc;

@ -473,6 +472,7 @@ struct bch_dev {
 	atomic64_t		rebalance_work;

 	struct journal_device	journal;
+	u64			prev_journal_sector;

 	struct work_struct	io_error_work;

@ -584,6 +584,8 @@ struct bch_fs {

 	struct journal_entry_res replicas_journal_res;

+	struct journal_entry_res dev_usage_journal_res;
+
 	struct bch_disk_groups_cpu __rcu *disk_groups;

 	struct bch_opts		opts;
@ -691,14 +693,6 @@ struct bch_fs {
 	struct mutex		usage_scratch_lock;
 	struct bch_fs_usage	*usage_scratch;

-	/*
-	 * When we invalidate buckets, we use both the priority and the amount
-	 * of good data to determine which buckets to reuse first - to weight
-	 * those together consistently we keep track of the smallest nonzero
-	 * priority of any bucket.
-	 */
-	struct bucket_clock	bucket_clock[2];
-
 	struct io_clock		io_clock[2];

 	/* JOURNAL SEQ BLACKLIST */
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -341,7 +341,8 @@ static inline void bkey_init(struct bkey *k)
 	x(reflink_v,		16)			\
 	x(inline_data,		17)			\
 	x(btree_ptr_v2,		18)			\
-	x(indirect_inline_data,	19)
+	x(indirect_inline_data,	19)			\
+	x(alloc_v2,		20)

 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name	= nr,
@ -551,9 +552,11 @@ struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
 	__u64			type:5,
 				block:8,
-				idx:51;
+				redundancy:4,
+				idx:47;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			idx:51,
+	__u64			idx:47,
+				redundancy:4,
 				block:8,
 				type:5;
 #endif
@ -799,35 +802,40 @@ struct bch_alloc {
 	__u8			data[];
 } __attribute__((packed, aligned(8)));

-#define BCH_ALLOC_FIELDS()			\
+#define BCH_ALLOC_FIELDS_V1()			\
 	x(read_time,		16)		\
 	x(write_time,		16)		\
 	x(data_type,		8)		\
 	x(dirty_sectors,	16)		\
 	x(cached_sectors,	16)		\
-	x(oldest_gen,		8)
+	x(oldest_gen,		8)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)
+
+struct bch_alloc_v2 {
+	struct bch_val		v;
+	__u8			nr_fields;
+	__u8			gen;
+	__u8			oldest_gen;
+	__u8			data_type;
+	__u8			data[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_FIELDS_V2()			\
+	x(read_time,		64)		\
+	x(write_time,		64)		\
+	x(dirty_sectors,	16)		\
+	x(cached_sectors,	16)		\
+	x(stripe,		32)		\
+	x(stripe_redundancy,	8)

 enum {
-#define x(name, bytes) BCH_ALLOC_FIELD_##name,
-	BCH_ALLOC_FIELDS()
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+	BCH_ALLOC_FIELDS_V1()
 #undef x
 	BCH_ALLOC_FIELD_NR
 };

-static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
-	BCH_ALLOC_FIELDS()
-#undef x
-};
-
-#define x(name, bits) + (bits / 8)
-static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
-	DIV_ROUND_UP(offsetof(struct bch_alloc, data)
-		     BCH_ALLOC_FIELDS(), sizeof(u64));
-#undef x
-
-#define BKEY_ALLOC_U64s_MAX	(BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
-
 /* Quotas: */

 enum quota_types {
@ -1131,8 +1139,8 @@ struct bch_sb_field_clean {
 	struct bch_sb_field	field;

 	__le32			flags;
-	__le16			read_clock;
-	__le16			write_clock;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;
 	__le64			journal_seq;

 	union {
@ -1305,6 +1313,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,	struct bch_sb, flags[2],  4, 64);

 LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET,	struct bch_sb, flags[3], 16, 28);

 /*
 * Features:
@ -1332,7 +1341,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	x(btree_updates_journalled,	13)	\
 	x(reflink_inline_data,		14)	\
 	x(new_varint,			15)	\
-	x(journal_no_flush,		16)
+	x(journal_no_flush,		16)	\
+	x(alloc_v2,			17)

 #define BCH_SB_FEATURES_ALL				\
 	((1ULL << BCH_FEATURE_new_siphash)|		\
@ -1340,7 +1350,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE,	struct bch_sb, flags[3],  0, 16);
 	 (1ULL << BCH_FEATURE_btree_ptr_v2)|		\
 	 (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
 	 (1ULL << BCH_FEATURE_new_varint)|		\
-	 (1ULL << BCH_FEATURE_journal_no_flush))
+	 (1ULL << BCH_FEATURE_journal_no_flush)|	\
+	 (1ULL << BCH_FEATURE_alloc_v2))

 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@ -1493,7 +1504,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(blacklist,		3)		\
 	x(blacklist_v2,		4)		\
 	x(usage,		5)		\
-	x(data_usage,		6)
+	x(data_usage,		6)		\
+	x(clock,		7)		\
+	x(dev_usage,		8)

 enum {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
@ -1541,6 +1554,30 @@ struct jset_entry_data_usage {
 	struct bch_replicas_entry r;
 } __attribute__((packed));

+struct jset_entry_clock {
+	struct jset_entry	entry;
+	__u8			rw;
+	__u8			pad[7];
+	__le64			time;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage_type {
+	__le64			buckets;
+	__le64			sectors;
+	__le64			fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+	struct jset_entry	entry;
+	__le32			dev;
+	__u32			pad;
+
+	__le64			buckets_ec;
+	__le64			buckets_unavailable;
+
+	struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
 /*
 * On disk format for a journal entry:
 * seq is monotonically increasing; every journal entry has its own unique
@ -1563,8 +1600,8 @@ struct jset {

 	__u8			encrypted_start[0];

-	__le16			read_clock;
-	__le16			write_clock;
+	__le16			_read_clock; /* no longer used */
+	__le16			_write_clock;

 	/* Sequence number of oldest dirty journal entry */
 	__le64			last_seq;
--- a/libbcachefs/bkey.h
+++ b/libbcachefs/bkey.h
@ -530,6 +530,7 @@ BKEY_VAL_ACCESSORS(reflink_v);
 BKEY_VAL_ACCESSORS(inline_data);
 BKEY_VAL_ACCESSORS(btree_ptr_v2);
 BKEY_VAL_ACCESSORS(indirect_inline_data);
+BKEY_VAL_ACCESSORS(alloc_v2);

 /* byte order helpers */

--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c)
 			ca->mi.nbuckets * sizeof(struct bucket));
 		ca->buckets[1] = NULL;

-		free_percpu(ca->usage[1]);
-		ca->usage[1] = NULL;
+		free_percpu(ca->usage_gc);
+		ca->usage_gc = NULL;
 	}

 	free_percpu(c->usage_gc);
@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c,
 	struct bch_dev *ca;
 	bool verify = (!initial ||
 		       (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
-	unsigned i;
+	unsigned i, dev;
 	int ret = 0;

 #define copy_field(_f, _msg, ...)					\
@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c,
 		}
 	}

-	for_each_member_device(ca, c, i) {
+	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+		bch2_fs_usage_acc_to_base(c, i);
+
+	for_each_member_device(ca, c, dev) {
 		struct bucket_array *dst = __bucket_array(ca, 0);
 		struct bucket_array *src = __bucket_array(ca, 1);
 		size_t b;
@ -801,13 +804,24 @@ static int bch2_gc_done(struct bch_fs *c,

 			dst->b[b].oldest_gen = src->b[b].oldest_gen;
 		}
+
+		{
+			struct bch_dev_usage *dst = ca->usage_base;
+			struct bch_dev_usage *src = (void *)
+				bch2_acc_percpu_u64s((void *) ca->usage_gc,
+						     dev_usage_u64s());
+
+			copy_dev_field(buckets_ec,		"buckets_ec");
+			copy_dev_field(buckets_unavailable,	"buckets_unavailable");
+
+			for (i = 0; i < BCH_DATA_NR; i++) {
+				copy_dev_field(d[i].buckets,	"%s buckets", bch2_data_types[i]);
+				copy_dev_field(d[i].sectors,	"%s sectors", bch2_data_types[i]);
+				copy_dev_field(d[i].fragmented,	"%s fragmented", bch2_data_types[i]);
+			}
+		}
 	};

-	for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-		bch2_fs_usage_acc_to_base(c, i);
-
-	bch2_dev_usage_from_buckets(c);
-
 	{
 		unsigned nr = fs_usage_u64s(c);
 		struct bch_fs_usage *dst = c->usage_base;
@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c)

 	for_each_member_device(ca, c, i) {
 		BUG_ON(ca->buckets[1]);
-		BUG_ON(ca->usage[1]);
+		BUG_ON(ca->usage_gc);

 		ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
 				ca->mi.nbuckets * sizeof(struct bucket),
@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c)
 			return -ENOMEM;
 		}

-		ca->usage[1] = alloc_percpu(struct bch_dev_usage);
-		if (!ca->usage[1]) {
-			bch_err(c, "error allocating ca->usage[gc]");
+		ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage_gc) {
+			bch_err(c, "error allocating ca->usage_gc");
 			percpu_ref_put(&ca->ref);
 			return -ENOMEM;
 		}
@ -1489,7 +1503,7 @@ static int bch2_gc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	unsigned long last = atomic_long_read(&clock->now);
+	unsigned long last = atomic64_read(&clock->now);
 	unsigned last_kick = atomic_read(&c->kick_gc);
 	int ret;

@ -1510,7 +1524,7 @@ static int bch2_gc_thread(void *arg)
 			if (c->btree_gc_periodic) {
 				unsigned long next = last + c->capacity / 16;

-				if (atomic_long_read(&clock->now) >= next)
+				if (atomic64_read(&clock->now) >= next)
 					break;

 				bch2_io_clock_schedule_timeout(clock, next);
@ -1522,7 +1536,7 @@ static int bch2_gc_thread(void *arg)
 		}
 		__set_current_state(TASK_RUNNING);

-		last = atomic_long_read(&clock->now);
+		last = atomic64_read(&clock->now);
 		last_kick = atomic_read(&c->kick_gc);

 		/*
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -222,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);

 retry:
-	wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+	wp = bch2_alloc_sectors_start(c,
+				      c->opts.metadata_target ?:
+				      c->opts.foreground_target,
+				      0,
 				      writepoint_ptr(&c->btree_write_point),
 				      &devs_have,
 				      res->nr_replicas,
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
 	struct bch_fs_usage *usage;
+	struct bch_dev *ca;
 	unsigned i;

 	percpu_down_write(&c->mark_lock);
@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
 		fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
 	}

+	for_each_member_device(ca, c, i) {
+		struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+		usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+				  dev.d[BCH_DATA_journal].buckets) *
+			ca->mi.bucket_size;
+	}
+
 	percpu_up_write(&c->mark_lock);
 }

@ -189,14 +198,27 @@ out_pool:
 	return ret;
 }

+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+						  unsigned journal_seq,
+						  bool gc)
+{
+	return this_cpu_ptr(gc
+			    ? ca->usage_gc
+			    : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 {
+	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage ret;
+	unsigned seq, i, u64s = dev_usage_u64s();

-	memset(&ret, 0, sizeof(ret));
-	acc_u64s_percpu((u64 *) &ret,
-			(u64 __percpu *) ca->usage[0],
-			sizeof(ret) / sizeof(u64));
+	do {
+		seq = read_seqcount_begin(&c->usage_lock);
+		memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+		for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+			acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+	} while (read_seqcount_retry(&c->usage_lock, seq));

 	return ret;
 }
@ -261,7 +283,8 @@ retry:

 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 {
-	unsigned u64s = fs_usage_u64s(c);
+	struct bch_dev *ca;
+	unsigned i, u64s = fs_usage_u64s(c);

 	BUG_ON(idx >= ARRAY_SIZE(c->usage));

@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 			(u64 __percpu *) c->usage[idx], u64s);
 	percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));

+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL) {
+		u64s = dev_usage_u64s();
+
+		acc_u64s_percpu((u64 *) ca->usage_base,
+				(u64 __percpu *) ca->usage[idx], u64s);
+		percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+	}
+	rcu_read_unlock();
+
 	write_seqcount_end(&c->usage_lock);
 	preempt_enable();
 }
@ -454,14 +487,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 				  struct bch_fs_usage *fs_usage,
 				  struct bucket_mark old, struct bucket_mark new,
-				  bool gc)
+				  u64 journal_seq, bool gc)
 {
 	struct bch_dev_usage *u;

 	percpu_rwsem_assert_held(&c->mark_lock);

 	preempt_disable();
-	u = this_cpu_ptr(ca->usage[gc]);
+	u = dev_usage_ptr(ca, journal_seq, gc);

 	if (bucket_type(old))
 		account_bucket(fs_usage, u, bucket_type(old),
@ -491,31 +524,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 		bch2_wake_allocator(ca);
 }

-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct bucket_mark old = { .v.counter = 0 };
-	struct bucket_array *buckets;
-	struct bucket *g;
-	unsigned i;
-	int cpu;
-
-	c->usage_base->hidden = 0;
-
-	for_each_member_device(ca, c, i) {
-		for_each_possible_cpu(cpu)
-			memset(per_cpu_ptr(ca->usage[0], cpu), 0,
-			       sizeof(*ca->usage[0]));
-
-		buckets = bucket_array(ca);
-
-		for_each_bucket(g, buckets)
-			bch2_dev_usage_update(c, ca, c->usage_base,
-					      old, g->mark, false);
-	}
-}
-
 static inline int update_replicas(struct bch_fs *c,
 				  struct bch_fs_usage *fs_usage,
 				  struct bch_replicas_entry *r,
@ -653,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.owned_by_allocator	= owned_by_allocator;
 	}));

-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	/*
+	 * XXX: this is wrong, this means we'll be doing updates to the percpu
+	 * buckets_alloc counter that don't have an open journal buffer and
+	 * we'll race with the machinery that accumulates that to ca->usage_base
+	 */
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);

 	BUG_ON(!gc &&
 	       !owned_by_allocator && !old.owned_by_allocator);
@ -685,7 +698,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
 	struct bucket_mark old_m, m;

 	/* We don't do anything for deletions - do we?: */
-	if (new.k->type != KEY_TYPE_alloc)
+	if (new.k->type != KEY_TYPE_alloc &&
+	    new.k->type != KEY_TYPE_alloc_v2)
 		return 0;

 	/*
@ -708,6 +722,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
 		m.data_type		= u.data_type;
 		m.dirty_sectors		= u.dirty_sectors;
 		m.cached_sectors	= u.cached_sectors;
+		m.stripe		= u.stripe != 0;

 		if (journal_seq) {
 			m.journal_seq_valid	= 1;
@ -715,12 +730,14 @@ static int bch2_mark_alloc(struct bch_fs *c,
 		}
 	}));

-	bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);

 	g->io_time[READ]	= u.read_time;
 	g->io_time[WRITE]	= u.write_time;
 	g->oldest_gen		= u.oldest_gen;
 	g->gen_valid		= 1;
+	g->stripe		= u.stripe;
+	g->stripe_redundancy	= u.stripe_redundancy;

 	/*
 	 * need to know if we're getting called from the invalidate path or
@ -778,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,

 	if (c)
 		bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
-				      old, new, gc);
+				      old, new, 0, gc);

 	return 0;
 }
@ -915,11 +932,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
 	return 0;
 }

-static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
 			     unsigned ptr_idx,
 			     struct bch_fs_usage *fs_usage,
-			     u64 journal_seq, unsigned flags,
-			     bool enabled)
+			     u64 journal_seq, unsigned flags)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 	unsigned nr_data = s->nr_blocks - s->nr_redundant;
@ -932,8 +948,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 	char buf[200];
 	int ret;

-	if (enabled)
-		g->ec_redundancy = s->nr_redundant;
+	if (g->stripe && g->stripe != k.k->p.offset) {
+		bch2_fs_inconsistent(c,
+			      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+			      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+			      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+		return -EINVAL;
+	}

 	old = bucket_cmpxchg(g, new, ({
 		ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
@ -941,23 +962,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 		if (ret)
 			return ret;

-		if (new.stripe && enabled)
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-				      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
-				      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-		if (!new.stripe && !enabled)
-			bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-				      "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
-				      ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
-				      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-		new.stripe			= enabled;
-
-		if ((flags & BTREE_TRIGGER_GC) && parity) {
-			new.data_type = enabled ? BCH_DATA_parity : 0;
-			new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+		if (parity) {
+			new.data_type		= BCH_DATA_parity;
+			new.dirty_sectors	= le16_to_cpu(s->sectors);
 		}

 		if (journal_seq) {
@ -966,10 +973,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
 		}
 	}));

-	if (!enabled)
-		g->ec_redundancy = 0;
+	g->stripe		= k.k->p.offset;
+	g->stripe_redundancy	= s->nr_redundant;

-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
 	return 0;
 }

@ -1036,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
 			      old.v.counter,
 			      new.v.counter)) != old.v.counter);

-	bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+	bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);

 	BUG_ON(!gc && bucket_became_unavailable(old, new));

@ -1163,6 +1170,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	unsigned i;
 	int ret;

+	BUG_ON(gc && old_s);
+
 	if (!m || (old_s && !m->alive)) {
 		bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
 				    idx);
@ -1170,48 +1179,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
 	}

 	if (!new_s) {
-		/* Deleting: */
-		for (i = 0; i < old_s->nr_blocks; i++) {
-			ret = bucket_set_stripe(c, old, i, fs_usage,
-						journal_seq, flags, false);
-			if (ret)
-				return ret;
-		}
-
-		if (!gc && m->on_heap) {
-			spin_lock(&c->ec_stripes_heap_lock);
-			bch2_stripes_heap_del(c, m, idx);
-			spin_unlock(&c->ec_stripes_heap_lock);
-		}
-
-		if (gc)
-			update_replicas(c, fs_usage, &m->r.e,
-					-((s64) m->sectors * m->nr_redundant));
+		spin_lock(&c->ec_stripes_heap_lock);
+		bch2_stripes_heap_del(c, m, idx);
+		spin_unlock(&c->ec_stripes_heap_lock);

 		memset(m, 0, sizeof(*m));
 	} else {
-		BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
-		BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
-
-		for (i = 0; i < new_s->nr_blocks; i++) {
-			if (!old_s ||
-			    memcmp(new_s->ptrs + i,
-				   old_s->ptrs + i,
-				   sizeof(struct bch_extent_ptr))) {
-
-				if (old_s) {
-					bucket_set_stripe(c, old, i, fs_usage,
-							  journal_seq, flags, false);
-					if (ret)
-						return ret;
-				}
-				ret = bucket_set_stripe(c, new, i, fs_usage,
-							journal_seq, flags, true);
-				if (ret)
-					return ret;
-			}
-		}
-
 		m->alive	= true;
 		m->sectors	= le16_to_cpu(new_s->sectors);
 		m->algorithm	= new_s->algorithm;
@ -1220,27 +1193,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		m->blocks_nonempty = 0;

 		for (i = 0; i < new_s->nr_blocks; i++) {
-			unsigned s = stripe_blockcount_get(new_s, i);
-
-			/*
-			 * gc recalculates this field from stripe ptr
-			 * references:
-			 */
-			if (!gc)
-				m->block_sectors[i] = s;
-			m->blocks_nonempty += !!s;
+			m->block_sectors[i] =
+				stripe_blockcount_get(new_s, i);
+			m->blocks_nonempty += !!m->block_sectors[i];
 		}

-		if (gc && old_s)
-			update_replicas(c, fs_usage, &m->r.e,
-					-((s64) m->sectors * m->nr_redundant));
-
 		bch2_bkey_to_replicas(&m->r.e, new);

-		if (gc)
-			update_replicas(c, fs_usage, &m->r.e,
-					((s64) m->sectors * m->nr_redundant));
-
 		if (!gc) {
 			spin_lock(&c->ec_stripes_heap_lock);
 			bch2_stripes_heap_update(c, m, idx);
@ -1248,6 +1207,25 @@ static int bch2_mark_stripe(struct bch_fs *c,
 		}
 	}

+	if (gc) {
+		/*
+		 * gc recalculates this field from stripe ptr
+		 * references:
+		 */
+		memset(m->block_sectors, 0, sizeof(m->block_sectors));
+		m->blocks_nonempty = 0;
+
+		for (i = 0; i < new_s->nr_blocks; i++) {
+			ret = mark_stripe_bucket(c, new, i, fs_usage,
+						 journal_seq, flags);
+			if (ret)
+				return ret;
+		}
+
+		update_replicas(c, fs_usage, &m->r.e,
+				((s64) m->sectors * m->nr_redundant));
+	}
+
 	return 0;
 }

@ -1271,6 +1249,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,

 	switch (k.k->type) {
 	case KEY_TYPE_alloc:
+	case KEY_TYPE_alloc_v2:
 		ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
 		break;
 	case KEY_TYPE_btree_ptr:
@ -1539,9 +1518,10 @@ static int trans_get_key(struct btree_trans *trans,
 	return ret;
 }

-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
-					 const struct bch_extent_ptr *ptr,
-					 struct bkey_alloc_unpacked *u)
+static struct bkey_alloc_buf *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+			      const struct bch_extent_ptr *ptr,
+			      struct bkey_alloc_unpacked *u)
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@ -1549,8 +1529,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 	struct bucket *g;
 	struct btree_iter *iter;
 	struct bkey_s_c k;
+	struct bkey_alloc_buf *a;
 	int ret;

+	a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+	if (IS_ERR(a))
+		return a;
+
 	iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
 	if (iter) {
 		*u = bch2_alloc_unpack(k);
@ -1562,17 +1547,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
 		ret = bch2_btree_iter_traverse(iter);
 		if (ret) {
 			bch2_trans_iter_put(trans, iter);
-			return ret;
+			return ERR_PTR(ret);
 		}

 		percpu_down_read(&c->mark_lock);
 		g = bucket(ca, pos.offset);
-		*u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+		*u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
 		percpu_up_read(&c->mark_lock);
 	}

 	*_iter = iter;
-	return 0;
+	return a;
 }

 static int bch2_trans_mark_pointer(struct btree_trans *trans,
@ -1582,27 +1567,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_alloc_unpacked u;
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf *a;
 	int ret;

-	ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+	if (IS_ERR(a))
+		return PTR_ERR(a);

 	ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
 			     &u.dirty_sectors, &u.cached_sectors);
 	if (ret)
 		goto out;

-	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto out;
-
-	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, u);
-	bch2_trans_update(trans, iter, &a->k_i, 0);
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@ -1713,34 +1691,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 }

 static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
-					    const struct bch_extent_ptr *ptr,
-					    s64 sectors, bool parity)
+					    struct bkey_s_c_stripe s,
+					    unsigned idx, bool deleting)
 {
-	struct bkey_i_alloc *a;
+	struct bch_fs *c = trans->c;
+	const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+	struct bkey_alloc_buf *a;
 	struct btree_iter *iter;
 	struct bkey_alloc_unpacked u;
-	int ret;
+	bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+	int ret = 0;

-	ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+	if (IS_ERR(a))
+		return PTR_ERR(a);

 	if (parity) {
+		s64 sectors = le16_to_cpu(s.v->sectors);
+
+		if (deleting)
+			sectors = -sectors;
+
 		u.dirty_sectors += sectors;
 		u.data_type = u.dirty_sectors
 			? BCH_DATA_parity
 			: 0;
 	}

-	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		goto err;
+	if (!deleting) {
+		if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
+				"bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+				iter->pos.inode, iter->pos.offset, u.gen,
+				u.stripe, s.k->p.offset)) {
+			ret = -EIO;
+			goto err;
+		}

-	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, u);
-	bch2_trans_update(trans, iter, &a->k_i, 0);
+		u.stripe		= s.k->p.offset;
+		u.stripe_redundancy	= s.v->nr_redundant;
+	} else {
+		u.stripe		= 0;
+		u.stripe_redundancy	= 0;
+	}
+
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, 0);
 err:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@ -1750,51 +1745,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
 				  struct bkey_s_c old, struct bkey_s_c new,
 				  unsigned flags)
 {
-	const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(old).v : NULL;
-	const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-		? bkey_s_c_to_stripe(new).v : NULL;
+	struct bkey_s_c_stripe old_s = { NULL };
+	struct bkey_s_c_stripe new_s = { NULL };
 	struct bch_replicas_padded r;
 	unsigned i;
 	int ret = 0;

+	if (old.k->type == KEY_TYPE_stripe)
+		old_s = bkey_s_c_to_stripe(old);
+	if (new.k->type == KEY_TYPE_stripe)
+		new_s = bkey_s_c_to_stripe(new);
+
 	/*
 	 * If the pointers aren't changing, we don't need to do anything:
 	 */
-	if (new_s && old_s &&
-	    !memcmp(old_s->ptrs, new_s->ptrs,
-		    new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+	if (new_s.k && old_s.k &&
+	    new_s.v->nr_blocks		== old_s.v->nr_blocks &&
+	    new_s.v->nr_redundant	== old_s.v->nr_redundant &&
+	    !memcmp(old_s.v->ptrs, new_s.v->ptrs,
+		    new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
 		return 0;

-	if (new_s) {
-		unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
-		s64 sectors = le16_to_cpu(new_s->sectors);
+	if (new_s.k) {
+		s64 sectors = le16_to_cpu(new_s.v->sectors);

 		bch2_bkey_to_replicas(&r.e, new);
-		update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+		update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);

-		for (i = 0; i < new_s->nr_blocks; i++) {
-			bool parity = i >= nr_data;
-
-			ret = bch2_trans_mark_stripe_alloc_ref(trans,
-					&new_s->ptrs[i], sectors, parity);
+		for (i = 0; i < new_s.v->nr_blocks; i++) {
+			ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
+							       i, false);
 			if (ret)
 				return ret;
 		}
 	}

-	if (old_s) {
-		unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
-		s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+	if (old_s.k) {
+		s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));

 		bch2_bkey_to_replicas(&r.e, old);
-		update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+		update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);

-		for (i = 0; i < old_s->nr_blocks; i++) {
-			bool parity = i >= nr_data;
-
-			ret = bch2_trans_mark_stripe_alloc_ref(trans,
-					&old_s->ptrs[i], sectors, parity);
+		for (i = 0; i < old_s.v->nr_blocks; i++) {
+			ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
+							       i, true);
 			if (ret)
 				return ret;
 		}
@ -2065,21 +2059,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_iter *iter;
 	struct bkey_alloc_unpacked u;
-	struct bkey_i_alloc *a;
+	struct bkey_alloc_buf *a;
 	struct bch_extent_ptr ptr = {
 		.dev = ca->dev_idx,
 		.offset = bucket_to_sector(ca, b),
 	};
 	int ret = 0;

-	a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-	ret = PTR_ERR_OR_ZERO(a);
-	if (ret)
-		return ret;
-
-	ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
-	if (ret)
-		return ret;
+	a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+	if (IS_ERR(a))
+		return PTR_ERR(a);

 	if (u.data_type && u.data_type != type) {
 		bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@ -2112,10 +2101,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
 	u.data_type	= type;
 	u.dirty_sectors	= sectors;

-	bkey_alloc_init(&a->k_i);
-	a->k.p = iter->pos;
-	bch2_alloc_pack(a, u);
-	bch2_trans_update(trans, iter, &a->k_i, 0);
+	bch2_alloc_pack(c, a, u);
+	bch2_trans_update(trans, iter, &a->k, 0);
 out:
 	bch2_trans_iter_put(trans, iter);
 	return ret;
@ -2422,13 +2409,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 		sizeof(struct bucket_array) +
 		ca->mi.nbuckets * sizeof(struct bucket));

-	free_percpu(ca->usage[0]);
+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+		free_percpu(ca->usage[i]);
+	kfree(ca->usage_base);
 }

 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+	unsigned i;
+
+	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+	if (!ca->usage_base)
 		return -ENOMEM;

+	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+		if (!ca->usage[i])
+			return -ENOMEM;
+	}
+
 	return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
 }
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 	return __bucket(ca, b, false);
 }

-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
-	return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
 /*
 * bucket_gc_gen() returns the difference between the bucket's current gen and
 * the oldest gen of any pointer into that bucket in the btree.
 */

-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
 {
-	struct bucket *g = bucket(ca, b);
-
 	return g->mark.gen - g->oldest_gen;
 }

@ -169,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,

 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);

-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage stats)
 {
@ -214,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
 		READ_ONCE(c->replicas.nr);
 }

+static inline unsigned dev_usage_u64s(void)
+{
+	return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
 void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
 struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);

--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -37,11 +37,12 @@ struct bucket {
 		const struct bucket_mark mark;
 	};

-	u16				io_time[2];
+	u64				io_time[2];
 	u8				oldest_gen;
 	u8				gc_gen;
 	unsigned			gen_valid:1;
-	u8				ec_redundancy;
+	u8				stripe_redundancy;
+	u32				stripe;
 };

 struct bucket_array {
--- a/libbcachefs/clock.c
+++ b/libbcachefs/clock.c
@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)

 	spin_lock(&clock->timer_lock);

-	if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+	if (time_after_eq((unsigned long) atomic64_read(&clock->now),
 			  timer->expire)) {
 		spin_unlock(&clock->timer_lock);
 		timer->fn(timer);
@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 {
 	struct io_timer *timer;
-	unsigned long now = atomic_long_add_return(sectors, &clock->now);
+	unsigned long now = atomic64_add_return(sectors, &clock->now);

 	while ((timer = get_expired_timer(clock, now)))
 		timer->fn(timer);
@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
 	unsigned i;

 	spin_lock(&clock->timer_lock);
-	now = atomic_long_read(&clock->now);
+	now = atomic64_read(&clock->now);

 	for (i = 0; i < clock->timers.used; i++)
 		pr_buf(out, "%ps:\t%li\n",
@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)

 int bch2_io_clock_init(struct io_clock *clock)
 {
-	atomic_long_set(&clock->now, 0);
+	atomic64_set(&clock->now, 0);
 	spin_lock_init(&clock->timer_lock);

 	clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
--- a/libbcachefs/clock_types.h
+++ b/libbcachefs/clock_types.h
@ -26,7 +26,7 @@ struct io_timer {
 typedef HEAP(struct io_timer *)	io_timer_heap;

 struct io_clock {
-	atomic_long_t		now;
+	atomic64_t		now;
 	u16 __percpu		*pcpu_buf;
 	unsigned		max_slop;

--- a/libbcachefs/ec.c
+++ b/libbcachefs/ec.c
@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;

+	if (!bkey_cmp(k.k->p, POS_MIN))
+		return "stripe at pos 0";
+
 	if (k.k->p.inode)
 		return "invalid stripe key";

@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 			struct bch_csum got = ec_block_checksum(buf, i, offset);

 			if (bch2_crc_cmp(want, got)) {
+				char buf2[200];
+
+				bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+
 				bch_err_ratelimited(c,
-					"stripe checksum error at %u:%u: csum type %u, expected %llx got %llx",
-					i, j, v->csum_type,
-					want.lo, got.lo);
+					"stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
+					(void *) _RET_IP_, i, j, v->csum_type,
+					want.lo, got.lo, buf2);
 				clear_bit(i, buf->valid);
 				break;
 			}
@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 static void ec_block_endio(struct bio *bio)
 {
 	struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+	struct bch_stripe *v = &ec_bio->buf->key.v;
+	struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
 	struct bch_dev *ca = ec_bio->ca;
 	struct closure *cl = bio->bi_private;

@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio)
 			       bch2_blk_status_to_str(bio->bi_status)))
 		clear_bit(ec_bio->idx, ec_bio->buf->valid);

+	if (ptr_stale(ca, ptr)) {
+		bch_err_ratelimited(ca->fs,
+				    "error %s stripe: stale pointer after io",
+				    bio_data_dir(bio) == READ ? "reading from" : "writing to");
+		clear_bit(ec_bio->idx, ec_bio->buf->valid);
+	}
+
 	bio_put(&ec_bio->bio);
 	percpu_ref_put(&ca->io_ref);
 	closure_put(cl);
@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c,

 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
-	//pr_info("deleting stripe %zu", idx);
 	return bch2_btree_delete_range(c, BTREE_ID_EC,
 				       POS(0, idx),
 				       POS(0, idx + 1),
@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
 	*dst = (struct bch_extent_stripe_ptr) {
 		.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
 		.block		= block,
+		.redundancy	= s->key.v.nr_redundant,
 		.idx		= s->key.k.p.offset,
 	};
 }
@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
 	if (!ob)
 		return;

-	//pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
-
 	ec = ob->ec;
 	mutex_lock(&ec->lock);

@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c,
 	struct stripe *m;
 	size_t heap_idx;
 	u64 stripe_idx;
+	s64 ret = -1;

 	if (may_create_new_stripe(c))
 		return -1;

 	spin_lock(&c->ec_stripes_heap_lock);
 	for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+		/* No blocks worth reusing, stripe will just be deleted: */
 		if (!h->data[heap_idx].blocks_nonempty)
 			continue;

@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c,
 		    m->sectors		== head->blocksize &&
 		    m->blocks_nonempty	< m->nr_blocks - m->nr_redundant) {
 			bch2_stripes_heap_del(c, m, stripe_idx);
-			spin_unlock(&c->ec_stripes_heap_lock);
-			return stripe_idx;
+			ret = stripe_idx;
+			break;
 		}
 	}
-
 	spin_unlock(&c->ec_stripes_heap_lock);
-	return -1;
+	return ret;
 }

 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -704,14 +704,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 		if (p.ptr.cached)
 			continue;

-		if (p.has_ec) {
-			struct stripe *s =
-				genradix_ptr(&c->stripes[0], p.ec.idx);
-
-			WARN_ON(!s);
-			if (s)
-				replicas += s->nr_redundant;
-		}
+		if (p.has_ec)
+			replicas += p.ec.redundancy;

 		replicas++;

@ -734,16 +728,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
 	if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
 		durability = max_t(unsigned, durability, ca->mi.durability);

-	if (p.has_ec) {
-		struct stripe *s =
-			genradix_ptr(&c->stripes[0], p.ec.idx);
+	if (p.has_ec)
+		durability += p.ec.redundancy;

-		if (WARN_ON(!s))
-			goto out;
-
-		durability += s->nr_redundant;
-	}
-out:
 	return durability;
 }

--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -1121,6 +1121,9 @@ int bch2_fs_journal_init(struct journal *j)
 	j->entry_u64s_reserved +=
 		BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);

+	j->entry_u64s_reserved +=
+		2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
+
 	atomic64_set(&j->reservations.counter,
 		((union journal_res_state)
 		 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
--- a/libbcachefs/journal_io.c
+++ b/libbcachefs/journal_io.c
@ -5,6 +5,7 @@
 #include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
@ -426,6 +427,69 @@ fsck_err:
 	return ret;
 }

+static int journal_entry_validate_clock(struct bch_fs *c,
+					struct jset *jset,
+					struct jset_entry *entry,
+					int write)
+{
+	struct jset_entry_clock *clock =
+		container_of(entry, struct jset_entry_clock, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes != sizeof(*clock),
+				 c, "invalid journal entry clock: bad size")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(clock->rw > 1,
+				 c, "invalid journal entry clock: bad rw")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+					    struct jset *jset,
+					    struct jset_entry *entry,
+					    int write)
+{
+	struct jset_entry_dev_usage *u =
+		container_of(entry, struct jset_entry_dev_usage, entry);
+	unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+	unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+	unsigned dev;
+	int ret = 0;
+
+	if (journal_entry_err_on(bytes < expected,
+				 c, "invalid journal entry dev usage: bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	dev = le32_to_cpu(u->dev);
+
+	if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+				 c, "invalid journal entry dev usage: bad dev")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+	if (journal_entry_err_on(u->pad,
+				 c, "invalid journal entry dev usage: bad pad")) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+
+fsck_err:
+	return ret;
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, int);
@ -937,6 +1001,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
 			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;

+		bch2_replicas_entry_sort(&replicas.e);
+
 		/*
 		 * If we're mounting in degraded mode - if we didn't read all
 		 * the devices - this is wrong:
@ -1032,16 +1098,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 			       unsigned sectors)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	struct bch_devs_mask devs;
 	struct journal_device *ja;
 	struct bch_dev *ca;
 	struct dev_alloc_list devs_sorted;
+	unsigned target = c->opts.metadata_target ?:
+		c->opts.foreground_target;
 	unsigned i, replicas = 0, replicas_want =
 		READ_ONCE(c->opts.metadata_replicas);

 	rcu_read_lock();
+retry:
+	devs = target_rw_devs(c, BCH_DATA_journal, target);

-	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
-					  &c->rw_devs[BCH_DATA_journal]);
+	devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);

 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
@ -1073,6 +1143,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,

 	__journal_write_alloc(j, w, &devs_sorted,
 			      sectors, &replicas, replicas_want);
+
+	if (replicas < replicas_want && target) {
+		/* Retry from all devices: */
+		target = 0;
+		goto retry;
+	}
 done:
 	rcu_read_unlock();

@ -1278,6 +1354,9 @@ static void do_journal_write(struct closure *cl)
 		bio->bi_private		= ca;
 		bio->bi_opf		= REQ_OP_WRITE|REQ_SYNC|REQ_META;

+		BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+		ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
 		if (!JSET_NO_FLUSH(w->data))
 			bio->bi_opf    |= REQ_FUA;
 		if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
@ -1348,8 +1427,8 @@ void bch2_journal_write(struct closure *cl)

 	end	= bch2_btree_roots_to_journal_entries(c, jset->start, end);

-	end	= bch2_journal_super_entries_add_common(c, end,
-						le64_to_cpu(jset->seq));
+	bch2_journal_super_entries_add_common(c, &end,
+				le64_to_cpu(jset->seq));
 	u64s	= (u64 *) end - (u64 *) start;
 	BUG_ON(u64s > j->entry_u64s_reserved);

@ -1358,10 +1437,7 @@ void bch2_journal_write(struct closure *cl)

 	journal_write_compact(jset);

-	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	jset->magic		= cpu_to_le64(jset_magic(c));
-
 	jset->version		= c->sb.version < bcachefs_metadata_version_new_versioning
 		? cpu_to_le32(BCH_JSET_VERSION_OLD)
 		: cpu_to_le32(c->sb.version);
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
 			data_opts->btree_insert_flags	= BTREE_INSERT_USE_RESERVE;
 			data_opts->rewrite_dev		= p.ptr.dev;

-			if (p.has_ec) {
-				struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
-
-				data_opts->nr_replicas += m->nr_redundant;
-			}
+			if (p.has_ec)
+				data_opts->nr_replicas += p.ec.redundancy;

 			return DATA_REWRITE;
 		}
@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
 			    bucket_sectors_used(m) >= ca->mi.bucket_size)
 				continue;

-			WARN_ON(m.stripe && !g->ec_redundancy);
+			WARN_ON(m.stripe && !g->stripe_redundancy);

 			e = (struct copygc_heap_entry) {
 				.dev		= dev_idx,
 				.gen		= m.gen,
-				.replicas	= 1 + g->ec_redundancy,
+				.replicas	= 1 + g->stripe_redundancy,
 				.fragmentation	= bucket_sectors_used(m) * (1U << 15)
 					/ ca->mi.bucket_size,
 				.sectors	= bucket_sectors_used(m),
@ -301,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
 {
 	struct bch_fs *c = arg;
 	struct io_clock *clock = &c->io_clock[WRITE];
-	unsigned long last, wait;
+	u64 last, wait;

 	set_freezable();

@ -309,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
 		if (kthread_wait_freezable(c->copy_gc_enabled))
 			break;

-		last = atomic_long_read(&clock->now);
+		last = atomic64_read(&clock->now);
 		wait = bch2_copygc_wait_amount(c);

 		if (wait > clock->max_slop) {
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@ -136,6 +136,11 @@ enum opt_type {
 	  OPT_STR(bch2_str_hash_types),					\
 	  BCH_SB_STR_HASH_TYPE,		BCH_STR_HASH_OPT_SIPHASH,	\
 	  NULL,		"Hash function for directory entries and xattrs")\
+	x(metadata_target,		u16,				\
+	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
+	  OPT_FN(bch2_opt_target),					\
+	  BCH_SB_METADATA_TARGET,	0,				\
+	  "(target)",	"Device or disk group for metadata writes")	\
 	x(foreground_target,		u16,				\
 	  OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,			\
 	  OPT_FN(bch2_opt_target),					\
--- a/libbcachefs/rebalance.c
+++ b/libbcachefs/rebalance.c
@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
 	unsigned long start, prev_start;
 	unsigned long prev_run_time, prev_run_cputime;
 	unsigned long cputime, prev_cputime;
-	unsigned long io_start;
+	u64 io_start;
 	long throttle;

 	set_freezable();

-	io_start	= atomic_long_read(&clock->now);
+	io_start	= atomic64_read(&clock->now);
 	p		= rebalance_work(c);
 	prev_start	= jiffies;
 	prev_cputime	= curr_cputime();
@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
 					(20 - w.dev_most_full_percent),
 					50);

-			if (atomic_long_read(&clock->now) + clock->max_slop <
+			if (atomic64_read(&clock->now) + clock->max_slop <
 			    r->throttled_until_iotime) {
 				r->throttled_until_cputime = start + throttle;
 				r->state = REBALANCE_THROTTLED;
@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
 			      max(p.dev_most_full_percent, 1U) /
 			      max(w.dev_most_full_percent, 1U));

-		io_start	= atomic_long_read(&clock->now);
+		io_start	= atomic64_read(&clock->now);
 		p		= w;
 		prev_start	= start;
 		prev_cputime	= cputime;
@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
 	case REBALANCE_THROTTLED:
 		bch2_hprint(&PBUF(h1),
 			    (r->throttled_until_iotime -
-			     atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+			     atomic64_read(&c->io_clock[WRITE].now)) << 9);
 		pr_buf(out, "throttled for %lu sec or %s io\n",
 		       (r->throttled_until_cputime - jiffies) / HZ,
 		       h1);
--- a/libbcachefs/rebalance_types.h
+++ b/libbcachefs/rebalance_types.h
@ -17,7 +17,7 @@ struct bch_fs_rebalance {
 	atomic64_t		work_unknown_dev;

 	enum rebalance_state	state;
-	unsigned long		throttled_until_iotime;
+	u64			throttled_until_iotime;
 	unsigned long		throttled_until_cputime;
 	struct bch_move_stats	move_stats;

--- a/libbcachefs/recovery.c
+++ b/libbcachefs/recovery.c
@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
 	case BCH_JSET_ENTRY_data_usage: {
 		struct jset_entry_data_usage *u =
 			container_of(entry, struct jset_entry_data_usage, entry);
+
 		ret = bch2_replicas_set_usage(c, &u->r,
 					      le64_to_cpu(u->v));
 		break;
 	}
+	case BCH_JSET_ENTRY_dev_usage: {
+		struct jset_entry_dev_usage *u =
+			container_of(entry, struct jset_entry_dev_usage, entry);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+		unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+		unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+			sizeof(struct jset_entry_dev_usage_type);
+		unsigned i;
+
+		ca->usage_base->buckets_ec		= le64_to_cpu(u->buckets_ec);
+		ca->usage_base->buckets_unavailable	= le64_to_cpu(u->buckets_unavailable);
+
+		for (i = 0; i < nr_types; i++) {
+			ca->usage_base->d[i].buckets	= le64_to_cpu(u->d[i].buckets);
+			ca->usage_base->d[i].sectors	= le64_to_cpu(u->d[i].sectors);
+			ca->usage_base->d[i].fragmented	= le64_to_cpu(u->d[i].fragmented);
+		}
+
+		break;
+	}
 	case BCH_JSET_ENTRY_blacklist: {
 		struct jset_entry_blacklist *bl_entry =
 			container_of(entry, struct jset_entry_blacklist, entry);
@ -847,6 +868,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
 				le64_to_cpu(bl_entry->end) + 1);
 		break;
 	}
+	case BCH_JSET_ENTRY_clock: {
+		struct jset_entry_clock *clock =
+			container_of(entry, struct jset_entry_clock, entry);
+
+		atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+	}
 	}

 	return ret;
@ -861,9 +888,6 @@ static int journal_replay_early(struct bch_fs *c,
 	int ret;

 	if (clean) {
-		c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-		c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
 		for (entry = clean->start;
 		     entry != vstruct_end(&clean->field);
 		     entry = vstruct_next(entry)) {
@ -876,9 +900,6 @@ static int journal_replay_early(struct bch_fs *c,
 			if (i->ignore)
 				continue;

-			c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-			c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
 			vstruct_for_each(&i->j, entry) {
 				ret = journal_replay_entry_early(c, entry);
 				if (ret)
@ -942,13 +963,6 @@ static int verify_superblock_clean(struct bch_fs *c,
 		return 0;
 	}

-	mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-			"superblock read clock %u doesn't match journal %u after clean shutdown",
-			clean->read_clock, j->read_clock);
-	mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-			"superblock write clock %u doesn't match journal %u after clean shutdown",
-			clean->write_clock, j->write_clock);
-
 	for (i = 0; i < BTREE_ID_NR; i++) {
 		char buf1[200], buf2[200];
 		struct bkey_i *k1, *k2;
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
 #endif
 }

-static void replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
 {
 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
 }
@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
 		break;
 	}

-	replicas_entry_sort(e);
+	bch2_replicas_entry_sort(e);
 }

 void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
@ -142,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
 	for (i = 0; i < devs.nr; i++)
 		e->devs[e->nr_devs++] = devs.devs[i];

-	replicas_entry_sort(e);
+	bch2_replicas_entry_sort(e);
 }

 static struct bch_replicas_cpu
@ -197,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 int bch2_replicas_entry_idx(struct bch_fs *c,
 			    struct bch_replicas_entry *search)
 {
-	replicas_entry_sort(search);
+	bch2_replicas_entry_sort(search);

 	return __replicas_entry_idx(&c->replicas, search);
 }
@ -681,7 +681,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 	for_each_replicas_entry(sb_r, e) {
 		dst = cpu_replicas_entry(cpu_r, idx++);
 		memcpy(dst, e, replicas_entry_bytes(e));
-		replicas_entry_sort(dst);
+		bch2_replicas_entry_sort(dst);
 	}

 	return 0;
@ -718,7 +718,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 		dst->nr_devs	= e->nr_devs;
 		dst->nr_required = 1;
 		memcpy(dst->devs, e->devs, e->nr_devs);
-		replicas_entry_sort(dst);
+		bch2_replicas_entry_sort(dst);
 	}

 	return 0;
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@ -5,6 +5,7 @@
 #include "eytzinger.h"
 #include "replicas_types.h"

+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
 void bch2_replicas_entry_to_text(struct printbuf *,
 				 struct bch_replicas_entry *);
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@ -963,31 +963,28 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 	return ret;
 }

-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
 {
-	memset(entry, 0, u64s * sizeof(u64));
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));

+	memset(entry, 0, u64s * sizeof(u64));
 	/*
 	 * The u64s field counts from the start of data, ignoring the shared
 	 * fields.
 	 */
 	entry->u64s = u64s - 1;
+
+	*end = vstruct_next(*end);
+	return entry;
 }

-static void
-entry_init_size(struct jset_entry *entry, size_t size)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+					   struct jset_entry **end,
+					   u64 journal_seq)
 {
-	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-	entry_init_u64s(entry, u64s);
-}
-
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
-				      struct jset_entry *entry,
-				      u64 journal_seq)
-{
-	unsigned i;
+	struct bch_dev *ca;
+	unsigned i, dev;

 	percpu_down_write(&c->mark_lock);

@ -1000,58 +997,77 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,

 	{
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);

-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_INODES;
 		u->v		= cpu_to_le64(c->usage_base->nr_inodes);
-
-		entry = vstruct_next(entry);
 	}

 	{
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);

-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_KEY_VERSION;
 		u->v		= cpu_to_le64(atomic64_read(&c->key_version));
-
-		entry = vstruct_next(entry);
 	}

 	for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 		struct jset_entry_usage *u =
-			container_of(entry, struct jset_entry_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u)),
+				     struct jset_entry_usage, entry);

-		entry_init_size(entry, sizeof(*u));
 		u->entry.type	= BCH_JSET_ENTRY_usage;
 		u->entry.btree_id = FS_USAGE_RESERVED;
 		u->entry.level	= i;
 		u->v		= cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
-		entry = vstruct_next(entry);
 	}

 	for (i = 0; i < c->replicas.nr; i++) {
 		struct bch_replicas_entry *e =
 			cpu_replicas_entry(&c->replicas, i);
 		struct jset_entry_data_usage *u =
-			container_of(entry, struct jset_entry_data_usage, entry);
+			container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+				     struct jset_entry_data_usage, entry);

-		entry_init_size(entry, sizeof(*u) + e->nr_devs);
 		u->entry.type	= BCH_JSET_ENTRY_data_usage;
 		u->v		= cpu_to_le64(c->usage_base->replicas[i]);
 		memcpy(&u->r, e, replicas_entry_bytes(e));
+	}

-		entry = vstruct_next(entry);
+	for_each_member_device(ca, c, dev) {
+		unsigned b = sizeof(struct jset_entry_dev_usage) +
+			sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+		struct jset_entry_dev_usage *u =
+			container_of(jset_entry_init(end, b),
+				     struct jset_entry_dev_usage, entry);
+
+		u->entry.type = BCH_JSET_ENTRY_dev_usage;
+		u->dev = cpu_to_le32(dev);
+		u->buckets_ec		= cpu_to_le64(ca->usage_base->buckets_ec);
+		u->buckets_unavailable	= cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+		for (i = 0; i < BCH_DATA_NR; i++) {
+			u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+			u->d[i].sectors	= cpu_to_le64(ca->usage_base->d[i].sectors);
+			u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+		}
 	}

 	percpu_up_write(&c->mark_lock);

-	return entry;
+	for (i = 0; i < 2; i++) {
+		struct jset_entry_clock *clock =
+			container_of(jset_entry_init(end, sizeof(*clock)),
+				     struct jset_entry_clock, entry);
+
+		clock->entry.type = BCH_JSET_ENTRY_clock;
+		clock->rw	= i;
+		clock->time	= atomic64_read(&c->io_clock[i].now);
+	}
 }

 void bch2_fs_mark_clean(struct bch_fs *c)
@ -1080,15 +1096,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	}

 	sb_clean->flags		= 0;
-	sb_clean->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
-	sb_clean->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	sb_clean->journal_seq	= cpu_to_le64(journal_cur_seq(&c->journal) - 1);

 	/* Trying to catch outstanding bug: */
 	BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);

 	entry = sb_clean->start;
-	entry = bch2_journal_super_entries_add_common(c, entry, 0);
+	bch2_journal_super_entries_add_common(c, &entry, 0);
 	entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
 	BUG_ON((void *) entry > vstruct_end(&sb_clean->field));

--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)

 /* BCH_SB_FIELD_clean: */

-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
-				      struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+					   struct jset_entry **, u64);

 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);

--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -148,6 +148,22 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
 	return c;
 }

+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	unsigned i, nr = 0, u64s =
+		(sizeof(struct jset_entry_dev_usage) +
+		 sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
+
+	rcu_read_lock();
+	for_each_member_device_rcu(ca, c, i, NULL)
+		nr++;
+	rcu_read_unlock();
+
+	bch2_journal_entry_res_resize(&c->journal,
+			&c->dev_usage_journal_res, u64s * nr);
+}
+
 /* Filesystem RO/RW: */

 /*
@ -174,9 +190,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch2_copygc_stop(c);
 	bch2_gc_thread_stop(c);

-	bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-	bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
 	/*
 	 * Flush journal before stopping allocators, because flushing journal
 	 * blacklist entries involves allocating new btree nodes:
@ -399,9 +412,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);

-	bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-	bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
 	for_each_rw_member(ca, c, i) {
 		ret = bch2_dev_allocator_start(ca);
 		if (ret) {
@ -779,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_fsio_init(c))
 		goto err;

+	bch2_dev_usage_journal_reserve(c);
+
 	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
 		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@ -1521,6 +1533,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)

 	mutex_unlock(&c->sb_lock);
 	up_write(&c->state_lock);
+
+	bch2_dev_usage_journal_reserve(c);
 	return 0;
 err:
 	if (ca->mi.state == BCH_MEMBER_STATE_RW &&
@ -1530,19 +1544,6 @@ err:
 	return ret;
 }

-static void dev_usage_clear(struct bch_dev *ca)
-{
-	struct bucket_array *buckets;
-
-	percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
-	down_read(&ca->bucket_lock);
-	buckets = bucket_array(ca);
-
-	memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
-	up_read(&ca->bucket_lock);
-}
-
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
@ -1600,8 +1601,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (ret)
 		goto err;

-	dev_usage_clear(ca);
-
 	down_write(&c->state_lock);
 	mutex_lock(&c->sb_lock);

@ -1655,6 +1654,8 @@ have_slot:
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

+	bch2_dev_usage_journal_reserve(c);
+
 	err = "error marking superblock";
 	ret = bch2_trans_mark_dev_sb(c, NULL, ca);
 	if (ret)
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
 {
 	int rw = (private ? 1 : 0);

-	return bucket_last_io(c, bucket(ca, b), rw);
+	return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
 }

 static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
 static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, void *private)
 {
-	return bucket_gc_gen(ca, b);
+	return bucket_gc_gen(bucket(ca, b));
 }

 static int unsigned_cmp(const void *_l, const void *_r)