Update bcachefs sources to d5225e0d46f8 fixup! bcachefs: bcachefs_metadata_version_rebalance_v2

2025-12-11 00:00:12 +03:00 · 2025-11-01 13:03:07 -04:00 · 2025-11-01 13:03:07 -04:00 · 6fbb4b40d1
commit 6fbb4b40d1
parent d64a69c6ee
56 changed files with 2494 additions and 853 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-5fe20ac58af402e8ad9ace0bcf9daad524e3005d
+d5225e0d46f8ddaf6e74e91b3683de839b2c0d04
--- a/libbcachefs/alloc/accounting.c
+++ b/libbcachefs/alloc/accounting.c
@ -250,7 +250,9 @@ fsck_err:
 	return ret;
 }

-void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_pos *k)
+void bch2_accounting_key_to_text(struct printbuf *out,
+				 struct bch_fs *c,
+				 struct disk_accounting_pos *k)
 {
 	if (k->type >= BCH_DISK_ACCOUNTING_TYPE_NR) {
 		prt_printf(out, "unknown type %u", k->type);
@ -283,6 +285,17 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po
 		prt_str(out, "btree=");
 		bch2_btree_id_to_text(out, k->btree.id);
 		break;
+	case BCH_DISK_ACCOUNTING_rebalance_work_v2:
+		bch2_prt_rebalance_accounting_type(out, k->rebalance_work_v2.type);
+		break;
+	case BCH_DISK_ACCOUNTING_dev_leaving: {
+		struct bch_dev *ca = c ? bch2_dev_rcu_noerror(c, k->dev_leaving.dev) : NULL;
+		if (ca)
+			prt_printf(out, "%s ", ca->name);
+		else
+			prt_printf(out, "%u ", k->dev_leaving.dev);
+		break;
+	}
 	}
 }

@ -292,7 +305,7 @@ void bch2_accounting_to_text(struct printbuf *out, struct bch_fs *c, struct bkey
 	struct disk_accounting_pos acc_k;
 	bpos_to_disk_accounting_pos(&acc_k, k.k->p);

-	bch2_accounting_key_to_text(out, &acc_k);
+	bch2_accounting_key_to_text(out, c, &acc_k);

 	for (unsigned i = 0; i < bch2_accounting_counters(k.k); i++)
 		prt_printf(out, " %lli", acc.v->d[i]);
@ -607,7 +620,7 @@ int bch2_gc_accounting_done(struct bch_fs *c)
 		if (memcmp(dst_v, src_v, nr * sizeof(u64))) {
 			printbuf_reset(&buf);
 			prt_str(&buf, "accounting mismatch for ");
-			bch2_accounting_key_to_text(&buf, &acc_k);
+			bch2_accounting_key_to_text(&buf, c, &acc_k);

 			prt_str(&buf, ":\n      got");
 			for (unsigned j = 0; j < nr; j++)
@ -672,7 +685,7 @@ static int disk_accounting_invalid_dev(struct btree_trans *trans,
 				       unsigned dev)
 {
 	CLASS(printbuf, buf)();
-	bch2_accounting_key_to_text(&buf, acc);
+	bch2_accounting_key_to_text(&buf, trans->c, acc);
 	int ret = 0;

 	if (fsck_err(trans, accounting_to_invalid_device,
@ -719,7 +732,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
 				trans, accounting_replicas_not_marked,
 				"accounting not marked in superblock replicas\n%s",
 				(printbuf_reset(&buf),
-				 bch2_accounting_key_to_text(&buf, acc),
+				 bch2_accounting_key_to_text(&buf, c, acc),
 				 buf.buf)))
 			try(bch2_mark_replicas(c, &r.e));
 		break;
@ -849,7 +862,7 @@ static int accounting_read_mem_fixups(struct btree_trans *trans)
 				bch2_log_msg_start(c, &underflow_err);
 				prt_printf(&underflow_err, "Accounting underflow for\n");
 			}
-			bch2_accounting_key_to_text(&underflow_err, &k);
+			bch2_accounting_key_to_text(&underflow_err, c, &k);

 			for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++)
 				prt_printf(&underflow_err, " %lli", v[j]);
--- a/libbcachefs/alloc/accounting.h
+++ b/libbcachefs/alloc/accounting.h
@ -124,7 +124,7 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool);

 int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c,
 			     struct bkey_validate_context);
-void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *);
+void bch2_accounting_key_to_text(struct printbuf *, struct bch_fs *, struct disk_accounting_pos *);
 void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 void bch2_accounting_swab(struct bkey_s);

--- a/libbcachefs/alloc/accounting_format.h
+++ b/libbcachefs/alloc/accounting_format.h
@ -110,7 +110,9 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
 	x(snapshot,		5,	1)	\
 	x(btree,		6,	3)	\
 	x(rebalance_work,	7,	1)	\
-	x(inum,			8,	3)
+	x(inum,			8,	3)	\
+	x(rebalance_work_v2,	9,	1)	\
+	x(dev_leaving,		10,	1)

 enum disk_accounting_type {
 #define x(f, nr, ...)	BCH_DISK_ACCOUNTING_##f	= nr,
@ -210,6 +212,19 @@ struct bch_acct_inum {
 struct bch_acct_rebalance_work {
 };

+struct bch_acct_rebalance_work_v2 {
+	__u8			type;
+};
+
+struct bch_acct_dev_leaving {
+	__u32			dev;
+};
+
+/*
+ * XXX: need per-device counters for "how much data are we going to move off of
+ * this device
+ */
+
 struct disk_accounting_pos {
 	union {
 	struct {
@ -224,6 +239,8 @@ struct disk_accounting_pos {
 		struct bch_acct_btree		btree;
 		struct bch_acct_rebalance_work	rebalance_work;
 		struct bch_acct_inum		inum;
+		struct bch_acct_rebalance_work_v2 rebalance_work_v2;
+		struct bch_acct_dev_leaving	dev_leaving;
 		} __packed;
 	} __packed;
 		struct bpos			_pad;
--- a/libbcachefs/alloc/buckets.c
+++ b/libbcachefs/alloc/buckets.c
@ -317,8 +317,7 @@ int bch2_check_fix_ptrs(struct btree_trans *trans,

 	if (do_update) {
 		struct bkey_i *new =
-			errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-						      sizeof(struct bch_extent_rebalance)));
+			errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64)));
 		bkey_reassemble(new, k);

 		scoped_guard(rcu)
@ -386,7 +385,7 @@ found:

 		struct bch_inode_opts opts;
 		try(bch2_bkey_get_io_opts(trans, NULL, k, &opts));
-		try(bch2_bkey_set_needs_rebalance(c, &opts, new, SET_NEEDS_REBALANCE_opt_change, 0));
+		try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new, SET_NEEDS_REBALANCE_opt_change, 0));

 		if (!(flags & BTREE_TRIGGER_is_root)) {
 			CLASS(btree_node_iter, iter)(trans, btree, new->k.p, 0, level,
@ -888,7 +887,7 @@ int bch2_trigger_extent(struct btree_trans *trans,
 			try(__trigger_extent(trans, btree, level, new.s_c,
 					     flags & ~BTREE_TRIGGER_overwrite));

-		try(bch2_trigger_extent_rebalance(trans, old, new.s_c, flags));
+		try(bch2_trigger_extent_rebalance(trans, btree, level, old, new, flags));
 	}

 	return 0;
--- a/libbcachefs/alloc/disk_groups.c
+++ b/libbcachefs/alloc/disk_groups.c
@ -3,6 +3,8 @@

 #include "alloc/disk_groups.h"

+#include "data/rebalance.h"
+
 #include "init/dev.h"

 #include "sb/members.h"
@ -469,9 +471,18 @@ int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)

 int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
 {
-	guard(mutex)(&c->sb_lock);
-	return __bch2_dev_group_set(c, ca, name) ?:
-		bch2_write_super(c);
+	struct rebalance_scan s = { .type = REBALANCE_SCAN_pending };
+
+	try(bch2_set_rebalance_needs_scan(c, s, false));
+
+	/* bch2_rebalance_wakeup_pending goes here */
+	scoped_guard(mutex,&c->sb_lock) {
+		try(__bch2_dev_group_set(c, ca, name));
+		try(bch2_write_super(c));
+	}
+
+	try(bch2_set_rebalance_needs_scan(c, s, true));
+	return 0;
 }

 int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
--- a/libbcachefs/alloc/replicas.c
+++ b/libbcachefs/alloc/replicas.c
@ -16,25 +16,40 @@ DEFINE_CLASS(bch_replicas_cpu, struct bch_replicas_cpu,
 	     kfree(_T.entries),
 	     (struct bch_replicas_cpu) {}, void)

-static inline struct bch_replicas_entry_v1 *
+static inline struct bch_replicas_entry_cpu *
 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
 {
 	return (void *) r->entries + r->entry_size * i;
 }

+static inline unsigned __cpu_replicas_entry_bytes(unsigned v1_bytes)
+{
+	return offsetof(struct bch_replicas_entry_cpu, e) + v1_bytes;
+}
+
+static inline unsigned cpu_replicas_entry_bytes(struct bch_replicas_entry_cpu *e)
+{
+	return __cpu_replicas_entry_bytes(replicas_entry_bytes(&e->e));
+}
+
 #define for_each_cpu_replicas_entry(_r, _i)						\
-	for (struct bch_replicas_entry_v1 *_i = (_r)->entries;				\
+	for (struct bch_replicas_entry_cpu *_i = (_r)->entries;				\
 	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;	\
 	     _i = (void *) (_i) + (_r)->entry_size)

 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);

-/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r,  const void *priv)
+static int cpu_replicas_entry_cmp(const struct bch_replicas_entry_cpu *l,
+				  const struct bch_replicas_entry_cpu *r,
+				  size_t size)
 {
-	size_t size = (size_t) priv;
-	return memcmp(l, r, size);
+	return memcmp(&l->e, &r->e, size - offsetof(struct bch_replicas_entry_cpu, e));
+}
+
+static int cpu_replicas_entry_cmp_r(const void *l, const void *r,  const void *priv)
+{
+	return cpu_replicas_entry_cmp(l, r, (size_t) priv);
 }

 /* Replicas tracking - in memory: */
@ -60,7 +75,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
 	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
-			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
+			  cpu_replicas_entry_cmp_r, NULL,
+			  (void *)(size_t)r->entry_size);
 }

 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@ -85,6 +101,13 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
 	prt_printf(out, "]");
 }

+static void bch2_replicas_entry_cpu_to_text(struct printbuf *out,
+					    struct bch_replicas_entry_cpu *e)
+{
+	prt_printf(out, "ref=%u ", atomic_read(&e->ref));
+	bch2_replicas_entry_to_text(out, &e->e);
+}
+
 static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
 					   struct bch_sb *sb,
 					   struct printbuf *err)
@ -151,7 +174,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 			prt_printf(out, " ");
 		first = false;

-		bch2_replicas_entry_to_text(out, i);
+		bch2_replicas_entry_cpu_to_text(out, i);
 	}
 }

@ -232,6 +255,44 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
 	bch2_replicas_entry_sort(e);
 }

+/* @l is bch_replicas_entry_v1, @r is bch_replicas_entry_cpu */
+static int replicas_entry_search_cmp(const void *_l, const void *_r,  const void *priv)
+{
+	const struct bch_replicas_entry_v1  *l = _l;
+	const struct bch_replicas_entry_cpu *r = _r;
+	size_t size = (size_t) priv;
+
+	return memcmp(l, &r->e, size);
+}
+
+static inline struct bch_replicas_entry_cpu *
+replicas_entry_search(struct bch_replicas_cpu *r,
+		      struct bch_replicas_entry_v1 *search)
+{
+	verify_replicas_entry(search);
+
+	size_t entry_size = replicas_entry_bytes(search);
+	int idx = likely(__cpu_replicas_entry_bytes(entry_size) <= r->entry_size)
+		? eytzinger0_find_r(r->entries, r->nr, r->entry_size,
+				    replicas_entry_search_cmp,
+				    (void *) entry_size, search)
+		: -1;
+	return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL;
+}
+
+bool bch2_replicas_marked_locked(struct bch_fs *c,
+			  struct bch_replicas_entry_v1 *search)
+{
+	return !search->nr_devs || replicas_entry_search(&c->replicas, search);
+}
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry_v1 *search)
+{
+	guard(percpu_read)(&c->mark_lock);
+	return bch2_replicas_marked_locked(c, search);
+}
+
 static struct bch_replicas_cpu
 cpu_replicas_add_entry(struct bch_fs *c,
 		       struct bch_replicas_cpu *old,
@ -240,9 +301,12 @@ cpu_replicas_add_entry(struct bch_fs *c,
 	struct bch_replicas_cpu new = {
 		.nr		= old->nr + 1,
 		.entry_size	= max_t(unsigned, old->entry_size,
-					replicas_entry_bytes(new_entry)),
+					__cpu_replicas_entry_bytes(replicas_entry_bytes(new_entry))),
 	};

+	/* alignment */
+	new.entry_size = round_up(new.entry_size, sizeof(atomic_t));
+
 	new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL);
 	if (!new.entries)
 		return new;
@ -252,7 +316,7 @@ cpu_replicas_add_entry(struct bch_fs *c,
 		       cpu_replicas_entry(old, i),
 		       old->entry_size);

-	memcpy(cpu_replicas_entry(&new, old->nr),
+	memcpy(&cpu_replicas_entry(&new, old->nr)->e,
 	       new_entry,
 	       replicas_entry_bytes(new_entry));

@ -260,152 +324,56 @@ cpu_replicas_add_entry(struct bch_fs *c,
 	return new;
 }

-static inline struct bch_replicas_entry_v1 *
-replicas_entry_search(struct bch_replicas_cpu *r,
-		      struct bch_replicas_entry_v1 *search)
-{
-	verify_replicas_entry(search);
-
-	size_t entry_size = replicas_entry_bytes(search);
-	int idx = likely(entry_size <= r->entry_size)
-		? eytzinger0_find_r(r->entries, r->nr, r->entry_size,
-				    bch2_memcmp, (void *) entry_size, search)
-		: -1;
-	return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL;
-}
-
-bool bch2_replicas_marked_locked(struct bch_fs *c,
-			  struct bch_replicas_entry_v1 *search)
-{
-	return !search->nr_devs ||
-		(replicas_entry_search(&c->replicas, search) &&
-		 (likely((!c->replicas_gc.entries)) ||
-		  replicas_entry_search(&c->replicas_gc, search)));
-}
-
-bool bch2_replicas_marked(struct bch_fs *c,
-			  struct bch_replicas_entry_v1 *search)
-{
-	guard(percpu_read)(&c->mark_lock);
-	return bch2_replicas_marked_locked(c, search);
-}
-
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
-				struct bch_replicas_entry_v1 *new_entry)
+				struct bch_replicas_entry_v1 *new_entry,
+				unsigned ref)
 {
 	verify_replicas_entry(new_entry);

-	CLASS(bch_replicas_cpu, new_r)();
-	CLASS(bch_replicas_cpu, new_gc)();
-
 	guard(mutex)(&c->sb_lock);
+	bool write_sb = false;

-	if (c->replicas_gc.entries &&
-	    !replicas_entry_search(&c->replicas_gc, new_entry)) {
-		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
-		if (!new_gc.entries)
-			return bch_err_throw(c, ENOMEM_cpu_replicas);
-	}
-
-	if (!replicas_entry_search(&c->replicas, new_entry)) {
-		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
-		if (!new_r.entries)
-			return bch_err_throw(c, ENOMEM_cpu_replicas);
-
-		try(bch2_cpu_replicas_to_sb_replicas(c, &new_r));
-	}
-
-	if (!new_r.entries &&
-	    !new_gc.entries)
-		return 0;
-
-	/* allocations done, now commit: */
-
-	if (new_r.entries)
-		bch2_write_super(c);
-
-	/* don't update in memory replicas until changes are persistent */
 	scoped_guard(percpu_write, &c->mark_lock) {
-		if (new_r.entries)
+		if (!replicas_entry_search(&c->replicas, new_entry)) {
+			CLASS(bch_replicas_cpu, new_r)();
+
+			new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
+			if (!new_r.entries)
+				return bch_err_throw(c, ENOMEM_cpu_replicas);
+
+			try(bch2_cpu_replicas_to_sb_replicas(c, &new_r));
+
 			swap(c->replicas, new_r);
-		if (new_gc.entries)
-			swap(new_gc, c->replicas_gc);
+			write_sb = true;
+		}
+
+		atomic_add(ref, &replicas_entry_search(&c->replicas, new_entry)->ref);
 	}

+	/* After dropping mark_lock */
+	if (write_sb)
+		bch2_write_super(c);
+
 	return 0;
 }

 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
 {
 	return likely(bch2_replicas_marked(c, r))
-		? 0 : bch2_mark_replicas_slowpath(c, r);
+		? 0 : bch2_mark_replicas_slowpath(c, r, 0);
 }

-/*
- * Old replicas_gc mechanism: only used for journal replicas entries now, should
- * die at some point:
- */
-
-int bch2_replicas_gc_end(struct bch_fs *c, int ret)
+static void __replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_cpu *e)
 {
-	lockdep_assert_held(&c->replicas_gc_lock);
+	struct bch_replicas_cpu *r = &c->replicas;

-	guard(mutex)(&c->sb_lock);
-	scoped_guard(percpu_write, &c->mark_lock) {
-		ret =   ret ?:
-			bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc);
-		if (!ret)
-			swap(c->replicas, c->replicas_gc);
+	memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size);
+	bch2_cpu_replicas_sort(r);

-		kfree(c->replicas_gc.entries);
-		c->replicas_gc.entries = NULL;
-	}
-
-	if (!ret)
-		bch2_write_super(c);
-
-	return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
-	lockdep_assert_held(&c->replicas_gc_lock);
-
-	guard(mutex)(&c->sb_lock);
-	BUG_ON(c->replicas_gc.entries);
-
-	c->replicas_gc.nr		= 0;
-	c->replicas_gc.entry_size	= 0;
-
-	for_each_cpu_replicas_entry(&c->replicas, e) {
-		/* Preserve unknown data types */
-		if (e->data_type >= BCH_DATA_NR ||
-		    !(BIT(e->data_type) & typemask)) {
-			c->replicas_gc.nr++;
-			c->replicas_gc.entry_size =
-				max_t(unsigned, c->replicas_gc.entry_size,
-				      replicas_entry_bytes(e));
-		}
-	}
-
-	c->replicas_gc.entries = kcalloc(c->replicas_gc.nr,
-					 c->replicas_gc.entry_size,
-					 GFP_KERNEL);
-	if (!c->replicas_gc.entries) {
-		bch_err(c, "error allocating c->replicas_gc");
-		return bch_err_throw(c, ENOMEM_replicas_gc);
-	}
-
-	unsigned i = 0;
-	for_each_cpu_replicas_entry(&c->replicas, e)
-		if (e->data_type >= BCH_DATA_NR ||
-		    !(BIT(e->data_type) & typemask))
-			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
-			       e, c->replicas_gc.entry_size);
-
-	bch2_cpu_replicas_sort(&c->replicas_gc);
-	return 0;
+	int ret = bch2_cpu_replicas_to_sb_replicas(c, r);
+	if (WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret)))
+		return;
 }

 void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *kill)
@ -413,18 +381,95 @@ void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *ki
 	lockdep_assert_held(&c->mark_lock);
 	lockdep_assert_held(&c->sb_lock);

-	struct bch_replicas_cpu *r = &c->replicas;
+	struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, kill);

-	struct bch_replicas_entry_v1 *e = replicas_entry_search(&c->replicas, kill);
 	if (WARN(!e, "replicas entry not found in sb"))
 		return;

-	memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size);
+	__replicas_entry_kill(c, e);

-	bch2_cpu_replicas_sort(r);
+	/* caller does write_super() after dropping mark_lock */
+}

-	int ret = bch2_cpu_replicas_to_sb_replicas(c, r);
-	WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret));
+void bch2_replicas_entry_put_many(struct bch_fs *c, struct bch_replicas_entry_v1 *r, unsigned nr)
+{
+	if (!r->nr_devs)
+		return;
+
+	BUG_ON(r->data_type != BCH_DATA_journal);
+	verify_replicas_entry(r);
+
+	scoped_guard(percpu_read, &c->mark_lock) {
+		struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r);
+
+		int v = atomic_sub_return(nr, &e->ref);
+		BUG_ON(v < 0);
+		if (v)
+			return;
+	}
+
+	guard(mutex)(&c->sb_lock);
+	scoped_guard(percpu_write, &c->mark_lock) {
+		struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r);
+		if (e && !atomic_read(&e->ref))
+			__replicas_entry_kill(c, e);
+	}
+
+	bch2_write_super(c);
+}
+
+static inline bool bch2_replicas_entry_get_inmem(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
+{
+	guard(percpu_read)(&c->mark_lock);
+	struct bch_replicas_entry_cpu *e = replicas_entry_search(&c->replicas, r);
+	if (e)
+		atomic_inc(&e->ref);
+	return e != NULL;
+}
+
+int bch2_replicas_entry_get(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
+{
+	if (!r->nr_devs)
+		return 0;
+
+	BUG_ON(r->data_type != BCH_DATA_journal);
+	verify_replicas_entry(r);
+
+	return bch2_replicas_entry_get_inmem(c, r)
+		? 0
+		: bch2_mark_replicas_slowpath(c, r, 1);
+}
+
+int bch2_replicas_gc_reffed(struct bch_fs *c)
+{
+	bool write_sb = false;
+
+	guard(mutex)(&c->sb_lock);
+
+	scoped_guard(percpu_write, &c->mark_lock) {
+		unsigned dst = 0;
+		for (unsigned i = 0; i < c->replicas.nr; i++) {
+			struct bch_replicas_entry_cpu *e =
+				cpu_replicas_entry(&c->replicas, i);
+
+			if (e->e.data_type != BCH_DATA_journal ||
+			    atomic_read(&e->ref))
+				memcpy(cpu_replicas_entry(&c->replicas, dst++),
+				       e,
+				       c->replicas.entry_size);
+		}
+
+		if (c->replicas.nr != dst) {
+			c->replicas.nr = dst;
+			bch2_cpu_replicas_sort(&c->replicas);
+
+			try(bch2_cpu_replicas_to_sb_replicas(c, &c->replicas));
+		}
+	}
+
+	if (write_sb)
+		bch2_write_super(c);
+	return 0;
 }

 /* Replicas tracking - superblock: */
@ -441,6 +486,9 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 		nr++;
 	}

+	entry_size = __cpu_replicas_entry_bytes(entry_size);
+	entry_size = round_up(entry_size, sizeof(atomic_t));
+
 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
 	if (!cpu_r->entries)
 		return -BCH_ERR_ENOMEM_cpu_replicas;
@ -448,10 +496,10 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;

-	for_each_replicas_entry(sb_r, e) {
-		struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++);
-		memcpy(dst, e, replicas_entry_bytes(e));
-		bch2_replicas_entry_sort(dst);
+	for_each_replicas_entry(sb_r, src) {
+		struct bch_replicas_entry_cpu *dst = cpu_replicas_entry(cpu_r, idx++);
+		memcpy(&dst->e, src, replicas_entry_bytes(src));
+		bch2_replicas_entry_sort(&dst->e);
 	}

 	return 0;
@ -469,9 +517,13 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 		nr++;
 	}

+	entry_size = __cpu_replicas_entry_bytes(entry_size);
+
 	entry_size += sizeof(struct bch_replicas_entry_v1) -
 		sizeof(struct bch_replicas_entry_v0);

+	entry_size = round_up(entry_size, sizeof(atomic_t));
+
 	cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
 	if (!cpu_r->entries)
 		return -BCH_ERR_ENOMEM_cpu_replicas;
@ -480,14 +532,14 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 	cpu_r->entry_size	= entry_size;

 	for_each_replicas_entry(sb_r, src) {
-		struct bch_replicas_entry_v1 *dst =
+		struct bch_replicas_entry_cpu *dst =
 			cpu_replicas_entry(cpu_r, idx++);

-		dst->data_type	= src->data_type;
-		dst->nr_devs	= src->nr_devs;
-		dst->nr_required = 1;
-		memcpy(dst->devs, src->devs, src->nr_devs);
-		bch2_replicas_entry_sort(dst);
+		dst->e.data_type	= src->data_type;
+		dst->e.nr_devs		= src->nr_devs;
+		dst->e.nr_required	= 1;
+		memcpy(dst->e.devs, src->devs, src->nr_devs);
+		bch2_replicas_entry_sort(&dst->e);
 	}

 	return 0;
@ -495,6 +547,12 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,

 int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 {
+	/*
+	 * If called after fs is started (after journal read), we'll be blowing
+	 * away refcounts
+	 */
+	BUG_ON(test_bit(BCH_FS_started, &c->flags));
+
 	struct bch_sb_field_replicas *sb_v1;
 	struct bch_sb_field_replicas_v0 *sb_v0;
 	CLASS(bch_replicas_cpu, new_r)();
@ -522,7 +580,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
 	bytes = sizeof(struct bch_sb_field_replicas);

 	for_each_cpu_replicas_entry(r, src)
-		bytes += replicas_entry_bytes(src) - 1;
+		bytes += replicas_entry_bytes(&src->e) - 1;

 	sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0,
 			DIV_ROUND_UP(bytes, sizeof(u64)));
@ -538,9 +596,9 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,

 	dst = sb_r->entries;
 	for_each_cpu_replicas_entry(r, src) {
-		dst->data_type	= src->data_type;
-		dst->nr_devs	= src->nr_devs;
-		memcpy(dst->devs, src->devs, src->nr_devs);
+		dst->data_type	= src->e.data_type;
+		dst->nr_devs	= src->e.nr_devs;
+		memcpy(dst->devs, src->e.devs, src->e.nr_devs);

 		dst = replicas_entry_next(dst);

@ -561,8 +619,8 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 	bytes = sizeof(struct bch_sb_field_replicas);

 	for_each_cpu_replicas_entry(r, src) {
-		bytes += replicas_entry_bytes(src);
-		if (src->nr_required != 1)
+		bytes += replicas_entry_bytes(&src->e);
+		if (src->e.nr_required != 1)
 			need_v1 = true;
 	}

@ -583,7 +641,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,

 	dst = sb_r->entries;
 	for_each_cpu_replicas_entry(r, src) {
-		memcpy(dst, src, replicas_entry_bytes(src));
+		memcpy(dst, &src->e, replicas_entry_bytes(&src->e));

 		dst = replicas_entry_next(dst);

@ -602,24 +660,26 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 	sort_r(cpu_r->entries,
 	       cpu_r->nr,
 	       cpu_r->entry_size,
-	       bch2_memcmp, NULL,
+	       cpu_replicas_entry_cmp_r, NULL,
 	       (void *)(size_t)cpu_r->entry_size);

 	for (i = 0; i < cpu_r->nr; i++) {
-		struct bch_replicas_entry_v1 *e =
+		struct bch_replicas_entry_cpu *e =
 			cpu_replicas_entry(cpu_r, i);

-		try(bch2_replicas_entry_sb_validate(e, sb, err));
+		try(bch2_replicas_entry_sb_validate(&e->e, sb, err));

 		if (i + 1 < cpu_r->nr) {
-			struct bch_replicas_entry_v1 *n =
+			struct bch_replicas_entry_cpu *n =
 				cpu_replicas_entry(cpu_r, i + 1);

-			BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
+			int cmp = cpu_replicas_entry_cmp(e, n, cpu_r->entry_size);

-			if (!memcmp(e, n, cpu_r->entry_size)) {
+			BUG_ON(cmp > 0);
+
+			if (!cmp) {
 				prt_printf(err, "duplicate replicas entry ");
-				bch2_replicas_entry_to_text(err, e);
+				bch2_replicas_entry_to_text(err, &e->e);
 				return -BCH_ERR_invalid_sb_replicas;
 			}
 		}
@ -702,7 +762,9 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs,
 				unsigned flags, struct printbuf *err)
 {
 	guard(percpu_read)(&c->mark_lock);
-	for_each_cpu_replicas_entry(&c->replicas, e) {
+	for_each_cpu_replicas_entry(&c->replicas, i) {
+		struct bch_replicas_entry_v1 *e = &i->e;
+
 		unsigned nr_online = 0, nr_failed = 0, dflags = 0;
 		bool metadata = e->data_type < BCH_DATA_user;

@ -820,6 +882,25 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
 	return bch2_can_read_fs_with_devs(c, devs, flags, err);
 }

+bool bch2_sb_has_journal(struct bch_sb *sb)
+{
+	struct bch_sb_field_replicas *replicas = bch2_sb_field_get(sb, replicas);
+	struct bch_sb_field_replicas_v0 *replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
+
+	if (replicas) {
+		for_each_replicas_entry(replicas, r)
+			if (r->data_type == BCH_DATA_journal)
+				return true;
+	} else if (replicas_v0) {
+		for_each_replicas_entry(replicas_v0, r)
+			if (r->data_type == BCH_DATA_journal)
+				return true;
+	}
+
+
+	return false;
+}
+
 unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 {
 	struct bch_sb_field_replicas *replicas;
@ -863,5 +944,4 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
 void bch2_fs_replicas_exit(struct bch_fs *c)
 {
 	kfree(c->replicas.entries);
-	kfree(c->replicas_gc.entries);
 }
--- a/libbcachefs/alloc/replicas.h
+++ b/libbcachefs/alloc/replicas.h
@ -39,13 +39,22 @@ bool bch2_can_read_fs_with_devs(struct bch_fs *, struct bch_devs_mask,
 bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask,
 			   unsigned, struct printbuf *, bool);

+bool bch2_sb_has_journal(struct bch_sb *);
 unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned);
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);

-int bch2_replicas_gc_end(struct bch_fs *, int);
-int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+void bch2_replicas_entry_put_many(struct bch_fs *, struct bch_replicas_entry_v1 *, unsigned);
+static inline void bch2_replicas_entry_put(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
+{
+	bch2_replicas_entry_put_many(c, r, 1);
+}
+
+int bch2_replicas_entry_get(struct bch_fs *, struct bch_replicas_entry_v1 *);
+
 void bch2_replicas_entry_kill(struct bch_fs *, struct bch_replicas_entry_v1 *);

+int bch2_replicas_gc_reffed(struct bch_fs *);
+
 static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, unsigned dev)
 {
 	for (unsigned i = 0; i < r->nr_devs; i++)
@ -54,6 +63,12 @@ static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r,
 	return false;
 }

+static inline bool bch2_replicas_entry_eq(struct bch_replicas_entry_v1 *l,
+					  struct bch_replicas_entry_v1 *r)
+{
+	return l->nr_devs == r->nr_devs && !memcmp(l, r, replicas_entry_bytes(l));
+}
+
 /* iterate over superblock replicas - used by userspace tools: */

 #define replicas_entry_next(_i)						\
--- a/libbcachefs/alloc/replicas_types.h
+++ b/libbcachefs/alloc/replicas_types.h
@ -2,10 +2,16 @@
 #ifndef _BCACHEFS_REPLICAS_TYPES_H
 #define _BCACHEFS_REPLICAS_TYPES_H

+/* unsized - bch_replicas_entry_v1 is variable length */
+struct bch_replicas_entry_cpu {
+	atomic_t			ref;
+	struct bch_replicas_entry_v1	e;
+};
+
 struct bch_replicas_cpu {
-	unsigned		nr;
-	unsigned		entry_size;
-	struct bch_replicas_entry_v1 *entries;
+	unsigned			nr;
+	unsigned			entry_size;
+	struct bch_replicas_entry_cpu	*entries;
 };

 union bch_replicas_padded {
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -808,8 +808,6 @@ struct bch_fs {
 	struct bch_accounting_mem accounting;

 	struct bch_replicas_cpu replicas;
-	struct bch_replicas_cpu replicas_gc;
-	struct mutex		replicas_gc_lock;

 	struct journal_entry_res btree_root_journal_res;
 	struct journal_entry_res clock_journal_res;
@ -1072,6 +1070,7 @@ struct bch_fs {
 	GENRADIX(struct gc_stripe) gc_stripes;

 	struct hlist_head	ec_stripes_new[32];
+	struct hlist_head	ec_stripes_new_buckets[64];
 	spinlock_t		ec_stripes_new_lock;

 	/* ERASURE CODING */
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -708,7 +708,8 @@ struct bch_sb_field_ext {
 	x(inode_has_case_insensitive,	BCH_VERSION(1, 28))		\
 	x(extent_snapshot_whiteouts,	BCH_VERSION(1, 29))		\
 	x(31bit_dirent_offset,		BCH_VERSION(1, 30))		\
-	x(btree_node_accounting,	BCH_VERSION(1, 31))
+	x(btree_node_accounting,	BCH_VERSION(1, 31))		\
+	x(rebalance_v2,			BCH_VERSION(1, 32))

 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@ -1427,6 +1428,17 @@ enum btree_id_flags {
 	  BTREE_IS_snapshot_field|						\
 	  BTREE_IS_write_buffer,						\
 	  BIT_ULL(KEY_TYPE_accounting))						\
+	x(rebalance_hipri,	21,						\
+	  BTREE_IS_snapshot_field|						\
+	  BTREE_IS_write_buffer,						\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(rebalance_pending,	22,						\
+	  BTREE_IS_snapshot_field|						\
+	  BTREE_IS_write_buffer,						\
+	  BIT_ULL(KEY_TYPE_set))						\
+	x(rebalance_scan,	23,	0,					\
+	  BIT_ULL(KEY_TYPE_cookie)|						\
+	  BIT_ULL(KEY_TYPE_backpointer))

 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
--- a/libbcachefs/btree/check.c
+++ b/libbcachefs/btree/check.c
@ -682,9 +682,11 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	try(bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
 			     BTREE_TRIGGER_check_repair|flags));

-	if (bch2_trans_has_updates(trans))
-		return bch2_trans_commit(trans, NULL, NULL, 0) ?:
+	if (bch2_trans_has_updates(trans)) {
+		CLASS(disk_reservation, res)(c);
+		return bch2_trans_commit(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
 			-BCH_ERR_transaction_restart_nested;
+	}

 	try(bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(k),
 			     BTREE_TRIGGER_gc|BTREE_TRIGGER_insert|flags));
--- a/libbcachefs/btree/interior.c
+++ b/libbcachefs/btree/interior.c
@ -22,6 +22,7 @@

 #include "data/extents.h"
 #include "data/keylist.h"
+#include "data/rebalance.h"
 #include "data/write.h"

 #include "init/error.h"
@ -654,6 +655,35 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as)
 	bch2_write_super(c);
 }

+static void bkey_strip_rebalance(struct bkey_s k)
+{
+	bool dropped;
+
+	do {
+		dropped = false;
+
+		struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+		union bch_extent_entry *entry;
+		bkey_extent_entry_for_each(ptrs, entry)
+			if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance ||
+			    extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance_bp) {
+				extent_entry_drop(k, entry);
+				dropped = true;
+				break;
+			}
+	} while (dropped);
+}
+
+static bool bkey_has_rebalance(struct bkey_s_c k)
+{
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+	const union bch_extent_entry *entry;
+	bkey_extent_entry_for_each(ptrs, entry)
+		if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+			return true;
+	return false;
+}
+
 /*
 * The transactional part of an interior btree node update, where we journal the
 * update we did to the interior node and update alloc info:
@ -661,26 +691,69 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as)
 static int btree_update_nodes_written_trans(struct btree_trans *trans,
 					    struct btree_update *as)
 {
+	struct bch_inode_opts opts;
+	bch2_inode_opts_get(as->c, &opts, true);
+
 	trans->journal_pin = &as->journal;

-	darray_for_each(as->old_nodes, i)
+	darray_for_each(as->old_nodes, i) {
 		try(bch2_key_trigger_old(trans, as->btree_id, i->level + 1, bkey_i_to_s_c(&i->key),
 					 BTREE_TRIGGER_transactional));

-	darray_for_each(as->new_nodes, i) {
-		try(bch2_key_trigger_new(trans, as->btree_id, i->level + 1, bkey_i_to_s(&i->key),
-					 BTREE_TRIGGER_transactional));
-
 		journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans,
 									 jset_u64s(i->key.k.u64s))),
-				  i->root
-				  ? BCH_JSET_ENTRY_btree_root
-				  : BCH_JSET_ENTRY_btree_keys,
+				  BCH_JSET_ENTRY_overwrite,
 				  as->btree_id,
-				  i->root ? i->level : i->level + 1,
+				  i->level + 1,
 				  &i->key, i->key.k.u64s);
 	}

+	darray_for_each(as->new_nodes, i) {
+		i->update_node_key = false;
+		bkey_strip_rebalance(bkey_i_to_s(&i->key));
+
+		try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, &i->key,
+						  SET_NEEDS_REBALANCE_foreground, 0));
+
+		if (bkey_has_rebalance(bkey_i_to_s_c(&i->key))) {
+			CLASS(btree_iter_uninit, iter)(trans);
+			int ret = bch2_btree_node_get_iter(trans, &iter, i->b);
+			if (ret && ret != -BCH_ERR_btree_node_dying)
+				return ret;
+			if (!ret)
+				i->update_node_key = true;
+			else
+				bkey_strip_rebalance(bkey_i_to_s(&i->key));
+		}
+
+		try(bch2_key_trigger_new(trans, as->btree_id, i->level + 1, bkey_i_to_s(&i->key),
+					 BTREE_TRIGGER_transactional));
+
+		if (!i->update_node_key || i->root) {
+			journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans,
+									jset_u64s(i->key.k.u64s))),
+					  i->root
+					  ? BCH_JSET_ENTRY_btree_root
+					  : BCH_JSET_ENTRY_btree_keys,
+					  as->btree_id,
+					  i->root ? i->level : i->level + 1,
+					  &i->key, i->key.k.u64s);
+		} else {
+			CLASS(btree_node_iter, parent_iter)(trans,
+							    as->btree_id,
+							    i->key.k.p,
+							    0,
+							    i->level + 1,
+							    BTREE_ITER_intent);
+			try(bch2_btree_iter_traverse(&parent_iter));
+			/*
+			 * XXX: we shouldn't be logging overwrites here, need a
+			 * flag for that
+			 */
+			try(bch2_trans_update(trans, &parent_iter, &i->key, BTREE_TRIGGER_norun));
+		}
+	}
+
 	return 0;
 }

@ -760,19 +833,23 @@ static void btree_update_nodes_written(struct btree_update *as)
 				BCH_TRANS_COMMIT_no_check_rw|
 				BCH_TRANS_COMMIT_journal_reclaim,
 				btree_update_nodes_written_trans(trans, as));
-		bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-				     "%s", bch2_err_str(ret));
+		bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal),
+				     c, "%s", bch2_err_str(ret));
 		/*
 		 * Clear will_make_reachable while we still hold intent locks on
 		 * all our new nodes, to avoid racing with
 		 * btree_node_update_key():
 		 */
-		darray_for_each(as->new_nodes, i)
+		darray_for_each(as->new_nodes, i) {
+			if (i->update_node_key)
+				bkey_copy(&i->b->key, &i->key);
+
 			if (i->b) {
 				BUG_ON(i->b->will_make_reachable != (unsigned long) as);
 				i->b->will_make_reachable = 0;
 				clear_btree_node_will_make_reachable(i->b);
 			}
+		}
 	}

 	/*
@ -2422,7 +2499,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 			 */
 		}

-		try(bch2_trans_commit(trans, NULL, NULL, commit_flags));
+		CLASS(disk_reservation, res)(c);
+		try(bch2_trans_commit(trans, &res.r, NULL, commit_flags));

 		bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
 		bkey_copy(&b->key, new_key);
--- a/libbcachefs/btree/interior.h
+++ b/libbcachefs/btree/interior.h
@ -26,6 +26,7 @@ struct btree_update_node {
 	struct btree			*b;
 	unsigned			level;
 	bool				root;
+	bool				update_node_key;
 	__le64				seq;
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
--- a/libbcachefs/btree/key_cache.c
+++ b/libbcachefs/btree/key_cache.c
@ -438,10 +438,10 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 		 * sequence number with a new btree node write, we want to re-journal
 		 * the update
 		 */
-		if (ck->journal.seq == journal_last_seq(j))
+		if (ck->journal.seq == j->last_seq)
 			commit_flags |= BCH_WATERMARK_reclaim;

-		if (ck->journal.seq != journal_last_seq(j) ||
+		if (ck->journal.seq != j->last_seq ||
 		    !journal_low_on_space(&c->journal))
 			commit_flags |= BCH_TRANS_COMMIT_no_journal_res;

--- a/libbcachefs/btree/types.h
+++ b/libbcachefs/btree/types.h
@ -560,7 +560,7 @@ struct btree_trans {
 	struct bch_fs_usage_base fs_usage_delta;

 	unsigned		journal_u64s;
-	unsigned		extra_disk_res; /* XXX kill */
+	u64			extra_disk_res;

 	__BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX);

--- a/libbcachefs/data/checksum.h
+++ b/libbcachefs/data/checksum.h
@ -143,6 +143,17 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c,
 	return bch2_csum_opt_to_type(opts.data_checksum, true);
 }

+static inline enum bch_csum_type bch2_data_checksum_type_rb(struct bch_fs *c,
+							    struct bch_extent_rebalance opts)
+{
+	if (c->sb.encryption_type)
+		return c->opts.wide_macs
+			? BCH_CSUM_chacha20_poly1305_128
+			: BCH_CSUM_chacha20_poly1305_80;
+
+	return bch2_csum_opt_to_type(opts.data_checksum, true);
+}
+
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 {
 	if (c->sb.encryption_type)
--- a/libbcachefs/data/ec.c
+++ b/libbcachefs/data/ec.c
@ -881,8 +881,60 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans,
 * Hash table of open stripes:
 * Stripes that are being created or modified are kept in a hash table, so that
 * stripe deletion can skip them.
+ *
+ * Additionally, we have a hash table for buckets that have stripes being
+ * created, to avoid racing with rebalance:
 */

+static bool __bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket)
+{
+	unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets)));
+	struct ec_stripe_new_bucket *s;
+
+	hlist_for_each_entry(s, &c->ec_stripes_new_buckets[hash], hash)
+		if (s->dev_bucket == dev_bucket)
+			return true;
+	return false;
+}
+
+bool bch2_bucket_has_new_stripe(struct bch_fs *c, u64 dev_bucket)
+{
+	guard(spinlock)(&c->ec_stripes_new_lock);
+	return __bch2_bucket_has_new_stripe(c, dev_bucket);
+}
+
+static void stripe_new_bucket_add(struct bch_fs *c, struct ec_stripe_new_bucket *s, u64 dev_bucket)
+{
+	s->dev_bucket = dev_bucket;
+
+	unsigned hash = hash_64(dev_bucket, ilog2(ARRAY_SIZE(c->ec_stripes_new_buckets)));
+	hlist_add_head(&s->hash, &c->ec_stripes_new_buckets[hash]);
+}
+
+static void stripe_new_buckets_add(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	unsigned nr_blocks = s->nr_data + s->nr_parity;
+
+	guard(spinlock)(&c->ec_stripes_new_lock);
+	for (unsigned i = 0; i < nr_blocks; i++) {
+		if (!s->blocks[i])
+			continue;
+
+		struct open_bucket *ob = c->open_buckets + s->blocks[i];
+		struct bpos bucket = POS(ob->dev, ob->bucket);
+
+		stripe_new_bucket_add(c, &s->buckets[i], bucket_to_u64(bucket));
+	}
+}
+
+static void stripe_new_buckets_del(struct bch_fs *c, struct ec_stripe_new *s)
+{
+	struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v;
+
+	for (unsigned i = 0; i < v->nr_blocks; i++)
+		hlist_del_init(&s->buckets[i].hash);
+}
+
 static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx)
 {
 	unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new)));
@ -923,6 +975,8 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s)
 	hlist_del_init(&s->hash);

 	s->idx = 0;
+
+	stripe_new_buckets_del(c, s);
 }

 /* stripe deletion */
@ -1087,7 +1141,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 		.idx		= s->key.k.p.offset,
 	};

-	struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)));
+	struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64)));

 	bkey_reassemble(n, k);

@ -1103,8 +1157,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans,
 	struct bch_inode_opts opts;

 	try(bch2_bkey_get_io_opts(trans, NULL, bkey_i_to_s_c(n), &opts));
-	try(bch2_bkey_set_needs_rebalance(trans->c, &opts, n,
-					  SET_NEEDS_REBALANCE_other, 0));
+	try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, SET_NEEDS_REBALANCE_other, 0));
 	try(bch2_trans_update(trans, &iter, n, 0));

 	return 0;
@ -1126,10 +1179,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 	struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit);
 	wb_maybe_flush_init(&last_flushed);

+	CLASS(disk_reservation, res)(c);
+
 	return for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers,
 			bucket_pos_to_bp_start(ca, bucket_pos),
 			bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k,
-			NULL, NULL,
+			&res.r, NULL,
 			BCH_TRANS_COMMIT_no_check_rw|
 			BCH_TRANS_COMMIT_no_enospc, ({
 		if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0)))
@ -1143,6 +1198,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
 			continue;

 		wb_maybe_flush_inc(&last_flushed);
+		bch2_disk_reservation_put(c, &res.r);
 		ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, bp, &last_flushed);
 	}));
 }
@ -1978,6 +2034,7 @@ allocate_buf:
 	if (ret)
 		goto err;

+	stripe_new_buckets_add(c, s);
 	s->allocated = true;
 allocated:
 	BUG_ON(!s->idx);
--- a/libbcachefs/data/ec.h
+++ b/libbcachefs/data/ec.h
@ -191,6 +191,11 @@ enum ec_stripe_ref {
 	STRIPE_REF_NR
 };

+struct ec_stripe_new_bucket {
+	struct hlist_node	hash;
+	u64			dev_bucket;
+};
+
 struct ec_stripe_new {
 	struct bch_fs		*c;
 	struct ec_stripe_head	*h;
@ -217,6 +222,8 @@ struct ec_stripe_new {
 	open_bucket_idx_t	blocks[BCH_BKEY_PTRS_MAX];
 	struct disk_reservation	res;

+	struct ec_stripe_new_bucket buckets[BCH_BKEY_PTRS_MAX];
+
 	struct ec_stripe_buf	new_stripe;
 	struct ec_stripe_buf	existing_stripe;
 };
@ -248,6 +255,8 @@ struct ec_stripe_head {

 int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey_s_c);

+bool bch2_bucket_has_new_stripe(struct bch_fs *, u64);
+
 void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);

 void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int);
--- a/libbcachefs/data/extents.c
+++ b/libbcachefs/data/extents.c
@ -598,31 +598,16 @@ static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
 		bch2_crc_cmp(l.csum, r.csum));
 }

-static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
-				  struct bch_extent_crc_unpacked n)
+static union bch_extent_entry *bkey_crc_find(struct bkey_i *k, struct bch_extent_crc_unpacked crc)
 {
-	return !crc_is_compressed(u) &&
-		u.csum_type &&
-		u.uncompressed_size > u.live_size &&
-		bch2_csum_type_is_encryption(u.csum_type) ==
-		bch2_csum_type_is_encryption(n.csum_type);
-}
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	struct bch_extent_crc_unpacked i;
+	union bch_extent_entry *entry;

-bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
-				 struct bch_extent_crc_unpacked n)
-{
-	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-	struct bch_extent_crc_unpacked crc;
-	const union bch_extent_entry *i;
-
-	if (!n.csum_type)
-		return false;
-
-	bkey_for_each_crc(k.k, ptrs, crc, i)
-		if (can_narrow_crc(crc, n))
-			return true;
-
-	return false;
+	bkey_for_each_crc(&k->k, ptrs, i, entry)
+		if (!bch2_crc_unpacked_cmp(i, crc))
+			return entry;
+	return NULL;
 }

 /*
@ -634,44 +619,31 @@ bool bch2_can_narrow_extent_crcs(struct bkey_s_c k,
 * currently live (so that readers won't have to bounce) while we've got the
 * checksum we need:
 */
-bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n)
+bool bch2_bkey_narrow_crc(struct bkey_i *k,
+			  struct bch_extent_crc_unpacked old,
+			  struct bch_extent_crc_unpacked new)
 {
-	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-	struct bch_extent_crc_unpacked u;
-	struct extent_ptr_decoded p;
-	union bch_extent_entry *i;
-	bool ret = false;
+	BUG_ON(crc_is_compressed(new));
+	BUG_ON(new.offset);
+	BUG_ON(new.live_size != k->k.size);

-	/* Find a checksum entry that covers only live data: */
-	if (!n.csum_type) {
-		bkey_for_each_crc(&k->k, ptrs, u, i)
-			if (!crc_is_compressed(u) &&
-			    u.csum_type &&
-			    u.live_size == u.uncompressed_size) {
-				n = u;
-				goto found;
-			}
+
+	union bch_extent_entry *old_e = bkey_crc_find(k, old);
+	if (!old_e)
 		return false;
+
+	struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+	union bch_extent_entry *i;
+
+	bkey_extent_entry_for_each_from(ptrs, i, extent_entry_next(old_e)) {
+		if (extent_entry_is_crc(i))
+			break;
+		if (extent_entry_is_ptr(i))
+			i->ptr.offset += old.offset;
 	}
-found:
-	BUG_ON(crc_is_compressed(n));
-	BUG_ON(n.offset);
-	BUG_ON(n.live_size != k->k.size);

-restart_narrow_pointers:
-	ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-
-	bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
-		if (can_narrow_crc(p.crc, n)) {
-			bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
-			p.ptr.offset += p.crc.offset;
-			p.crc = n;
-			bch2_extent_ptr_decoded_append(k, &p);
-			ret = true;
-			goto restart_narrow_pointers;
-		}
-
-	return ret;
+	bch2_extent_crc_pack(entry_to_crc(old_e), new, extent_entry_type(old_e));
+	return true;
 }

 static void bch2_extent_crc_pack(union bch_extent_crc *dst,
@ -909,6 +881,15 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
 	return durability;
 }

+void bch2_bkey_extent_entry_drop_s(struct bkey_s k, union bch_extent_entry *entry)
+{
+	union bch_extent_entry *end = bkey_val_end(k);
+	union bch_extent_entry *next = extent_entry_next(entry);
+
+	memmove_u64s(entry, next, (u64 *) end - (u64 *) next);
+	k.k->u64s -= extent_entry_u64s(entry);
+}
+
 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
 {
 	union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
@ -1410,6 +1391,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags);
 			break;

+		case BCH_EXTENT_ENTRY_rebalance_bp:
+			prt_printf(out, "idx %llu", (u64) entry->rebalance_bp.idx);
+			break;
+
 		default:
 			prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
 			return;
@ -1463,6 +1448,18 @@ fsck_err:
 	return ret;
 }

+static inline bool btree_ptr_entry_type_allowed(enum bch_extent_entry_type type)
+{
+	switch (type) {
+	case BCH_EXTENT_ENTRY_ptr:
+	case BCH_EXTENT_ENTRY_rebalance:
+	case BCH_EXTENT_ENTRY_rebalance_bp:
+		return true;
+	default:
+		return false;
+	};
+}
+
 int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 			    struct bkey_validate_context from)
 {
@ -1473,23 +1470,26 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 	unsigned nonce = UINT_MAX;
 	unsigned nr_ptrs = 0;
 	bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+	bool have_inval_dev_ptrs = false, have_non_inval_dev_ptrs = false;
 	int ret = 0;

 	if (bkey_is_btree_ptr(k.k))
 		size_ondisk = btree_sectors(c);

 	bkey_extent_entry_for_each(ptrs, entry) {
-		bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX,
+		enum bch_extent_entry_type type = __extent_entry_type(entry);
+
+		bkey_fsck_err_on(type >= BCH_EXTENT_ENTRY_MAX,
 				 c, extent_ptrs_invalid_entry,
 				 "invalid extent entry type (got %u, max %u)",
-				 __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+				 type, BCH_EXTENT_ENTRY_MAX);

 		bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
-				 !extent_entry_is_ptr(entry),
+				 !btree_ptr_entry_type_allowed(type),
 				 c, btree_ptr_has_non_ptr,
-				 "has non ptr field");
+				 "has non allowed field");

-		switch (extent_entry_type(entry)) {
+		switch (type) {
 		case BCH_EXTENT_ENTRY_ptr:
 			try(extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false));

@ -1504,6 +1504,12 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,

 			have_ec = false;
 			crc_since_last_ptr = false;
+
+			if (entry->ptr.dev == BCH_SB_MEMBER_INVALID)
+				have_inval_dev_ptrs = true;
+			else
+				have_non_inval_dev_ptrs = true;
+
 			nr_ptrs++;
 			break;
 		case BCH_EXTENT_ENTRY_crc32:
@ -1551,30 +1557,18 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 					 c, ptr_stripe_redundant,
 					 "redundant stripe entry");
 			have_ec = true;
+			have_non_inval_dev_ptrs = true;
 			break;
-		case BCH_EXTENT_ENTRY_rebalance: {
-			/*
-			 * this shouldn't be a fsck error, for forward
-			 * compatibility; the rebalance code should just refetch
-			 * the compression opt if it's unknown
-			 */
-#if 0
-			const struct bch_extent_rebalance *r = &entry->rebalance;
-
-			if (!bch2_compression_opt_valid(r->compression)) {
-				union bch_compression_opt opt = { .value = r->compression };
-				prt_printf(err, "invalid compression opt %u:%u",
-					   opt.type, opt.level);
-				return bch_err_throw(c, invalid_bkey);
-			}
-#endif
+		case BCH_EXTENT_ENTRY_rebalance:
+			try(bch2_extent_rebalance_validate(c, k, from, &entry->rebalance));
 			break;
-		}
 		case BCH_EXTENT_ENTRY_flags:
 			bkey_fsck_err_on(entry != ptrs.start,
 					 c, extent_flags_not_at_start,
 					 "extent flags entry not at start");
 			break;
+		case BCH_EXTENT_ENTRY_rebalance_bp:
+			break;
 		}
 	}

@ -1596,6 +1590,9 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k,
 	bkey_fsck_err_on(have_ec,
 			 c, extent_ptrs_redundant_stripe,
 			 "redundant stripe entry");
+	bkey_fsck_err_on(have_inval_dev_ptrs && !have_non_inval_dev_ptrs,
+			 c, extent_ptrs_all_invalid,
+			 "extent ptrs all to BCH_SB_MEMBER_INVALID");
 fsck_err:
 	return ret;
 }
@ -1708,6 +1705,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 			case BCH_EXTENT_ENTRY_stripe_ptr:
 			case BCH_EXTENT_ENTRY_rebalance:
 			case BCH_EXTENT_ENTRY_flags:
+			case BCH_EXTENT_ENTRY_rebalance_bp:
 				break;
 			}

--- a/libbcachefs/data/extents.h
+++ b/libbcachefs/data/extents.h
@ -461,9 +461,9 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);

 /* Extent checksum entries: */

-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
-				 struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crc(struct bkey_i *,
+			  struct bch_extent_crc_unpacked,
+			  struct bch_extent_crc_unpacked);
 void bch2_extent_crc_append(struct bkey_i *,
 			    struct bch_extent_crc_unpacked);

@ -607,6 +607,7 @@ bool bch2_bkey_devs_rw(struct bch_fs *, struct bkey_s_c);
 bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
 bool bch2_bkey_in_target(struct bch_fs *, struct bkey_s_c, unsigned);

+void bch2_bkey_extent_entry_drop_s(struct bkey_s, union bch_extent_entry *);
 void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);

 static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr)
--- a/libbcachefs/data/extents_format.h
+++ b/libbcachefs/data/extents_format.h
@ -80,8 +80,9 @@
 	x(crc128,		3)		\
 	x(stripe_ptr,		4)		\
 	x(rebalance,		5)		\
-	x(flags,		6)
-#define BCH_EXTENT_ENTRY_MAX	7
+	x(flags,		6)		\
+	x(rebalance_bp,		7)
+#define BCH_EXTENT_ENTRY_MAX	8

 enum bch_extent_entry_type {
 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@ -270,13 +271,13 @@ struct bch_extent {
 } __packed __aligned(8);

 /* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
+#define BKEY_EXTENT_PTR_U64s_MAX				\
 	((sizeof(struct bch_extent_crc128) +			\
 	  sizeof(struct bch_extent_ptr)) / sizeof(__u64))

 /* Maximum possible size of an entire extent value: */
 #define BKEY_EXTENT_VAL_U64s_MAX				\
-	(1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+	(5 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))

 /* * Maximum possible size of an entire extent, key + value: */
 #define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@ -284,7 +285,9 @@ struct bch_extent {
 /* Btree pointers don't carry around checksums: */
 #define BKEY_BTREE_PTR_VAL_U64s_MAX				\
 	((sizeof(struct bch_btree_ptr_v2) +			\
-	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+	  sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX +	\
+	  sizeof(struct bch_extent_rebalance) +			\
+	  sizeof(struct bch_extent_rebalance_bp)) / sizeof(__u64))
 #define BKEY_BTREE_PTR_U64s_MAX					\
 	(BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)

--- a/libbcachefs/data/migrate.c
+++ b/libbcachefs/data/migrate.c
@ -75,14 +75,15 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	if (!bch2_bkey_has_device_c(k, dev_idx))
 		return 0;

-	struct bkey_i *n =
-		errptr_try(bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_internal_snapshot_node));
+	/* blah */
+	struct bkey_i *n = errptr_try(bch2_trans_kmalloc(trans, BKEY_EXTENT_U64s_MAX * sizeof(u64)));
+	bkey_reassemble(n, k);

 	try(drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, err, false));

 	struct bch_inode_opts opts;
 	try(bch2_bkey_get_io_opts(trans, NULL, k, &opts));
-	try(bch2_bkey_set_needs_rebalance(c, &opts, n, SET_NEEDS_REBALANCE_opt_change, 0));
+	try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, n, SET_NEEDS_REBALANCE_opt_change, 0));

 	/*
 	 * Since we're not inserting through an extent iterator
@ -92,7 +93,7 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
 	 */
 	if (bkey_deleted(&n->k))
 		n->k.size = 0;
-	return 0;
+	return bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node);
 }

 static int bch2_dev_btree_drop_key(struct btree_trans *trans,
@ -116,6 +117,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
 				 unsigned flags, struct printbuf *err)
 {
 	CLASS(btree_trans, trans)(c);
+	CLASS(disk_reservation, res)(c);

 	/* FIXME: this does not handle unknown btrees with data pointers */
 	for (unsigned id = 0; id < BTREE_ID_NR; id++) {
@ -126,14 +128,13 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c,
 		if (id == BTREE_ID_stripes)
 			continue;

-		int ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
+		try(for_each_btree_key_commit(trans, iter, id, POS_MIN,
 				BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+				&res.r, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+			bch2_disk_reservation_put(c, &res.r);
 			bch2_progress_update_iter(trans, progress, &iter, "dropping user data") ?:
 			bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags, err);
-		}));
-		if (ret)
-			return ret;
+		})));
 	}

 	return 0;
@ -218,6 +219,7 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig
 				       struct printbuf *err)
 {
 	CLASS(btree_trans, trans)(c);
+	CLASS(disk_reservation, res)(c);

 	struct wb_maybe_flush last_flushed __cleanup(wb_maybe_flush_exit);
 	wb_maybe_flush_init(&last_flushed);
@ -226,11 +228,12 @@ int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsig
 		for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers,
 				POS(dev_idx, 0),
 				POS(dev_idx, U64_MAX), 0, k,
-				NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+				&res.r, NULL, BCH_TRANS_COMMIT_no_enospc, ({
 			if (k.k->type != KEY_TYPE_backpointer)
 				continue;

 			wb_maybe_flush_inc(&last_flushed);
+			bch2_disk_reservation_put(c, &res.r);
 			data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k),
 				     &last_flushed, flags, err);

--- a/libbcachefs/data/move.c
+++ b/libbcachefs/data/move.c
@ -324,8 +324,11 @@ int bch2_move_extent(struct moving_context *ctxt,

 	struct bch_inode_opts opts;
 	try(bch2_bkey_get_io_opts(trans, snapshot_io_opts, k, &opts));
-	try(bch2_update_rebalance_opts(trans, &opts, iter, k, SET_NEEDS_REBALANCE_other));
-	try(bch2_trans_commit_lazy(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc));
+	try(bch2_update_rebalance_opts(trans, snapshot_io_opts, &opts, iter, level, k,
+				       SET_NEEDS_REBALANCE_other));
+
+	CLASS(disk_reservation, res)(c);
+	try(bch2_trans_commit_lazy(trans, &res.r, NULL, BCH_TRANS_COMMIT_no_enospc));

 	struct data_update_opts data_opts = {};
 	int ret = pred(trans, arg, iter->btree_id, k, &opts, &data_opts);
--- a/libbcachefs/data/read.c
+++ b/libbcachefs/data/read.c
@ -784,7 +784,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,

 	bkey_reassemble(new, k);

-	if (!bch2_bkey_narrow_crcs(new, *new_crc))
+	if (!bch2_bkey_narrow_crc(new, rbio->pick.crc, *new_crc))
 		return bch_err_throw(c, rbio_narrow_crcs_fail);

 	return bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node);
@ -794,7 +794,8 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
 {
 	struct bch_fs *c = rbio->c;

-	if (crc_is_compressed(rbio->pick.crc))
+	if (!rbio->pick.crc.csum_type ||
+	    crc_is_compressed(rbio->pick.crc))
 		return;

 	u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset;
@ -1070,6 +1071,13 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
 	bch2_fs_inconsistent(c, "%s", buf.buf);
 }

+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked n)
+{
+	return n.csum_type &&
+		n.uncompressed_size < n.live_size &&
+		!crc_is_compressed(n);
+}
+
 int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 		       struct bvec_iter iter, struct bpos read_pos,
 		       enum btree_id data_btree, struct bkey_s_c k,
@ -1170,8 +1178,7 @@ retry_pick:
 		    bio_flagged(&orig->bio, BIO_CHAIN))
 			flags |= BCH_READ_must_clone;

-		narrow_crcs = !(flags & BCH_READ_in_retry) &&
-			bch2_can_narrow_extent_crcs(k, pick.crc);
+		narrow_crcs = !(flags & BCH_READ_in_retry) && can_narrow_crc(pick.crc);

 		if (narrow_crcs && (flags & BCH_READ_user_mapped))
 			flags |= BCH_READ_must_bounce;
--- a/libbcachefs/data/rebalance.c
+++ b/libbcachefs/data/rebalance.c
--- a/libbcachefs/data/rebalance.h
+++ b/libbcachefs/data/rebalance.h
@ -6,10 +6,14 @@
 #include "alloc/disk_groups.h"
 #include "rebalance_types.h"

+int bch2_extent_rebalance_validate(struct bch_fs *, struct bkey_s_c,
+				   struct bkey_validate_context,
+				   const struct bch_extent_rebalance *);
+
 static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_fs *c,
 								    struct bch_inode_opts *opts)
 {
-	struct bch_extent_rebalance r = {
+	return (struct bch_extent_rebalance) {
 		.type = BIT(BCH_EXTENT_ENTRY_rebalance),
 #define x(_name)							\
 		._name = opts->_name,					\
@ -17,22 +21,37 @@ static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_f
 		BCH_REBALANCE_OPTS()
 #undef x
 	};
-
-	if (r.background_target &&
-	    !bch2_target_accepts_data(c, BCH_DATA_user, r.background_target))
-		r.background_target = 0;
-
-	return r;
 };

 void bch2_extent_rebalance_to_text(struct printbuf *, struct bch_fs *,
 				   const struct bch_extent_rebalance *);

-int bch2_trigger_extent_rebalance(struct btree_trans *,
-				  struct bkey_s_c, struct bkey_s_c,
-				  enum btree_iter_update_trigger_flags);
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);

-u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c);
+int __bch2_trigger_extent_rebalance(struct btree_trans *,
+				    enum btree_id, unsigned,
+				    struct bkey_s_c, struct bkey_s,
+				    const struct bch_extent_rebalance *,
+				    const struct bch_extent_rebalance *,
+				    enum btree_iter_update_trigger_flags);
+
+static inline unsigned rb_needs_trigger(const struct bch_extent_rebalance *r)
+{
+	return r ? r->need_rb|r->ptrs_moving : 0;
+}
+
+static inline int bch2_trigger_extent_rebalance(struct btree_trans *trans,
+				enum btree_id btree, unsigned level,
+				struct bkey_s_c old, struct bkey_s new,
+				enum btree_iter_update_trigger_flags flags)
+{
+	const struct bch_extent_rebalance *old_r = bch2_bkey_rebalance_opts(old);
+	const struct bch_extent_rebalance *new_r = bch2_bkey_rebalance_opts(new.s_c);
+
+	return rb_needs_trigger(old_r) || rb_needs_trigger(new_r)
+		? __bch2_trigger_extent_rebalance(trans, btree, level, old, new, old_r, new_r, flags)
+		: 0;
+}

 enum set_needs_rebalance_ctx {
 	SET_NEEDS_REBALANCE_opt_change,
@ -41,9 +60,6 @@ enum set_needs_rebalance_ctx {
 	SET_NEEDS_REBALANCE_other,
 };

-int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_inode_opts *,
-				  struct bkey_i *, enum set_needs_rebalance_ctx, u32);
-
 /* Inodes in different snapshots may have different IO options: */
 struct snapshot_io_opts_entry {
 	u32			snapshot;
@ -52,6 +68,10 @@ struct snapshot_io_opts_entry {

 struct per_snapshot_io_opts {
 	u64			cur_inum;
+	bool			fs_scan_cookie;
+	bool			inum_scan_cookie;
+	struct bch_devs_mask	dev_cookie;
+
 	struct bch_inode_opts	fs_io_opts;
 	DARRAY(struct snapshot_io_opts_entry) d;
 };
@ -74,20 +94,27 @@ DEFINE_CLASS(per_snapshot_io_opts, struct per_snapshot_io_opts,
 	     per_snapshot_io_opts_init(c),
 	     struct bch_fs *c);

-int bch2_update_rebalance_opts(struct btree_trans *,
-			       struct bch_inode_opts *,
-			       struct btree_iter *,
-			       struct bkey_s_c,
-			       enum set_needs_rebalance_ctx);
-
 int bch2_bkey_get_io_opts(struct btree_trans *,
 			  struct per_snapshot_io_opts *, struct bkey_s_c,
 			  struct bch_inode_opts *opts);

+int bch2_update_rebalance_opts(struct btree_trans *,
+			       struct per_snapshot_io_opts *,
+			       struct bch_inode_opts *,
+			       struct btree_iter *,
+			       unsigned level,
+			       struct bkey_s_c,
+			       enum set_needs_rebalance_ctx);
+
+int bch2_bkey_set_needs_rebalance(struct btree_trans *,
+				  struct per_snapshot_io_opts *, struct bch_inode_opts *,
+				  struct bkey_i *, enum set_needs_rebalance_ctx, u32);
+
 struct rebalance_scan {
 	enum rebalance_scan_type {
 		REBALANCE_SCAN_fs,
 		REBALANCE_SCAN_metadata,
+		REBALANCE_SCAN_pending,
 		REBALANCE_SCAN_device,
 		REBALANCE_SCAN_inum,
 	}			type;
@ -99,7 +126,7 @@ struct rebalance_scan {
 };

 int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, struct rebalance_scan);
-int bch2_set_rebalance_needs_scan(struct bch_fs *, struct rebalance_scan);
+int bch2_set_rebalance_needs_scan(struct bch_fs *, struct rebalance_scan, bool);
 int bch2_set_fs_needs_rebalance(struct bch_fs *);

 static inline void bch2_rebalance_wakeup(struct bch_fs *c)
@ -112,6 +139,7 @@ static inline void bch2_rebalance_wakeup(struct bch_fs *c)
 }

 void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
+void bch2_rebalance_scan_pending_to_text(struct printbuf *, struct bch_fs *);

 void bch2_rebalance_stop(struct bch_fs *);
 int bch2_rebalance_start(struct bch_fs *);
--- a/libbcachefs/data/rebalance_format.h
+++ b/libbcachefs/data/rebalance_format.h
@ -2,52 +2,137 @@
 #ifndef _BCACHEFS_REBALANCE_FORMAT_H
 #define _BCACHEFS_REBALANCE_FORMAT_H

+/*
+ * rebalance on disk data structures:
+ *
+ * extents will contain a bch_extent_rebalance if they have background
+ * processing pending; additionally, indirect extents will always have a
+ * bch_extent_rebalance if they had any io path options set on the inode, since
+ * we don't (yet) have backpointers that would let us look up the "owning" inode
+ * of an indirect extent to recover the io path options.
+ *
+ * We also have 4 btrees for keeping track of pending rebalance work:
+ *
+ * BTREE_ID_rebalance_scan:
+ *   Inum 0:
+ *     Holds "scan cookies", which are created on option change to indicate that
+ *     new options need to be propagated to each extent; this happens before the
+ *     actual data processing.
+ *
+ *     A scan cookie may be for the entire filesystem, a specific device, or a
+ *     specific inode.
+ *
+ *   Inum 1:
+ *     Btree nodes that need background processing cannot be tracked by the
+ *     other rebalance btrees; instead they have backpointers
+ *     (KEY_TYPE_backpointer) created here.
+ *
+ *     This has the added benefit that btree nodes will be processed before
+ *     regular data, which is beneficial if e.g. we're recovering from data
+ *     being degraded.
+ *
+ *  BTREE_ID_rebalance_work:
+ *    The main "pending rebalance work" btree: it's a simple bitset btree where
+ *    a set bit indicates that an an extent in BTREE_ID_extents or
+ *    BTREE_ID_reflink needs to be processed.
+ *
+ *  BTREE_ID_rebalance_hipri:
+ *    If bch_extent_rebalance.hipri is set, the extent will be tracked here
+ *    instead of BTREE_ID_rebalance_work and processed ahead of extents in
+ *    BTREE_ID_rebalance_work; this is so that we can evacuate failed devices
+ *    before other work.
+ *
+ *  BTREE_ID_rebalance_pending:
+ *    If we'd like to move an extent to a specific target, but can't because the
+ *    target is full, we set bch_extent_rebalance.pending and switch to tracking
+ *    it here; pending rebalance work is re-attempted on device resize, add, or
+ *    label change.
+ */
+
 struct bch_extent_rebalance {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-	__u64			type:6,
-				unused:3,
+	__u64	type:6,
+		ptrs_moving:5,
+		hipri:1,
+		pending:1,
+		need_rb:5,

-				promote_target_from_inode:1,
-				erasure_code_from_inode:1,
-				data_checksum_from_inode:1,
-				background_compression_from_inode:1,
-				data_replicas_from_inode:1,
-				background_target_from_inode:1,
+		data_replicas_from_inode:1,
+		data_checksum_from_inode:1,
+		erasure_code_from_inode:1,
+		background_compression_from_inode:1,
+		background_target_from_inode:1,
+		promote_target_from_inode:1,

-				promote_target:16,
-				erasure_code:1,
-				data_checksum:4,
-				data_replicas:4,
-				background_compression:8, /* enum bch_compression_opt */
-				background_target:16;
+		data_replicas:3,
+		data_checksum:4,
+		erasure_code:1,
+		background_compression:8, /* enum bch_compression_opt */
+		background_target:12,
+		promote_target:12;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-	__u64			background_target:16,
-				background_compression:8,
-				data_replicas:4,
-				data_checksum:4,
-				erasure_code:1,
-				promote_target:16,
+	__u64	promote_target:12,
+		background_target:12,
+		background_compression:8,
+		erasure_code:1,
+		data_checksum:4,
+		data_replicas:3,

-				background_target_from_inode:1,
-				data_replicas_from_inode:1,
-				background_compression_from_inode:1,
-				data_checksum_from_inode:1,
-				erasure_code_from_inode:1,
-				promote_target_from_inode:1,
+		promote_target_from_inode:1,
+		background_target_from_inode:1,
+		background_compression_from_inode:1,
+		erasure_code_from_inode:1,
+		data_checksum_from_inode:1,
+		data_replicas_from_inode:1,

-				unused:3,
-				type:6;
+		need_rb:5,
+		pending:1,
+		hipri:1,
+		ptrs_moving:5,
+		type:6;
+#endif
+};
+
+struct bch_extent_rebalance_bp {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u64			type:8,
+				idx:56;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+	__u64			idx:56,
+				type:8;
 #endif
 };

 /* subset of BCH_INODE_OPTS */
 #define BCH_REBALANCE_OPTS()			\
-	x(data_checksum)			\
-	x(background_compression)		\
 	x(data_replicas)			\
-	x(promote_target)			\
+	x(data_checksum)			\
+	x(erasure_code)				\
+	x(background_compression)		\
 	x(background_target)			\
-	x(erasure_code)
+	x(promote_target)
+
+enum bch_rebalance_opts {
+#define x(n)	BCH_REBALANCE_##n,
+	BCH_REBALANCE_OPTS()
+#undef x
+};
+
+#define BCH_REBALANCE_ACCOUNTING()		\
+	x(replicas,		0)		\
+	x(checksum,		1)		\
+	x(erasure_code,		2)		\
+	x(compression,		3)		\
+	x(target,		4)		\
+	x(high_priority,	5)		\
+	x(pending,		6)		\
+
+enum bch_rebalance_accounting_type {
+#define x(t, n) BCH_REBALANCE_ACCOUNTING_##t = n,
+	BCH_REBALANCE_ACCOUNTING()
+#undef x
+	BCH_REBALANCE_ACCOUNTING_NR,
+};

 #endif /* _BCACHEFS_REBALANCE_FORMAT_H */

--- a/libbcachefs/data/update.c
+++ b/libbcachefs/data/update.c
@ -298,7 +298,6 @@ restart_drop_extra_replicas:
 		extent_for_each_ptr_decode(extent_i_to_s(new), p, entry)
 			bch2_extent_ptr_decoded_append(insert, &p);

-		bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
 		bch2_bkey_drop_extra_cached_ptrs(c, &m->op.opts, bkey_i_to_s(insert));

 		ret = bch2_sum_sector_overwrites(trans, &iter, insert,
@ -328,7 +327,7 @@ restart_drop_extra_replicas:
 			bch2_insert_snapshot_whiteouts(trans, m->btree_id,
 						k.k->p, insert->k.p) ?:
 			bch2_bkey_get_io_opts(trans, NULL, k, &opts) ?:
-			bch2_bkey_set_needs_rebalance(c, &opts, insert,
+			bch2_bkey_set_needs_rebalance(trans, NULL, &opts, insert,
 						      SET_NEEDS_REBALANCE_foreground,
 						      m->op.opts.change_cookie) ?:
 			bch2_trans_update(trans, &iter, insert,
@ -784,6 +783,53 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m)
 	return 0;
 }

+/*
+ * When an extent has both checksummed and non-checksummed pointers, special
+ * handling:
+ *
+ * We don't want to blindly apply an existing checksum to non-checksummed data,
+ * or lose our ability to detect that different replicas in the same extent have
+ * or had different data, so:
+ *
+ * - prefer to read from the specific replica being rewritten
+ * - if we're rewriting a replica without a checksum, only rewrite that specific
+ *   replica in this data update
+ */
+static void checksummed_and_non_checksummed_handling(struct data_update *u, struct bkey_ptrs_c ptrs)
+{
+	bool have_checksummed = false, have_non_checksummed = false;
+
+	struct bkey_s_c k = bkey_i_to_s_c(u->k.k);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+		if (p.crc.csum_type)
+			have_checksummed = true;
+		else
+			have_non_checksummed = true;
+	}
+
+	if (unlikely(have_checksummed && have_non_checksummed)) {
+		unsigned ptr_bit = 1;
+		int rewrite_checksummed = -1;
+
+		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+			if (ptr_bit & u->opts.ptrs_rewrite) {
+				if (rewrite_checksummed < 0) {
+					rewrite_checksummed = p.crc.csum_type != 0;
+					u->opts.read_dev = p.ptr.dev;
+				}
+
+				if (rewrite_checksummed != (p.crc.csum_type != 0) ||
+				    (!rewrite_checksummed && p.ptr.dev != u->opts.read_dev))
+					u->opts.ptrs_rewrite &= ~ptr_bit;
+			}
+
+			ptr_bit <<= 1;
+		}
+	}
+}
+
 int bch2_data_update_init(struct btree_trans *trans,
 			  struct btree_iter *iter,
 			  struct moving_context *ctxt,
@ -844,6 +890,9 @@ int bch2_data_update_init(struct btree_trans *trans,
 	unsigned buf_bytes = 0;
 	bool unwritten = false;

+	if (m->opts.ptrs_rewrite)
+		checksummed_and_non_checksummed_handling(m, ptrs);
+
 	scoped_guard(rcu) {
 		unsigned ptr_bit = 1;
 		bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
@ -957,6 +1006,11 @@ int bch2_data_update_init(struct btree_trans *trans,
 		}
 	}

+	/*
+	 * Check if we have checksummed and non-checksummed pointers, prefer to
+	 * read from the pointer we're operating on
+	 */
+
 	m->ptrs_held = bkey_get_dev_refs(c, k);

 	if (c->opts.nocow_enabled) {
--- a/libbcachefs/data/write.c
+++ b/libbcachefs/data/write.c
@ -355,7 +355,7 @@ int bch2_extent_update(struct btree_trans *trans,

 	bch2_inode_opts_get_inode(c, &inode, &opts);

-	try(bch2_bkey_set_needs_rebalance(c, &opts, k,
+	try(bch2_bkey_set_needs_rebalance(trans, NULL, &opts, k,
 					  SET_NEEDS_REBALANCE_foreground,
 					  change_cookie));
 	try(bch2_trans_update(trans, iter, k, 0));
@ -390,6 +390,13 @@ static int bch2_write_index_default(struct bch_write_op *op)
 		bch2_trans_begin(trans);

 		k = bch2_keylist_front(keys);
+
+		/*
+		 * If we did a degraded write, bch2_bkey_set_needs_rebalance() will add
+		 * pointers to BCH_SB_MEMBER_INVALID so the extent is accounted as
+		 * degraded
+		 */
+		bch2_bkey_buf_realloc(&sk, k->k.u64s + 1 + BCH_REPLICAS_MAX);
 		bch2_bkey_buf_copy(&sk, k);

 		int ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &sk.k->k.p.snapshot);
@ -1224,8 +1231,16 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
 	}

 	struct bch_fs *c = trans->c;
+
+	/*
+	 * If we did a degraded write, bch2_bkey_set_needs_rebalance() will add
+	 * pointers to BCH_SB_MEMBER_INVALID so the extent is accounted as
+	 * degraded
+	 */
 	struct bkey_i *new = errptr_try(bch2_trans_kmalloc_nomemzero(trans,
-				bkey_bytes(k.k) + sizeof(struct bch_extent_rebalance)));
+				bkey_bytes(k.k) +
+				sizeof(struct bch_extent_rebalance) +
+				sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX));

 	bkey_reassemble(new, k);
 	bch2_cut_front(bkey_start_pos(&orig->k), new);
@ -1253,7 +1268,7 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
 	return  bch2_extent_update_i_size_sectors(trans, iter,
 					min(new->k.p.offset << 9, new_i_size), 0, &inode) ?:
 		(bch2_inode_opts_get_inode(c, &inode, &opts),
-		 bch2_bkey_set_needs_rebalance(c, &opts, new,
+		 bch2_bkey_set_needs_rebalance(trans, NULL, &opts, new,
 					       SET_NEEDS_REBALANCE_foreground,
 					       op->opts.change_cookie)) ?:
 		bch2_trans_update(trans, iter, new,
@ -1270,7 +1285,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 		ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents,
 				     bkey_start_pos(&orig->k), orig->k.p,
 				     BTREE_ITER_intent, k,
-				     NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+				     &op->res, NULL,
+				     BCH_TRANS_COMMIT_no_enospc, ({
 			bch2_nocow_write_convert_one_unwritten(trans, &iter, op, orig, k, op->new_i_size);
 		}));
 		if (ret)
--- a/libbcachefs/debug/sysfs.c
+++ b/libbcachefs/debug/sysfs.c
@ -218,6 +218,7 @@ read_attribute(copy_gc_wait);

 sysfs_pd_controller_attribute(rebalance);
 read_attribute(rebalance_status);
+read_attribute(rebalance_scan_pending);
 read_attribute(snapshot_delete_status);
 read_attribute(recovery_status);

@ -340,6 +341,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_rebalance_status)
 		bch2_rebalance_status_to_text(out, c);

+	if (attr == &sysfs_rebalance_scan_pending)
+		bch2_rebalance_scan_pending_to_text(out, c);
+
 	if (attr == &sysfs_snapshot_delete_status)
 		bch2_snapshot_delete_status_to_text(out, c);

@ -517,6 +521,7 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_btree_write_stats,

 	&sysfs_rebalance_status,
+	&sysfs_rebalance_scan_pending,
 	&sysfs_snapshot_delete_status,
 	&sysfs_recovery_status,

--- a/libbcachefs/errcode.h
+++ b/libbcachefs/errcode.h
@ -226,6 +226,7 @@
 	x(EINVAL,			erasure_coding_found_btree_node)	\
 	x(EINVAL,			option_negative)			\
 	x(EINVAL,			topology_repair)			\
+	x(EINVAL,			unaligned_io)				\
 	x(BCH_ERR_topology_repair,	topology_repair_drop_this_node)		\
 	x(BCH_ERR_topology_repair,	topology_repair_drop_prev_node)		\
 	x(BCH_ERR_topology_repair,	topology_repair_did_fill_from_scan)	\
--- a/libbcachefs/init/dev.c
+++ b/libbcachefs/init/dev.c
@ -543,6 +543,17 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,

 	bch_notice(ca, "%s", bch2_member_states[new_state]);

+	bool do_rebalance_scan =
+		new_state == BCH_MEMBER_STATE_rw ||
+		new_state == BCH_MEMBER_STATE_failed;
+
+	struct rebalance_scan s = new_state == BCH_MEMBER_STATE_rw
+		? (struct rebalance_scan) { .type = REBALANCE_SCAN_pending }
+		: (struct rebalance_scan) { .type = REBALANCE_SCAN_device, .dev = ca->dev_idx };
+
+	if (do_rebalance_scan)
+		try(bch2_set_rebalance_needs_scan(c, s, false));
+
 	scoped_guard(mutex, &c->sb_lock) {
 		struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
 		SET_BCH_MEMBER_STATE(m, new_state);
@ -552,7 +563,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	if (new_state == BCH_MEMBER_STATE_rw)
 		__bch2_dev_read_write(c, ca);

-	bch2_rebalance_wakeup(c);
+	if (do_rebalance_scan)
+		try(bch2_set_rebalance_needs_scan(c, s, true));

 	return ret;
 }
@ -740,6 +752,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err)
 	if (ret)
 		goto err;

+	struct rebalance_scan s = { .type = REBALANCE_SCAN_pending };
+	try(bch2_set_rebalance_needs_scan(c, s, false));
+
 	scoped_guard(rwsem_write, &c->state_lock) {
 		scoped_guard(mutex, &c->sb_lock) {
 			SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true);
@ -824,6 +839,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path, struct printbuf *err)
 		};
 		kobject_uevent_env(&ca->disk_sb.bdev->bd_device.kobj, KOBJ_CHANGE, envp);
 	}
+
+	try(bch2_set_rebalance_needs_scan(c, s, true));
 out:
 	bch_err_fn(c, ret);
 	return ret;
@ -936,6 +953,11 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct p
 		return -EINVAL;
 	}

+	bool wakeup_rebalance_pending = nbuckets > ca->mi.nbuckets;
+	struct rebalance_scan s = { .type = REBALANCE_SCAN_pending };
+	if (wakeup_rebalance_pending)
+		try(bch2_set_rebalance_needs_scan(c, s, false));
+
 	if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) {
 		prt_printf(err, "New device size too big (%llu greater than max %u)\n",
 			   nbuckets, BCH_MEMBER_NBUCKETS_MAX);
@ -979,6 +1001,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets, struct p
 	}

 	bch2_recalc_capacity(c);
+
+	if (wakeup_rebalance_pending)
+		try(bch2_set_rebalance_needs_scan(c, s, true));
 	return 0;
 }

--- a/libbcachefs/init/fs.c
+++ b/libbcachefs/init/fs.c
@ -375,9 +375,6 @@ void bch2_fs_read_only(struct bch_fs *c)
 		BUG_ON(c->btree_write_buffer.inc.keys.nr);
 		BUG_ON(c->btree_write_buffer.flushing.keys.nr);
 		bch2_verify_accounting_clean(c);
-
-		bch_verbose(c, "marking filesystem clean");
-		bch2_fs_mark_clean(c);
 	} else {
 		/* Make sure error counts/counters are persisted */
 		guard(mutex)(&c->sb_lock);
@ -473,7 +470,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)

 	try(bch2_fs_init_rw(c));
 	try(bch2_sb_members_v2_init(c));
-	try(bch2_fs_mark_dirty(c));

 	clear_bit(BCH_FS_clean_shutdown, &c->flags);

@ -918,7 +914,7 @@ static int bch2_fs_opt_version_init(struct bch_fs *c)
 	}

 	if (c->sb.version_incompat_allowed != c->sb.version) {
-		prt_printf(&p, "\nallowing incompatible features above ");
+		prt_printf(&p, "\nallowing incompatible features up to ");
 		bch2_version_to_text(&p, c->sb.version_incompat_allowed);
 	}

@ -1052,7 +1048,6 @@ static int bch2_fs_init(struct bch_fs *c, struct bch_sb *sb,

 	init_rwsem(&c->state_lock);
 	mutex_init(&c->sb_lock);
-	mutex_init(&c->replicas_gc_lock);
 	mutex_init(&c->btree_root_lock);
 	INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);

--- a/libbcachefs/init/recovery.c
+++ b/libbcachefs/init/recovery.c
@ -610,8 +610,7 @@ fsck_err:
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	struct bch_sb_field_clean *clean = NULL;
-	struct jset *last_journal_entry = NULL;
-	u64 last_seq = 0, blacklist_seq, journal_seq;
+	struct journal_start_info journal_start = {};
 	int ret = 0;

 	if (c->sb.clean) {
@ -637,7 +636,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		struct journal_replay **i;

 		bch_verbose(c, "starting journal read");
-		ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq);
+		ret = bch2_journal_read(c, &journal_start);
 		if (ret)
 			goto err;

@ -648,22 +647,21 @@ int bch2_fs_recovery(struct bch_fs *c)
 		if (c->opts.read_journal_only)
 			goto out;

+		if (mustfix_fsck_err_on(c->sb.clean && !journal_start.clean,
+					c, clean_but_journal_not_empty,
+					"filesystem marked clean but journal not empty")) {
+			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+			c->sb.clean = false;
+		}
+
+		struct jset *last_journal_entry = NULL;
 		genradix_for_each_reverse(&c->journal_entries, iter, i)
 			if (!journal_replay_ignore(*i)) {
 				last_journal_entry = &(*i)->j;
 				break;
 			}

-		if (mustfix_fsck_err_on(c->sb.clean &&
-					last_journal_entry &&
-					!journal_entry_empty(last_journal_entry), c,
-				clean_but_journal_not_empty,
-				"filesystem marked clean but journal not empty")) {
-			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-			SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-			c->sb.clean = false;
-		}
-
 		if (!last_journal_entry) {
 			fsck_err_on(!c->sb.clean, c,
 				    dirty_but_no_journal_entries,
@ -705,11 +703,12 @@ use_clean:
 			goto err;

 		}
-		blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1;
+
+		journal_start.start_seq = le64_to_cpu(clean->journal_seq) + 1;
 	}

-	c->journal_replay_seq_start	= last_seq;
-	c->journal_replay_seq_end	= blacklist_seq - 1;
+	c->journal_replay_seq_start	= journal_start.seq_read_start;
+	c->journal_replay_seq_end	= journal_start.seq_read_end;

 	zero_out_btree_mem_ptr(&c->journal_keys);

@ -756,13 +755,15 @@ use_clean:
 	 * journal sequence numbers:
 	 */
 	if (!c->sb.clean)
-		journal_seq += JOURNAL_BUF_NR * 4;
+		journal_start.start_seq += JOURNAL_BUF_NR * 4;

-	if (blacklist_seq != journal_seq) {
+	if (journal_start.seq_read_end &&
+	    journal_start.seq_read_end + 1 != journal_start.start_seq) {
+		u64 blacklist_seq = journal_start.seq_read_end + 1;
 		ret =   bch2_journal_log_msg(c, "blacklisting entries %llu-%llu",
-					     blacklist_seq, journal_seq) ?:
+					     blacklist_seq, journal_start.start_seq) ?:
 			bch2_journal_seq_blacklist_add(c,
-					blacklist_seq, journal_seq);
+					blacklist_seq, journal_start.start_seq);
 		if (ret) {
 			bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
 			goto err;
@ -770,8 +771,10 @@ use_clean:
 	}

 	ret =   bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu",
-				     journal_seq, last_seq, blacklist_seq - 1) ?:
-		bch2_fs_journal_start(&c->journal, last_seq, journal_seq);
+				     journal_start.start_seq,
+				     journal_start.seq_read_start,
+				     journal_start.seq_read_end) ?:
+		bch2_fs_journal_start(&c->journal, journal_start);
 	if (ret)
 		goto err;

@ -1014,7 +1017,8 @@ int bch2_fs_initialize(struct bch_fs *c)
 	 * journal_res_get() will crash if called before this has
 	 * set up the journal.pin FIFO and journal.cur pointer:
 	 */
-	ret = bch2_fs_journal_start(&c->journal, 1, 1);
+	struct journal_start_info journal_start = { .start_seq = 1 };
+	ret = bch2_fs_journal_start(&c->journal, journal_start);
 	if (ret)
 		goto err;

--- a/libbcachefs/journal/init.c
+++ b/libbcachefs/journal/init.c
@ -11,6 +11,7 @@
 #include "alloc/foreground.h"
 #include "alloc/replicas.h"
 #include "btree/update.h"
+#include "init/error.h"

 /* allocate journal on a device: */

@ -367,29 +368,30 @@ void bch2_fs_journal_stop(struct journal *j)
 		clear_bit(JOURNAL_running, &j->flags);
 }

-int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
+int bch2_fs_journal_start(struct journal *j, struct journal_start_info info)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_entry_pin_list *p;
 	struct journal_replay *i, **_i;
 	struct genradix_iter iter;
 	bool had_entries = false;
+	int ret = 0;

 	/*
 	 *
 	 * XXX pick most recent non blacklisted sequence number
 	 */

-	cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c));
+	info.start_seq = max(info.start_seq, bch2_journal_last_blacklisted_seq(c));

-	if (cur_seq >= JOURNAL_SEQ_MAX) {
+	if (info.start_seq >= JOURNAL_SEQ_MAX) {
 		bch_err(c, "cannot start: journal seq overflow");
 		return -EINVAL;
 	}

 	/* Clean filesystem? */
-	if (!last_seq)
-		last_seq = cur_seq;
+	u64 cur_seq	= info.start_seq;
+	u64 last_seq	= info.seq_read_start ?: info.start_seq;

 	u64 nr = cur_seq - last_seq;
 	if (nr * sizeof(struct journal_entry_pin_list) > 1U << 30) {
@ -419,6 +421,7 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
 	j->seq_write_started	= cur_seq - 1;
 	j->seq_ondisk		= cur_seq - 1;
 	j->pin.front		= last_seq;
+	j->last_seq		= last_seq;
 	j->pin.back		= cur_seq;
 	atomic64_set(&j->seq, cur_seq - 1);

@ -441,12 +444,26 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
 		if (journal_entry_empty(&i->j))
 			j->last_empty_seq = le64_to_cpu(i->j.seq);

-		struct bch_devs_list seq_devs = {};
-		darray_for_each(i->ptrs, ptr)
-			seq_devs.data[seq_devs.nr++] = ptr->dev;
+		if (!info.clean) {
+			struct bch_devs_list seq_devs = {};
+			darray_for_each(i->ptrs, ptr)
+				seq_devs.data[seq_devs.nr++] = ptr->dev;

-		p = journal_seq_pin(j, seq);
-		bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs);
+			p = journal_seq_pin(j, seq);
+			bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs);
+
+			CLASS(printbuf, buf)();
+			bch2_replicas_entry_to_text(&buf, &p->devs.e);
+
+			fsck_err_on(!test_bit(JOURNAL_degraded, &j->flags) &&
+				    !bch2_replicas_marked(c, &p->devs.e),
+				    c, journal_entry_replicas_not_marked,
+				    "superblock not marked as containing replicas for journal entry %llu\n%s",
+				    le64_to_cpu(i->j.seq), buf.buf);
+
+			if (bch2_replicas_entry_get(c, &p->devs.e))
+				p->devs.e.nr_devs = 0;
+		}

 		had_entries = true;
 	}
@ -460,7 +477,9 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
 		c->last_bucket_seq_cleanup = journal_cur_seq(j);
 	}

-	return 0;
+	try(bch2_replicas_gc_reffed(c));
+fsck_err:
+	return ret;
 }

 void bch2_journal_set_replay_done(struct journal *j)
@ -585,6 +604,7 @@ void bch2_fs_journal_init_early(struct journal *j)
 	init_waitqueue_head(&j->reclaim_wait);
 	init_waitqueue_head(&j->pin_flush_wait);
 	mutex_init(&j->reclaim_lock);
+	mutex_init(&j->last_seq_ondisk_lock);
 	mutex_init(&j->discard_lock);

 	lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
--- a/libbcachefs/journal/init.h
+++ b/libbcachefs/journal/init.h
@ -11,7 +11,7 @@ int bch2_fs_journal_alloc(struct bch_fs *);
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);

 void bch2_fs_journal_stop(struct journal *);
-int bch2_fs_journal_start(struct journal *, u64, u64);
+int bch2_fs_journal_start(struct journal *, struct journal_start_info);
 void bch2_journal_set_replay_done(struct journal *);

 void bch2_dev_journal_exit(struct bch_dev *);
--- a/libbcachefs/journal/journal.c
+++ b/libbcachefs/journal/journal.c
@ -187,7 +187,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 	lockdep_assert_held(&j->lock);

 	if (__bch2_journal_pin_put(j, seq))
-		bch2_journal_reclaim_fast(j);
+		bch2_journal_update_last_seq(j);
 	bch2_journal_do_writes(j);

 	/*
@ -235,10 +235,10 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
 	/* Close out old buffer: */
 	buf->data->u64s		= cpu_to_le32(old.cur_entry_offset);

-	struct journal_entry_pin_list *pin_list =
-		journal_seq_pin(j, journal_cur_seq(j));
-	pin_list->bytes = roundup_pow_of_two(vstruct_bytes(buf->data));
-	j->dirty_entry_bytes += pin_list->bytes;
+	size_t bytes = roundup_pow_of_two(vstruct_bytes(buf->data));
+
+	journal_seq_pin(j, journal_cur_seq(j))->bytes = bytes;
+	j->dirty_entry_bytes += bytes;

 	if (trace_journal_entry_close_enabled() && trace) {
 		CLASS(printbuf, err)();
@ -280,7 +280,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
 	 * contain either what the old pin protected or what the new pin
 	 * protects.
 	 *
-	 * After the old pin is dropped journal_last_seq() won't include the old
+	 * After the old pin is dropped j->last_seq won't include the old
 	 * pin, so we can only write the updated last_seq on the entry that
 	 * contains whatever the new pin protects.
 	 *
@ -291,7 +291,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t
 	 * Hence, we want update/set last_seq on the current journal entry right
 	 * before we open a new one:
 	 */
-	buf->last_seq		= journal_last_seq(j);
+	buf->last_seq		= j->last_seq;
 	buf->data->last_seq	= cpu_to_le64(buf->last_seq);
 	BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq));

@ -358,7 +358,6 @@ static int journal_entry_open(struct journal *j)

 	lockdep_assert_held(&j->lock);
 	BUG_ON(journal_entry_is_open(j));
-	BUG_ON(c->sb.clean);

 	if (j->blocked)
 		return bch_err_throw(c, journal_blocked);
@ -416,7 +415,7 @@ static int journal_entry_open(struct journal *j)

 	/*
 	 * The fifo_push() needs to happen at the same time as j->seq is
-	 * incremented for journal_last_seq() to be calculated correctly
+	 * incremented for j->last_seq to be calculated correctly
 	 */
 	atomic64_inc(&j->seq);
 	journal_pin_list_init(fifo_push_ref(&j->pin), 1);
@ -1092,7 +1091,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	prt_printf(out, "dirty journal entries:\t%llu/%llu\n",	fifo_used(&j->pin), j->pin.size);
 	prt_printf(out, "seq:\t%llu\n",				journal_cur_seq(j));
 	prt_printf(out, "seq_ondisk:\t%llu\n",			j->seq_ondisk);
-	prt_printf(out, "last_seq:\t%llu\n",			journal_last_seq(j));
+	prt_printf(out, "last_seq:\t%llu\n",			j->last_seq);
 	prt_printf(out, "last_seq_ondisk:\t%llu\n",		j->last_seq_ondisk);
 	prt_printf(out, "flushed_seq_ondisk:\t%llu\n",		j->flushed_seq_ondisk);
 	prt_printf(out, "watermark:\t%s\n",			bch2_watermarks[j->watermark]);
--- a/libbcachefs/journal/journal.h
+++ b/libbcachefs/journal/journal.h
@ -129,11 +129,6 @@ static inline bool journal_low_on_space(struct journal *j)

 /* Sequence number of oldest dirty journal entry */

-static inline u64 journal_last_seq(struct journal *j)
-{
-	return j->pin.front;
-}
-
 static inline u64 journal_cur_seq(struct journal *j)
 {
 	return atomic64_read(&j->seq);
--- a/libbcachefs/journal/read.c
+++ b/libbcachefs/journal/read.c
@ -1346,18 +1346,17 @@ fsck_err:
 	return ret;
 }

-int bch2_journal_read(struct bch_fs *c,
-		      u64 *last_seq,
-		      u64 *blacklist_seq,
-		      u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c, struct journal_start_info *info)
 {
 	struct journal_list jlist;
 	struct journal_replay *i, **_i;
 	struct genradix_iter radix_iter;
-	bool degraded = false, last_write_torn = false;
+	bool last_write_torn = false;
 	u64 seq;
 	int ret = 0;

+	memset(info, 0, sizeof(*info));
+
 	closure_init_stack(&jlist.cl);
 	mutex_init(&jlist.lock);
 	jlist.last_seq = 0;
@ -1377,7 +1376,7 @@ int bch2_journal_read(struct bch_fs *c,
 				     system_unbound_wq,
 				     &jlist.cl);
 		else
-			degraded = true;
+			set_bit(JOURNAL_degraded, &c->journal.flags);
 	}

 	while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2))
@ -1386,10 +1385,6 @@ int bch2_journal_read(struct bch_fs *c,
 	if (jlist.ret)
 		return jlist.ret;

-	*last_seq	= 0;
-	*start_seq	= 0;
-	*blacklist_seq	= 0;
-
 	/*
 	 * Find most recent flush entry, and ignore newer non flush entries -
 	 * those entries will be blacklisted:
@ -1400,8 +1395,8 @@ int bch2_journal_read(struct bch_fs *c,
 		if (journal_replay_ignore(i))
 			continue;

-		if (!*start_seq)
-			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
+		if (!info->start_seq)
+			info->start_seq = le64_to_cpu(i->j.seq) + 1;

 		if (JSET_NO_FLUSH(&i->j)) {
 			i->ignore_blacklisted = true;
@ -1426,27 +1421,28 @@ int bch2_journal_read(struct bch_fs *c,
 					 le64_to_cpu(i->j.seq)))
 			i->j.last_seq = i->j.seq;

-		*last_seq	= le64_to_cpu(i->j.last_seq);
-		*blacklist_seq	= le64_to_cpu(i->j.seq) + 1;
+		info->seq_read_start	= le64_to_cpu(i->j.last_seq);
+		info->seq_read_end	= le64_to_cpu(i->j.seq);
+		info->clean		= journal_entry_empty(&i->j);
 		break;
 	}

-	if (!*start_seq) {
+	if (!info->start_seq) {
 		bch_info(c, "journal read done, but no entries found");
 		return 0;
 	}

-	if (!*last_seq) {
+	if (!info->seq_read_end) {
 		fsck_err(c, dirty_but_no_journal_entries_post_drop_nonflushes,
 			 "journal read done, but no entries found after dropping non-flushes");
 		return 0;
 	}

-	u64 drop_before = *last_seq;
+	u64 drop_before = info->seq_read_start;
 	{
 		CLASS(printbuf, buf)();
 		prt_printf(&buf, "journal read done, replaying entries %llu-%llu",
-			   *last_seq, *blacklist_seq - 1);
+			   info->seq_read_start, info->seq_read_end);

 		/*
 		 * Drop blacklisted entries and entries older than last_seq (or start of
@ -1457,9 +1453,11 @@ int bch2_journal_read(struct bch_fs *c,
 			prt_printf(&buf, " (rewinding from %llu)", c->opts.journal_rewind);
 		}

-		*last_seq = drop_before;
-		if (*start_seq != *blacklist_seq)
-			prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1);
+		info->seq_read_start = drop_before;
+		if (info->seq_read_end + 1 != info->start_seq)
+			prt_printf(&buf, " (unflushed %llu-%llu)",
+				   info->seq_read_end + 1,
+				   info->start_seq - 1);
 		bch_info(c, "%s", buf.buf);
 	}

@ -1483,7 +1481,7 @@ int bch2_journal_read(struct bch_fs *c,
 		}
 	}

-	try(bch2_journal_check_for_missing(c, drop_before, *blacklist_seq - 1));
+	try(bch2_journal_check_for_missing(c, drop_before, info->seq_read_end));

 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		union bch_replicas_padded replicas = {
@ -1516,17 +1514,6 @@ int bch2_journal_read(struct bch_fs *c,
 			replicas_entry_add_dev(&replicas.e, ptr->dev);

 		bch2_replicas_entry_sort(&replicas.e);
-
-		CLASS(printbuf, buf)();
-		bch2_replicas_entry_to_text(&buf, &replicas.e);
-
-		if (!degraded &&
-		    !bch2_replicas_marked(c, &replicas.e) &&
-		    (le64_to_cpu(i->j.seq) == *last_seq ||
-		     fsck_err(c, journal_entry_replicas_not_marked,
-			      "superblock not marked as containing replicas for journal entry %llu\n%s",
-			      le64_to_cpu(i->j.seq), buf.buf)))
-			try(bch2_mark_replicas(c, &replicas.e));
 	}
 fsck_err:
 	return ret;
--- a/libbcachefs/journal/read.h
+++ b/libbcachefs/journal/read.h
@ -70,6 +70,6 @@ struct u64_range {

 struct u64_range bch2_journal_entry_missing_range(struct bch_fs *, u64, u64);

-int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
+int bch2_journal_read(struct bch_fs *, struct journal_start_info *);

 #endif /* _BCACHEFS_JOURNAL_READ_H */
--- a/libbcachefs/journal/reclaim.c
+++ b/libbcachefs/journal/reclaim.c
@ -211,7 +211,7 @@ void bch2_journal_space_available(struct journal *j)
 			continue;

 		while (ja->dirty_idx != ja->cur_idx &&
-		       ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j))
+		       ja->bucket_seq[ja->dirty_idx] < j->last_seq)
 			ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;

 		while (ja->dirty_idx_ondisk != ja->dirty_idx &&
@ -325,37 +325,66 @@ void bch2_journal_do_discards(struct journal *j)
 * entry, holding it open to ensure it gets replayed during recovery:
 */

-void bch2_journal_reclaim_fast(struct journal *j)
+void bch2_journal_update_last_seq(struct journal *j)
 {
-	bool popped = false;
-
 	lockdep_assert_held(&j->lock);

 	/*
 	 * Unpin journal entries whose reference counts reached zero, meaning
 	 * all btree nodes got written out
 	 */
+	u64 old = j->last_seq;
 	struct journal_entry_pin_list *pin_list;
-	while (!fifo_empty(&j->pin) &&
-	       j->pin.front <= j->seq_ondisk &&
-	       !atomic_read(&(pin_list = &fifo_peek_front(&j->pin))->count)) {
+	while (j->last_seq <  j->pin.back &&
+	       j->last_seq <= j->seq_ondisk &&
+	       !atomic_read(&(pin_list = journal_seq_pin(j, j->last_seq))->count))
+		j->last_seq++;

-		if (WARN_ON(j->dirty_entry_bytes < pin_list->bytes))
-			pin_list->bytes = j->dirty_entry_bytes;
-
-		j->dirty_entry_bytes -= pin_list->bytes;
-		pin_list->bytes = 0;
-
-		j->pin.front++;
-		popped = true;
-	}
-
-	if (popped) {
+	if (old != j->last_seq) {
 		bch2_journal_space_available(j);
 		__closure_wake_up(&j->reclaim_flush_wait);
 	}
 }

+void bch2_journal_update_last_seq_ondisk(struct journal *j, u64 last_seq_ondisk)
+{
+	struct bch_fs *c = container_of(j, struct bch_fs, journal);
+	union bch_replicas_padded replicas;
+	unsigned nr_refs = 0;
+	size_t dirty_entry_bytes = 0;
+
+	scoped_guard(mutex, &j->last_seq_ondisk_lock)
+		while (j->last_seq_ondisk < last_seq_ondisk) {
+			struct journal_entry_pin_list *pin_list = journal_seq_pin(j, j->last_seq_ondisk);
+
+			if (pin_list->devs.e.nr_devs) {
+				if (nr_refs &&
+				    !bch2_replicas_entry_eq(&replicas.e, &pin_list->devs.e)) {
+					bch2_replicas_entry_put_many(c, &replicas.e, nr_refs);
+					nr_refs = 0;
+				}
+
+				memcpy(&replicas, &pin_list->devs, replicas_entry_bytes(&pin_list->devs.e));
+				pin_list->devs.e.nr_devs = 0;
+				nr_refs++;
+			}
+
+			dirty_entry_bytes += pin_list->bytes;
+			pin_list->bytes = 0;
+
+			j->last_seq_ondisk++;
+		}
+
+	scoped_guard(spinlock, &j->lock) {
+		if (WARN_ON(j->dirty_entry_bytes < dirty_entry_bytes))
+			dirty_entry_bytes = j->dirty_entry_bytes;
+		j->dirty_entry_bytes -= dirty_entry_bytes;
+	}
+
+	if (nr_refs)
+		bch2_replicas_entry_put_many(c, &replicas.e, nr_refs);
+}
+
 bool __bch2_journal_pin_put(struct journal *j, u64 seq)
 {
 	struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
@ -367,7 +396,7 @@ void bch2_journal_pin_put(struct journal *j, u64 seq)
 {
 	if (__bch2_journal_pin_put(j, seq)) {
 		guard(spinlock)(&j->lock);
-		bch2_journal_reclaim_fast(j);
+		bch2_journal_update_last_seq(j);
 	}
 }

@ -394,7 +423,7 @@ static inline bool __journal_pin_drop(struct journal *j,
 	 * writing a new last_seq will now make another bucket available:
 	 */
 	return atomic_dec_and_test(&pin_list->count) &&
-		pin_list == &fifo_peek_front(&j->pin);
+		pin_list == journal_seq_pin(j, j->last_seq);
 }

 void bch2_journal_pin_drop(struct journal *j,
@ -402,7 +431,7 @@ void bch2_journal_pin_drop(struct journal *j,
 {
 	guard(spinlock)(&j->lock);
 	if (__journal_pin_drop(j, pin))
-		bch2_journal_reclaim_fast(j);
+		bch2_journal_update_last_seq(j);
 }

 static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
@ -453,7 +482,7 @@ void bch2_journal_pin_copy(struct journal *j,

 	u64 seq = READ_ONCE(src->seq);

-	if (seq < journal_last_seq(j)) {
+	if (seq < j->last_seq) {
 		/*
 		 * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
 		 * the src pin - with the pin dropped, the entry to pin might no
@ -468,13 +497,13 @@ void bch2_journal_pin_copy(struct journal *j,
 	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));

 	if (reclaim)
-		bch2_journal_reclaim_fast(j);
+		bch2_journal_update_last_seq(j);

 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
 	 * immediately:
 	 */
-	if (seq == journal_last_seq(j))
+	if (seq == j->last_seq)
 		journal_wake(j);
 }

@ -485,19 +514,19 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
 	bool wake;

 	scoped_guard(spinlock, &j->lock) {
-		BUG_ON(seq < journal_last_seq(j));
+		BUG_ON(seq < j->last_seq);

 		bool reclaim = __journal_pin_drop(j, pin);

 		bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));

 		if (reclaim)
-			bch2_journal_reclaim_fast(j);
+			bch2_journal_update_last_seq(j);
 		/*
 		 * If the journal is currently full,  we might want to call flush_fn
 		 * immediately:
 		 */
-		wake = seq == journal_last_seq(j);
+		wake = seq == j->last_seq;
 	}

 	if (wake)
@ -929,8 +958,8 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
 	 */
 	guard(spinlock)(&j->lock);
 	return !test_bit(JOURNAL_replay_done, &j->flags) ||
-		journal_last_seq(j) > seq_to_flush ||
-		!fifo_used(&j->pin);
+		j->last_seq > seq_to_flush ||
+		j->last_seq == j->pin.back;
 }

 bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
@ -964,39 +993,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)

 	try(bch2_journal_error(j));

-	guard(mutex)(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_journal);
-
-	/*
-	 * Now that we've populated replicas_gc, write to the journal to mark
-	 * active journal devices. This handles the case where the journal might
-	 * be empty. Otherwise we could clear all journal replicas and
-	 * temporarily put the fs into an unrecoverable state. Journal recovery
-	 * expects to find devices marked for journal data on unclean mount.
-	 */
-	int ret = bch2_journal_meta(&c->journal);
-	if (ret)
-		goto err;
-
-	seq = 0;
-	scoped_guard(spinlock, &j->lock)
-		while (!ret) {
-			seq = max(seq, journal_last_seq(j));
-			if (seq > j->seq_ondisk)
-				break;
-
-			union bch_replicas_padded replicas;
-			memcpy(&replicas, &journal_seq_pin(j, seq)->devs, sizeof(replicas));
-			seq++;
-
-			if (replicas.e.nr_devs) {
-				spin_unlock(&j->lock);
-				ret = bch2_mark_replicas(c, &replicas.e);
-				spin_lock(&j->lock);
-			}
-		}
-err:
-	return bch2_replicas_gc_end(c, ret);
+	return 0;
 }

 bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq)
@ -1010,7 +1007,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 	if (!test_bit(JOURNAL_running, &j->flags))
 		return true;

-	*seq = max(*seq, j->pin.front);
+	*seq = max(*seq, j->last_seq);

 	if (*seq >= j->pin.back)
 		return true;
--- a/libbcachefs/journal/reclaim.h
+++ b/libbcachefs/journal/reclaim.h
@ -43,7 +43,9 @@ journal_seq_pin(struct journal *j, u64 seq)
 	return &j->pin.data[seq & j->pin.mask];
 }

-void bch2_journal_reclaim_fast(struct journal *);
+void bch2_journal_update_last_seq(struct journal *);
+void bch2_journal_update_last_seq_ondisk(struct journal *, u64);
+
 bool __bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_put(struct journal *, u64);
 void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *);
--- a/libbcachefs/journal/types.h
+++ b/libbcachefs/journal/types.h
@ -149,6 +149,7 @@ enum journal_space_from {
 };

 #define JOURNAL_FLAGS()			\
+	x(degraded)			\
 	x(replay_done)			\
 	x(running)			\
 	x(may_skip_flush)		\
@ -265,6 +266,8 @@ struct journal {
 		u64 front, back, size, mask;
 		struct journal_entry_pin_list *data;
 	}			pin;
+	u64			last_seq;
+
 	size_t			dirty_entry_bytes;

 	struct journal_space	space[journal_space_nr];
@ -276,6 +279,7 @@ struct journal {
 	spinlock_t		err_lock;

 	struct mutex		reclaim_lock;
+	struct mutex		last_seq_ondisk_lock;
 	/*
 	 * Used for waiting until journal reclaim has freed up space in the
 	 * journal:
@ -352,4 +356,11 @@ struct journal_entry_res {
 	unsigned		u64s;
 };

+struct journal_start_info {
+	u64	seq_read_start;
+	u64	seq_read_end;
+	u64	start_seq;
+	bool	clean;
+};
+
 #endif /* _BCACHEFS_JOURNAL_TYPES_H */
--- a/libbcachefs/journal/write.c
+++ b/libbcachefs/journal/write.c
@ -189,6 +189,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	u64 seq = le64_to_cpu(w->data->seq);
+	u64 seq_wrote = seq;
 	int err = 0;

 	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@ -197,8 +198,12 @@ static CLOSURE_CALLBACK(journal_write_done)

 	if (w->had_error) {
 		struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e;
+		bch2_replicas_entry_put(c, r);

 		bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written);
+		err = bch2_replicas_entry_get(c, r);
+		if (err)
+			r->nr_devs = 0;
 	}

 	if (!w->devs_written.nr)
@ -225,7 +230,6 @@ static CLOSURE_CALLBACK(journal_write_done)
 	BUG_ON(seq < j->pin.front);
 	if (err && (!j->err_seq || seq < j->err_seq))
 		j->err_seq	= seq;
-	w->write_done = true;

 	if (!j->free_buf || j->free_buf_size < w->buf_size) {
 		swap(j->free_buf,	w->data);
@ -243,22 +247,31 @@ static CLOSURE_CALLBACK(journal_write_done)
 	}

 	bool completed = false;
-	bool do_discards = false;
-
+	bool last_seq_ondisk_updated = false;
+again:
 	for (seq = journal_last_unwritten_seq(j);
 	     seq <= journal_cur_seq(j);
 	     seq++) {
 		w = j->buf + (seq & JOURNAL_BUF_MASK);
-		if (!w->write_done)
+		if (!w->write_done && seq != seq_wrote)
 			break;

 		if (!j->err_seq && !w->noflush) {
-			j->flushed_seq_ondisk = seq;
-			j->last_seq_ondisk = w->last_seq;
+			if (j->last_seq_ondisk < w->last_seq) {
+				spin_unlock(&j->lock);
+				/*
+				 * this needs to happen _before_ updating
+				 * j->flushed_seq_ondisk, for flushing to work
+				 * properly - when the flush completes replcias
+				 * refs need to have been dropped
+				 * */
+				bch2_journal_update_last_seq_ondisk(j, w->last_seq);
+				last_seq_ondisk_updated = true;
+				spin_lock(&j->lock);
+				goto again;
+			}

-			closure_wake_up(&c->freelist_wait);
-			bch2_reset_alloc_cursors(c);
-			do_discards = true;
+			j->flushed_seq_ondisk = seq;
 		}

 		j->seq_ondisk = seq;
@ -277,8 +290,10 @@ static CLOSURE_CALLBACK(journal_write_done)
 		completed = true;
 	}

+	j->buf[seq_wrote & JOURNAL_BUF_MASK].write_done = true;
+
 	if (completed) {
-		bch2_journal_reclaim_fast(j);
+		bch2_journal_update_last_seq(j);
 		bch2_journal_space_available(j);

 		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
@ -286,6 +301,8 @@ static CLOSURE_CALLBACK(journal_write_done)
 		journal_wake(j);
 	}

+	j->pin.front = min(j->pin.back, j->last_seq_ondisk);
+
 	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 	    j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
@ -308,8 +325,11 @@ static CLOSURE_CALLBACK(journal_write_done)
 	bch2_journal_do_writes(j);
 	spin_unlock(&j->lock);

-	if (do_discards)
+	if (last_seq_ondisk_updated) {
+		bch2_reset_alloc_cursors(c);
+		closure_wake_up(&c->freelist_wait);
 		bch2_do_discards(c);
+	}

 	closure_put(&c->cl);
 }
@ -635,7 +655,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
 	int ret;

-	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
 	BUG_ON(!w->write_started);
 	BUG_ON(w->write_allocated);
 	BUG_ON(w->write_done);
@ -702,9 +721,11 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	 */
 	struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e;
 	bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written);
-	ret = bch2_mark_replicas(c, r);
-	if (ret)
+	ret = bch2_replicas_entry_get(c, r);
+	if (ret) {
+		r->nr_devs = 0;
 		goto err;
+	}

 	if (c->opts.nochanges)
 		goto no_io;
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@ -108,6 +108,11 @@ static const char * const __bch2_fs_usage_types[] = {
 	NULL
 };

+const char * const __bch2_rebalance_accounting_types[] = {
+	BCH_REBALANCE_ACCOUNTING()
+	NULL
+};
+
 #undef x

 static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
@ -132,6 +137,7 @@ PRT_STR_OPT_BOUNDSCHECKED(csum_opt,		enum bch_csum_opt);
 PRT_STR_OPT_BOUNDSCHECKED(csum_type,		enum bch_csum_type);
 PRT_STR_OPT_BOUNDSCHECKED(compression_type,	enum bch_compression_type);
 PRT_STR_OPT_BOUNDSCHECKED(str_hash_type,	enum bch_str_hash_type);
+PRT_STR_OPT_BOUNDSCHECKED(rebalance_accounting_type,	enum bch_rebalance_accounting_type);

 static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
 				     struct printbuf *err)
@ -525,7 +531,8 @@ void bch2_opts_to_text(struct printbuf *out,
 	}
 }

-static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, bool post)
+static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id,
+		       u64 v, bool post)
 {
 	if (!test_bit(BCH_FS_started, &c->flags))
 		return 0;
@ -544,11 +551,23 @@ static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_
 			.inum = inum,
 		};

-		try(bch2_set_rebalance_needs_scan(c, s));
-		if (post)
-			bch2_rebalance_wakeup(c);
+		try(bch2_set_rebalance_needs_scan(c, s, post));
 		break;
 	}
+	case Opt_metadata_target:
+	case Opt_metadata_checksum:
+	case Opt_metadata_replicas:
+		try(bch2_set_rebalance_needs_scan(c,
+			(struct rebalance_scan) { .type = REBALANCE_SCAN_metadata, .dev = inum }, post));
+		break;
+	case Opt_durability:
+		if (!post && v > ca->mi.durability)
+			try(bch2_set_rebalance_needs_scan(c,
+				(struct rebalance_scan) { .type = REBALANCE_SCAN_pending}, post));
+
+		try(bch2_set_rebalance_needs_scan(c,
+			(struct rebalance_scan) { .type = REBALANCE_SCAN_device, .dev = inum }, post));
+		break;
 	default:
 		break;
 	}
@ -578,7 +597,7 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum b
 	}

 	if (change)
-		try(opt_hook_io(c, ca, inum, id, false));
+		try(opt_hook_io(c, ca, inum, id, v, false));

 	return 0;
 }
@ -594,7 +613,7 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c)
 void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
 			    enum bch_opt_id id, u64 v)
 {
-	opt_hook_io(c, ca, inum, id, true);
+	opt_hook_io(c, ca, inum, id, v, true);

 	switch (id) {
 	case Opt_rebalance_enabled:
--- a/libbcachefs/opts.h
+++ b/libbcachefs/opts.h
@ -25,6 +25,7 @@ extern const char * const __bch2_str_hash_types[];
 extern const char * const bch2_str_hash_opts[];
 extern const char * const __bch2_data_types[];
 extern const char * const bch2_member_states[];
+extern const char * const __bch2_rebalance_accounting_types[];
 extern const char * const bch2_d_types[];

 void bch2_prt_jset_entry_type(struct printbuf *,	enum bch_jset_entry_type);
@ -34,6 +35,7 @@ void bch2_prt_csum_opt(struct printbuf *,		enum bch_csum_opt);
 void bch2_prt_csum_type(struct printbuf *,		enum bch_csum_type);
 void bch2_prt_compression_type(struct printbuf *,	enum bch_compression_type);
 void bch2_prt_str_hash_type(struct printbuf *,		enum bch_str_hash_type);
+void bch2_prt_rebalance_accounting_type(struct printbuf *, enum bch_rebalance_accounting_type);

 static inline const char *bch2_d_type_str(unsigned d_type)
 {
--- a/libbcachefs/sb/clean.c
+++ b/libbcachefs/sb/clean.c
@ -256,18 +256,10 @@ const struct bch_sb_field_ops bch_sb_field_ops_clean = {
 	.to_text	= bch2_sb_clean_to_text,
 };

-int bch2_fs_mark_dirty(struct bch_fs *c)
+void bch2_fs_mark_dirty(struct bch_fs *c)
 {
-	/*
-	 * Unconditionally write superblock, to verify it hasn't changed before
-	 * we go rw:
-	 */
-
-	guard(mutex)(&c->sb_lock);
 	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
-
-	return bch2_write_super(c);
 }

 void bch2_fs_mark_clean(struct bch_fs *c)
@ -277,7 +269,6 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	unsigned u64s;
 	int ret;

-	guard(mutex)(&c->sb_lock);
 	if (BCH_SB_CLEAN(c->disk_sb.sb))
 		return;

@ -321,6 +312,4 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 	}

 	bch2_journal_pos_from_member_info_set(c);
-
-	bch2_write_super(c);
 }
--- a/libbcachefs/sb/clean.h
+++ b/libbcachefs/sb/clean.h
@ -10,7 +10,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **

 extern const struct bch_sb_field_ops bch_sb_field_ops_clean;

-int bch2_fs_mark_dirty(struct bch_fs *);
+void bch2_fs_mark_dirty(struct bch_fs *);
 void bch2_fs_mark_clean(struct bch_fs *);

 #endif /* _BCACHEFS_SB_CLEAN_H */
--- a/libbcachefs/sb/downgrade.c
+++ b/libbcachefs/sb/downgrade.c
@ -110,7 +110,16 @@
 	  BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set)\
 	x(btree_node_accounting,				\
 	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
-	  BCH_FSCK_ERR_accounting_mismatch)
+	  BCH_FSCK_ERR_accounting_mismatch)			\
+	x(rebalance_v2,						\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work),	\
+	  BCH_FSCK_ERR_accounting_mismatch,			\
+	  BCH_FSCK_ERR_extent_io_opts_not_set)
+
+#define UPGRADE_TABLE_INCOMPAT()				\
+	x(rebalance_v2,						\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_rebalance_work),	\
+	  BCH_FSCK_ERR_extent_io_opts_not_set)

 #define DOWNGRADE_TABLE()					\
 	x(bucket_stripe_sectors,				\
@ -175,17 +184,32 @@ struct upgrade_downgrade_entry {
 UPGRADE_TABLE()
 #undef x

+#define x(ver, passes, ...) static const u16 upgrade_incompat_##ver##_errors[] = { __VA_ARGS__ };
+UPGRADE_TABLE_INCOMPAT()
+#undef x
+
 static const struct upgrade_downgrade_entry upgrade_table[] = {
-#define x(ver, passes, ...) {					\
-	.recovery_passes	= passes,			\
-	.version		= bcachefs_metadata_version_##ver,\
-	.nr_errors		= ARRAY_SIZE(upgrade_##ver##_errors),	\
-	.errors			= upgrade_##ver##_errors,	\
+#define x(ver, passes, ...) {							\
+	.recovery_passes	= passes,					\
+	.version		= bcachefs_metadata_version_##ver,		\
+	.nr_errors		= ARRAY_SIZE(upgrade_##ver##_errors),		\
+	.errors			= upgrade_##ver##_errors,			\
 },
 UPGRADE_TABLE()
 #undef x
 };

+static const struct upgrade_downgrade_entry upgrade_table_incompat[] = {
+#define x(ver, passes, ...) {							\
+	.recovery_passes	= passes,					\
+	.version		= bcachefs_metadata_version_##ver,		\
+	.nr_errors		= ARRAY_SIZE(upgrade_incompat_##ver##_errors),	\
+	.errors			= upgrade_incompat_##ver##_errors,		\
+},
+UPGRADE_TABLE_INCOMPAT()
+#undef x
+};
+
 static int have_stripes(struct bch_fs *c)
 {
 	if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
@ -219,17 +243,17 @@ int bch2_sb_set_upgrade_extra(struct bch_fs *c)
 	return ret < 0 ? ret : 0;
 }

-void bch2_sb_set_upgrade(struct bch_fs *c,
-			 unsigned old_version,
-			 unsigned new_version)
+static void __bch2_sb_set_upgrade(struct bch_fs *c,
+				  unsigned old_version,
+				  unsigned new_version,
+				  const struct upgrade_downgrade_entry *table,
+				  size_t nr_entries)
 {
 	lockdep_assert_held(&c->sb_lock);

 	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);

-	for (const struct upgrade_downgrade_entry *i = upgrade_table;
-	     i < upgrade_table + ARRAY_SIZE(upgrade_table);
-	     i++)
+	for (const struct upgrade_downgrade_entry *i = table; i < table + nr_entries; i++)
 		if (i->version > old_version && i->version <= new_version) {
 			u64 passes = i->recovery_passes;

@ -245,6 +269,24 @@ void bch2_sb_set_upgrade(struct bch_fs *c,
 		}
 }

+void bch2_sb_set_upgrade(struct bch_fs *c,
+			 unsigned old_version,
+			 unsigned new_version)
+{
+	return __bch2_sb_set_upgrade(c, old_version, new_version,
+				     upgrade_table,
+				     ARRAY_SIZE(upgrade_table));
+}
+
+void bch2_sb_set_upgrade_incompat(struct bch_fs *c,
+				  unsigned old_version,
+				  unsigned new_version)
+{
+	return __bch2_sb_set_upgrade(c, old_version, new_version,
+				     upgrade_table_incompat,
+				     ARRAY_SIZE(upgrade_table_incompat));
+}
+
 #define x(ver, passes, ...) static const u16 downgrade_##ver##_errors[] = { __VA_ARGS__ };
 DOWNGRADE_TABLE()
 #undef x
--- a/libbcachefs/sb/downgrade.h
+++ b/libbcachefs/sb/downgrade.h
@ -6,6 +6,7 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;

 int bch2_sb_downgrade_update(struct bch_fs *);
 void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
+void bch2_sb_set_upgrade_incompat(struct bch_fs *, unsigned, unsigned);
 int bch2_sb_set_upgrade_extra(struct bch_fs *);
 void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);

--- a/libbcachefs/sb/errors_format.h
+++ b/libbcachefs/sb/errors_format.h
@ -160,6 +160,9 @@ enum bch_fsck_flags {
 	x(extent_ptrs_redundant_stripe,				139,	0)		\
 	x(extent_ptrs_unwritten,				140,	0)		\
 	x(extent_ptrs_written_and_unwritten,			141,	0)		\
+	x(extent_ptrs_all_invalid,				338,	0)		\
+	x(extent_rebalance_bad_pending,				332,	0)		\
+	x(extent_rebalance_bad_hipri,				333,	0)		\
 	x(ptr_to_invalid_device,				142,	0)		\
 	x(ptr_to_removed_device,				322,	FSCK_AUTOFIX)	\
 	x(ptr_to_duplicate_device,				143,	0)		\
@ -339,9 +342,15 @@ enum bch_fsck_flags {
 	x(dirent_cf_name_too_big,				304,	0)		\
 	x(dirent_stray_data_after_cf_name,			305,	0)		\
 	x(rebalance_work_incorrectly_set,			309,	FSCK_AUTOFIX)	\
-	x(rebalance_work_incorrectly_unset,			310,	FSCK_AUTOFIX)	\
 	x(validate_error_in_commit,				329,	0)		\
-	x(MAX,							330,	0)
+	x(extent_io_opts_not_set,				330,	FSCK_AUTOFIX)	\
+	x(extent_io_opts_unneeded,				331,	FSCK_AUTOFIX)	\
+	x(rebalance_bp_to_missing_btree_ptr,			310,	FSCK_AUTOFIX)	\
+	x(rebalance_bp_to_leaf_node_key,			334,	FSCK_AUTOFIX)	\
+	x(btree_ptr_with_no_rebalance_bp,			335,	FSCK_AUTOFIX)	\
+	x(btree_ptr_with_bad_rebalance_bp,			336,	FSCK_AUTOFIX)	\
+	x(btree_ptr_to_bad_rebalance_bp,			337,	FSCK_AUTOFIX)	\
+	x(MAX,							339,	0)

 enum bch_sb_error_id {
 #define x(t, n, ...) BCH_FSCK_ERR_##t = n,
--- a/libbcachefs/sb/io.c
+++ b/libbcachefs/sb/io.c
@ -1021,6 +1021,11 @@ int bch2_write_super(struct bch_fs *c)
 	closure_init_stack(cl);
 	memset(&sb_written, 0, sizeof(sb_written));

+	if (bch2_sb_has_journal(c->disk_sb.sb))
+		bch2_fs_mark_dirty(c);
+	else
+		bch2_fs_mark_clean(c);
+
 	/*
 	 * Note: we do writes to RO devices here, and we might want to change
 	 * that in the future.
@ -1276,6 +1281,8 @@ void bch2_sb_upgrade_incompat(struct bch_fs *c)
 	c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
 	SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb,
 			max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version));
+
+	bch2_sb_set_upgrade_incompat(c, c->sb.version_incompat_allowed, c->sb.version);
 	bch2_write_super(c);
 }

--- a/libbcachefs/snapshots/snapshot.c
+++ b/libbcachefs/snapshots/snapshot.c
@ -1136,8 +1136,10 @@ void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c)
 	}

 	scoped_guard(mutex, &d->progress_lock) {
-		bch2_snapshot_delete_nodes_to_text(out, d);
+		prt_str(out, "Current position: ");
 		bch2_bbpos_to_text(out, d->pos);
+		prt_newline(out);
+		bch2_snapshot_delete_nodes_to_text(out, d);
 	}
 }

--- a/libbcachefs/util/printbuf.h
+++ b/libbcachefs/util/printbuf.h
@ -71,7 +71,7 @@ enum printbuf_si {
 	PRINTBUF_UNITS_10,	/* use powers of 10^3 (standard SI) */
 };

-#define PRINTBUF_INLINE_TABSTOPS	6
+#define PRINTBUF_INLINE_TABSTOPS	8

 struct printbuf {
 	char			*buf;
--- a/libbcachefs/vfs/direct.c
+++ b/libbcachefs/vfs/direct.c
@ -86,7 +86,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)

 	/* bios must be 512 byte aligned: */
 	if ((offset|iter->count) & (SECTOR_SIZE - 1))
-		return -EINVAL;
+		return bch_err_throw(c, unaligned_io);

 	ret = min_t(loff_t, iter->count,
 		    max_t(loff_t, 0, i_size_read(&inode->v) - offset));
@ -627,7 +627,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 		goto err_put_write_ref;

 	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
-		ret = -EINVAL;
+		ret = bch_err_throw(c, unaligned_io);
 		goto err_put_write_ref;
 	}