Update bcachefs sources to 5fe20ac58af4 bcachefs: Don't bail out of check_inode() if check_has_case_sensitive() fails

2025-12-08 00:00:12 +03:00 · 2025-10-30 16:42:35 -04:00 · 2025-10-30 16:42:35 -04:00 · dc8c10a4b0
commit dc8c10a4b0
parent b2b4a5e78b
38 changed files with 610 additions and 532 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-b552eb12225133c8bf869b461faba6b72e35d2be
+5fe20ac58af402e8ad9ace0bcf9daad524e3005d
--- a/libbcachefs/alloc/accounting.c
+++ b/libbcachefs/alloc/accounting.c
@ -440,25 +440,39 @@ static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e)
 	return true;
 }

-void bch2_accounting_mem_gc(struct bch_fs *c)
+void __bch2_accounting_maybe_kill(struct bch_fs *c, struct bpos pos)
 {
-	struct bch_accounting_mem *acc = &c->accounting;
+	struct disk_accounting_pos acc_k;
+	bpos_to_disk_accounting_pos(&acc_k, pos);

-	guard(percpu_write)(&c->mark_lock);
-	struct accounting_mem_entry *dst = acc->k.data;
+	if (acc_k.type != BCH_DISK_ACCOUNTING_replicas)
+		return;

-	darray_for_each(acc->k, src) {
-		if (accounting_mem_entry_is_zero(src)) {
-			free_percpu(src->v[0]);
-			free_percpu(src->v[1]);
-		} else {
-			*dst++ = *src;
+	guard(mutex)(&c->sb_lock);
+	scoped_guard(percpu_write, &c->mark_lock) {
+		struct bch_accounting_mem *acc = &c->accounting;
+
+		unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+					       accounting_pos_cmp, &pos);
+
+		if (idx < acc->k.nr) {
+			struct accounting_mem_entry *e = acc->k.data + idx;
+			if (!accounting_mem_entry_is_zero(e))
+				return;
+
+			free_percpu(e->v[0]);
+			free_percpu(e->v[1]);
+
+			swap(*e, darray_last(acc->k));
+			--acc->k.nr;
+			eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+					accounting_pos_cmp, NULL);
 		}
+
+		bch2_replicas_entry_kill(c, &acc_k.replicas);
 	}

-	acc->k.nr = dst - acc->k.data;
-	eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-			accounting_pos_cmp, NULL);
+	bch2_write_super(c);
 }

 /*
@ -472,9 +486,6 @@ void bch2_accounting_mem_gc(struct bch_fs *c)
 int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 {
 	struct bch_accounting_mem *acc = &c->accounting;
-	int ret = 0;
-
-	darray_init(usage);

 	guard(percpu_read)(&c->mark_lock);
 	darray_for_each(acc->k, i) {
@ -492,24 +503,19 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage)
 		bch2_accounting_mem_read_counters(acc, i - acc->k.data, &sectors, 1, false);
 		u.r.sectors = sectors;

-		ret = darray_make_room(usage, replicas_usage_bytes(&u.r));
-		if (ret)
-			break;
+		try(darray_make_room(usage, replicas_usage_bytes(&u.r)));

 		memcpy(&darray_top(*usage), &u.r, replicas_usage_bytes(&u.r));
 		usage->nr += replicas_usage_bytes(&u.r);
 	}

-	if (ret)
-		darray_exit(usage);
-	return ret;
+	return 0;
 }

 int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned accounting_types_mask)
 {

 	struct bch_accounting_mem *acc = &c->accounting;
-	int ret = 0;

 	darray_init(out_buf);

@ -521,10 +527,8 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc
 		if (!(accounting_types_mask & BIT(a_p.type)))
 			continue;

-		ret = darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
-				       sizeof(u64) * i->nr_counters);
-		if (ret)
-			break;
+		try(darray_make_room(out_buf, sizeof(struct bkey_i_accounting) +
+				     sizeof(u64) * i->nr_counters));

 		struct bkey_i_accounting *a_out =
 			bkey_accounting_init((void *) &darray_top(*out_buf));
@ -537,9 +541,7 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc
 			out_buf->nr += bkey_bytes(&a_out->k);
 	}

-	if (ret)
-		darray_exit(out_buf);
-	return ret;
+	return 0;
 }

 static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc)
--- a/libbcachefs/alloc/accounting.h
+++ b/libbcachefs/alloc/accounting.h
@ -43,6 +43,21 @@ static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst,
 		dst->k.bversion = src.k->bversion;
 }

+void __bch2_accounting_maybe_kill(struct bch_fs *, struct bpos pos);
+
+static inline void bch2_accounting_accumulate_maybe_kill(struct bch_fs *c,
+							 struct bkey_i_accounting *dst,
+							 struct bkey_s_c_accounting src)
+{
+	bch2_accounting_accumulate(dst, src);
+
+	for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++)
+		if (dst->v.d[i])
+			return;
+
+	__bch2_accounting_maybe_kill(c, dst->k.p);
+}
+
 static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
 					      enum bch_data_type data_type,
 					      s64 sectors)
@ -137,7 +152,6 @@ enum bch_accounting_mode {

 int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
 int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode);
-void bch2_accounting_mem_gc(struct bch_fs *);

 static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc)
 {
@ -205,13 +219,10 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans,

 	while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
 				      accounting_pos_cmp, &a.k->p)) >= acc->k.nr) {
-		int ret = 0;
 		if (unlikely(write_locked))
-			ret = bch2_accounting_mem_insert_locked(c, a, mode);
+			try(bch2_accounting_mem_insert_locked(c, a, mode));
 		else
-			ret = bch2_accounting_mem_insert(c, a, mode);
-		if (ret)
-			return ret;
+			try(bch2_accounting_mem_insert(c, a, mode));
 	}

 	struct accounting_mem_entry *e = &acc->k.data[idx];
--- a/libbcachefs/alloc/replicas.c
+++ b/libbcachefs/alloc/replicas.c
@ -12,6 +12,21 @@

 #include <linux/sort.h>

+DEFINE_CLASS(bch_replicas_cpu, struct bch_replicas_cpu,
+	     kfree(_T.entries),
+	     (struct bch_replicas_cpu) {}, void)
+
+static inline struct bch_replicas_entry_v1 *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+#define for_each_cpu_replicas_entry(_r, _i)						\
+	for (struct bch_replicas_entry_v1 *_i = (_r)->entries;				\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;	\
+	     _i = (void *) (_i) + (_r)->entry_size)
+
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);

@ -129,15 +144,14 @@ bad:
 void bch2_cpu_replicas_to_text(struct printbuf *out,
 			       struct bch_replicas_cpu *r)
 {
-	struct bch_replicas_entry_v1 *e;
 	bool first = true;

-	for_each_cpu_replicas_entry(r, e) {
+	for_each_cpu_replicas_entry(r, i) {
 		if (!first)
 			prt_printf(out, " ");
 		first = false;

-		bch2_replicas_entry_to_text(out, e);
+		bch2_replicas_entry_to_text(out, i);
 	}
 }

@ -246,45 +260,27 @@ cpu_replicas_add_entry(struct bch_fs *c,
 	return new;
 }

-static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
-				       struct bch_replicas_entry_v1 *search)
+static inline struct bch_replicas_entry_v1 *
+replicas_entry_search(struct bch_replicas_cpu *r,
+		      struct bch_replicas_entry_v1 *search)
 {
-	int idx, entry_size = replicas_entry_bytes(search);
+	verify_replicas_entry(search);

-	if (unlikely(entry_size > r->entry_size))
-		return -1;
-
-#define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
-	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
-			      entry_cmp, search);
-#undef entry_cmp
-
-	return idx < r->nr ? idx : -1;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *c,
-			    struct bch_replicas_entry_v1 *search)
-{
-	bch2_replicas_entry_sort(search);
-
-	return __replicas_entry_idx(&c->replicas, search);
-}
-
-static bool __replicas_has_entry(struct bch_replicas_cpu *r,
-				 struct bch_replicas_entry_v1 *search)
-{
-	return __replicas_entry_idx(r, search) >= 0;
+	size_t entry_size = replicas_entry_bytes(search);
+	int idx = likely(entry_size <= r->entry_size)
+		? eytzinger0_find_r(r->entries, r->nr, r->entry_size,
+				    bch2_memcmp, (void *) entry_size, search)
+		: -1;
+	return idx >= 0 ? cpu_replicas_entry(r, idx) : NULL;
 }

 bool bch2_replicas_marked_locked(struct bch_fs *c,
 			  struct bch_replicas_entry_v1 *search)
 {
-	verify_replicas_entry(search);
-
 	return !search->nr_devs ||
-		(__replicas_has_entry(&c->replicas, search) &&
+		(replicas_entry_search(&c->replicas, search) &&
 		 (likely((!c->replicas_gc.entries)) ||
-		  __replicas_has_entry(&c->replicas_gc, search)));
+		  replicas_entry_search(&c->replicas_gc, search)));
 }

 bool bch2_replicas_marked(struct bch_fs *c,
@ -298,40 +294,31 @@ noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry_v1 *new_entry)
 {
-	struct bch_replicas_cpu new_r, new_gc;
-	int ret = 0;
-
 	verify_replicas_entry(new_entry);

-	memset(&new_r, 0, sizeof(new_r));
-	memset(&new_gc, 0, sizeof(new_gc));
+	CLASS(bch_replicas_cpu, new_r)();
+	CLASS(bch_replicas_cpu, new_gc)();

 	guard(mutex)(&c->sb_lock);

 	if (c->replicas_gc.entries &&
-	    !__replicas_has_entry(&c->replicas_gc, new_entry)) {
+	    !replicas_entry_search(&c->replicas_gc, new_entry)) {
 		new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry);
-		if (!new_gc.entries) {
-			ret = bch_err_throw(c, ENOMEM_cpu_replicas);
-			goto out;
-		}
+		if (!new_gc.entries)
+			return bch_err_throw(c, ENOMEM_cpu_replicas);
 	}

-	if (!__replicas_has_entry(&c->replicas, new_entry)) {
+	if (!replicas_entry_search(&c->replicas, new_entry)) {
 		new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry);
-		if (!new_r.entries) {
-			ret = bch_err_throw(c, ENOMEM_cpu_replicas);
-			goto out;
-		}
+		if (!new_r.entries)
+			return bch_err_throw(c, ENOMEM_cpu_replicas);

-		ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r);
-		if (ret)
-			goto out;
+		try(bch2_cpu_replicas_to_sb_replicas(c, &new_r));
 	}

 	if (!new_r.entries &&
 	    !new_gc.entries)
-		goto out;
+		return 0;

 	/* allocations done, now commit: */

@ -345,12 +332,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 		if (new_gc.entries)
 			swap(new_gc, c->replicas_gc);
 	}
-out:
-	kfree(new_r.entries);
-	kfree(new_gc.entries);

-	bch_err_msg(c, ret, "adding replicas entry");
-	return ret;
+	return 0;
 }

 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
@ -387,9 +370,6 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)

 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 {
-	struct bch_replicas_entry_v1 *e;
-	unsigned i = 0;
-
 	lockdep_assert_held(&c->replicas_gc_lock);

 	guard(mutex)(&c->sb_lock);
@ -401,7 +381,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 	for_each_cpu_replicas_entry(&c->replicas, e) {
 		/* Preserve unknown data types */
 		if (e->data_type >= BCH_DATA_NR ||
-		    !((1 << e->data_type) & typemask)) {
+		    !(BIT(e->data_type) & typemask)) {
 			c->replicas_gc.nr++;
 			c->replicas_gc.entry_size =
 				max_t(unsigned, c->replicas_gc.entry_size,
@ -417,9 +397,10 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 		return bch_err_throw(c, ENOMEM_replicas_gc);
 	}

+	unsigned i = 0;
 	for_each_cpu_replicas_entry(&c->replicas, e)
 		if (e->data_type >= BCH_DATA_NR ||
-		    !((1 << e->data_type) & typemask))
+		    !(BIT(e->data_type) & typemask))
 			memcpy(cpu_replicas_entry(&c->replicas_gc, i++),
 			       e, c->replicas_gc.entry_size);

@ -427,73 +408,23 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 	return 0;
 }

-/*
- * New much simpler mechanism for clearing out unneeded replicas entries - drop
- * replicas entries that have 0 sectors used.
- *
- * However, we don't track sector counts for journal usage, so this doesn't drop
- * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism
- * is retained for that.
- */
-int bch2_replicas_gc2(struct bch_fs *c)
+void bch2_replicas_entry_kill(struct bch_fs *c, struct bch_replicas_entry_v1 *kill)
 {
-	struct bch_replicas_cpu new = { 0 };
-	unsigned nr;
-	int ret = 0;
+	lockdep_assert_held(&c->mark_lock);
+	lockdep_assert_held(&c->sb_lock);

-	bch2_accounting_mem_gc(c);
-retry:
-	nr		= READ_ONCE(c->replicas.nr);
-	new.entry_size	= READ_ONCE(c->replicas.entry_size);
-	new.entries	= kcalloc(nr, new.entry_size, GFP_KERNEL);
-	if (!new.entries) {
-		bch_err(c, "error allocating c->replicas_gc");
-		return bch_err_throw(c, ENOMEM_replicas_gc);
-	}
+	struct bch_replicas_cpu *r = &c->replicas;

-	guard(mutex)(&c->sb_lock);
-	scoped_guard(percpu_write, &c->mark_lock) {
-		if (nr			!= c->replicas.nr ||
-		    new.entry_size	!= c->replicas.entry_size) {
-			kfree(new.entries);
-			goto retry;
-		}
+	struct bch_replicas_entry_v1 *e = replicas_entry_search(&c->replicas, kill);
+	if (WARN(!e, "replicas entry not found in sb"))
+		return;

-		for (unsigned i = 0; i < c->replicas.nr; i++) {
-			struct bch_replicas_entry_v1 *e =
-				cpu_replicas_entry(&c->replicas, i);
+	memcpy(e, cpu_replicas_entry(r, --r->nr), r->entry_size);

-			struct disk_accounting_pos k = {
-				.type = BCH_DISK_ACCOUNTING_replicas,
-			};
+	bch2_cpu_replicas_sort(r);

-			unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e),
-				      "embedded variable length struct");
-
-			struct bpos p = disk_accounting_pos_to_bpos(&k);
-
-			struct bch_accounting_mem *acc = &c->accounting;
-			bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
-						    accounting_pos_cmp, &p) >= acc->k.nr;
-
-			if (e->data_type == BCH_DATA_journal || !kill)
-				memcpy(cpu_replicas_entry(&new, new.nr++),
-				       e, new.entry_size);
-		}
-
-		bch2_cpu_replicas_sort(&new);
-
-		ret = bch2_cpu_replicas_to_sb_replicas(c, &new);
-
-		if (!ret)
-			swap(c->replicas, new);
-
-		kfree(new.entries);
-	}
-
-	if (!ret)
-		bch2_write_super(c);
-	return ret;
+	int ret = bch2_cpu_replicas_to_sb_replicas(c, r);
+	WARN(ret, "bch2_cpu_replicas_to_sb_replicas() error: %s", bch2_err_str(ret));
 }

 /* Replicas tracking - superblock: */
@ -502,7 +433,6 @@ static int
 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 				   struct bch_replicas_cpu *cpu_r)
 {
-	struct bch_replicas_entry_v1 *e, *dst;
 	unsigned nr = 0, entry_size = 0, idx = 0;

 	for_each_replicas_entry(sb_r, e) {
@ -519,7 +449,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
 	cpu_r->entry_size	= entry_size;

 	for_each_replicas_entry(sb_r, e) {
-		dst = cpu_replicas_entry(cpu_r, idx++);
+		struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++);
 		memcpy(dst, e, replicas_entry_bytes(e));
 		bch2_replicas_entry_sort(dst);
 	}
@ -531,7 +461,6 @@ static int
 __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 				      struct bch_replicas_cpu *cpu_r)
 {
-	struct bch_replicas_entry_v0 *e;
 	unsigned nr = 0, entry_size = 0, idx = 0;

 	for_each_replicas_entry(sb_r, e) {
@ -550,14 +479,14 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
 	cpu_r->nr		= nr;
 	cpu_r->entry_size	= entry_size;

-	for_each_replicas_entry(sb_r, e) {
+	for_each_replicas_entry(sb_r, src) {
 		struct bch_replicas_entry_v1 *dst =
 			cpu_replicas_entry(cpu_r, idx++);

-		dst->data_type	= e->data_type;
-		dst->nr_devs	= e->nr_devs;
+		dst->data_type	= src->data_type;
+		dst->nr_devs	= src->nr_devs;
 		dst->nr_required = 1;
-		memcpy(dst->devs, e->devs, e->nr_devs);
+		memcpy(dst->devs, src->devs, src->nr_devs);
 		bch2_replicas_entry_sort(dst);
 	}

@ -568,7 +497,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 {
 	struct bch_sb_field_replicas *sb_v1;
 	struct bch_sb_field_replicas_v0 *sb_v0;
-	struct bch_replicas_cpu new_r = { 0, 0, NULL };
+	CLASS(bch_replicas_cpu, new_r)();

 	if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas)))
 		try(__bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r));
@ -580,8 +509,6 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	guard(percpu_write)(&c->mark_lock);
 	swap(c->replicas, new_r);

-	kfree(new_r.entries);
-
 	return 0;
 }

@ -590,7 +517,6 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
 {
 	struct bch_sb_field_replicas_v0 *sb_r;
 	struct bch_replicas_entry_v0 *dst;
-	struct bch_replicas_entry_v1 *src;
 	size_t bytes;

 	bytes = sizeof(struct bch_sb_field_replicas);
@ -628,7 +554,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
 					    struct bch_replicas_cpu *r)
 {
 	struct bch_sb_field_replicas *sb_r;
-	struct bch_replicas_entry_v1 *dst, *src;
+	struct bch_replicas_entry_v1 *dst;
 	bool need_v1 = false;
 	size_t bytes;

@ -707,12 +633,11 @@ static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f,
 {
 	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);

-	struct bch_replicas_cpu cpu_r;
+	CLASS(bch_replicas_cpu, cpu_r)();
 	try(__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r));
+	try(bch2_cpu_replicas_validate(&cpu_r, sb, err));

-	int ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
-	kfree(cpu_r.entries);
-	return ret;
+	return 0;
 }

 static void bch2_sb_replicas_to_text(struct printbuf *out,
@ -720,7 +645,6 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
 				     struct bch_sb_field *f)
 {
 	struct bch_sb_field_replicas *r = field_to_type(f, replicas);
-	struct bch_replicas_entry_v1 *e;
 	bool first = true;

 	for_each_replicas_entry(r, e) {
@ -743,12 +667,11 @@ static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *
 {
 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);

-	struct bch_replicas_cpu cpu_r;
+	CLASS(bch_replicas_cpu, cpu_r)();
 	try(__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r));
+	try(bch2_cpu_replicas_validate(&cpu_r, sb, err));

-	int ret = bch2_cpu_replicas_validate(&cpu_r, sb, err);
-	kfree(cpu_r.entries);
-	return ret;
+	return 0;
 }

 static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
@ -756,7 +679,6 @@ static void bch2_sb_replicas_v0_to_text(struct printbuf *out,
 					struct bch_sb_field *f)
 {
 	struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0);
-	struct bch_replicas_entry_v0 *e;
 	bool first = true;

 	for_each_replicas_entry(sb_r, e) {
@ -779,8 +701,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 bool bch2_can_read_fs_with_devs(struct bch_fs *c, struct bch_devs_mask devs,
 				unsigned flags, struct printbuf *err)
 {
-	struct bch_replicas_entry_v1 *e;
-
 	guard(percpu_read)(&c->mark_lock);
 	for_each_cpu_replicas_entry(&c->replicas, e) {
 		unsigned nr_online = 0, nr_failed = 0, dflags = 0;
@ -910,8 +830,6 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 	replicas_v0 = bch2_sb_field_get(sb, replicas_v0);

 	if (replicas) {
-		struct bch_replicas_entry_v1 *r;
-
 		for_each_replicas_entry(replicas, r) {
 			if (r->data_type >= sizeof(data_has) * 8)
 				continue;
@ -922,9 +840,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
 		}

 	} else if (replicas_v0) {
-		struct bch_replicas_entry_v0 *r;
-
-		for_each_replicas_entry_v0(replicas_v0, r) {
+		for_each_replicas_entry(replicas_v0, r) {
 			if (r->data_type >= sizeof(data_has) * 8)
 				continue;

--- a/libbcachefs/alloc/replicas.h
+++ b/libbcachefs/alloc/replicas.h
@ -13,15 +13,6 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
 				 struct bch_fs *, struct printbuf *);
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);

-static inline struct bch_replicas_entry_v1 *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-	return (void *) r->entries + r->entry_size * i;
-}
-
-int bch2_replicas_entry_idx(struct bch_fs *,
-			    struct bch_replicas_entry_v1 *);
-
 void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
 			      enum bch_data_type,
 			      struct bch_devs_list);
@ -53,12 +44,15 @@ unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);

 int bch2_replicas_gc_end(struct bch_fs *, int);
 int bch2_replicas_gc_start(struct bch_fs *, unsigned);
-int bch2_replicas_gc2(struct bch_fs *);
+void bch2_replicas_entry_kill(struct bch_fs *, struct bch_replicas_entry_v1 *);

-#define for_each_cpu_replicas_entry(_r, _i)				\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
-	     _i = (void *) (_i) + (_r)->entry_size)
+static inline bool bch2_replicas_entry_has_dev(struct bch_replicas_entry_v1 *r, unsigned dev)
+{
+	for (unsigned i = 0; i < r->nr_devs; i++)
+		if (r->devs[i] == dev)
+			return true;
+	return false;
+}

 /* iterate over superblock replicas - used by userspace tools: */

@ -66,12 +60,7 @@ int bch2_replicas_gc2(struct bch_fs *);
 	((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i)))

 #define for_each_replicas_entry(_r, _i)					\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-	     (_i) = replicas_entry_next(_i))
-
-#define for_each_replicas_entry_v0(_r, _i)				\
-	for (_i = (_r)->entries;					\
+	for (typeof(&(_r)->entries[0]) _i = (_r)->entries;		\
 	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
 	     (_i) = replicas_entry_next(_i))

--- a/libbcachefs/alloc/replicas_types.h
+++ b/libbcachefs/alloc/replicas_types.h
@ -8,4 +8,10 @@ struct bch_replicas_cpu {
 	struct bch_replicas_entry_v1 *entries;
 };

+union bch_replicas_padded {
+	u8				bytes[struct_size_t(struct bch_replicas_entry_v1,
+							    devs, BCH_BKEY_PTRS_MAX)];
+	struct bch_replicas_entry_v1	e;
+};
+
 #endif /* _BCACHEFS_REPLICAS_TYPES_H */
--- a/libbcachefs/btree/interior.c
+++ b/libbcachefs/btree/interior.c
@ -609,6 +609,18 @@ static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *
 	closure_wake_up(&c->btree_interior_update_wait);
 }

+static void bch2_btree_update_add_key(btree_update_nodes *nodes,
+				      unsigned level, struct bkey_i *k)
+{
+	BUG_ON(darray_make_room(nodes, 1));
+
+	struct btree_update_node *n = &darray_top(*nodes);
+	nodes->nr++;
+
+	*n = (struct btree_update_node) { .level = level };
+	bkey_copy(&n->key, k);
+}
+
 static void bch2_btree_update_add_node(struct bch_fs *c, btree_update_nodes *nodes, struct btree *b)
 {
 	BUG_ON(darray_make_room(nodes, 1));
@ -649,20 +661,26 @@ static void btree_update_new_nodes_mark_sb(struct btree_update *as)
 static int btree_update_nodes_written_trans(struct btree_trans *trans,
 					    struct btree_update *as)
 {
-	struct jset_entry *e = errptr_try(bch2_trans_jset_entry_alloc(trans, as->journal_u64s));
-
-	memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
-
 	trans->journal_pin = &as->journal;

 	darray_for_each(as->old_nodes, i)
 		try(bch2_key_trigger_old(trans, as->btree_id, i->level + 1, bkey_i_to_s_c(&i->key),
 					 BTREE_TRIGGER_transactional));

-	darray_for_each(as->new_nodes, i)
+	darray_for_each(as->new_nodes, i) {
 		try(bch2_key_trigger_new(trans, as->btree_id, i->level + 1, bkey_i_to_s(&i->key),
 					 BTREE_TRIGGER_transactional));

+		journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans,
+									 jset_u64s(i->key.k.u64s))),
+				  i->root
+				  ? BCH_JSET_ENTRY_btree_root
+				  : BCH_JSET_ENTRY_btree_keys,
+				  as->btree_id,
+				  i->root ? i->level : i->level + 1,
+				  &i->key, i->key.k.u64s);
+	}
+
 	return 0;
 }

@ -749,11 +767,12 @@ static void btree_update_nodes_written(struct btree_update *as)
 		 * all our new nodes, to avoid racing with
 		 * btree_node_update_key():
 		 */
-		darray_for_each(as->new_nodes, i) {
-			BUG_ON(i->b->will_make_reachable != (unsigned long) as);
-			i->b->will_make_reachable = 0;
-			clear_btree_node_will_make_reachable(i->b);
-		}
+		darray_for_each(as->new_nodes, i)
+			if (i->b) {
+				BUG_ON(i->b->will_make_reachable != (unsigned long) as);
+				i->b->will_make_reachable = 0;
+				clear_btree_node_will_make_reachable(i->b);
+			}
 	}

 	/*
@ -841,11 +860,12 @@ static void btree_update_nodes_written(struct btree_update *as)

 	bch2_journal_pin_drop(&c->journal, &as->journal);

-	darray_for_each(as->new_nodes, i) {
-		btree_node_lock_nopath_nofail(trans, &i->b->c, SIX_LOCK_read);
-		btree_node_write_if_need(trans, i->b, SIX_LOCK_read);
-		six_unlock_read(&i->b->c.lock);
-	}
+	darray_for_each(as->new_nodes, i)
+		if (i->b) {
+			btree_node_lock_nopath_nofail(trans, &i->b->c, SIX_LOCK_read);
+			btree_node_write_if_need(trans, i->b, SIX_LOCK_read);
+			six_unlock_read(&i->b->c.lock);
+		}

 	for (unsigned i = 0; i < as->nr_open_buckets; i++)
 		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
@ -931,25 +951,13 @@ static void btree_update_reparent(struct btree_update *as,

 static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 {
-	struct bkey_i *insert = &b->key;
 	struct bch_fs *c = as->c;

 	BUG_ON(as->mode != BTREE_UPDATE_none);
+	as->mode = BTREE_UPDATE_root;

-	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
-	       ARRAY_SIZE(as->journal_entries));
-
-	as->journal_u64s +=
-		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
-				  BCH_JSET_ENTRY_btree_root,
-				  b->c.btree_id, b->c.level,
-				  insert, insert->k.u64s);
-
-	scoped_guard(mutex, &c->btree_interior_update_lock) {
+	scoped_guard(mutex, &c->btree_interior_update_lock)
 		list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
-		as->mode	= BTREE_UPDATE_root;
-	}
 }

 /*
@ -1323,7 +1331,6 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 {
 	struct bch_fs *c = as->c;
 	struct bkey_packed *k;
-	CLASS(printbuf, buf)();
 	unsigned long old, new;

 	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
@ -1344,15 +1351,6 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 		dump_stack();
 	}

-	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
-	       ARRAY_SIZE(as->journal_entries));
-
-	as->journal_u64s +=
-		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
-				  BCH_JSET_ENTRY_btree_keys,
-				  b->c.btree_id, b->c.level,
-				  insert, insert->k.u64s);
-
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
 		bch2_btree_node_iter_advance(node_iter, b);
@ -2105,6 +2103,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,

 	bch2_btree_update_get_open_buckets(as, n);
 	bch2_btree_node_write_trans(trans, n, SIX_LOCK_intent, 0);
+	bch2_btree_update_add_key(&as->new_nodes, n->c.level, &delete);
 	bch2_btree_update_add_node(c, &as->new_nodes, n);

 	bch2_btree_node_free_inmem(trans, trans->paths + path, b);
@ -2386,15 +2385,6 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;

 	if (!btree_node_will_make_reachable(b)) {
-		if (!skip_triggers) {
-			try(bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
-						 bkey_i_to_s_c(&b->key),
-						 BTREE_TRIGGER_transactional));
-			try(bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
-						 bkey_i_to_s(new_key),
-						 BTREE_TRIGGER_transactional));
-		}
-
 		if (!btree_node_is_root(c, b)) {
 			CLASS(btree_node_iter, parent_iter)(trans,
 							    b->c.btree_id,
@ -2404,15 +2394,32 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
 							    BTREE_ITER_intent);

 			try(bch2_btree_iter_traverse(&parent_iter));
-			try(bch2_trans_update(trans, &parent_iter, new_key, BTREE_TRIGGER_norun));
+			try(bch2_trans_update(trans, &parent_iter, new_key, skip_triggers ? BTREE_TRIGGER_norun : 0));
 		} else {
-			struct jset_entry *e = errptr_try(bch2_trans_jset_entry_alloc(trans,
-									jset_u64s(new_key->k.u64s)));
+			if (!skip_triggers)
+				try(bch2_key_trigger(trans, b->c.btree_id, b->c.level + 1,
+						     bkey_i_to_s_c(&b->key),
+						     bkey_i_to_s(new_key),
+						     BTREE_TRIGGER_insert|
+						     BTREE_TRIGGER_overwrite|
+						     BTREE_TRIGGER_transactional));

-			journal_entry_set(e,
+			journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans,
+										 jset_u64s(b->key.k.u64s))),
+					  BCH_JSET_ENTRY_overwrite,
+					  b->c.btree_id, b->c.level + 1,
+					  &b->key, b->key.k.u64s);
+
+			journal_entry_set(errptr_try(bch2_trans_jset_entry_alloc(trans,
+										 jset_u64s(new_key->k.u64s))),
 					  BCH_JSET_ENTRY_btree_root,
 					  b->c.btree_id, b->c.level,
 					  new_key, new_key->k.u64s);
+
+			/*
+			 * propagated back to c->btree_roots[].key by
+			 * bch2_journal_entry_to_btree_root() incorrect for
+			 */
 		}

 		try(bch2_trans_commit(trans, NULL, NULL, commit_flags));
--- a/libbcachefs/btree/interior.h
+++ b/libbcachefs/btree/interior.h
@ -8,8 +8,6 @@

 #define BTREE_UPDATE_NODES_MAX		((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES)

-#define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
-
 int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);

 #define BTREE_UPDATE_MODES()	\
@ -111,9 +109,6 @@ struct btree_update {
 						     BCH_REPLICAS_MAX];
 	open_bucket_idx_t		nr_open_buckets;

-	unsigned			journal_u64s;
-	u64				journal_entries[BTREE_UPDATE_JOURNAL_RES];
-
 	/* Only here to reduce stack usage on recursive splits: */
 	struct keylist			parent_keys;
 	/*
--- a/libbcachefs/btree/iter.c
+++ b/libbcachefs/btree/iter.c
@ -736,6 +736,19 @@ void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)

 /* Btree path: traverse, set_pos: */

+static noinline_for_stack int btree_node_root_err(struct btree_trans *trans, struct btree *b)
+{
+	struct bch_fs *c = trans->c;
+	CLASS(printbuf, buf)();
+	bch2_log_msg_start(c, &buf);
+
+	prt_str(&buf, "btree root doesn't cover expected range:\n");
+	bch2_btree_pos_to_text(&buf, c, b);
+	prt_newline(&buf);
+
+	return __bch2_topology_error(c, &buf);
+}
+
 static inline int btree_path_lock_root(struct btree_trans *trans,
 				       struct btree_path *path,
 				       unsigned depth_want,
@ -783,6 +796,13 @@ static inline int btree_path_lock_root(struct btree_trans *trans,
 		if (likely(b == READ_ONCE(r->b) &&
 			   b->c.level == path->level &&
 			   !race_fault())) {
+			if (unlikely(!bpos_eq(b->data->min_key, POS_MIN) ||
+				     !bpos_eq(b->key.k.p, SPOS_MAX))) {
+				ret = btree_node_root_err(trans, b);
+				six_unlock_type(&b->c.lock, lock_type);
+				return ret;
+			}
+
 			for (i = 0; i < path->level; i++)
 				path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root);
 			path->l[path->level].b = b;
--- a/libbcachefs/btree/update.c
+++ b/libbcachefs/btree/update.c
@ -557,7 +557,7 @@ void *__bch2_trans_subbuf_alloc(struct btree_trans *trans,
 int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
 			     enum btree_id btree, struct bpos start, struct bpos end)
 {
-	bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent);
+	bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent|BTREE_ITER_with_updates);
 	struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_prev(iter));

 	if (bpos_lt(iter->pos, start))
--- a/libbcachefs/btree/write_buffer.c
+++ b/libbcachefs/btree/write_buffer.c
@ -158,8 +158,9 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite
 		struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, iter), &u);

 		if (k.k->type == KEY_TYPE_accounting)
-			bch2_accounting_accumulate(bkey_i_to_accounting(&wb->k),
-						   bkey_s_c_to_accounting(k));
+			bch2_accounting_accumulate_maybe_kill(trans->c,
+					bkey_i_to_accounting(&wb->k),
+					bkey_s_c_to_accounting(k));
 	}
 	*accounting_accumulated = true;

--- a/libbcachefs/data/ec_types.h
+++ b/libbcachefs/data/ec_types.h
@ -4,12 +4,6 @@

 #include "bcachefs_format.h"

-union bch_replicas_padded {
-	u8				bytes[struct_size_t(struct bch_replicas_entry_v1,
-							    devs, BCH_BKEY_PTRS_MAX)];
-	struct bch_replicas_entry_v1	e;
-};
-
 struct stripe {
 	size_t			heap_idx;
 	u16			sectors;
--- a/libbcachefs/data/move.c
+++ b/libbcachefs/data/move.c
@ -994,7 +994,6 @@ int bch2_data_job(struct bch_fs *c,
 				     true,
 				     rereplicate_pred, c) ?: ret;
 		bch2_btree_interior_updates_flush(c);
-		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_migrate:
 		if (op->migrate.dev >= c->sb.nr_devices)
@ -1010,7 +1009,6 @@ int bch2_data_job(struct bch_fs *c,
 					  true,
 					  migrate_pred, op) ?: ret;
 		bch2_btree_interior_updates_flush(c);
-		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	case BCH_DATA_OP_rewrite_old_nodes:
 		ret = bch2_scan_old_btree_nodes(c, stats);
@ -1020,7 +1018,6 @@ int bch2_data_job(struct bch_fs *c,
 				     writepoint_hashed((unsigned long) current),
 				     true,
 				     drop_extra_replicas_pred, c) ?: ret;
-		ret = bch2_replicas_gc2(c) ?: ret;
 		break;
 	default:
 		ret = -EINVAL;
--- a/libbcachefs/data/rebalance.c
+++ b/libbcachefs/data/rebalance.c
@ -296,7 +296,7 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans,
 	if (!snapshot_opts) {
 		bch2_inode_opts_get(c, opts, metadata);

-		if (k.k->p.snapshot) {
+		if (!metadata && k.k->p.snapshot) {
 			struct bch_inode_unpacked inode;
 			int ret = bch2_inode_find_by_inum_snapshot(trans, k.k->p.inode, k.k->p.snapshot,
 								   &inode, BTREE_ITER_cached);
@ -313,7 +313,7 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans,
 			snapshot_opts->d.nr = 0;
 		}

-		if (k.k->p.snapshot) {
+		if (!metadata && k.k->p.snapshot) {
 			if (snapshot_opts->cur_inum != k.k->p.inode) {
 				snapshot_opts->d.nr = 0;

@ -362,6 +362,8 @@ int bch2_bkey_get_io_opts(struct btree_trans *trans,
 #undef x
 	}

+	BUG_ON(metadata && opts->erasure_code);
+
 	return 0;
 }

@ -374,10 +376,46 @@ static const char * const bch2_rebalance_state_strs[] = {
 #undef x
 };

-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
+static u64 rebalance_scan_encode(struct rebalance_scan s)
+{
+	switch (s.type) {
+	case REBALANCE_SCAN_fs:
+		return 0;
+	case REBALANCE_SCAN_metadata:
+		return 1;
+	case REBALANCE_SCAN_device:
+		return s.dev + 32;
+	case REBALANCE_SCAN_inum:
+		return s.inum;
+	default:
+		BUG();
+	}
+}
+
+static struct rebalance_scan rebalance_scan_decode(u64 v)
+{
+	if (v == 0)
+		return (struct rebalance_scan) { .type = REBALANCE_SCAN_fs };
+	if (v == 1)
+		return (struct rebalance_scan) { .type = REBALANCE_SCAN_metadata };
+	if (v < BCACHEFS_ROOT_INO)
+		return (struct rebalance_scan) {
+			.type = REBALANCE_SCAN_device,
+			.dev =  v - 32,
+	};
+
+	return (struct rebalance_scan) {
+		.type = REBALANCE_SCAN_inum,
+		.inum = v,
+	};
+}
+
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, struct rebalance_scan s)
 {
 	CLASS(btree_iter, iter)(trans, BTREE_ID_rebalance_work,
-				SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+				SPOS(rebalance_scan_encode(s),
+				     REBALANCE_WORK_SCAN_OFFSET,
+				     U32_MAX),
 				BTREE_ITER_intent);
 	struct bkey_s_c k = bkey_try(bch2_btree_iter_peek_slot(&iter));

@ -394,16 +432,17 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum)
 	return bch2_trans_update(trans, &iter, &cookie->k_i, 0);
 }

-int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, struct rebalance_scan s)
 {
 	CLASS(btree_trans, trans)(c);
 	return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			    bch2_set_rebalance_needs_scan_trans(trans, inum));
+			    bch2_set_rebalance_needs_scan_trans(trans, s));
 }

 int bch2_set_fs_needs_rebalance(struct bch_fs *c)
 {
-	return bch2_set_rebalance_needs_scan(c, 0);
+	return bch2_set_rebalance_needs_scan(c,
+				(struct rebalance_scan) { .type = REBALANCE_SCAN_fs });
 }

 static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
@ -647,7 +686,7 @@ root_err:
 noinline_for_stack
 static int do_rebalance_scan(struct moving_context *ctxt,
 			     struct per_snapshot_io_opts *snapshot_io_opts,
-			     u64 inum, u64 cookie, u64 *sectors_scanned)
+			     u64 scan_v, u64 cookie, u64 *sectors_scanned)
 {
 	struct btree_trans *trans = ctxt->trans;
 	struct bch_fs *c = trans->c;
@ -658,7 +697,8 @@ static int do_rebalance_scan(struct moving_context *ctxt,

 	r->state = BCH_REBALANCE_scanning;

-	if (!inum) {
+	struct rebalance_scan s = rebalance_scan_decode(scan_v);
+	if (s.type == REBALANCE_SCAN_fs) {
 		r->scan_start	= BBPOS_MIN;
 		r->scan_end	= BBPOS_MAX;

@ -670,16 +710,16 @@ static int do_rebalance_scan(struct moving_context *ctxt,
 			try(do_rebalance_scan_btree(ctxt, snapshot_io_opts, btree, 0,
 						    POS_MIN, SPOS_MAX));
 		}
-	} else {
-		r->scan_start	= BBPOS(BTREE_ID_extents, POS(inum, 0));
-		r->scan_end	= BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+	} else if (s.type == REBALANCE_SCAN_inum) {
+		r->scan_start	= BBPOS(BTREE_ID_extents, POS(s.inum, 0));
+		r->scan_end	= BBPOS(BTREE_ID_extents, POS(s.inum, U64_MAX));

 		try(do_rebalance_scan_btree(ctxt, snapshot_io_opts, BTREE_ID_extents, 0,
 					    r->scan_start.pos, r->scan_end.pos));
 	}

 	try(commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			bch2_clear_rebalance_needs_scan(trans, inum, cookie)));
+			bch2_clear_rebalance_needs_scan(trans, scan_v, cookie)));

 	*sectors_scanned += atomic64_read(&r->scan_stats.sectors_seen);
 	/*
--- a/libbcachefs/data/rebalance.h
+++ b/libbcachefs/data/rebalance.h
@ -84,8 +84,22 @@ int bch2_bkey_get_io_opts(struct btree_trans *,
 			  struct per_snapshot_io_opts *, struct bkey_s_c,
 			  struct bch_inode_opts *opts);

-int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64);
-int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+struct rebalance_scan {
+	enum rebalance_scan_type {
+		REBALANCE_SCAN_fs,
+		REBALANCE_SCAN_metadata,
+		REBALANCE_SCAN_device,
+		REBALANCE_SCAN_inum,
+	}			type;
+
+	union {
+		unsigned	dev;
+		u64		inum;
+	};
+};
+
+int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, struct rebalance_scan);
+int bch2_set_rebalance_needs_scan(struct bch_fs *, struct rebalance_scan);
 int bch2_set_fs_needs_rebalance(struct bch_fs *);

 static inline void bch2_rebalance_wakeup(struct bch_fs *c)
--- a/libbcachefs/data/update.c
+++ b/libbcachefs/data/update.c
@ -693,6 +693,9 @@ static int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
 				      struct bch_inode_opts *io_opts,
 				      unsigned buf_bytes)
 {
+	/* be paranoid */
+	buf_bytes = round_up(buf_bytes, c->opts.block_size);
+
 	unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE);

 	m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL);
@ -702,7 +705,7 @@ static int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c,
 	bio_init(&m->rbio.bio,		NULL, m->bvecs, nr_vecs, REQ_OP_READ);
 	bio_init(&m->op.wbio.bio,	NULL, m->bvecs, nr_vecs, 0);

-	if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) {
+	if (bch2_bio_alloc_pages(&m->op.wbio.bio, c->opts.block_size, buf_bytes, GFP_KERNEL)) {
 		kfree(m->bvecs);
 		m->bvecs = NULL;
 		return -ENOMEM;
--- a/libbcachefs/data/write.c
+++ b/libbcachefs/data/write.c
@ -807,6 +807,19 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	struct bio *bio;
 	unsigned output_available =
 		min(wp->sectors_free << 9, src->bi_iter.bi_size);
+
+	/*
+	 * XXX: we'll want to delete this later, there's no reason we can't
+	 * issue > 2MB bios if we're allocating high order pages
+	 *
+	 * But bch2_bio_alloc_pages() BUGS() if we ask it to allocate more pages
+	 * than fit in the bio, and we're using bio_alloc_bioset() which is
+	 * limited to BIO_MAX_VECS
+	 */
+	output_available = min(output_available, BIO_MAX_VECS * PAGE_SIZE);
+
+	BUG_ON(output_available & (c->opts.block_size - 1));
+
 	unsigned pages = DIV_ROUND_UP(output_available +
 				      (buf
 				       ? ((unsigned long) buf & (PAGE_SIZE - 1))
@ -814,8 +827,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,

 	pages = min(pages, BIO_MAX_VECS);

-	bio = bio_alloc_bioset(NULL, pages, 0,
-			       GFP_NOFS, &c->bio_write);
+	bio = bio_alloc_bioset(NULL, pages, 0, GFP_NOFS, &c->bio_write);
 	wbio			= wbio_init(bio);
 	wbio->put_bio		= true;
 	/* copy WRITE_SYNC flag */
@ -839,6 +851,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 	if (bio->bi_iter.bi_size < output_available)
 		*page_alloc_failed =
 			bch2_bio_alloc_pages(bio,
+					     c->opts.block_size,
 					     output_available -
 					     bio->bi_iter.bi_size,
 					     GFP_NOFS) != 0;
--- a/libbcachefs/debug/sysfs.c
+++ b/libbcachefs/debug/sysfs.c
@ -196,6 +196,7 @@ read_attribute(btree_reserve_cache);
 read_attribute(open_buckets);
 read_attribute(open_buckets_partial);
 read_attribute(nocow_lock_table);
+read_attribute(replicas);

 read_attribute(read_refs);
 read_attribute(write_refs);
@ -389,6 +390,9 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_nocow_lock_table)
 		bch2_nocow_locks_to_text(out, &c->nocow_locks);

+	if (attr == &sysfs_replicas)
+		bch2_cpu_replicas_to_text(out, &c->replicas);
+
 	if (attr == &sysfs_disk_groups)
 		bch2_disk_groups_to_text(out, c);

@ -600,6 +604,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_open_buckets_partial,
 	&sysfs_write_refs,
 	&sysfs_nocow_lock_table,
+	&sysfs_replicas,
 	&sysfs_io_timers_read,
 	&sysfs_io_timers_write,

--- a/libbcachefs/fs/check.c
+++ b/libbcachefs/fs/check.c
@ -913,6 +913,9 @@ static int check_inode(struct btree_trans *trans,
 	}

 	ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update);
+	if (bch2_err_matches(ret, ENOENT)) /* disconnected inode; will be fixed by a later pass */
+		ret = 0;
+	bch_err_msg(c, ret, "bch2_check_inode_has_case_insensitive()");
 	if (ret)
 		goto err;

@ -1627,7 +1630,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		new_d->k.p.inode	= d.k->p.inode;
 		new_d->k.p.snapshot	= d.k->p.snapshot;

-		struct btree_iter dup_iter = {};
+		CLASS(btree_iter_uninit, dup_iter)(trans);
 		return  bch2_hash_delete_at(trans,
 					    bch2_dirent_hash_desc, hash_info, iter,
 					    BTREE_UPDATE_internal_snapshot_node) ?:
--- a/libbcachefs/fs/dirent.c
+++ b/libbcachefs/fs/dirent.c
@ -549,7 +549,7 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans,
 					     hash_info, dir, &lookup_name, flags));

 	int ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
-	return ret > 0 ? -ENOENT : 0;
+	return ret > 0 ? -ENOENT : ret;
 }

 u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
--- a/libbcachefs/fs/namei.c
+++ b/libbcachefs/fs/namei.c
@ -832,10 +832,8 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
 		prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ",
 			   inode->bi_inum, inode->bi_snapshot);

-		ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
-						 snapshot_overwrites, &buf);
-		if (ret)
-			goto out;
+		try(bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot,
+					       snapshot_overwrites, &buf));

 		if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) {
 			inode->bi_flags |= BCH_INODE_has_case_insensitive;
@ -844,7 +842,7 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
 	}

 	if (!(inode->bi_flags & BCH_INODE_has_case_insensitive))
-		goto out;
+		return 0;

 	struct bch_inode_unpacked dir = *inode;
 	u32 snapshot = dir.bi_snapshot;
@ -852,30 +850,22 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
 	while (!(dir.bi_inum	== BCACHEFS_ROOT_INO &&
 		 dir.bi_subvol	== BCACHEFS_ROOT_SUBVOL)) {
 		if (dir.bi_parent_subvol) {
-			ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot);
-			if (ret)
-				goto out;
+			try(bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot));

 			snapshot_overwrites = NULL;
 		}

-		ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0);
-		if (ret)
-			goto out;
+		try(bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0));

 		if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) {
 			prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n");

-			ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
-							 snapshot_overwrites, &buf);
-			if (ret)
-				goto out;
+			try(bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot,
+						       snapshot_overwrites, &buf));

 			if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) {
 				dir.bi_flags |= BCH_INODE_has_case_insensitive;
-				ret = __bch2_fsck_write_inode(trans, &dir);
-				if (ret)
-					goto out;
+				try(__bch2_fsck_write_inode(trans, &dir));
 			}
 		}

@ -886,15 +876,11 @@ int bch2_check_inode_has_case_insensitive(struct btree_trans *trans,
 		if (!repairing_parents)
 			break;
 	}
-out:
-fsck_err:
-	bch_err_fn(trans->c, ret);
-	if (ret)
-		return ret;

 	if (repairing_parents)
 		return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
 			bch_err_throw(trans->c, transaction_restart_nested);

-	return 0;
+fsck_err:
+	return ret;
 }
--- a/libbcachefs/fs/quota.c
+++ b/libbcachefs/fs/quota.c
@ -118,7 +118,7 @@ static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q)
 	prt_printf(out, "d_fieldmask\t%x\n",		q->d_fieldmask);
 	prt_printf(out, "d_spc_hardlimit\t%llu\n",	q->d_spc_hardlimit);
 	prt_printf(out, "d_spc_softlimit\t%llu\n",	q->d_spc_softlimit);
-	prt_printf(out, "d_ino_hardlimit\%llu\n",	q->d_ino_hardlimit);
+	prt_printf(out, "d_ino_hardlimit\t%llu\n",	q->d_ino_hardlimit);
 	prt_printf(out, "d_ino_softlimit\t%llu\n",	q->d_ino_softlimit);
 	prt_printf(out, "d_space\t%llu\n",		q->d_space);
 	prt_printf(out, "d_ino_count\t%llu\n",		q->d_ino_count);
--- a/libbcachefs/fs/str_hash.c
+++ b/libbcachefs/fs/str_hash.c
@ -218,6 +218,50 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans
 	return 0;
 }

+static int str_hash_dup_entries(struct btree_trans *trans,
+				struct snapshots_seen *s,
+				const struct bch_hash_desc *desc,
+				struct bch_hash_info *hash_info,
+				struct btree_iter *k_iter, struct bkey_s_c k,
+				struct btree_iter *dup_iter, struct bkey_s_c dup_k,
+				bool *updated_before_k_pos)
+{
+	struct bch_fs *c = trans->c;
+	CLASS(printbuf, buf)();
+	int ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
+	if (ret < 0)
+		return ret;
+
+	if (!fsck_err(trans, hash_table_key_duplicate,
+		      "duplicate hash table keys%s:\n%s",
+		      ret != 2 ? "" : ", both point to valid inodes",
+		      (printbuf_reset(&buf),
+		       bch2_bkey_val_to_text(&buf, c, k),
+		       prt_newline(&buf),
+		       bch2_bkey_val_to_text(&buf, c, dup_k),
+		       buf.buf)))
+		return 0;
+
+	switch (ret) {
+		case 0:
+			try(bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0));
+			break;
+		case 1:
+			try(bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0));
+			break;
+		case 2:
+			try(bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
+						    bkey_s_c_to_dirent(k),
+						    updated_before_k_pos));
+			try(bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0));
+			break;
+	}
+
+	return bch2_trans_commit_lazy(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+fsck_err:
+	return ret;
+}
+
 /* Put a str_hash key in its proper location, checking for duplicates */
 int bch2_str_hash_repair_key(struct btree_trans *trans,
 			     struct snapshots_seen *s,
@ -227,96 +271,65 @@ int bch2_str_hash_repair_key(struct btree_trans *trans,
 			     struct btree_iter *dup_iter, struct bkey_s_c dup_k,
 			     bool *updated_before_k_pos)
 {
-	struct bch_fs *c = trans->c;
-	CLASS(printbuf, buf)();
-	bool free_snapshots_seen = false;
-	int ret = 0;
+	CLASS(snapshots_seen, s_onstack)();

 	if (!s) {
-		s = bch2_trans_kmalloc(trans, sizeof(*s));
-		ret = PTR_ERR_OR_ZERO(s);
-		if (ret)
-			goto out;
-
+		s = &s_onstack;
 		s->pos = k_iter->pos;
-		darray_init(&s->ids);

-		ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids);
-		if (ret)
-			goto out;
-
-		free_snapshots_seen = true;
+		try(bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids));
 	}

 	if (!dup_k.k) {
-		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
-		ret = PTR_ERR_OR_ZERO(new);
-		if (ret)
-			goto out;
+		struct bkey_i *new = errptr_try(bch2_bkey_make_mut_noupdate(trans, k));

-		dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
+		dup_k = bkey_try(bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info,
 				       (subvol_inum) { 0, new->k.p.inode },
 				       new->k.p.snapshot, new,
 				       STR_HASH_must_create|
-				       BTREE_ITER_with_updates|
-				       BTREE_UPDATE_internal_snapshot_node);
-		ret = bkey_err(dup_k);
-		if (ret)
-			goto out;
-		if (dup_k.k)
-			goto duplicate_entries;
+				       BTREE_UPDATE_internal_snapshot_node));

-		if (bpos_lt(new->k.p, k.k->p))
-			*updated_before_k_pos = true;
-
-		ret =   bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
-						       k_iter->pos, new->k.p) ?:
-			bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
-					    BTREE_ITER_with_updates|
-					    BTREE_UPDATE_internal_snapshot_node) ?:
-			bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?:
-			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
-			bch_err_throw(c, transaction_restart_commit);
-	} else {
-duplicate_entries:
-		ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k);
-		if (ret < 0)
-			goto out;
-
-		if (!fsck_err(trans, hash_table_key_duplicate,
-			      "duplicate hash table keys%s:\n%s",
-			      ret != 2 ? "" : ", both point to valid inodes",
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, k),
-			       prt_newline(&buf),
-			       bch2_bkey_val_to_text(&buf, c, dup_k),
-			       buf.buf)))
-			goto out;
-
-		switch (ret) {
-		case 0:
-			ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0);
-			break;
-		case 1:
-			ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0);
-			break;
-		case 2:
-			ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info,
-						      bkey_s_c_to_dirent(k),
-						      updated_before_k_pos) ?:
-				bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
-						    BTREE_ITER_with_updates);
-			goto out;
+		if (!dup_k.k) {
+			try(bch2_insert_snapshot_whiteouts(trans, desc->btree_id,
+							   k_iter->pos, new->k.p));
+			try(bch2_hash_delete_at(trans, *desc, hash_info, k_iter,
+					    BTREE_UPDATE_internal_snapshot_node));
+			try(bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new));
+			try(bch2_trans_commit_lazy(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc));
 		}
-
-		ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
-			bch_err_throw(c, transaction_restart_commit);
 	}
-out:
+
+	if (dup_k.k)
+		try(str_hash_dup_entries(trans, s, desc, hash_info,
+					 k_iter, k, dup_iter, dup_k,
+					 updated_before_k_pos));
+	return 0;
+}
+
+static int str_hash_bad_hash(struct btree_trans *trans,
+			     struct snapshots_seen *s,
+			     const struct bch_hash_desc *desc,
+			     struct bch_hash_info *hash_info,
+			     struct btree_iter *k_iter, struct bkey_s_c hash_k,
+			     bool *updated_before_k_pos,
+			     struct btree_iter *iter, u64 hash)
+{
+	CLASS(printbuf, buf)();
+	int ret = 0;
+	/*
+	 * Before doing any repair, check hash_info itself:
+	 */
+	try(check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info));
+
+	if (fsck_err(trans, hash_table_key_wrong_offset,
+		     "hash table key at wrong offset: should be at %llu\n%s",
+		     hash,
+		     (bch2_bkey_val_to_text(&buf, trans->c, hash_k), buf.buf)))
+		ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
+					       k_iter, hash_k,
+					       iter, bkey_s_c_null,
+					       updated_before_k_pos);
 fsck_err:
-	bch2_trans_iter_exit(dup_iter);
-	if (free_snapshots_seen)
-		darray_exit(&s->ids);
 	return ret;
 }

@ -327,57 +340,36 @@ int __bch2_str_hash_check_key(struct btree_trans *trans,
 			      struct btree_iter *k_iter, struct bkey_s_c hash_k,
 			      bool *updated_before_k_pos)
 {
-	struct bch_fs *c = trans->c;
-	struct btree_iter iter = {};
-	CLASS(printbuf, buf)();
+	u64 hash = desc->hash_bkey(hash_info, hash_k);
+
+	CLASS(btree_iter, iter)(trans, desc->btree_id,
+				SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
+				BTREE_ITER_slots);
+
+	if (hash_k.k->p.offset < hash)
+		return str_hash_bad_hash(trans, s, desc, hash_info, k_iter, hash_k,
+					 updated_before_k_pos, &iter, hash);
+
 	struct bkey_s_c k;
 	int ret = 0;
-
-	u64 hash = desc->hash_bkey(hash_info, hash_k);
-	if (hash_k.k->p.offset < hash)
-		goto bad_hash;
-
-	bch2_trans_iter_init(trans, &iter, desc->btree_id,
-			     SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot),
-			     BTREE_ITER_slots|
-			     BTREE_ITER_with_updates);
-
 	for_each_btree_key_continue_norestart(iter,
-			BTREE_ITER_slots|
-			BTREE_ITER_with_updates, k, ret) {
+			BTREE_ITER_slots, k, ret) {
 		if (bkey_eq(k.k->p, hash_k.k->p))
 			break;

 		if (k.k->type == desc->key_type &&
 		    !desc->cmp_bkey(k, hash_k)) {
-			ret =	check_inode_hash_info_matches_root(trans, hash_k.k->p.inode,
-								   hash_info) ?:
-				bch2_str_hash_repair_key(trans, s, desc, hash_info,
-							 k_iter, hash_k,
-							 &iter, k, updated_before_k_pos);
+			/* dup */
+			try(check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info));
+			try(bch2_str_hash_repair_key(trans, s, desc, hash_info, k_iter, hash_k,
+						     &iter, k, updated_before_k_pos));
 			break;
 		}

 		if (bkey_deleted(k.k))
-			goto bad_hash;
+			return str_hash_bad_hash(trans, s, desc, hash_info, k_iter, hash_k,
+						 updated_before_k_pos, &iter, hash);
 	}
-	bch2_trans_iter_exit(&iter);
-fsck_err:
-	return ret;
-bad_hash:
-	bch2_trans_iter_exit(&iter);
-	/*
-	 * Before doing any repair, check hash_info itself:
-	 */
-	try(check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info));

-	if (fsck_err(trans, hash_table_key_wrong_offset,
-		     "hash table key at wrong offset: should be at %llu\n%s",
-		     hash,
-		     (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf)))
-		ret = bch2_str_hash_repair_key(trans, s, desc, hash_info,
-					       k_iter, hash_k,
-					       &iter, bkey_s_c_null,
-					       updated_before_k_pos);
 	return ret;
 }
--- a/libbcachefs/init/dev.c
+++ b/libbcachefs/init/dev.c
@ -447,8 +447,13 @@ int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb, struct prin
 	lockdep_assert_held(&c->state_lock);

 	if (le64_to_cpu(sb->sb->seq) >
-	    le64_to_cpu(c->disk_sb.sb->seq))
-		bch2_sb_to_fs(c, sb->sb);
+	    le64_to_cpu(c->disk_sb.sb->seq)) {
+		/*
+		 * rewind, we'll lose some updates but it's not safe to call
+		 * bch2_sb_to_fs() after fs is started
+		 */
+		sb->sb->seq = c->disk_sb.sb->seq;
+	}

 	BUG_ON(!bch2_dev_exists(c, sb->sb->dev_idx));

@ -628,11 +633,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags,
 		goto err;
 	}

-	ret = bch2_replicas_gc2(c);
-	if (ret) {
-		prt_printf(err, "bch2_replicas_gc2() error: %s\n", bch2_err_str(ret));
-		goto err;
-	}
+	/*
+	 * flushing the journal should be sufficient, but it's the write buffer
+	 * flush that kills superblock replicas entries after they've gone to 0
+	 * so bch2_dev_has_data() returns the correct value:
+	 */

 	data = bch2_dev_has_data(c, ca);
 	if (data) {
--- a/libbcachefs/journal/init.c
+++ b/libbcachefs/journal/init.c
@ -9,6 +9,7 @@
 #include "journal/seq_blacklist.h"

 #include "alloc/foreground.h"
+#include "alloc/replicas.h"
 #include "btree/update.h"

 /* allocate journal on a device: */
@ -440,11 +441,12 @@ int bch2_fs_journal_start(struct journal *j, u64 last_seq, u64 cur_seq)
 		if (journal_entry_empty(&i->j))
 			j->last_empty_seq = le64_to_cpu(i->j.seq);

-		p = journal_seq_pin(j, seq);
-
-		p->devs.nr = 0;
+		struct bch_devs_list seq_devs = {};
 		darray_for_each(i->ptrs, ptr)
-			bch2_dev_list_add_dev(&p->devs, ptr->dev);
+			seq_devs.data[seq_devs.nr++] = ptr->dev;
+
+		p = journal_seq_pin(j, seq);
+		bch2_devlist_to_replicas(&p->devs.e, BCH_DATA_journal, seq_devs);

 		had_entries = true;
 	}
--- a/libbcachefs/journal/journal.c
+++ b/libbcachefs/journal/journal.c
@ -442,6 +442,7 @@ static int journal_entry_open(struct journal *j)
 	buf->write_started	= false;
 	buf->write_allocated	= false;
 	buf->write_done		= false;
+	buf->had_error		= false;

 	memset(buf->data, 0, sizeof(*buf->data));
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
--- a/libbcachefs/journal/journal.h
+++ b/libbcachefs/journal/journal.h
@ -410,20 +410,14 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re
 				       unsigned u64s, unsigned flags,
 				       struct btree_trans *trans)
 {
-	int ret;
-
 	EBUG_ON(res->ref);
 	EBUG_ON(!test_bit(JOURNAL_running, &j->flags));

 	res->u64s = u64s;

-	if (journal_res_get_fast(j, res, flags))
-		goto out;
+	if (!journal_res_get_fast(j, res, flags))
+		try(bch2_journal_res_get_slowpath(j, res, flags, trans));

-	ret = bch2_journal_res_get_slowpath(j, res, flags, trans);
-	if (ret)
-		return ret;
-out:
 	if (!(flags & JOURNAL_RES_GET_CHECK)) {
 		lock_acquire_shared(&j->res_map, 0,
 				    (flags & JOURNAL_RES_GET_NONBLOCK) != 0,
--- a/libbcachefs/journal/reclaim.c
+++ b/libbcachefs/journal/reclaim.c
@ -956,8 +956,8 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 	scoped_guard(spinlock, &j->lock)
 		fifo_for_each_entry_ptr(p, &j->pin, iter)
 			if (dev_idx >= 0
-			    ? bch2_dev_list_has_dev(p->devs, dev_idx)
-			    : p->devs.nr < c->opts.metadata_replicas)
+			    ? bch2_replicas_entry_has_dev(&p->devs.e, dev_idx)
+			    : p->devs.e.nr_devs < c->opts.metadata_replicas)
 				seq = iter;

 	bch2_journal_flush_pins(j, seq);
@ -981,13 +981,12 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 	seq = 0;
 	scoped_guard(spinlock, &j->lock)
 		while (!ret) {
-			union bch_replicas_padded replicas;
-
 			seq = max(seq, journal_last_seq(j));
-			if (seq >= j->pin.back)
+			if (seq > j->seq_ondisk)
 				break;
-			bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-						 journal_seq_pin(j, seq)->devs);
+
+			union bch_replicas_padded replicas;
+			memcpy(&replicas, &journal_seq_pin(j, seq)->devs, sizeof(replicas));
 			seq++;

 			if (replicas.e.nr_devs) {
@ -1021,6 +1020,9 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 	prt_printf(out, "%llu: count %u\n", *seq, atomic_read(&pin_list->count));
 	guard(printbuf_indent)(out);

+	bch2_replicas_entry_to_text(out, &pin_list->devs.e);
+	prt_newline(out);
+
 	prt_printf(out, "unflushed:\n");
 	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->unflushed); i++)
 		list_for_each_entry(pin, &pin_list->unflushed[i], list)
--- a/libbcachefs/journal/reclaim.h
+++ b/libbcachefs/journal/reclaim.h
@ -26,7 +26,7 @@ static inline void journal_pin_list_init(struct journal_entry_pin_list *p, int c
 	for (unsigned i = 0; i < ARRAY_SIZE(p->flushed); i++)
 		INIT_LIST_HEAD(&p->flushed[i]);
 	atomic_set(&p->count, count);
-	p->devs.nr = 0;
+	p->devs.e.nr_devs = 0;
 	p->bytes = 0;
 }

--- a/libbcachefs/journal/types.h
+++ b/libbcachefs/journal/types.h
@ -5,6 +5,7 @@
 #include <linux/cache.h>
 #include <linux/workqueue.h>

+#include "alloc/replicas_types.h"
 #include "alloc/types.h"
 #include "init/dev_types.h"
 #include "util/fifo.h"
@ -48,6 +49,7 @@ struct journal_buf {
 	bool			write_started:1;
 	bool			write_allocated:1;
 	bool			write_done:1;
+	bool			had_error:1;
 	u8			idx;
 };

@ -70,7 +72,7 @@ struct journal_entry_pin_list {
 	struct list_head		unflushed[JOURNAL_PIN_TYPE_NR];
 	struct list_head		flushed[JOURNAL_PIN_TYPE_NR];
 	atomic_t			count;
-	struct bch_devs_list		devs;
+	union bch_replicas_padded	devs;
 	size_t				bytes;
 };

@ -113,7 +115,14 @@ union journal_res_state {

 /* bytes: */
 #define JOURNAL_ENTRY_SIZE_MIN		(64U << 10) /* 64k */
-#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 22) /* 16M */
+
+/*
+ * The block layer is fragile with large bios - it should be able to process any
+ * IO incrementally, but...
+ *
+ * 4MB corresponds to bio_kmalloc() -> UIO_MAXIOV
+ */
+#define JOURNAL_ENTRY_SIZE_MAX		(4U  << 20) /* 4M */

 /*
 * We stash some journal state as sentinal values in cur_entry_offset:
--- a/libbcachefs/journal/write.c
+++ b/libbcachefs/journal/write.c
@ -188,7 +188,6 @@ static CLOSURE_CALLBACK(journal_write_done)
 	closure_type(w, struct journal_buf, io);
 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	union bch_replicas_padded replicas;
 	u64 seq = le64_to_cpu(w->data->seq);
 	int err = 0;

@ -196,14 +195,15 @@ static CLOSURE_CALLBACK(journal_write_done)
 			       ? j->flush_write_time
 			       : j->noflush_write_time, j->write_start_time);

-	if (!w->devs_written.nr) {
-		err = bch_err_throw(c, journal_write_err);
-	} else {
-		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-					 w->devs_written);
-		err = bch2_mark_replicas(c, &replicas.e);
+	if (w->had_error) {
+		struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, seq)->devs.e;
+
+		bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written);
 	}

+	if (!w->devs_written.nr)
+		err = bch_err_throw(c, journal_write_err);
+
 	if (err && !bch2_journal_error(j)) {
 		CLASS(printbuf, buf)();
 		bch2_log_msg_start(c, &buf);
@ -222,8 +222,7 @@ static CLOSURE_CALLBACK(journal_write_done)
 	closure_debug_destroy(cl);

 	spin_lock(&j->lock);
-	if (seq >= j->pin.front)
-		journal_seq_pin(j, seq)->devs = w->devs_written;
+	BUG_ON(seq < j->pin.front);
 	if (err && (!j->err_seq || seq < j->err_seq))
 		j->err_seq	= seq;
 	w->write_done = true;
@ -334,6 +333,7 @@ static void journal_write_endio(struct bio *bio)
 		unsigned long flags;
 		spin_lock_irqsave(&j->err_lock, flags);
 		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
+		w->had_error = true;
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}

@ -632,7 +632,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	closure_type(w, struct journal_buf, io);
 	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	union bch_replicas_padded replicas;
 	unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]);
 	int ret;

@ -701,9 +700,9 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	 * Mark journal replicas before we submit the write to guarantee
 	 * recovery will find the journal entries after a crash.
 	 */
-	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
-				 w->devs_written);
-	ret = bch2_mark_replicas(c, &replicas.e);
+	struct bch_replicas_entry_v1 *r = &journal_seq_pin(j, le64_to_cpu(w->data->seq))->devs.e;
+	bch2_devlist_to_replicas(r, BCH_DATA_journal, w->devs_written);
+	ret = bch2_mark_replicas(c, r);
 	if (ret)
 		goto err;

--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@ -525,6 +525,37 @@ void bch2_opts_to_text(struct printbuf *out,
 	}
 }

+static int opt_hook_io(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, bool post)
+{
+	if (!test_bit(BCH_FS_started, &c->flags))
+		return 0;
+
+	switch (id) {
+	case Opt_foreground_target:
+	case Opt_background_target:
+	case Opt_promote_target:
+	case Opt_compression:
+	case Opt_background_compression:
+	case Opt_data_checksum:
+	case Opt_data_replicas:
+	case Opt_erasure_code: {
+		struct rebalance_scan s = {
+			.type = !inum ? REBALANCE_SCAN_fs : REBALANCE_SCAN_inum,
+			.inum = inum,
+		};
+
+		try(bch2_set_rebalance_needs_scan(c, s));
+		if (post)
+			bch2_rebalance_wakeup(c);
+		break;
+	}
+	default:
+		break;
+	}
+
+	return 0;
+}
+
 int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum bch_opt_id id, u64 v,
 			  bool change)
 {
@ -546,16 +577,8 @@ int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, enum b
 		break;
 	}

-	if (change &&
-	    test_bit(BCH_FS_started, &c->flags) &&
-	    (id == Opt_foreground_target ||
-	     id == Opt_background_target ||
-	     id == Opt_promote_target ||
-	     id == Opt_compression ||
-	     id == Opt_background_compression ||
-	     id == Opt_data_checksum ||
-	     id == Opt_data_replicas))
-		try(bch2_set_rebalance_needs_scan(c, inum));
+	if (change)
+		try(opt_hook_io(c, ca, inum, id, false));

 	return 0;
 }
@ -571,17 +594,7 @@ int bch2_opts_hooks_pre_set(struct bch_fs *c)
 void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum,
 			    enum bch_opt_id id, u64 v)
 {
-	if (test_bit(BCH_FS_started, &c->flags) &&
-	    (id == Opt_foreground_target ||
-	     id == Opt_background_target ||
-	     id == Opt_promote_target ||
-	     id == Opt_compression ||
-	     id == Opt_background_compression ||
-	     id == Opt_data_checksum ||
-	     id == Opt_data_replicas)) {
-		bch2_set_rebalance_needs_scan(c, inum);
-		bch2_rebalance_wakeup(c);
-	}
+	opt_hook_io(c, ca, inum, id, true);

 	switch (id) {
 	case Opt_rebalance_enabled:
@ -838,6 +851,7 @@ void bch2_inode_opts_get(struct bch_fs *c, struct bch_inode_opts *ret, bool meta
 		ret->background_target	= c->opts.metadata_target ?: c->opts.foreground_target;
 		ret->data_replicas	= c->opts.metadata_replicas;
 		ret->data_checksum	= c->opts.metadata_checksum;
+		ret->erasure_code	= false;
 	} else {
 		bch2_io_opts_fixups(ret);
 	}
--- a/libbcachefs/sb/members.h
+++ b/libbcachefs/sb/members.h
@ -72,10 +72,7 @@ static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
 static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
 					 unsigned dev)
 {
-	darray_for_each(devs, i)
-		if (*i == dev)
-			return true;
-	return false;
+	return darray_find(devs, dev) != NULL;
 }

 static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
--- a/libbcachefs/util/darray.h
+++ b/libbcachefs/util/darray.h
@ -96,7 +96,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t, bool);

 #define darray_find_p(_d, _i, cond)					\
 ({									\
-	typeof((_d).data) _ret = NULL;					\
+	typeof(&(_d).data[0]) _ret = NULL;				\
 									\
 	darray_for_each(_d, _i)						\
 		if (cond) {						\
--- a/libbcachefs/util/eytzinger.h
+++ b/libbcachefs/util/eytzinger.h
@ -278,20 +278,51 @@ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size,
 	return n - 1;
 }

-#define eytzinger0_find(base, nr, size, _cmp, search)			\
-({									\
-	size_t _size		= (size);				\
-	void *_base1		= (void *)(base) - _size;		\
-	const void *_search	= (search);				\
-	size_t _nr		= (nr);					\
-	size_t _i		= 1;					\
-	int _res;							\
-									\
-	while (_i <= _nr &&						\
-	       (_res = _cmp(_search, _base1 + _i * _size)))		\
-		_i = eytzinger1_child(_i, _res > 0);			\
-	_i - 1;								\
-})
+/* 0 == not found */
+static inline int eytzinger1_find_r(void *base, unsigned nr, unsigned size,
+				    cmp_r_func_t cmp_fn, const void *priv,
+				    const void *search)
+{
+	unsigned i = 1;
+	while (i <= nr) {
+		int cmp = cmp_fn(search, base + i * size, priv);
+		if (!cmp)
+			return i;
+		i = eytzinger1_child(i, cmp > 0);
+	}
+
+	return 0;
+}
+
+/* 0 == not found */
+static inline int eytzinger1_find(void *base, unsigned nr, unsigned size,
+				  cmp_func_t cmp_fn, const void *search)
+{
+	unsigned i = 1;
+	while (i <= nr) {
+		int cmp = cmp_fn(search, base + i * size);
+		if (!cmp)
+			return i;
+		i = eytzinger1_child(i, cmp > 0);
+	}
+
+	return 0;
+}
+
+/* -1 == not found */
+static inline int eytzinger0_find_r(void *base, unsigned nr, unsigned size,
+				    cmp_r_func_t cmp_fn, const void *priv,
+				    const void *search)
+{
+	return eytzinger1_find_r(base - size, nr, size, cmp_fn, priv, search) - 1;
+}
+
+/* -1 == not found */
+static inline int eytzinger0_find(void *base, unsigned nr, unsigned size,
+				  cmp_func_t cmp_fn, const void *search)
+{
+	return eytzinger1_find(base - size, nr, size, cmp_fn, search) - 1;
+}

 void eytzinger0_sort_r(void *, size_t, size_t,
 		       cmp_r_func_t, swap_r_func_t, const void *);
--- a/libbcachefs/util/util.c
+++ b/libbcachefs/util/util.c
@ -612,24 +612,51 @@ void bch2_bio_map(struct bio *bio, void *base, size_t size)
 		bio_add_virt_nofail(bio, base, size);
 }

-int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
+int bch2_bio_alloc_pages(struct bio *bio, unsigned bs, size_t size, gfp_t gfp_mask)
 {
+	BUG_ON(size & (bs - 1));
+	unsigned bs_pages = DIV_ROUND_UP(bs, PAGE_SIZE);
+
+	/*
+	 * XXX: we could do this by allocating higher order pages, but
+	 *
+	 * - the page allocator gets slower at a certain order (5?) - we'd have
+	 *   to check for this
+	 *
+	 * - bch2_bio_free_pages_pool() probably does not handle compound pages
+	 *   yet
+	 */
+	DARRAY_PREALLOCATED(struct page *, 16) pages;
+	darray_init(&pages);
+	darray_make_room_gfp(&pages, bs_pages, gfp_mask|__GFP_NOFAIL);
+
+	int ret = 0;
 	while (size) {
-		struct page *page = alloc_pages(gfp_mask, 0);
-		unsigned len = min_t(size_t, PAGE_SIZE, size);
+		while (pages.nr < bs_pages) {
+			struct page *page = alloc_pages(gfp_mask, 0);
+			if (!page) {
+				ret = -ENOMEM;
+				goto out;
+			}

-		if (!page)
-			return -ENOMEM;
-
-		if (unlikely(!bio_add_page(bio, page, len, 0))) {
-			__free_page(page);
-			break;
+			BUG_ON(darray_push(&pages, page));
 		}

-		size -= len;
-	}
+		while (pages.nr) {
+			BUG_ON(!size);

-	return 0;
+			unsigned len = min(PAGE_SIZE, size);
+			size -= len;
+
+			struct page *page = darray_pop(&pages);
+			BUG_ON(!bio_add_page(bio, page, len, 0));
+		}
+	}
+out:
+	darray_for_each(pages, i)
+		__free_page(*i);
+	darray_exit(&pages);
+	return ret;
 }

 u64 bch2_get_random_u64_below(u64 ceil)
--- a/libbcachefs/util/util.h
+++ b/libbcachefs/util/util.h
@ -370,7 +370,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
 }

 void bch2_bio_map(struct bio *bio, void *base, size_t);
-int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t);
+int bch2_bio_alloc_pages(struct bio *, unsigned, size_t, gfp_t);

 #define closure_bio_submit(bio, cl)					\
 do {									\
--- a/libbcachefs/vfs/fs.c
+++ b/libbcachefs/vfs/fs.c
@ -123,7 +123,10 @@ static int bch2_write_inode_trans(struct btree_trans *trans,
 	struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u);
 	*rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r));
 	if (*rebalance_changed)
-		try(bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum));
+		try(bch2_set_rebalance_needs_scan_trans(trans,
+				(struct rebalance_scan) {
+					.type = REBALANCE_SCAN_inum,
+					.inum = inode_u.bi_inum }));

 	try(bch2_inode_write(trans, &iter, &inode_u));
 	try(bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc));