Update bcachefs sources to edf5f38218 bcachefs: Refactor superblock code

2025-03-28 00:00:03 +03:00 · 2018-04-10 19:19:09 -04:00 · 2018-04-10 19:19:09 -04:00 · c598d91dcb
commit c598d91dcb
parent ff5e165532
59 changed files with 2532 additions and 2221 deletions
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@ -1 +1 @@
-9fc6ccd8659598d4ca885220a795889071b619f4
+edf5f38218f699e53913a549465f35d36c4418f7
--- a/cmd_key.c
+++ b/cmd_key.c
@ -86,7 +86,7 @@ int cmd_set_passphrase(int argc, char *argv[])
 	if (IS_ERR(c))
 		die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));

-	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
+	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
 	if (!crypt)
 		die("Filesystem does not have encryption enabled");

@ -100,7 +100,7 @@ int cmd_set_passphrase(int argc, char *argv[])
 	char *new_passphrase = read_passphrase_twice("Enter new passphrase: ");
 	struct bch_key passphrase_key = derive_passphrase(crypt, new_passphrase);

-	if (bch2_chacha_encrypt_key(&passphrase_key, __bch2_sb_key_nonce(c->disk_sb),
+	if (bch2_chacha_encrypt_key(&passphrase_key, __bch2_sb_key_nonce(c->disk_sb.sb),
 				    &new_key, sizeof(new_key)))
 		die("error encrypting key");
 	crypt->key = new_key;
@ -123,7 +123,7 @@ int cmd_remove_passphrase(int argc, char *argv[])
 	if (IS_ERR(c))
 		die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));

-	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
+	struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
 	if (!crypt)
 		die("Filesystem does not have encryption enabled");

--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@ -31,6 +31,7 @@
 #include "libbcachefs/fs.h"
 #include "libbcachefs/inode.h"
 #include "libbcachefs/io.h"
+#include "libbcachefs/replicas.h"
 #include "libbcachefs/str_hash.h"
 #include "libbcachefs/super.h"
 #include "libbcachefs/xattr.h"
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@ -15,7 +15,7 @@
 #define BUG_ON(cond)		assert(!(cond))

 #define WARN_ON_ONCE(cond)	({ bool _r = (cond); if (_r) assert(0); _r; })
-#define WARN_ONCE(cond, msg)	({ bool _r = (cond); if (_r) assert(0); _r; })
+#define WARN_ONCE(cond, ...)	({ bool _r = (cond); if (_r) assert(0); _r; })

 #define __WARN()		assert(0)
 #define __WARN_printf(arg...)	assert(0)
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@ -319,7 +319,7 @@ TRACE_EVENT(btree_gc_coalesce_fail,

 	TP_fast_assign(
 		__entry->reason		= reason;
-		memcpy(__entry->uuid, c->disk_sb->user_uuid.b, 16);
+		memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
 	),

 	TP_printk("%pU: %u", __entry->uuid, __entry->reason)
--- a/libbcachefs.c
+++ b/libbcachefs.c
@ -14,12 +14,14 @@

 #include <uuid/uuid.h>

-#include "libbcachefs/bcachefs_format.h"
-#include "libbcachefs/checksum.h"
-#include "crypto.h"
 #include "libbcachefs.h"
+#include "crypto.h"
+#include "libbcachefs/bcachefs_format.h"
 #include "libbcachefs/btree_cache.h"
+#include "libbcachefs/checksum.h"
+#include "libbcachefs/disk_groups.h"
 #include "libbcachefs/opts.h"
+#include "libbcachefs/replicas.h"
 #include "libbcachefs/super-io.h"

 #define NSEC_PER_SEC	1000000000L
@ -124,8 +126,8 @@ void bch2_pick_bucket_size(struct format_opts opts, struct dev_opts *dev)

 }

-static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
-			     struct bch_sb_field_disk_groups *gi,
+static unsigned parse_target(struct bch_sb_handle *sb,
+			     struct dev_opts *devs, size_t nr_devs,
 			     const char *s)
 {
 	struct dev_opts *i;
@ -138,7 +140,7 @@ static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
 		if (!strcmp(s, i->path))
 			return dev_to_target(i - devs);

-	idx = __bch2_disk_group_find(gi, s);
+	idx = bch2_disk_path_find(sb, s);
 	if (idx >= 0)
 		return group_to_target(idx);

@ -149,11 +151,9 @@ static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
 struct bch_sb *bch2_format(struct format_opts opts,
 			   struct dev_opts *devs, size_t nr_devs)
 {
-	struct bch_sb *sb;
+	struct bch_sb_handle sb = { NULL };
 	struct dev_opts *i;
 	struct bch_sb_field_members *mi;
-	struct bch_sb_field_disk_groups *gi = NULL;
-	unsigned u64s;

 	/* calculate block size: */
 	if (!opts.block_size)
@ -184,58 +184,51 @@ struct bch_sb *bch2_format(struct format_opts opts,
 	if (uuid_is_null(opts.uuid.b))
 		uuid_generate(opts.uuid.b);

-	sb = calloc(1, sizeof(*sb) +
-		    sizeof(struct bch_sb_field_members) +
-		    sizeof(struct bch_member) * nr_devs +
-		    sizeof(struct bch_sb_field_disk_groups) +
-		    sizeof(struct bch_disk_group) * nr_devs +
-		    sizeof(struct bch_sb_field_crypt));
+	if (bch2_sb_realloc(&sb, 0))
+		die("insufficient memory");

-	sb->version	= cpu_to_le64(BCH_SB_VERSION_MAX);
-	sb->magic	= BCACHE_MAGIC;
-	sb->block_size	= cpu_to_le16(opts.block_size);
-	sb->user_uuid	= opts.uuid;
-	sb->nr_devices	= nr_devs;
+	sb.sb->version		= cpu_to_le64(BCH_SB_VERSION_MAX);
+	sb.sb->magic		= BCACHE_MAGIC;
+	sb.sb->block_size	= cpu_to_le16(opts.block_size);
+	sb.sb->user_uuid	= opts.uuid;
+	sb.sb->nr_devices	= nr_devs;

-	uuid_generate(sb->uuid.b);
+	uuid_generate(sb.sb->uuid.b);

 	if (opts.label)
-		strncpy((char *) sb->label, opts.label, sizeof(sb->label));
+		strncpy((char *) sb.sb->label, opts.label, sizeof(sb.sb->label));

-	SET_BCH_SB_CSUM_TYPE(sb,		opts.meta_csum_type);
-	SET_BCH_SB_META_CSUM_TYPE(sb,		opts.meta_csum_type);
-	SET_BCH_SB_DATA_CSUM_TYPE(sb,		opts.data_csum_type);
-	SET_BCH_SB_COMPRESSION_TYPE(sb,		opts.compression_type);
-	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb, opts.background_compression_type);
+	SET_BCH_SB_CSUM_TYPE(sb.sb,		opts.meta_csum_type);
+	SET_BCH_SB_META_CSUM_TYPE(sb.sb,	opts.meta_csum_type);
+	SET_BCH_SB_DATA_CSUM_TYPE(sb.sb,	opts.data_csum_type);
+	SET_BCH_SB_COMPRESSION_TYPE(sb.sb,	opts.compression_type);
+	SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb.sb,
+						opts.background_compression_type);

-	SET_BCH_SB_BTREE_NODE_SIZE(sb,		opts.btree_node_size);
-	SET_BCH_SB_GC_RESERVE(sb,		8);
-	SET_BCH_SB_META_REPLICAS_WANT(sb,	opts.meta_replicas);
-	SET_BCH_SB_META_REPLICAS_REQ(sb,	opts.meta_replicas_required);
-	SET_BCH_SB_DATA_REPLICAS_WANT(sb,	opts.data_replicas);
-	SET_BCH_SB_DATA_REPLICAS_REQ(sb,	opts.data_replicas_required);
-	SET_BCH_SB_ERROR_ACTION(sb,		opts.on_error_action);
-	SET_BCH_SB_STR_HASH_TYPE(sb,		BCH_STR_HASH_SIPHASH);
-	SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb,	ilog2(opts.encoded_extent_max));
+	SET_BCH_SB_BTREE_NODE_SIZE(sb.sb,	opts.btree_node_size);
+	SET_BCH_SB_GC_RESERVE(sb.sb,		8);
+	SET_BCH_SB_META_REPLICAS_WANT(sb.sb,	opts.meta_replicas);
+	SET_BCH_SB_META_REPLICAS_REQ(sb.sb,	opts.meta_replicas_required);
+	SET_BCH_SB_DATA_REPLICAS_WANT(sb.sb,	opts.data_replicas);
+	SET_BCH_SB_DATA_REPLICAS_REQ(sb.sb,	opts.data_replicas_required);
+	SET_BCH_SB_ERROR_ACTION(sb.sb,		opts.on_error_action);
+	SET_BCH_SB_STR_HASH_TYPE(sb.sb,		BCH_STR_HASH_SIPHASH);
+	SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb.sb,ilog2(opts.encoded_extent_max));

-	SET_BCH_SB_POSIX_ACL(sb,		1);
+	SET_BCH_SB_POSIX_ACL(sb.sb,		1);

 	struct timespec now;
 	if (clock_gettime(CLOCK_REALTIME, &now))
 		die("error getting current time: %m");

-	sb->time_base_lo	= cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
-	sb->time_precision	= cpu_to_le32(1);
-
-	mi = vstruct_end(sb);
-	u64s = (sizeof(struct bch_sb_field_members) +
-		sizeof(struct bch_member) * nr_devs) / sizeof(u64);
-
-	le32_add_cpu(&sb->u64s, u64s);
-	le32_add_cpu(&mi->field.u64s, u64s);
-	mi->field.type = BCH_SB_FIELD_members;
+	sb.sb->time_base_lo	= cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
+	sb.sb->time_precision	= cpu_to_le32(1);

 	/* Member info: */
+	mi = bch2_sb_resize_members(&sb,
+			(sizeof(*mi) + sizeof(struct bch_member) *
+			 nr_devs) / sizeof(u64));
+
 	for (i = devs; i < devs + nr_devs; i++) {
 		struct bch_member *m = mi->members + (i - devs);

@ -253,63 +246,38 @@ struct bch_sb *bch2_format(struct format_opts opts,
 	/* Disk groups */
 	for (i = devs; i < devs + nr_devs; i++) {
 		struct bch_member *m = mi->members + (i - devs);
-		struct bch_disk_group *g;
-		size_t len;
 		int idx;

 		if (!i->group)
 			continue;

-		len = min_t(size_t, strlen(i->group) + 1, BCH_SB_LABEL_SIZE);
+		idx = bch2_disk_path_find_or_create(&sb, i->group);
+		if (idx < 0)
+			die("error creating disk path: %s", idx);

-		if (!gi) {
-			gi = vstruct_end(sb);
-			u64s = sizeof(*gi) / sizeof(u64);
-			le32_add_cpu(&sb->u64s, u64s);
-			le32_add_cpu(&gi->field.u64s, u64s);
-			gi->field.type = BCH_SB_FIELD_disk_groups;
-		}
-
-		idx = __bch2_disk_group_find(gi, i->group);
-		if (idx >= 0) {
-			g = gi->entries + idx;
-		} else {
-			u64s = sizeof(*g) / sizeof(u64);
-			g = vstruct_end(&gi->field);
-			le32_add_cpu(&sb->u64s, u64s);
-			le32_add_cpu(&gi->field.u64s, u64s);
-			memcpy(g->label, i->group, len);
-			SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-		}
-
-		SET_BCH_MEMBER_GROUP(m,	(g - gi->entries) + 1);
+		SET_BCH_MEMBER_GROUP(m,	idx + 1);
 	}

-	SET_BCH_SB_FOREGROUND_TARGET(sb,
-		parse_target(devs, nr_devs, gi, opts.foreground_target));
-	SET_BCH_SB_BACKGROUND_TARGET(sb,
-		parse_target(devs, nr_devs, gi, opts.background_target));
-	SET_BCH_SB_PROMOTE_TARGET(sb,
-		parse_target(devs, nr_devs, gi, opts.promote_target));
+	SET_BCH_SB_FOREGROUND_TARGET(sb.sb,
+		parse_target(&sb, devs, nr_devs, opts.foreground_target));
+	SET_BCH_SB_BACKGROUND_TARGET(sb.sb,
+		parse_target(&sb, devs, nr_devs, opts.background_target));
+	SET_BCH_SB_PROMOTE_TARGET(sb.sb,
+		parse_target(&sb, devs, nr_devs, opts.promote_target));

 	/* Crypt: */
 	if (opts.encrypted) {
-		struct bch_sb_field_crypt *crypt = vstruct_end(sb);
+		struct bch_sb_field_crypt *crypt =
+			bch2_sb_resize_crypt(&sb, sizeof(*crypt) / sizeof(u64));

-		u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64);
-
-		le32_add_cpu(&sb->u64s, u64s);
-		crypt->field.u64s = cpu_to_le32(u64s);
-		crypt->field.type = BCH_SB_FIELD_crypt;
-
-		bch_sb_crypt_init(sb, crypt, opts.passphrase);
-		SET_BCH_SB_ENCRYPTION_TYPE(sb, 1);
+		bch_sb_crypt_init(sb.sb, crypt, opts.passphrase);
+		SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1);
 	}

 	for (i = devs; i < devs + nr_devs; i++) {
-		sb->dev_idx = i - devs;
+		sb.sb->dev_idx = i - devs;

-		init_layout(&sb->layout, opts.block_size,
+		init_layout(&sb.sb->layout, opts.block_size,
 			    i->sb_offset, i->sb_end);

 		if (i->sb_offset == BCH_SB_SECTOR) {
@ -319,11 +287,11 @@ struct bch_sb *bch2_format(struct format_opts opts,
 			xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
 		}

-		bch2_super_write(i->fd, sb);
+		bch2_super_write(i->fd, sb.sb);
 		close(i->fd);
 	}

-	return sb;
+	return sb.sb;
 }

 void bch2_super_write(int fd, struct bch_sb *sb)
@ -553,11 +521,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,

 typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);

-struct bch_sb_field_ops {
+struct bch_sb_field_toolops {
 	sb_field_print_fn	print;
 };

-static const struct bch_sb_field_ops bch2_sb_field_ops[] = {
+static const struct bch_sb_field_toolops bch2_sb_field_ops[] = {
 #define x(f, nr)					\
 	[BCH_SB_FIELD_##f] = {				\
 		.print	= bch2_sb_print_##f,		\
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@ -58,11 +58,13 @@
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "clock.h"
 #include "debug.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "extents.h"
 #include "io.h"
@ -79,7 +81,7 @@
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>

-static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int);
+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);

 /* Ratelimiting/PD controllers */

@ -130,8 +132,7 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 	return DIV_ROUND_UP(bytes, sizeof(u64));
 }

-static const char *bch2_alloc_invalid(const struct bch_fs *c,
-				      struct bkey_s_c k)
+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	if (k.k->p.inode >= c->sb.nr_devices ||
 	    !c->devs[k.k->p.inode])
@ -152,8 +153,8 @@ static const char *bch2_alloc_invalid(const struct bch_fs *c,
 	return NULL;
 }

-static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
-			       size_t size, struct bkey_s_c k)
+void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
 {
 	buf[0] = '\0';

@ -163,11 +164,6 @@ static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
 	}
 }

-const struct bkey_ops bch2_bkey_alloc_ops = {
-	.key_invalid	= bch2_alloc_invalid,
-	.val_to_text	= bch2_alloc_to_text,
-};
-
 static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
 {
 	unsigned v;
@ -236,9 +232,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)

 	d = a.v->data;
 	if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-		g->prio[READ] = get_alloc_field(&d, 2);
+		g->io_time[READ] = get_alloc_field(&d, 2);
 	if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-		g->prio[WRITE] = get_alloc_field(&d, 2);
+		g->io_time[WRITE] = get_alloc_field(&d, 2);

 	lg_local_unlock(&c->usage_lock);
 }
@ -270,21 +266,21 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 	}

-	mutex_lock(&c->prio_clock[READ].lock);
+	mutex_lock(&c->bucket_clock[READ].lock);
 	for_each_member_device(ca, c, i) {
 		down_read(&ca->bucket_lock);
-		bch2_recalc_min_prio(c, ca, READ);
+		bch2_recalc_oldest_io(c, ca, READ);
 		up_read(&ca->bucket_lock);
 	}
-	mutex_unlock(&c->prio_clock[READ].lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);

-	mutex_lock(&c->prio_clock[WRITE].lock);
+	mutex_lock(&c->bucket_clock[WRITE].lock);
 	for_each_member_device(ca, c, i) {
 		down_read(&ca->bucket_lock);
-		bch2_recalc_min_prio(c, ca, WRITE);
+		bch2_recalc_oldest_io(c, ca, WRITE);
 		up_read(&ca->bucket_lock);
 	}
-	mutex_unlock(&c->prio_clock[WRITE].lock);
+	mutex_unlock(&c->bucket_clock[WRITE].lock);

 	return 0;
 }
@ -320,9 +316,9 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,

 		d = a->v.data;
 		if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-			put_alloc_field(&d, 2, g->prio[READ]);
+			put_alloc_field(&d, 2, g->io_time[READ]);
 		if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-			put_alloc_field(&d, 2, g->prio[WRITE]);
+			put_alloc_field(&d, 2, g->io_time[WRITE]);
 		lg_local_unlock(&c->usage_lock);

 		ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
@ -395,38 +391,34 @@ int bch2_alloc_write(struct bch_fs *c)

 /* Bucket IO clocks: */

-static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
 {
-	struct prio_clock *clock = &c->prio_clock[rw];
+	struct bucket_clock *clock = &c->bucket_clock[rw];
 	struct bucket_array *buckets = bucket_array(ca);
 	struct bucket *g;
-	u16 max_delta = 1;
+	u16 max_last_io = 0;
 	unsigned i;

-	lockdep_assert_held(&c->prio_clock[rw].lock);
+	lockdep_assert_held(&c->bucket_clock[rw].lock);

-	/* Determine min prio for this particular device */
+	/* Recalculate max_last_io for this device: */
 	for_each_bucket(g, buckets)
-		max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
+		max_last_io = max(max_last_io, bucket_last_io(c, g, rw));

-	ca->min_prio[rw] = clock->hand - max_delta;
+	ca->max_last_bucket_io[rw] = max_last_io;

-	/*
-	 * This may possibly increase the min prio for the whole device, check
-	 * that as well.
-	 */
-	max_delta = 1;
+	/* Recalculate global max_last_io: */
+	max_last_io = 0;

 	for_each_member_device(ca, c, i)
-		max_delta = max(max_delta,
-				(u16) (clock->hand - ca->min_prio[rw]));
+		max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);

-	clock->min_prio = clock->hand - max_delta;
+	clock->max_last_io = max_last_io;
 }

-static void bch2_rescale_prios(struct bch_fs *c, int rw)
+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
 {
-	struct prio_clock *clock = &c->prio_clock[rw];
+	struct bucket_clock *clock = &c->bucket_clock[rw];
 	struct bucket_array *buckets;
 	struct bch_dev *ca;
 	struct bucket *g;
@ -439,10 +431,10 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
 		buckets = bucket_array(ca);

 		for_each_bucket(g, buckets)
-			g->prio[rw] = clock->hand -
-			(clock->hand - g->prio[rw]) / 2;
+			g->io_time[rw] = clock->hand -
+			bucket_last_io(c, g, rw) / 2;

-		bch2_recalc_min_prio(c, ca, rw);
+		bch2_recalc_oldest_io(c, ca, rw);

 		up_read(&ca->bucket_lock);
 	}
@ -450,19 +442,26 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)

 static void bch2_inc_clock_hand(struct io_timer *timer)
 {
-	struct prio_clock *clock = container_of(timer,
-						struct prio_clock, rescale);
+	struct bucket_clock *clock = container_of(timer,
+						struct bucket_clock, rescale);
 	struct bch_fs *c = container_of(clock,
-					struct bch_fs, prio_clock[clock->rw]);
+					struct bch_fs, bucket_clock[clock->rw]);
+	struct bch_dev *ca;
 	u64 capacity;
+	unsigned i;

 	mutex_lock(&clock->lock);

-	clock->hand++;
-
 	/* if clock cannot be advanced more, rescale prio */
-	if (clock->hand == (u16) (clock->min_prio - 1))
-		bch2_rescale_prios(c, clock->rw);
+	if (clock->max_last_io >= U16_MAX - 2)
+		bch2_rescale_bucket_io_times(c, clock->rw);
+
+	BUG_ON(clock->max_last_io >= U16_MAX - 2);
+
+	for_each_member_device(ca, c, i)
+		ca->max_last_bucket_io[clock->rw]++;
+	clock->max_last_io++;
+	clock->hand++;

 	mutex_unlock(&clock->lock);

@ -484,9 +483,9 @@ static void bch2_inc_clock_hand(struct io_timer *timer)
 	bch2_io_timer_add(&c->io_clock[clock->rw], timer);
 }

-static void bch2_prio_timer_init(struct bch_fs *c, int rw)
+static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
 {
-	struct prio_clock *clock = &c->prio_clock[rw];
+	struct bucket_clock *clock = &c->bucket_clock[rw];

 	clock->hand		= 1;
 	clock->rw		= rw;
@ -536,7 +535,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (kthread_should_stop()) {
-			ret = -1;
+			ret = 1;
 			break;
 		}

@ -635,13 +634,14 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, struct bucket_mark m)
 {
+	unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
+	unsigned max_last_io = ca->max_last_bucket_io[READ];
+
 	/*
 	 * Time since last read, scaled to [0, 8) where larger value indicates
 	 * more recently read data:
 	 */
-	unsigned long hotness =
-		(bucket(ca, b)->prio[READ]	- ca->min_prio[READ]) * 7 /
-		(c->prio_clock[READ].hand	- ca->min_prio[READ]);
+	unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;

 	/* How much we want to keep the data in this bucket: */
 	unsigned long data_wantness =
@ -659,23 +659,25 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
 				   struct alloc_heap_entry l,
 				   struct alloc_heap_entry r)
 {
-	return (l.key > r.key) - (l.key < r.key);
+	return (l.key > r.key) - (l.key < r.key) ?:
+		(l.nr < r.nr)  - (l.nr  > r.nr) ?:
+		(l.bucket > r.bucket) - (l.bucket < r.bucket);
 }

 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
-	struct alloc_heap_entry e;
+	struct alloc_heap_entry e = { 0 };
 	size_t b;

 	ca->alloc_heap.used = 0;

-	mutex_lock(&c->prio_clock[READ].lock);
+	mutex_lock(&c->bucket_clock[READ].lock);
 	down_read(&ca->bucket_lock);

 	buckets = bucket_array(ca);

-	bch2_recalc_min_prio(c, ca, READ);
+	bch2_recalc_oldest_io(c, ca, READ);

 	/*
 	 * Find buckets with lowest read priority, by building a maxheap sorted
@ -684,30 +686,45 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 	 */
 	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
 		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+		unsigned long key = bucket_sort_key(c, ca, b, m);

 		if (!bch2_can_invalidate_bucket(ca, b, m))
 			continue;

-		e = (struct alloc_heap_entry) {
-			.bucket = b,
-			.key	= bucket_sort_key(c, ca, b, m)
-		};
+		if (e.nr && e.bucket + e.nr == b && e.key == key) {
+			e.nr++;
+		} else {
+			if (e.nr)
+				heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);

-		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+			e = (struct alloc_heap_entry) {
+				.bucket = b,
+				.nr	= 1,
+				.key	= key,
+			};
+		}
+
+		cond_resched();
 	}

+	if (e.nr)
+		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
 	up_read(&ca->bucket_lock);
-	mutex_unlock(&c->prio_clock[READ].lock);
+	mutex_unlock(&c->bucket_clock[READ].lock);

 	heap_resort(&ca->alloc_heap, bucket_alloc_cmp);

-	/*
-	 * If we run out of buckets to invalidate, bch2_allocator_thread() will
-	 * kick stuff and retry us
-	 */
-	while (!fifo_full(&ca->free_inc) &&
-	       heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
-		bch2_invalidate_one_bucket(c, ca, e.bucket);
+	while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
+		for (b = e.bucket;
+		     b < e.bucket + e.nr;
+		     b++) {
+			if (fifo_full(&ca->free_inc))
+				return;
+
+			bch2_invalidate_one_bucket(c, ca, b);
+		}
+	}
 }

 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@ -729,6 +746,8 @@ static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)

 		if (bch2_can_invalidate_bucket(ca, b, m))
 			bch2_invalidate_one_bucket(c, ca, b);
+
+		cond_resched();
 	}
 }

@ -749,6 +768,8 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca

 		if (bch2_can_invalidate_bucket(ca, b, m))
 			bch2_invalidate_one_bucket(c, ca, b);
+
+		cond_resched();
 	}
 }

@ -850,7 +871,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t

 		if ((current->flags & PF_KTHREAD) &&
 		    kthread_should_stop()) {
-			ret = -1;
+			ret = 1;
 			break;
 		}

@ -880,7 +901,7 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
 					     ca->mi.bucket_size, GFP_NOIO, 0);

 		if (push_invalidated_bucket(c, ca, bucket))
-			return -1;
+			return 1;
 	}

 	return 0;
@ -905,17 +926,32 @@ static int bch2_allocator_thread(void *arg)

 	while (1) {
 		while (1) {
+			cond_resched();
+
+			pr_debug("discarding %zu invalidated buckets",
+				 ca->nr_invalidated);
+
 			ret = discard_invalidated_buckets(c, ca);
 			if (ret)
-				return 0;
+				goto stop;

 			if (fifo_empty(&ca->free_inc))
 				break;

+			pr_debug("invalidating %zu buckets",
+				 fifo_used(&ca->free_inc));
+
 			journal_seq = 0;
 			ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX);
-			if (ret)
-				return 0;
+			if (ret) {
+				bch_err(ca, "error invalidating buckets: %i", ret);
+				goto stop;
+			}
+
+			if (!ca->nr_invalidated) {
+				bch_err(ca, "allocator thread unable to make forward progress!");
+				goto stop;
+			}

 			if (ca->allocator_invalidating_data)
 				ret = bch2_journal_flush_seq(&c->journal, journal_seq);
@ -927,22 +963,29 @@ static int bch2_allocator_thread(void *arg)
 			 * journal error - buckets haven't actually been
 			 * invalidated, can't discard them:
 			 */
-			if (ret)
-				return 0;
+			if (ret) {
+				bch_err(ca, "journal error: %i", ret);
+				goto stop;
+			}
 		}

+		pr_debug("free_inc now empty");
+
 		/* Reset front/back so we can easily sort fifo entries later: */
 		ca->free_inc.front = ca->free_inc.back	= 0;
 		ca->allocator_journal_seq_flush		= 0;
 		ca->allocator_invalidating_data		= false;

 		down_read(&c->gc_lock);
-		if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-			up_read(&c->gc_lock);
-			return 0;
-		}
-
 		while (1) {
+			size_t prev = fifo_used(&ca->free_inc);
+
+			if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+				up_read(&c->gc_lock);
+				bch_err(ca, "gc failure");
+				goto stop;
+			}
+
 			/*
 			 * Find some buckets that we can invalidate, either
 			 * they're completely unused, or only contain clean data
@ -950,7 +993,14 @@ static int bch2_allocator_thread(void *arg)
 			 * another cache tier
 			 */

+			pr_debug("scanning for reclaimable buckets");
+
 			find_reclaimable_buckets(c, ca);
+
+			pr_debug("found %zu buckets (free_inc %zu/%zu)",
+				 fifo_used(&ca->free_inc) - prev,
+				 fifo_used(&ca->free_inc), ca->free_inc.size);
+
 			trace_alloc_batch(ca, fifo_used(&ca->free_inc),
 					  ca->free_inc.size);

@ -977,15 +1027,20 @@ static int bch2_allocator_thread(void *arg)
 			ca->allocator_blocked = true;
 			closure_wake_up(&c->freelist_wait);

-			if (wait_buckets_available(c, ca)) {
+			ret = wait_buckets_available(c, ca);
+			if (ret) {
 				up_read(&c->gc_lock);
-				return 0;
+				goto stop;
 			}
 		}

 		ca->allocator_blocked = false;
 		up_read(&c->gc_lock);

+		pr_debug("free_inc now %zu/%zu",
+			 fifo_used(&ca->free_inc),
+			 ca->free_inc.size);
+
 		sort_free_inc(c, ca);

 		/*
@ -993,6 +1048,10 @@ static int bch2_allocator_thread(void *arg)
 		 * write out the new bucket gens:
 		 */
 	}
+
+stop:
+	pr_debug("alloc thread stopping (ret %i)", ret);
+	return 0;
 }

 /* Allocation */
@ -1046,8 +1105,8 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 	return ob;
 }

-/* _only_ for allocating the journal and btree roots on a brand new fs: */
-int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 {
 	struct bucket_array *buckets;
 	ssize_t b;
@ -1056,14 +1115,8 @@ int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
 	buckets = bucket_array(ca);

 	for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-		if (is_available_bucket(buckets->b[b].mark)) {
-			bch2_mark_alloc_bucket(c, ca, b, true,
-					gc_pos_alloc(c, NULL),
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-					BCH_BUCKET_MARK_GC_LOCK_HELD);
-			set_bit(b, ca->buckets_dirty);
+		if (is_available_bucket(buckets->b[b].mark))
 			goto success;
-		}
 	b = -1;
 success:
 	rcu_read_unlock();
@ -1135,9 +1188,8 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
 		break;
 	}

-	if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) &&
-	    (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
-		goto out;
+	if (cl)
+		closure_wait(&c->freelist_wait, cl);

 	spin_unlock(&c->freelist_lock);

@ -1218,7 +1270,7 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
 		*v = *v < scale ? 0 : *v - scale;
 }

-static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
+static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
 					struct write_point *wp,
 					unsigned nr_replicas,
 					enum alloc_reserve reserve,
@ -1284,52 +1336,22 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
 			break;
 		}
 	}
+	rcu_read_unlock();

 	EBUG_ON(reserve == RESERVE_MOVINGGC &&
 		ret != ALLOC_SUCCESS &&
 		ret != OPEN_BUCKETS_EMPTY);
-	rcu_read_unlock();
-	return ret;
-}

-static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
-				unsigned nr_replicas,
-				enum alloc_reserve reserve,
-				struct bch_devs_mask *devs,
-				struct closure *cl)
-{
-	bool waiting = false;
-
-	while (1) {
-		switch (__bch2_bucket_alloc_set(c, wp, nr_replicas,
-						reserve, devs, cl)) {
-		case ALLOC_SUCCESS:
-			if (waiting)
-				closure_wake_up(&c->freelist_wait);
-
-			return 0;
-
-		case NO_DEVICES:
-			if (waiting)
-				closure_wake_up(&c->freelist_wait);
-			return -EROFS;
-
-		case FREELIST_EMPTY:
-			if (!cl)
-				return -ENOSPC;
-
-			if (waiting)
-				return -EAGAIN;
-
-			/* Retry allocation after adding ourself to waitlist: */
-			closure_wait(&c->freelist_wait, cl);
-			waiting = true;
-			break;
-		case OPEN_BUCKETS_EMPTY:
-			return cl ? -EAGAIN : -ENOSPC;
-		default:
-			BUG();
-		}
+	switch (ret) {
+	case ALLOC_SUCCESS:
+		return 0;
+	case NO_DEVICES:
+		return -EROFS;
+	case FREELIST_EMPTY:
+	case OPEN_BUCKETS_EMPTY:
+		return cl ? -EAGAIN : -ENOSPC;
+	default:
+		BUG();
 	}
 }

@ -1530,11 +1552,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 	nr_ptrs_have = wp->first_ptr;

 	/* does writepoint have ptrs we don't want to use? */
-	writepoint_for_each_ptr(wp, ob, i)
-		if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
-			swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-			wp->first_ptr++;
-		}
+	if (target)
+		writepoint_for_each_ptr(wp, ob, i)
+			if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
+				swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+				wp->first_ptr++;
+			}

 	if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
 		ret = open_bucket_add_buckets(c, target, wp, devs_have,
@ -1551,7 +1574,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
 					      nr_replicas, reserve, cl);
 	}

-	if (ret)
+	if (ret && ret != -EROFS)
 		goto err;
 alloc_done:
 	/* check for more than one cache: */
@ -1584,6 +1607,13 @@ alloc_done:
 		nr_ptrs_effective += ca->mi.durability;
 	}

+	if (ret == -EROFS &&
+	    nr_ptrs_effective >= nr_replicas_required)
+		ret = 0;
+
+	if (ret)
+		goto err;
+
 	if (nr_ptrs_effective > nr_replicas) {
 		writepoint_for_each_ptr(wp, ob, i) {
 			ca = bch_dev_bkey_exists(c, ob->ptr.dev);
@ -1749,14 +1779,14 @@ void bch2_recalc_capacity(struct bch_fs *c)

 	if (c->capacity) {
 		bch2_io_timer_add(&c->io_clock[READ],
-				 &c->prio_clock[READ].rescale);
+				 &c->bucket_clock[READ].rescale);
 		bch2_io_timer_add(&c->io_clock[WRITE],
-				 &c->prio_clock[WRITE].rescale);
+				 &c->bucket_clock[WRITE].rescale);
 	} else {
 		bch2_io_timer_del(&c->io_clock[READ],
-				 &c->prio_clock[READ].rescale);
+				 &c->bucket_clock[READ].rescale);
 		bch2_io_timer_del(&c->io_clock[WRITE],
-				 &c->prio_clock[WRITE].rescale);
+				 &c->bucket_clock[WRITE].rescale);
 	}

 	/* Wake up case someone was waiting for buckets */
@ -1889,7 +1919,8 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 	if (ca->alloc_thread)
 		return 0;

-	p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
+	p = kthread_create(bch2_allocator_thread, ca,
+			   "bch_alloc[%s]", ca->name);
 	if (IS_ERR(p))
 		return PTR_ERR(p);

@ -1923,7 +1954,7 @@ static void allocator_start_issue_discards(struct bch_fs *c)
 static int __bch2_fs_allocator_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
-	size_t bu, i, devs_have_enough = 0;
+	size_t bu, i;
 	unsigned dev_iter;
 	u64 journal_seq = 0;
 	bool invalidating_data = false;
@ -1964,16 +1995,21 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)

 	/* did we find enough buckets? */
 	for_each_rw_member(ca, c, dev_iter)
-		devs_have_enough += (fifo_used(&ca->free_inc) >=
-				     ca->free[RESERVE_BTREE].size);
+		if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+			percpu_ref_put(&ca->io_ref);
+			goto not_enough;
+		}

-	if (devs_have_enough >= c->opts.metadata_replicas)
-		return 0;
+	return 0;
+not_enough:
+	pr_debug("did not find enough empty buckets; issuing discards");

 	/* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
 	for_each_rw_member(ca, c, dev_iter)
 		discard_invalidated_buckets(c, ca);

+	pr_debug("scanning for reclaimable buckets");
+
 	for_each_rw_member(ca, c, dev_iter) {
 		BUG_ON(!fifo_empty(&ca->free_inc));
 		ca->free_inc.front = ca->free_inc.back	= 0;
@ -1988,6 +2024,8 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 				break;
 	}

+	pr_debug("done scanning for reclaimable buckets");
+
 	/*
 	 * We're moving buckets to freelists _before_ they've been marked as
 	 * invalidated on disk - we have to so that we can allocate new btree
@ -1997,10 +2035,13 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	 * have cached data in them, which is live until they're marked as
 	 * invalidated on disk:
 	 */
-	if (invalidating_data)
+	if (invalidating_data) {
+		pr_debug("invalidating existing data");
 		set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-	else
+	} else {
+		pr_debug("issuing discards");
 		allocator_start_issue_discards(c);
+	}

 	/*
 	 * XXX: it's possible for this to deadlock waiting on journal reclaim,
@ -2017,13 +2058,15 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 	}

 	if (invalidating_data) {
+		pr_debug("flushing journal");
+
 		ret = bch2_journal_flush_seq(&c->journal, journal_seq);
 		if (ret)
 			return ret;
-	}

-	if (invalidating_data)
+		pr_debug("issuing discards");
 		allocator_start_issue_discards(c);
+	}

 	for_each_rw_member(ca, c, dev_iter)
 		while (ca->nr_invalidated) {
@ -2038,19 +2081,43 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 		struct bucket_table *tbl;
 		struct rhash_head *pos;
 		struct btree *b;
+		bool flush_updates;
+		size_t nr_pending_updates;

 		clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 again:
+		pr_debug("flushing dirty btree nodes");
+		cond_resched();
+
+		flush_updates = false;
+		nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+
 		rcu_read_lock();
 		for_each_cached_btree(b, c, tbl, i, pos)
 			if (btree_node_dirty(b) && (!b->written || b->level)) {
-				rcu_read_unlock();
-				six_lock_read(&b->lock);
-				bch2_btree_node_write(c, b, SIX_LOCK_read);
-				six_unlock_read(&b->lock);
-				goto again;
+				if (btree_node_may_write(b)) {
+					rcu_read_unlock();
+					six_lock_read(&b->lock);
+					bch2_btree_node_write(c, b, SIX_LOCK_read);
+					six_unlock_read(&b->lock);
+					goto again;
+				} else {
+					flush_updates = true;
+				}
 			}
 		rcu_read_unlock();
+
+		/*
+		 * This is ugly, but it's needed to flush btree node writes
+		 * without spinning...
+		 */
+		if (flush_updates) {
+			closure_wait_event(&c->btree_interior_update_wait,
+				bch2_btree_interior_updates_nr_pending(c) <
+				nr_pending_updates);
+			goto again;
+		}
 	}

 	return 0;
@ -2087,8 +2154,8 @@ void bch2_fs_allocator_init(struct bch_fs *c)

 	mutex_init(&c->write_points_hash_lock);
 	spin_lock_init(&c->freelist_lock);
-	bch2_prio_timer_init(c, READ);
-	bch2_prio_timer_init(c, WRITE);
+	bch2_bucket_clock_init(c, READ);
+	bch2_bucket_clock_init(c, WRITE);

 	/* open bucket 0 is a sentinal NULL: */
 	spin_lock_init(&c->open_buckets[0].lock);
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@ -9,6 +9,14 @@ struct bch_dev;
 struct bch_fs;
 struct bch_devs_List;

+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_alloc_invalid,		\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+
 struct dev_alloc_list {
 	unsigned	nr;
 	u8		devs[BCH_SB_MEMBERS_MAX];
@ -30,6 +38,8 @@ enum bucket_alloc_ret {
 	NO_DEVICES		= -3,	/* -EROFS */
 };

+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
 int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
 		      struct closure *);

@ -127,6 +137,4 @@ int bch2_alloc_write(struct bch_fs *);
 int bch2_fs_allocator_start(struct bch_fs *);
 void bch2_fs_allocator_init(struct bch_fs *);

-extern const struct bkey_ops bch2_bkey_alloc_ops;
-
 #endif /* _BCACHEFS_ALLOC_H */
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@ -8,7 +8,7 @@
 #include "fifo.h"

 /* There's two of these clocks, one for reads and one for writes: */
-struct prio_clock {
+struct bucket_clock {
 	/*
 	 * "now" in (read/write) IO time - incremented whenever we do X amount
 	 * of reads or writes.
@ -23,7 +23,7 @@ struct prio_clock {
 	 * consistent.
 	 */
 	u16			hand;
-	u16			min_prio;
+	u16			max_last_io;

 	int			rw;

@ -80,6 +80,7 @@ struct write_point_specifier {

 struct alloc_heap_entry {
 	size_t			bucket;
+	size_t			nr;
 	unsigned long		key;
 };

--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@ -384,7 +384,7 @@ struct bch_dev {
 	alloc_fifo		free[RESERVE_NR];
 	alloc_fifo		free_inc;
 	spinlock_t		freelist_lock;
-	unsigned		nr_invalidated;
+	size_t			nr_invalidated;

 	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
 	unsigned		open_buckets_partial_nr;
@ -392,7 +392,7 @@ struct bch_dev {
 	size_t			fifo_last_bucket;

 	/* last calculated minimum prio */
-	u16			min_prio[2];
+	u16			max_last_bucket_io[2];

 	atomic_long_t		saturated_count;
 	size_t			inc_gen_needs_gc;
@ -431,11 +431,11 @@ struct bch_dev {
 */
 enum {
 	/* startup: */
-	BCH_FS_BRAND_NEW_FS,
 	BCH_FS_ALLOC_READ_DONE,
 	BCH_FS_ALLOCATOR_STARTED,
 	BCH_FS_INITIAL_GC_DONE,
 	BCH_FS_FSCK_DONE,
+	BCH_FS_STARTED,

 	/* shutdown: */
 	BCH_FS_EMERGENCY_RO,
@ -519,8 +519,7 @@ struct bch_fs {
 		u64		features;
 	}			sb;

-	struct bch_sb		*disk_sb;
-	unsigned		disk_sb_order;
+	struct bch_sb_handle	disk_sb;

 	unsigned short		block_bits;	/* ilog2(block_size) */

@ -595,7 +594,7 @@ struct bch_fs {
 	 * those together consistently we keep track of the smallest nonzero
 	 * priority of any bucket.
 	 */
-	struct prio_clock	prio_clock[2];
+	struct bucket_clock	bucket_clock[2];

 	struct io_clock		io_clock[2];

--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@ -955,8 +955,9 @@ struct bch_disk_group {
 	__le64			flags[2];
 };

-LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0, 1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1, 6)
+LE64_BITMASK(BCH_GROUP_DELETED,		struct bch_disk_group, flags[0], 0,  1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,	struct bch_disk_group, flags[0], 1,  6)
+LE64_BITMASK(BCH_GROUP_PARENT,		struct bch_disk_group, flags[0], 6, 24)

 struct bch_sb_field_disk_groups {
 	struct bch_sb_field	field;
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@ -10,20 +10,20 @@
 #include "quota.h"
 #include "xattr.h"

-const struct bkey_ops *bch2_bkey_ops[] = {
-	[BKEY_TYPE_EXTENTS]	= &bch2_bkey_extent_ops,
-	[BKEY_TYPE_INODES]	= &bch2_bkey_inode_ops,
-	[BKEY_TYPE_DIRENTS]	= &bch2_bkey_dirent_ops,
-	[BKEY_TYPE_XATTRS]	= &bch2_bkey_xattr_ops,
-	[BKEY_TYPE_ALLOC]	= &bch2_bkey_alloc_ops,
-	[BKEY_TYPE_QUOTAS]	= &bch2_bkey_quota_ops,
-	[BKEY_TYPE_BTREE]	= &bch2_bkey_btree_ops,
+const struct bkey_ops bch2_bkey_ops[] = {
+	[BKEY_TYPE_EXTENTS]	= bch2_bkey_extent_ops,
+	[BKEY_TYPE_INODES]	= bch2_bkey_inode_ops,
+	[BKEY_TYPE_DIRENTS]	= bch2_bkey_dirent_ops,
+	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
+	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
+	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
+	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
 };

 const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
 				  struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];

 	switch (k.k->type) {
 	case KEY_TYPE_DELETED:
@ -51,7 +51,7 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
 const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
 			      struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];

 	if (k.k->u64s < BKEY_U64s)
 		return "u64s too small";
@ -100,7 +100,7 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 {
 	enum bkey_type type = btree_node_type(b);
-	const struct bkey_ops *ops = bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
 	const char *invalid;

 	BUG_ON(!k.k->u64s);
@ -141,7 +141,7 @@ int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
 int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
 		     char *buf, size_t size, struct bkey_s_c k)
 {
-	const struct bkey_ops *ops = bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
 	char *out = buf, *end = buf + size;

 	switch (k.k->type) {
@ -182,7 +182,7 @@ void bch2_bkey_swab(enum bkey_type type,
 		   const struct bkey_format *f,
 		   struct bkey_packed *k)
 {
-	const struct bkey_ops *ops = bch2_bkey_ops[type];
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];

 	bch2_bkey_swab_key(f, k);

--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@ -81,6 +81,6 @@ int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
 void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
 		    struct bkey_packed *);

-extern const struct bkey_ops *bch2_bkey_ops[];
+extern const struct bkey_ops bch2_bkey_ops[];

 #endif /* _BCACHEFS_BKEY_METHODS_H */
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@ -18,6 +18,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "replicas.h"
 #include "super-io.h"

 #include <linux/slab.h>
@ -317,7 +318,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 	unsigned i;
 	u64 b;

-	lockdep_assert_held(&c->sb_lock);
+	if (c)
+		lockdep_assert_held(&c->sb_lock);

 	for (i = 0; i < layout->nr_superblocks; i++) {
 		u64 offset = le64_to_cpu(layout->sb_offset[i]);
@ -331,7 +333,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 				      BCH_DATA_SB, flags);
 	}

-	spin_lock(&c->journal.lock);
+	if (c)
+		spin_lock(&c->journal.lock);

 	for (i = 0; i < ca->journal.nr; i++) {
 		b = ca->journal.buckets[i];
@ -340,7 +343,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 					  gc_phase(GC_PHASE_SB), flags);
 	}

-	spin_unlock(&c->journal.lock);
+	if (c)
+		spin_unlock(&c->journal.lock);
 }

 static void bch2_mark_superblocks(struct bch_fs *c)
@ -1034,8 +1038,8 @@ static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
 	int ret;

 	mutex_lock(&c->sb_lock);
-	if (!bch2_sb_get_replicas(c->disk_sb)) {
-		if (BCH_SB_INITIALIZED(c->disk_sb))
+	if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+		if (BCH_SB_INITIALIZED(c->disk_sb.sb))
 			bch_info(c, "building replicas info");
 		set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
 	}
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@ -1290,16 +1290,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)

 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
-	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-		struct bkey_s_c k;
-
-		k = bch2_btree_iter_peek_slot(iter);
-		if (btree_iter_err(k))
-			return k;
-	}
-
 	iter->pos = btree_type_successor(iter->btree_id, iter->k.p);

+	if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
+		/*
+		 * XXX: when we just need to relock we should be able to avoid
+		 * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+		 * for that to work
+		 */
+		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+
+		return bch2_btree_iter_peek_slot(iter);
+	}
+
 	if (!bkey_deleted(&iter->k))
 		__btree_iter_advance(&iter->l[0]);

@ -1318,6 +1321,8 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,

 	iter->c				= c;
 	iter->pos			= pos;
+	bkey_init(&iter->k);
+	iter->k.p			= pos;
 	iter->flags			= flags;
 	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
 	iter->btree_id			= btree_id;
@ -1330,6 +1335,10 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
 	iter->l[iter->level].b		= BTREE_ITER_NOT_END;
 	iter->next			= iter;

+	if (unlikely((flags & BTREE_ITER_IS_EXTENTS) &&
+		     !bkey_cmp(pos, POS_MAX)))
+		iter->uptodate = BTREE_ITER_END;
+
 	prefetch(c->btree_roots[btree_id].b);
 }

--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@ -231,6 +231,20 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
 	return __btree_iter_cmp(l->btree_id, l->pos, r);
 }

+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
+{
+	if (need_resched()) {
+		bch2_btree_iter_unlock(iter);
+		schedule();
+	} else if (race_fault()) {
+		bch2_btree_iter_unlock(iter);
+	}
+}
+
 #define __for_each_btree_node(_iter, _c, _btree_id, _start,		\
 			      _locks_want, _depth, _flags, _b)		\
 	for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start,	\
@ -253,6 +267,8 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
 static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
 						     unsigned flags)
 {
+	bch2_btree_iter_cond_resched(iter);
+
 	return flags & BTREE_ITER_SLOTS
 		? bch2_btree_iter_next_slot(iter)
 		: bch2_btree_iter_next(iter);
@ -275,18 +291,4 @@ static inline int btree_iter_err(struct bkey_s_c k)
 	return PTR_ERR_OR_ZERO(k.k);
 }

-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
-{
-	if (need_resched()) {
-		bch2_btree_iter_unlock(iter);
-		schedule();
-	} else if (race_fault()) {
-		bch2_btree_iter_unlock(iter);
-	}
-}
-
 #endif /* _BCACHEFS_BTREE_ITER_H */
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@ -299,7 +299,7 @@ static inline enum bkey_type btree_node_type(struct btree *b)

 static inline const struct bkey_ops *btree_node_ops(struct btree *b)
 {
-	return bch2_bkey_ops[btree_node_type(b)];
+	return &bch2_bkey_ops[btree_node_type(b)];
 }

 static inline bool btree_node_has_ptrs(struct btree *b)
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@ -13,6 +13,7 @@
 #include "extents.h"
 #include "journal.h"
 #include "keylist.h"
+#include "replicas.h"
 #include "super-io.h"

 #include <linux/random.h>
@ -2116,3 +2117,16 @@ ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)

 	return out - buf;
 }
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+{
+	size_t ret = 0;
+	struct list_head *i;
+
+	mutex_lock(&c->btree_interior_update_lock);
+	list_for_each(i, &c->btree_interior_update_list)
+		ret++;
+	mutex_unlock(&c->btree_interior_update_lock);
+
+	return ret;
+}
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@ -343,4 +343,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,

 ssize_t bch2_btree_updates_print(struct bch_fs *, char *);

+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+
 #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@ -443,8 +443,20 @@ split:
 	 * potentially blocks the allocator:
 	 */
 	ret = bch2_btree_split_leaf(c, split, trans->flags);
+
+	/*
+	 * This can happen when we insert part of an extent - with an update
+	 * with multiple keys, we don't want to redo the entire update - that's
+	 * just too confusing:
+	 */
+	if (!ret &&
+	    (trans->flags & BTREE_INSERT_ATOMIC) &&
+	    trans->did_work)
+		ret = -EINTR;
+
 	if (ret)
 		goto err;
+
 	/*
 	 * if the split didn't have to drop locks the insert will still be
 	 * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@ -309,7 +309,7 @@ static bool bucket_became_unavailable(struct bch_fs *c,
 {
 	return is_available_bucket(old) &&
 	       !is_available_bucket(new) &&
-	       c && c->gc_pos.phase == GC_PHASE_DONE;
+	       (!c || c->gc_pos.phase == GC_PHASE_DONE);
 }

 void bch2_fs_usage_apply(struct bch_fs *c,
@ -351,12 +351,16 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct bch_dev_usage *dev_usage;

-	lockdep_assert_held(&c->usage_lock);
+	if (c)
+		lockdep_assert_held(&c->usage_lock);

-	bch2_fs_inconsistent_on(old.data_type && new.data_type &&
-			old.data_type != new.data_type, c,
+	if (old.data_type && new.data_type &&
+	    old.data_type != new.data_type) {
+		BUG_ON(!c);
+		bch2_fs_inconsistent(c,
 			"different types of data in same bucket: %u, %u",
 			old.data_type, new.data_type);
+	}

 	dev_usage = this_cpu_ptr(ca->usage_percpu);

@ -466,21 +470,29 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,

 	BUG_ON(!type);

-	lg_local_lock(&c->usage_lock);
-	g = bucket(ca, b);
+	if (likely(c)) {
+		lg_local_lock(&c->usage_lock);

-	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-	    gc_will_visit(c, pos)) {
-		lg_local_unlock(&c->usage_lock);
-		return;
+		if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+		    gc_will_visit(c, pos)) {
+			lg_local_unlock(&c->usage_lock);
+			return;
+		}
 	}

+	preempt_disable();
+
+	g = bucket(ca, b);
 	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		saturated_add(ca, new.dirty_sectors, sectors,
 			      GC_MAX_SECTORS_USED);
 		new.data_type		= type;
 	}));
-	lg_local_unlock(&c->usage_lock);
+
+	preempt_enable();
+
+	if (likely(c))
+		lg_local_unlock(&c->usage_lock);

 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
 	       bucket_became_unavailable(c, old, new));
@ -859,9 +871,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)

 	bch2_copygc_stop(ca);

-	down_write(&c->gc_lock);
-	down_write(&ca->bucket_lock);
-	lg_global_lock(&c->usage_lock);
+	if (resize) {
+		down_write(&c->gc_lock);
+		down_write(&ca->bucket_lock);
+		lg_global_lock(&c->usage_lock);
+	}

 	old_buckets = bucket_array(ca);

@ -885,7 +899,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	swap(ca->oldest_gens, oldest_gens);
 	swap(ca->buckets_dirty, buckets_dirty);

-	lg_global_unlock(&c->usage_lock);
+	if (resize)
+		lg_global_unlock(&c->usage_lock);

 	spin_lock(&c->freelist_lock);
 	for (i = 0; i < RESERVE_NR; i++) {
@ -904,8 +919,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)

 	nbuckets = ca->mi.nbuckets;

-	up_write(&ca->bucket_lock);
-	up_write(&c->gc_lock);
+	if (resize) {
+		up_write(&ca->bucket_lock);
+		up_write(&c->gc_lock);
+	}

 	if (start_copygc &&
 	    bch2_copygc_start(c, ca))
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@ -31,6 +31,7 @@
 static inline struct bucket_array *bucket_array(struct bch_dev *ca)
 {
 	return rcu_dereference_check(ca->buckets,
+				     !ca->fs ||
 				     lockdep_is_held(&ca->fs->usage_lock) ||
 				     lockdep_is_held(&ca->fs->gc_lock) ||
 				     lockdep_is_held(&ca->bucket_lock));
@ -47,7 +48,12 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
 					 size_t b, int rw)
 {
-	bucket(ca, b)->prio[rw] = c->prio_clock[rw].hand;
+	bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
+}
+
+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
+{
+	return c->bucket_clock[rw].hand - g->io_time[rw];
 }

 /*
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@ -31,12 +31,12 @@ struct bucket_mark {
 };

 struct bucket {
-	u16				prio[2];
-
 	union {
 		struct bucket_mark	_mark;
 		const struct bucket_mark mark;
 	};
+
+	u16				io_time[2];
 };

 struct bucket_array {
@ -85,8 +85,9 @@ struct disk_reservation {
 };

 struct copygc_heap_entry {
+	u8			gen;
+	u32			sectors;
 	u64			offset;
-	struct bucket_mark	mark;
 };

 typedef HEAP(struct copygc_heap_entry) copygc_heap;
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@ -372,6 +372,9 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 	unsigned i, j;
 	int ret;

+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
 	if (copy_from_user(&arg, user_arg, sizeof(arg)))
 		return -EFAULT;

@ -460,7 +463,7 @@ static long bch2_ioctl_read_super(struct bch_fs *c,

 		sb = ca->disk_sb.sb;
 	} else {
-		sb = c->disk_sb;
+		sb = c->disk_sb.sb;
 	}

 	if (vstruct_bytes(sb) > arg.size) {
@ -535,13 +538,22 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;

-	/* ioctls that do require admin cap: */
 	switch (cmd) {
 	case BCH_IOCTL_START:
 		BCH_IOCTL(start, struct bch_ioctl_start);
 	case BCH_IOCTL_STOP:
 		return bch2_ioctl_stop(c);
+	case BCH_IOCTL_READ_SUPER:
+		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+	case BCH_IOCTL_DISK_GET_IDX:
+		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+	}

+	if (!test_bit(BCH_FS_STARTED, &c->flags))
+		return -EINVAL;
+
+	/* ioctls that do require admin cap: */
+	switch (cmd) {
 	case BCH_IOCTL_DISK_ADD:
 		BCH_IOCTL(disk_add, struct bch_ioctl_disk);
 	case BCH_IOCTL_DISK_REMOVE:
@ -554,10 +566,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
 		BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
 	case BCH_IOCTL_DATA:
 		BCH_IOCTL(data, struct bch_ioctl_data);
-	case BCH_IOCTL_READ_SUPER:
-		BCH_IOCTL(read_super, struct bch_ioctl_read_super);
-	case BCH_IOCTL_DISK_GET_IDX:
-		BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
 	case BCH_IOCTL_DISK_RESIZE:
 		BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);

--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@ -569,7 +569,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 	if (!bch2_key_is_encrypted(&sb_key))
 		goto out;

-	ret = bch2_request_key(c->disk_sb, &user_key);
+	ret = bch2_request_key(c->disk_sb.sb, &user_key);
 	if (ret) {
 		bch_err(c, "error requesting encryption key: %i", ret);
 		goto err;
@ -623,7 +623,7 @@ int bch2_disable_encryption(struct bch_fs *c)

 	mutex_lock(&c->sb_lock);

-	crypt = bch2_sb_get_crypt(c->disk_sb);
+	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
 	if (!crypt)
 		goto out;

@ -639,7 +639,7 @@ int bch2_disable_encryption(struct bch_fs *c)
 	crypt->key.magic	= BCH_KEY_MAGIC;
 	crypt->key.key		= key;

-	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0);
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
 	bch2_write_super(c);
 out:
 	mutex_unlock(&c->sb_lock);
@ -657,7 +657,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	mutex_lock(&c->sb_lock);

 	/* Do we already have an encryption key? */
-	if (bch2_sb_get_crypt(c->disk_sb))
+	if (bch2_sb_get_crypt(c->disk_sb.sb))
 		goto err;

 	ret = bch2_alloc_ciphers(c);
@ -668,7 +668,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	get_random_bytes(&key.key, sizeof(key.key));

 	if (keyed) {
-		ret = bch2_request_key(c->disk_sb, &user_key);
+		ret = bch2_request_key(c->disk_sb.sb, &user_key);
 		if (ret) {
 			bch_err(c, "error requesting encryption key: %i", ret);
 			goto err;
@ -685,7 +685,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	if (ret)
 		goto err;

-	crypt = bch2_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
+	crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
 	if (!crypt) {
 		ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
 		goto err;
@ -694,7 +694,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
 	crypt->key = key;

 	/* write superblock */
-	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1);
+	SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
 	bch2_write_super(c);
 err:
 	mutex_unlock(&c->sb_lock);
@ -728,7 +728,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 		goto out;
 	}

-	crypt = bch2_sb_get_crypt(c->disk_sb);
+	crypt = bch2_sb_get_crypt(c->disk_sb.sb);
 	if (!crypt)
 		goto out;

--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@ -117,6 +117,7 @@ static const unsigned bch_crc_bytes[] = {
 	[BCH_CSUM_CHACHA20_POLY1305_128]	= 16,
 };

+/* returns true if not equal */
 static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
 {
 	/*
--- a/libbcachefs/clock_types.h
+++ b/libbcachefs/clock_types.h
@ -3,7 +3,7 @@

 #include "util.h"

-#define NR_IO_TIMERS		8
+#define NR_IO_TIMERS		(BCH_SB_MEMBERS_MAX * 3)

 /*
 * Clocks/timers in units of sectors of IO:
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@ -500,7 +500,7 @@ int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
 		return ret;
 	}

-	c->disk_sb->features[0] |= cpu_to_le64(f);
+	c->disk_sb.sb->features[0] |= cpu_to_le64(f);
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@ -212,17 +212,20 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 	if (!i->size)
 		return i->ret;

-	for_each_btree_key(&iter, i->c, i->id, i->from,
-			   BTREE_ITER_PREFETCH, k) {
-		i->from = iter.pos;
+	bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+	k = bch2_btree_iter_peek(&iter);

+	while (k.k && !(err = btree_iter_err(k))) {
 		bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
-				     i->buf, sizeof(i->buf), k);
+				      i->buf, sizeof(i->buf), k);
 		i->bytes = strlen(i->buf);
 		BUG_ON(i->bytes >= PAGE_SIZE);
 		i->buf[i->bytes] = '\n';
 		i->bytes++;

+		k = bch2_btree_iter_next(&iter);
+		i->from = iter.pos;
+
 		err = flush_buf(i);
 		if (err)
 			break;
@ -230,7 +233,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 	}
-	err = bch2_btree_iter_unlock(&iter) ?: err;
+	bch2_btree_iter_unlock(&iter);

 	return err < 0 ? err : i->ret;
 }
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@ -79,8 +79,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
 	.cmp_bkey	= dirent_cmp_bkey,
 };

-static const char *bch2_dirent_invalid(const struct bch_fs *c,
-				       struct bkey_s_c k)
+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_s_c_dirent d;
 	unsigned len;
@ -116,8 +115,8 @@ static const char *bch2_dirent_invalid(const struct bch_fs *c,
 	}
 }

-static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
-				size_t size, struct bkey_s_c k)
+void bch2_dirent_to_text(struct bch_fs *c, char *buf,
+			 size_t size, struct bkey_s_c k)
 {
 	struct bkey_s_c_dirent d;
 	size_t n = 0;
@ -136,11 +135,6 @@ static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
 	}
 }

-const struct bkey_ops bch2_bkey_dirent_ops = {
-	.key_invalid	= bch2_dirent_invalid,
-	.val_to_text	= bch2_dirent_to_text,
-};
-
 static struct bkey_i_dirent *dirent_create_key(u8 type,
 				const struct qstr *name, u64 dst)
 {
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@ -4,7 +4,14 @@
 #include "str_hash.h"

 extern const struct bch_hash_desc bch2_dirent_hash_desc;
-extern const struct bkey_ops bch2_bkey_dirent_ops;
+
+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_dirent_ops (struct bkey_ops) {	\
+	.key_invalid	= bch2_dirent_invalid,		\
+	.val_to_text	= bch2_dirent_to_text,		\
+}

 struct qstr;
 struct file;
--- a/libbcachefs/disk_groups.c
+++ b/libbcachefs/disk_groups.c
@ -0,0 +1,462 @@
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+	const struct bch_disk_group *l = _l;
+	const struct bch_disk_group *r = _r;
+
+	return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+		(BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+		((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+		 (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+		strncmp(l->label, r->label, sizeof(l->label));
+}
+
+const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+					 struct bch_sb_field *f)
+{
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g, *sorted = NULL;
+	struct bch_sb_field_members *mi;
+	struct bch_member *m;
+	unsigned i, nr_groups, len;
+	const char *err = NULL;
+
+	mi		= bch2_sb_get_members(sb);
+	groups		= bch2_sb_get_disk_groups(sb);
+	nr_groups	= disk_groups_nr(groups);
+
+	for (m = mi->members;
+	     m < mi->members + sb->nr_devices;
+	     m++) {
+		unsigned g;
+
+		if (!BCH_MEMBER_GROUP(m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(m) - 1;
+
+		if (g >= nr_groups ||
+		    BCH_GROUP_DELETED(&groups->entries[g]))
+			return "disk has invalid group";
+	}
+
+	if (!nr_groups)
+		return NULL;
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		len = strnlen(g->label, sizeof(g->label));
+		if (!len) {
+			err = "group with empty label";
+			goto err;
+		}
+	}
+
+	sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+	if (!sorted)
+		return "cannot allocate memory";
+
+	memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+	sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+	for (i = 0; i + 1 < nr_groups; i++)
+		if (!BCH_GROUP_DELETED(sorted + i) &&
+		    !group_cmp(sorted + i, sorted + i + 1)) {
+			err = "duplicate groups";
+			goto err;
+		}
+
+	err = NULL;
+err:
+	kfree(sorted);
+	return err;
+}
+
+static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
+					struct bch_sb *sb,
+					struct bch_sb_field *f)
+{
+	char *out = buf, *end = buf + size;
+	struct bch_sb_field_disk_groups *groups =
+		field_to_type(f, disk_groups);
+	struct bch_disk_group *g;
+	unsigned nr_groups = disk_groups_nr(groups);
+
+	for (g = groups->entries;
+	     g < groups->entries + nr_groups;
+	     g++) {
+		if (g != groups->entries)
+			out += scnprintf(out, end - out, " ");
+
+		if (BCH_GROUP_DELETED(g))
+			out += scnprintf(out, end - out, "[deleted]");
+		else
+			out += scnprintf(out, end - out,
+					 "[parent %llu name %s]",
+					 BCH_GROUP_PARENT(g),
+					 g->label);
+	}
+
+	return out - buf;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+	.validate	= bch2_sb_disk_groups_validate,
+	.to_text	= bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_sb_field_disk_groups *groups;
+	struct bch_disk_groups_cpu *cpu_g, *old_g;
+	unsigned i, g, nr_groups;
+
+	lockdep_assert_held(&c->sb_lock);
+
+	mi		= bch2_sb_get_members(c->disk_sb.sb);
+	groups		= bch2_sb_get_disk_groups(c->disk_sb.sb);
+	nr_groups	= disk_groups_nr(groups);
+
+	if (!groups)
+		return 0;
+
+	cpu_g = kzalloc(sizeof(*cpu_g) +
+			sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+	if (!cpu_g)
+		return -ENOMEM;
+
+	cpu_g->nr = nr_groups;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *src	= &groups->entries[i];
+		struct bch_disk_group_cpu *dst	= &cpu_g->entries[i];
+
+		dst->deleted	= BCH_GROUP_DELETED(src);
+		dst->parent	= BCH_GROUP_PARENT(src);
+	}
+
+	for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+		struct bch_member *m = mi->members + i;
+		struct bch_disk_group_cpu *dst =
+			&cpu_g->entries[BCH_MEMBER_GROUP(m)];
+
+		if (!bch2_member_exists(m))
+			continue;
+
+		g = BCH_MEMBER_GROUP(m);
+		while (g) {
+			dst = &cpu_g->entries[g - 1];
+			__set_bit(i, dst->devs.d);
+			g = dst->parent;
+		}
+	}
+
+	old_g = c->disk_groups;
+	rcu_assign_pointer(c->disk_groups, cpu_g);
+	if (old_g)
+		kfree_rcu(old_g, rcu);
+
+	return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_DEV: {
+		struct bch_dev *ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+		return ca ? &ca->self : NULL;
+	}
+	case TARGET_GROUP: {
+		struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+		return t.group < g->nr && !g->entries[t.group].deleted
+			? &g->entries[t.group].devs
+			: NULL;
+	}
+	default:
+		BUG();
+	}
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+				  unsigned parent,
+				  const char *name, unsigned namelen)
+{
+	unsigned i, nr_groups = disk_groups_nr(groups);
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0; i < nr_groups; i++) {
+		struct bch_disk_group *g = groups->entries + i;
+
+		if (BCH_GROUP_DELETED(g))
+			continue;
+
+		if (!BCH_GROUP_DELETED(g) &&
+		    BCH_GROUP_PARENT(g) == parent &&
+		    strnlen(g->label, sizeof(g->label)) == namelen &&
+		    !memcmp(name, g->label, namelen))
+			return i;
+	}
+
+	return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+				 const char *name, unsigned namelen)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	unsigned i, nr_groups = disk_groups_nr(groups);
+	struct bch_disk_group *g;
+
+	if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+		return -EINVAL;
+
+	for (i = 0;
+	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+	     i++)
+		;
+
+	if (i == nr_groups) {
+		unsigned u64s =
+			(sizeof(struct bch_sb_field_disk_groups) +
+			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+			sizeof(u64);
+
+		groups = bch2_sb_resize_disk_groups(sb, u64s);
+		if (!groups)
+			return -ENOSPC;
+
+		nr_groups = disk_groups_nr(groups);
+	}
+
+	BUG_ON(i >= nr_groups);
+
+	g = &groups->entries[i];
+
+	memcpy(g->label, name, namelen);
+	if (namelen < sizeof(g->label))
+		g->label[namelen] = '\0';
+	SET_BCH_GROUP_DELETED(g, 0);
+	SET_BCH_GROUP_PARENT(g, parent);
+	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+	return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		v = __bch2_disk_group_find(groups, v + 1, name, len);
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+	struct bch_sb_field_disk_groups *groups;
+	unsigned parent = 0;
+	int v = -1;
+
+	do {
+		const char *next = strchrnul(name, '.');
+		unsigned len = next - name;
+
+		if (*next == '.')
+			next++;
+
+		groups = bch2_sb_get_disk_groups(sb->sb);
+
+		v = __bch2_disk_group_find(groups, parent, name, len);
+		if (v < 0)
+			v = __bch2_disk_group_add(sb, parent, name, len);
+		if (v < 0)
+			return v;
+
+		parent = v + 1;
+		name = next;
+	} while (*name && v >= 0);
+
+	return v;
+}
+
+int bch2_disk_path_print(struct bch_sb_handle *sb,
+			 char *buf, size_t len, unsigned v)
+{
+	char *out = buf, *end = out + len;
+	struct bch_sb_field_disk_groups *groups =
+		bch2_sb_get_disk_groups(sb->sb);
+	struct bch_disk_group *g;
+	unsigned nr = 0;
+	u16 path[32];
+
+	while (1) {
+		if (nr == ARRAY_SIZE(path))
+			goto inval;
+
+		if (v >= disk_groups_nr(groups))
+			goto inval;
+
+		g = groups->entries + v;
+
+		if (BCH_GROUP_DELETED(g))
+			goto inval;
+
+		path[nr++] = v;
+
+		if (!BCH_GROUP_PARENT(g))
+			break;
+
+		v = BCH_GROUP_PARENT(g) - 1;
+	}
+
+	while (nr) {
+		unsigned b = 0;
+
+		v = path[--nr];
+		g = groups->entries + v;
+
+		if (end != out)
+			b = min_t(size_t, end - out,
+				  strnlen(g->label, sizeof(g->label)));
+		memcpy(out, g->label, b);
+		if (b < end - out)
+			out[b] = '\0';
+		out += b;
+
+		if (nr)
+			out += scnprintf(out, end - out, ".");
+	}
+
+	return out - buf;
+inval:
+	return scnprintf(buf, len, "invalid group %u", v);
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+	struct bch_member *mi;
+	int v = -1;
+
+	mutex_lock(&c->sb_lock);
+
+	if (!strlen(name) || !strcmp(name, "none"))
+		goto write_sb;
+
+	v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+	if (v < 0) {
+		mutex_unlock(&c->sb_lock);
+		return v;
+	}
+
+write_sb:
+	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+	SET_BCH_MEMBER_GROUP(mi, v + 1);
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+{
+	struct bch_dev *ca;
+	int g;
+
+	if (!strlen(buf) || !strcmp(buf, "none")) {
+		*v = 0;
+		return 0;
+	}
+
+	/* Is it a device? */
+	ca = bch2_dev_lookup(c, buf);
+	if (!IS_ERR(ca)) {
+		*v = dev_to_target(ca->dev_idx);
+		percpu_ref_put(&ca->ref);
+		return 0;
+	}
+
+	mutex_lock(&c->sb_lock);
+	g = bch2_disk_path_find(&c->disk_sb, buf);
+	mutex_unlock(&c->sb_lock);
+
+	if (g >= 0) {
+		*v = group_to_target(g);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+{
+	struct target t = target_decode(v);
+	int ret;
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return scnprintf(buf, len, "none");
+	case TARGET_DEV: {
+		struct bch_dev *ca;
+
+		rcu_read_lock();
+		ca = t.dev < c->sb.nr_devices
+			? rcu_dereference(c->devs[t.dev])
+			: NULL;
+
+		if (ca && percpu_ref_tryget(&ca->io_ref)) {
+			char b[BDEVNAME_SIZE];
+
+			ret = scnprintf(buf, len, "/dev/%s",
+					bdevname(ca->disk_sb.bdev, b));
+			percpu_ref_put(&ca->io_ref);
+		} else if (ca) {
+			ret = scnprintf(buf, len, "offline device %u", t.dev);
+		} else {
+			ret = scnprintf(buf, len, "invalid device %u", t.dev);
+		}
+
+		rcu_read_unlock();
+		break;
+	}
+	case TARGET_GROUP:
+		mutex_lock(&c->sb_lock);
+		ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
+		mutex_unlock(&c->sb_lock);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
--- a/libbcachefs/disk_groups.h
+++ b/libbcachefs/disk_groups.h
@ -0,0 +1,99 @@
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+	return groups
+		? (vstruct_end(&groups->field) -
+		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+		: 0;
+}
+
+struct target {
+	enum {
+		TARGET_NULL,
+		TARGET_DEV,
+		TARGET_GROUP,
+	}			type;
+	union {
+		unsigned	dev;
+		unsigned	group;
+	};
+};
+
+#define TARGET_DEV_START	1
+#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+	return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+	return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+	if (target >= TARGET_GROUP_START)
+		return (struct target) {
+			.type	= TARGET_GROUP,
+			.group	= target - TARGET_GROUP_START
+		};
+
+	if (target >= TARGET_DEV_START)
+		return (struct target) {
+			.type	= TARGET_DEV,
+			.group	= target - TARGET_DEV_START
+		};
+
+	return (struct target) { .type = TARGET_NULL };
+}
+
+static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
+{
+	struct target t = target_decode(target);
+
+	switch (t.type) {
+	case TARGET_NULL:
+		return false;
+	case TARGET_DEV:
+		return ca->dev_idx == t.dev;
+	case TARGET_GROUP:
+		return ca->mi.group && ca->mi.group - 1 == t.group;
+	default:
+		BUG();
+	}
+}
+
+static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+	bool ret;
+
+	rcu_read_lock();
+	ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+					 struct bch_sb_field *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@ -14,10 +14,12 @@
 #include "checksum.h"
 #include "debug.h"
 #include "dirent.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
+#include "replicas.h"
 #include "super.h"
 #include "super-io.h"
 #include "util.h"
@ -25,9 +27,6 @@

 #include <trace/events/bcachefs.h>

-static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
-					   struct bkey_i *, struct bkey_i *);
-
 static void sort_key_next(struct btree_node_iter_large *iter,
 			  struct btree *b,
 			  struct btree_node_iter_set *i)
@ -160,9 +159,13 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
 {
 	const struct bch_extent_ptr *ptr;

-	extent_for_each_ptr(e, ptr)
-		if (dev_in_target(c->devs[ptr->dev], target))
+	extent_for_each_ptr(e, ptr) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+		if (dev_in_target(ca, target) &&
+		    (!ptr->cached || !ptr_stale(ca, ptr)))
 			return ptr;
+	}

 	return NULL;
 }
@ -356,11 +359,25 @@ restart_narrow_pointers:
 	return true;
 }

+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+					 struct bch_extent_crc_unpacked r)
+{
+	return (l.csum_type		!= r.csum_type ||
+		l.compression_type	!= r.compression_type ||
+		l.compressed_size	!= r.compressed_size ||
+		l.uncompressed_size	!= r.uncompressed_size ||
+		l.offset		!= r.offset ||
+		l.live_size		!= r.live_size ||
+		l.nonce			!= r.nonce ||
+		bch2_crc_cmp(l.csum, r.csum));
+}
+
 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 {
 	union bch_extent_entry *entry = e.v->start;
 	union bch_extent_crc *crc, *prev = NULL;
-	struct bch_extent_crc_unpacked u, prev_u;
+	struct bch_extent_crc_unpacked u, prev_u = { 0 };

 	while (entry != extent_entry_last(e)) {
 		union bch_extent_entry *next = extent_entry_next(entry);
@ -382,7 +399,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 			goto drop;
 		}

-		if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
+		if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) {
 			/* identical to previous crc entry: */
 			goto drop;
 		}
@ -439,13 +456,12 @@ static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
 		bch2_extent_drop_redundant_crcs(e);
 }

-static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk,
-			      struct bkey_s k)
+bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
 {
 	return bch2_extent_normalize(c, k);
 }

-static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 {
 	switch (k->type) {
 	case BCH_EXTENT:
@ -628,8 +644,7 @@ use:

 /* Btree ptrs */

-static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
-					 struct bkey_s_c k)
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	if (bkey_extent_is_cached(k.k))
 		return "cached";
@ -671,8 +686,8 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
 	}
 }

-static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
-				 struct bkey_s_c k)
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
+			       struct bkey_s_c k)
 {
 	struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 	const struct bch_extent_ptr *ptr;
@ -727,8 +742,8 @@ err:
 		      mark.gen, (unsigned) mark.counter);
 }

-static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
-				  size_t size, struct bkey_s_c k)
+void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
+			    size_t size, struct bkey_s_c k)
 {
 	char *out = buf, *end = buf + size;
 	const char *invalid;
@ -756,13 +771,6 @@ bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
 	return pick;
 }

-const struct bkey_ops bch2_bkey_btree_ops = {
-	.key_invalid	= bch2_btree_ptr_invalid,
-	.key_debugcheck	= btree_ptr_debugcheck,
-	.val_to_text	= bch2_btree_ptr_to_text,
-	.swab		= bch2_ptr_swab,
-};
-
 /* Extents */

 static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
@ -1436,7 +1444,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 }

 static enum btree_insert_ret
-bch2_delete_fixup_extent(struct extent_insert_state *s)
+__bch2_delete_fixup_extent(struct extent_insert_state *s)
 {
 	struct bch_fs *c = s->trans->c;
 	struct btree_iter *iter = s->insert->iter;
@ -1450,8 +1458,7 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)

 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));

-	s->whiteout	= *insert;
-	s->do_journal	= false;
+	s->whiteout = *insert;

 	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
 	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
@ -1474,12 +1481,12 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
 		overlap = bch2_extent_overlap(&insert->k, k.k);

 		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-		if (ret != BTREE_INSERT_OK)
-			goto stop;
+		if (ret)
+			break;

 		ret = extent_insert_advance_pos(s, k.s_c);
 		if (ret)
-			goto stop;
+			break;

 		s->do_journal = true;

@ -1520,25 +1527,65 @@ next:
 		bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 	}

-	if (ret == BTREE_INSERT_OK &&
-	    bkey_cmp(s->committed, insert->k.p) < 0)
-		ret = extent_insert_advance_pos(s, bkey_s_c_null);
-stop:
-	extent_insert_committed(s);
+	return ret;
+}

-	bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res,
-			   gc_pos_btree_node(b));
+static enum btree_insert_ret
+__bch2_insert_fixup_extent(struct extent_insert_state *s)
+{
+	struct btree_iter *iter = s->insert->iter;
+	struct btree_iter_level *l = &iter->l[0];
+	struct btree *b = l->b;
+	struct btree_node_iter *node_iter = &l->iter;
+	struct bkey_packed *_k;
+	struct bkey unpacked;
+	struct bkey_i *insert = s->insert->k;
+	enum btree_insert_ret ret = BTREE_INSERT_OK;

-	EBUG_ON(bkey_cmp(iter->pos, s->committed));
-	EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
-		!!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
+	while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+	       (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+	       (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
+		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+		enum bch_extent_overlap overlap;

-	bch2_cut_front(iter->pos, insert);
+		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);

-	if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
-		ret = BTREE_INSERT_NEED_TRAVERSE;
+		if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+			break;

-	EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
+		overlap = bch2_extent_overlap(&insert->k, k.k);
+
+		ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+		if (ret)
+			break;
+
+		if (!k.k->size)
+			goto squash;
+
+		/*
+		 * Only call advance pos & call hook for nonzero size extents:
+		 */
+		ret = extent_insert_advance_pos(s, k.s_c);
+		if (ret)
+			break;
+
+		if (k.k->size &&
+		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+			insert->k.needs_whiteout = true;
+
+		if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+		    bkey_whiteout(k.k) &&
+		    k.k->needs_whiteout) {
+			unreserve_whiteout(b, t, _k);
+			_k->needs_whiteout = false;
+		}
+squash:
+		ret = extent_squash(s, insert, t, _k, k, overlap);
+		if (ret != BTREE_INSERT_OK)
+			break;
+	}

 	return ret;
 }
@ -1590,9 +1637,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 	struct btree_iter *iter = insert->iter;
 	struct btree_iter_level *l = &iter->l[0];
 	struct btree *b = l->b;
-	struct btree_node_iter *node_iter = &l->iter;
-	struct bkey_packed *_k;
-	struct bkey unpacked;
 	enum btree_insert_ret ret = BTREE_INSERT_OK;

 	struct extent_insert_state s = {
@ -1605,9 +1649,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 	EBUG_ON(iter->level);
 	EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);

-	if (s.deleting)
-		return bch2_delete_fixup_extent(&s);
-
 	/*
 	 * As we process overlapping extents, we advance @iter->pos both to
 	 * signal to our caller (btree_insert_key()) how much of @insert->k has
@ -1616,67 +1657,32 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 	 */
 	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));

-	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+	if (!s.deleting &&
+	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
 		bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
 				bkey_start_offset(&insert->k->k),
 				insert->k->k.size);

-	while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
-	       (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
-	       (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-		struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-		struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-		enum bch_extent_overlap overlap;
-
-		EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-		EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
-		if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
-			break;
-
-		overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
-		ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
-		if (ret != BTREE_INSERT_OK)
-			goto stop;
-
-		if (!k.k->size)
-			goto squash;
-
-		/*
-		 * Only call advance pos & call hook for nonzero size extents:
-		 */
-		ret = extent_insert_advance_pos(&s, k.s_c);
-		if (ret != BTREE_INSERT_OK)
-			goto stop;
-
-		if (k.k->size &&
-		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
-			insert->k->k.needs_whiteout = true;
-
-		if (overlap == BCH_EXTENT_OVERLAP_ALL &&
-		    bkey_whiteout(k.k) &&
-		    k.k->needs_whiteout) {
-			unreserve_whiteout(b, t, _k);
-			_k->needs_whiteout = false;
-		}
-squash:
-		ret = extent_squash(&s, insert->k, t, _k, k, overlap);
-		if (ret != BTREE_INSERT_OK)
-			goto stop;
-	}
+	ret = !s.deleting
+		? __bch2_insert_fixup_extent(&s)
+		: __bch2_delete_fixup_extent(&s);

 	if (ret == BTREE_INSERT_OK &&
 	    bkey_cmp(s.committed, insert->k->k.p) < 0)
 		ret = extent_insert_advance_pos(&s, bkey_s_c_null);
-stop:
+
 	extent_insert_committed(&s);
+
+	if (s.deleting)
+		bch2_cut_front(iter->pos, insert->k);
+
 	/*
 	 * Subtract any remaining sectors from @insert, if we bailed out early
 	 * and didn't fully insert @insert:
 	 */
-	if (insert->k->k.size &&
-	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+	if (!s.deleting &&
+	    !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+	    insert->k->k.size)
 		bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
 				     bkey_start_offset(&insert->k->k),
 				     insert->k->k.size);
@ -1692,13 +1698,13 @@ stop:
 	if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
 		ret = BTREE_INSERT_NEED_TRAVERSE;

-	EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
+	WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
+		  "ret %u insert->k.size %u", ret, insert->k->k.size);

 	return ret;
 }

-static const char *bch2_extent_invalid(const struct bch_fs *c,
-				       struct bkey_s_c k)
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
 		return "value too big";
@ -1865,8 +1871,7 @@ bad_ptr:
 	return;
 }

-static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
-				   struct bkey_s_c k)
+void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
 {
 	switch (k.k->type) {
 	case BCH_EXTENT:
@ -1880,8 +1885,8 @@ static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
 	}
 }

-static void bch2_extent_to_text(struct bch_fs *c, char *buf,
-				size_t size, struct bkey_s_c k)
+void bch2_extent_to_text(struct bch_fs *c, char *buf,
+			 size_t size, struct bkey_s_c k)
 {
 	char *out = buf, *end = buf + size;
 	const char *invalid;
@ -1963,7 +1968,7 @@ void bch2_extent_crc_append(struct bkey_i_extent *e,
 	extent_for_each_crc(extent_i_to_s(e), crc, i)
 		;

-	if (!memcmp(&crc, &new, sizeof(crc)))
+	if (!bch2_crc_unpacked_cmp(crc, new))
 		return;

 	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
@ -2089,9 +2094,8 @@ void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
 	}
 }

-static enum merge_result bch2_extent_merge(struct bch_fs *c,
-					   struct btree *bk,
-					   struct bkey_i *l, struct bkey_i *r)
+enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
+				    struct bkey_i *l, struct bkey_i *r)
 {
 	struct bkey_s_extent el, er;
 	union bch_extent_entry *en_l, *en_r;
@ -2410,13 +2414,3 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)

 	return ret;
 }
-
-const struct bkey_ops bch2_bkey_extent_ops = {
-	.key_invalid	= bch2_extent_invalid,
-	.key_debugcheck	= bch2_extent_debugcheck,
-	.val_to_text	= bch2_extent_to_text,
-	.swab		= bch2_ptr_swab,
-	.key_normalize	= bch2_ptr_normalize,
-	.key_merge	= bch2_extent_merge,
-	.is_extents	= true,
-};
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@ -15,6 +15,36 @@ struct extent_insert_hook;
 struct bch_devs_mask;
 union bch_extent_crc;

+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
+			       struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_btree_ops (struct bkey_ops) {			\
+	.key_invalid	= bch2_btree_ptr_invalid,		\
+	.key_debugcheck	= bch2_btree_ptr_debugcheck,		\
+	.val_to_text	= bch2_btree_ptr_to_text,		\
+	.swab		= bch2_ptr_swab,			\
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
+				    struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_extent_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_extent_invalid,			\
+	.key_debugcheck	= bch2_extent_debugcheck,		\
+	.val_to_text	= bch2_extent_to_text,			\
+	.swab		= bch2_ptr_swab,			\
+	.key_normalize	= bch2_ptr_normalize,			\
+	.key_merge	= bch2_extent_merge,			\
+	.is_extents	= true,					\
+}
+
 struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
 						  struct btree *,
 						  struct btree_node_iter_large *);
@ -23,9 +53,6 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 						     struct btree *,
 						     struct btree_node_iter_large *);

-extern const struct bkey_ops bch2_bkey_btree_ops;
-extern const struct bkey_ops bch2_bkey_extent_ops;
-
 struct extent_pick_ptr
 bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
 		    struct bch_devs_mask *avoid);
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@ -468,7 +468,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 		}

 		BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
-		BUG_ON(!ret != !k->k.size);
+
+		if (WARN_ONCE(!ret != !k->k.size,
+			      "ret %i k->size %u", ret, k->k.size))
+			ret = k->k.size ? -EINTR : 0;
 err:
 		if (ret == -EINTR)
 			continue;
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@ -175,8 +175,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
 	return 0;
 }

-static const char *bch2_inode_invalid(const struct bch_fs *c,
-				      struct bkey_s_c k)
+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	if (k.k->p.offset)
 		return "nonzero offset";
@ -224,8 +223,8 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
 	}
 }

-static void bch2_inode_to_text(struct bch_fs *c, char *buf,
-			       size_t size, struct bkey_s_c k)
+void bch2_inode_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
 {
 	char *out = buf, *end = out + size;
 	struct bkey_s_c_inode inode;
@ -247,11 +246,6 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
 	}
 }

-const struct bkey_ops bch2_bkey_inode_ops = {
-	.key_invalid	= bch2_inode_invalid,
-	.val_to_text	= bch2_inode_to_text,
-};
-
 void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
 		     uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
 		     struct bch_inode_unpacked *parent)
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@ -5,7 +5,13 @@

 #include <linux/math64.h>

-extern const struct bkey_ops bch2_bkey_inode_ops;
+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_inode_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_inode_invalid,		\
+	.val_to_text	= bch2_inode_to_text,		\
+}

 struct bch_inode_unpacked {
 	u64			bi_inum;
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@ -20,6 +20,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "replicas.h"
 #include "super.h"
 #include "super-io.h"
 #include "tier.h"
@ -196,8 +197,6 @@ static void bch2_write_done(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);

-	BUG_ON(!(op->flags & BCH_WRITE_DONE));
-
 	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 		op->error = bch2_journal_error(&op->c->journal);

@ -205,7 +204,6 @@ static void bch2_write_done(struct closure *cl)
 		bch2_disk_reservation_put(op->c, &op->res);
 	percpu_ref_put(&op->c->writes);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
-	op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);

 	closure_return(cl);
 }
@ -232,9 +230,8 @@ int bch2_write_index_default(struct bch_write_op *op)
 /**
 * bch_write_index - after a write, update index to point to new data
 */
-static void bch2_write_index(struct closure *cl)
+static void __bch2_write_index(struct bch_write_op *op)
 {
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
 	struct bkey_s_extent e;
@ -242,8 +239,6 @@ static void bch2_write_index(struct closure *cl)
 	struct bkey_i *src, *dst = keys->keys, *n, *k;
 	int ret;

-	op->flags |= BCH_WRITE_LOOPED;
-
 	for (src = keys->keys; src != keys->top; src = n) {
 		n = bkey_next(src);
 		bkey_copy(dst, src);
@ -292,9 +287,19 @@ static void bch2_write_index(struct closure *cl)
 	}
 out:
 	bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
+	return;
+err:
+	keys->top = keys->keys;
+	op->error = ret;
+	goto out;
+}

-	if (!(op->flags & BCH_WRITE_DONE))
-		continue_at(cl, __bch2_write, op->io_wq);
+static void bch2_write_index(struct closure *cl)
+{
+	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct bch_fs *c = op->c;
+
+	__bch2_write_index(op);

 	if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
 		bch2_journal_flush_seq_async(&c->journal,
@ -304,12 +309,6 @@ out:
 	} else {
 		continue_at_nobarrier(cl, bch2_write_done, NULL);
 	}
-	return;
-err:
-	keys->top = keys->keys;
-	op->error = ret;
-	op->flags |= BCH_WRITE_DONE;
-	goto out;
 }

 static void bch2_write_endio(struct bio *bio)
@ -730,18 +729,18 @@ static void __bch2_write(struct closure *cl)
 	struct bch_fs *c = op->c;
 	struct write_point *wp;
 	int ret;
-
+again:
 	do {
 		/* +1 for possible cache device: */
 		if (op->open_buckets_nr + op->nr_replicas + 1 >
 		    ARRAY_SIZE(op->open_buckets))
-			continue_at(cl, bch2_write_index, index_update_wq(op));
+			goto flush_io;

 		if (bch2_keylist_realloc(&op->insert_keys,
 					op->inline_keys,
 					ARRAY_SIZE(op->inline_keys),
 					BKEY_EXTENT_U64s_MAX))
-			continue_at(cl, bch2_write_index, index_update_wq(op));
+			goto flush_io;

 		wp = bch2_alloc_sectors_start(c,
 			op->target,
@ -760,33 +759,7 @@ static void __bch2_write(struct closure *cl)
 				goto err;
 			}

-			/*
-			 * If we already have some keys, must insert them first
-			 * before allocating another open bucket. We only hit
-			 * this case if open_bucket_nr > 1.
-			 */
-			if (!bch2_keylist_empty(&op->insert_keys))
-				continue_at(cl, bch2_write_index,
-					    index_update_wq(op));
-
-			/*
-			 * If we've looped, we're running out of a workqueue -
-			 * not the bch2_write() caller's context - and we don't
-			 * want to block the workqueue:
-			 */
-			if (op->flags & BCH_WRITE_LOOPED)
-				continue_at(cl, __bch2_write, op->io_wq);
-
-			/*
-			 * Otherwise, we do want to block the caller on alloc
-			 * failure instead of letting it queue up more and more
-			 * writes:
-			 * XXX: this technically needs a try_to_freeze() -
-			 * except that that's not safe because caller may have
-			 * issued other IO... hmm..
-			 */
-			closure_sync(cl);
-			continue;
+			goto flush_io;
 		}

 		ret = bch2_write_extent(op, wp);
@ -802,28 +775,24 @@ static void __bch2_write(struct closure *cl)
 			goto err;
 	} while (ret);

-	op->flags |= BCH_WRITE_DONE;
 	continue_at(cl, bch2_write_index, index_update_wq(op));
 err:
-	/*
-	 * Right now we can only error here if we went RO - the
-	 * allocation failed, but we already checked for -ENOSPC when we
-	 * got our reservation.
-	 *
-	 * XXX capacity might have changed, but we don't check for that
-	 * yet:
-	 */
 	op->error = ret;
-	op->flags |= BCH_WRITE_DONE;

-	/*
-	 * No reason not to insert keys for whatever data was successfully
-	 * written (especially for a cmpxchg operation that's moving data
-	 * around)
-	 */
 	continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
 		    ? bch2_write_index
 		    : bch2_write_done, index_update_wq(op));
+flush_io:
+	closure_sync(cl);
+
+	if (!bch2_keylist_empty(&op->insert_keys)) {
+		__bch2_write_index(op);
+
+		if (op->error)
+			continue_at_nobarrier(cl, bch2_write_done, NULL);
+	}
+
+	goto again;
 }

 /**
@ -969,7 +938,7 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
 	if (percpu_ref_is_dying(&c->writes))
 		return false;

-	return bch2_extent_has_target(c, e, target);
+	return bch2_extent_has_target(c, e, target) == NULL;
 }

 /* Read */
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@ -36,8 +36,6 @@ enum bch_write_flags {

 	/* Internal: */
 	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 9),
-	BCH_WRITE_DONE			= (1 << 10),
-	BCH_WRITE_LOOPED		= (1 << 11),
 };

 static inline u64 *op_journal_seq(struct bch_write_op *op)
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@ -19,6 +19,7 @@
 #include "io.h"
 #include "keylist.h"
 #include "journal.h"
+#include "replicas.h"
 #include "super-io.h"
 #include "vstructs.h"

@ -1582,40 +1583,19 @@ err:
 	return ret;
 }

-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
-				       unsigned nr)
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+					 bool new_fs, struct closure *cl)
 {
-	struct journal *j = &c->journal;
+	struct bch_fs *c = ca->fs;
 	struct journal_device *ja = &ca->journal;
 	struct bch_sb_field_journal *journal_buckets;
-	struct disk_reservation disk_res = { 0, 0 };
-	struct closure cl;
 	u64 *new_bucket_seq = NULL, *new_buckets = NULL;
 	int ret = 0;

-	closure_init_stack(&cl);
-
 	/* don't handle reducing nr of buckets yet: */
 	if (nr <= ja->nr)
 		return 0;

-	/*
-	 * note: journal buckets aren't really counted as _sectors_ used yet, so
-	 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
-	 * when space used goes up without a reservation - but we do need the
-	 * reservation to ensure we'll actually be able to allocate:
-	 */
-
-	if (bch2_disk_reservation_get(c, &disk_res,
-			bucket_to_sector(ca, nr - ja->nr), 1, 0))
-		return -ENOSPC;
-
-	mutex_lock(&c->sb_lock);
-
 	ret = -ENOMEM;
 	new_buckets	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
 	new_bucket_seq	= kzalloc(nr * sizeof(u64), GFP_KERNEL);
@ -1627,29 +1607,41 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 	if (!journal_buckets)
 		goto err;

-	spin_lock(&j->lock);
+	if (c)
+		spin_lock(&c->journal.lock);
+
 	memcpy(new_buckets,	ja->buckets,	ja->nr * sizeof(u64));
 	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
 	swap(new_buckets,	ja->buckets);
 	swap(new_bucket_seq,	ja->bucket_seq);
-	spin_unlock(&j->lock);
+
+	if (c)
+		spin_unlock(&c->journal.lock);

 	while (ja->nr < nr) {
-		struct open_bucket *ob;
-		size_t bucket;
-		int ob_idx;
+		struct open_bucket *ob = NULL;
+		long bucket;

-		ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
-		if (ob_idx < 0) {
-			if (!closure_wait(&c->freelist_wait, &cl))
-				closure_sync(&cl);
-			continue;
+		if (new_fs) {
+			bucket = bch2_bucket_alloc_new_fs(ca);
+			if (bucket < 0) {
+				ret = -ENOSPC;
+				goto err;
+			}
+		} else {
+			int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
+			if (ob_idx < 0) {
+				ret = cl ? -EAGAIN : -ENOSPC;
+				goto err;
+			}
+
+			ob = c->open_buckets + ob_idx;
+			bucket = sector_to_bucket(ca, ob->ptr.offset);
 		}

-		ob = c->open_buckets + ob_idx;
-		bucket = sector_to_bucket(ca, ob->ptr.offset);
+		if (c)
+			spin_lock(&c->journal.lock);

-		spin_lock(&j->lock);
 		__array_insert_item(ja->buckets,		ja->nr, ja->last_idx);
 		__array_insert_item(ja->bucket_seq,		ja->nr, ja->last_idx);
 		__array_insert_item(journal_buckets->buckets,	ja->nr, ja->last_idx);
@ -1664,34 +1656,77 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 			ja->last_idx++;
 		}
 		ja->nr++;
-		spin_unlock(&j->lock);
+
+		if (c)
+			spin_unlock(&c->journal.lock);

 		bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
-					  ca->mi.bucket_size,
-					  gc_phase(GC_PHASE_SB), 0);
+				ca->mi.bucket_size,
+				gc_phase(GC_PHASE_SB),
+				new_fs
+				? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
+				: 0);

-		bch2_open_bucket_put(c, ob);
+		if (!new_fs)
+			bch2_open_bucket_put(c, ob);
 	}

-	bch2_write_super(c);
-
 	ret = 0;
 err:
-	mutex_unlock(&c->sb_lock);
-
 	kfree(new_bucket_seq);
 	kfree(new_buckets);
-	bch2_disk_reservation_put(c, &disk_res);
-
-	if (!ret)
-		bch2_dev_allocator_add(c, ca);
-
-	closure_sync(&cl);

 	return ret;
 }

-int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+				unsigned nr)
+{
+	struct journal_device *ja = &ca->journal;
+	struct closure cl;
+	unsigned current_nr;
+	int ret;
+
+	closure_init_stack(&cl);
+
+	do {
+		struct disk_reservation disk_res = { 0, 0 };
+
+		closure_sync(&cl);
+
+		mutex_lock(&c->sb_lock);
+		current_nr = ja->nr;
+
+		/*
+		 * note: journal buckets aren't really counted as _sectors_ used yet, so
+		 * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+		 * when space used goes up without a reservation - but we do need the
+		 * reservation to ensure we'll actually be able to allocate:
+		 */
+
+		if (bch2_disk_reservation_get(c, &disk_res,
+				bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+			mutex_unlock(&c->sb_lock);
+			return -ENOSPC;
+		}
+
+		ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+		bch2_disk_reservation_put(c, &disk_res);
+
+		if (ja->nr != current_nr)
+			bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+int bch2_dev_journal_alloc(struct bch_dev *ca)
 {
 	unsigned nr;

@ -1707,7 +1742,7 @@ int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
 		     min(1 << 10,
 			 (1 << 20) / ca->mi.bucket_size));

-	return bch2_set_nr_journal_buckets(c, ca, nr);
+	return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
 }

 /* Journalling */
@ -2320,8 +2355,8 @@ static void journal_write(struct closure *cl)

 	journal_write_compact(jset);

-	jset->read_clock	= cpu_to_le16(c->prio_clock[READ].hand);
-	jset->write_clock	= cpu_to_le16(c->prio_clock[WRITE].hand);
+	jset->read_clock	= cpu_to_le16(c->bucket_clock[READ].hand);
+	jset->write_clock	= cpu_to_le16(c->bucket_clock[WRITE].hand);
 	jset->magic		= cpu_to_le64(jset_magic(c));
 	jset->version		= cpu_to_le32(BCACHE_JSET_VERSION);

--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@ -400,7 +400,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 ssize_t bch2_journal_print_debug(struct journal *, char *);
 ssize_t bch2_journal_print_pins(struct journal *, char *);

-int bch2_dev_journal_alloc(struct bch_fs *, struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_dev *);

 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 void bch2_fs_journal_stop(struct journal *);
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@ -11,6 +11,7 @@
 #include "keylist.h"
 #include "migrate.h"
 #include "move.h"
+#include "replicas.h"
 #include "super-io.h"

 static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@ -6,6 +6,7 @@
 #include "inode.h"
 #include "io.h"
 #include "move.h"
+#include "replicas.h"
 #include "super-io.h"
 #include "keylist.h"

--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@ -9,6 +9,7 @@
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
+#include "disk_groups.h"
 #include "extents.h"
 #include "eytzinger.h"
 #include "io.h"
@ -51,7 +52,7 @@ static inline int sectors_used_cmp(copygc_heap *heap,
 				   struct copygc_heap_entry l,
 				   struct copygc_heap_entry r)
 {
-	return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
+	return (l.sectors > r.sectors) - (l.sectors < r.sectors);
 }

 static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
@ -78,7 +79,7 @@ static bool __copygc_pred(struct bch_dev *ca,

 		return (i >= 0 &&
 			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-			ptr->gen == h->data[i].mark.gen);
+			ptr->gen == h->data[i].gen);
 	}

 	return false;
@ -154,8 +155,9 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 			continue;

 		e = (struct copygc_heap_entry) {
-			.offset = bucket_to_sector(ca, b),
-			.mark	= m
+			.gen		= m.gen,
+			.sectors	= bucket_sectors_used(m),
+			.offset		= bucket_to_sector(ca, b),
 		};
 		heap_add_or_replace(h, e, -sectors_used_cmp);
 	}
@ -163,11 +165,11 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 	up_read(&c->gc_lock);

 	for (i = h->data; i < h->data + h->used; i++)
-		sectors_to_move += bucket_sectors_used(i->mark);
+		sectors_to_move += i->sectors;

 	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
 		BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
-		sectors_to_move -= bucket_sectors_used(e.mark);
+		sectors_to_move -= e.sectors;
 	}

 	buckets_to_move = h->used;
@ -191,7 +193,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 		size_t b = sector_to_bucket(ca, i->offset);
 		struct bucket_mark m = READ_ONCE(buckets->b[b].mark);

-		if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
+		if (i->gen == m.gen && bucket_sectors_used(m)) {
 			sectors_not_moved += bucket_sectors_used(m);
 			buckets_not_moved++;
 		}
@ -284,7 +286,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
 	if (bch2_fs_init_fault("copygc_start"))
 		return -ENOMEM;

-	t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
+	t = kthread_create(bch2_copygc_thread, ca,
+			   "bch_copygc[%s]", ca->name);
 	if (IS_ERR(t))
 		return PTR_ERR(t);

--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@ -2,6 +2,7 @@
 #include <linux/kernel.h>

 #include "bcachefs.h"
+#include "disk_groups.h"
 #include "opts.h"
 #include "super-io.h"
 #include "util.h"
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@ -4,7 +4,22 @@
 #include "quota.h"
 #include "super-io.h"

-static const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+					  struct bch_sb_field *f)
+{
+	struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+	if (vstruct_bytes(&q->field) != sizeof(*q))
+		return "invalid field quota: wrong size";
+
+	return NULL;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+	.validate	= bch2_sb_validate_quota,
+};
+
+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_s_c_quota dq;

@ -30,8 +45,8 @@ static const char * const bch2_quota_counters[] = {
 	"inodes",
 };

-static void bch2_quota_to_text(struct bch_fs *c, char *buf,
-			       size_t size, struct bkey_s_c k)
+void bch2_quota_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
 {
 	char *out = buf, *end= buf + size;
 	struct bkey_s_c_quota dq;
@ -50,11 +65,6 @@ static void bch2_quota_to_text(struct bch_fs *c, char *buf,
 	}
 }

-const struct bkey_ops bch2_bkey_quota_ops = {
-	.key_invalid	= bch2_quota_invalid,
-	.val_to_text	= bch2_quota_to_text,
-};
-
 #ifdef CONFIG_BCACHEFS_QUOTA

 #include <linux/cred.h>
@ -399,7 +409,7 @@ static void bch2_sb_quota_read(struct bch_fs *c)
 	struct bch_sb_field_quota *sb_quota;
 	unsigned i, j;

-	sb_quota = bch2_sb_get_quota(c->disk_sb);
+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
 	if (!sb_quota)
 		return;

@ -476,13 +486,13 @@ static int bch2_quota_enable(struct super_block	*sb, unsigned uflags)

 	mutex_lock(&c->sb_lock);
 	if (uflags & FS_QUOTA_UDQ_ENFD)
-		SET_BCH_SB_USRQUOTA(c->disk_sb, true);
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);

 	if (uflags & FS_QUOTA_GDQ_ENFD)
-		SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);

 	if (uflags & FS_QUOTA_PDQ_ENFD)
-		SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);

 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
@ -499,13 +509,13 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags)

 	mutex_lock(&c->sb_lock);
 	if (uflags & FS_QUOTA_UDQ_ENFD)
-		SET_BCH_SB_USRQUOTA(c->disk_sb, false);
+		SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);

 	if (uflags & FS_QUOTA_GDQ_ENFD)
-		SET_BCH_SB_GRPQUOTA(c->disk_sb, false);
+		SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);

 	if (uflags & FS_QUOTA_PDQ_ENFD)
-		SET_BCH_SB_PRJQUOTA(c->disk_sb, false);
+		SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);

 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
@ -616,9 +626,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
 	q = &c->quotas[type];

 	mutex_lock(&c->sb_lock);
-	sb_quota = bch2_sb_get_quota(c->disk_sb);
+	sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
 	if (!sb_quota) {
-		sb_quota = bch2_fs_sb_resize_quota(c, sizeof(*sb_quota) / sizeof(u64));
+		sb_quota = bch2_sb_resize_quota(&c->disk_sb,
+					sizeof(*sb_quota) / sizeof(u64));
 		if (!sb_quota)
 			return -ENOSPC;
 	}
--- a/libbcachefs/quota.h
+++ b/libbcachefs/quota.h
@ -1,9 +1,18 @@
 #ifndef _BCACHEFS_QUOTA_H
 #define _BCACHEFS_QUOTA_H

+#include "inode.h"
 #include "quota_types.h"

-extern const struct bkey_ops bch2_bkey_quota_ops;
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_quota_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_quota_invalid,		\
+	.val_to_text	= bch2_quota_to_text,		\
+}

 enum quota_acct_mode {
 	BCH_QUOTA_PREALLOC,
--- a/libbcachefs/replicas.c
+++ b/libbcachefs/replicas.c
@ -0,0 +1,698 @@
+
+#include "bcachefs.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+					    struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i)				\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+	     _i = (void *) (_i) + (_r)->entry_size)
+
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+				     unsigned dev)
+{
+	return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
+
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+				    unsigned dev)
+{
+	e->devs[dev >> 3] |= 1 << (dev & 7);
+}
+
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+	return (r->entry_size -
+		offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
+			      char *buf, size_t size)
+{
+	char *out = buf, *end = out + size;
+	struct bch_replicas_cpu_entry *e;
+	bool first = true;
+	unsigned i;
+
+	for_each_cpu_replicas_entry(r, e) {
+		bool first_e = true;
+
+		if (!first)
+			out += scnprintf(out, end - out, " ");
+		first = false;
+
+		out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+		for (i = 0; i < replicas_dev_slots(r); i++)
+			if (replicas_test_dev(e, i)) {
+				if (!first_e)
+					out += scnprintf(out, end - out, " ");
+				first_e = false;
+				out += scnprintf(out, end - out, "%u", i);
+			}
+		out += scnprintf(out, end - out, "]");
+	}
+
+	return out - buf;
+}
+
+static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+					enum bch_data_type data_type,
+					struct bch_replicas_cpu_entry *r,
+					unsigned *max_dev)
+{
+	const struct bch_extent_ptr *ptr;
+	unsigned nr = 0;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	memset(r, 0, sizeof(*r));
+	r->data_type = data_type;
+
+	*max_dev = 0;
+
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached) {
+			*max_dev = max_t(unsigned, *max_dev, ptr->dev);
+			replicas_set_dev(r, ptr->dev);
+			nr++;
+		}
+	return nr;
+}
+
+static inline void devlist_to_replicas(struct bch_devs_list devs,
+				       enum bch_data_type data_type,
+				       struct bch_replicas_cpu_entry *r,
+				       unsigned *max_dev)
+{
+	unsigned i;
+
+	BUG_ON(!data_type ||
+	       data_type == BCH_DATA_SB ||
+	       data_type >= BCH_DATA_NR);
+
+	memset(r, 0, sizeof(*r));
+	r->data_type = data_type;
+
+	*max_dev = 0;
+
+	for (i = 0; i < devs.nr; i++) {
+		*max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
+		replicas_set_dev(r, devs.devs[i]);
+	}
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+		       struct bch_replicas_cpu_entry new_entry,
+		       unsigned max_dev)
+{
+	struct bch_replicas_cpu *new;
+	unsigned i, nr, entry_size;
+
+	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+		DIV_ROUND_UP(max_dev + 1, 8);
+	entry_size = max(entry_size, old->entry_size);
+	nr = old->nr + 1;
+
+	new = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      nr * entry_size, GFP_NOIO);
+	if (!new)
+		return NULL;
+
+	new->nr		= nr;
+	new->entry_size	= entry_size;
+
+	for (i = 0; i < old->nr; i++)
+		memcpy(cpu_replicas_entry(new, i),
+		       cpu_replicas_entry(old, i),
+		       min(new->entry_size, old->entry_size));
+
+	memcpy(cpu_replicas_entry(new, old->nr),
+	       &new_entry,
+	       new->entry_size);
+
+	bch2_cpu_replicas_sort(new);
+	return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+				struct bch_replicas_cpu_entry search,
+				unsigned max_dev)
+{
+	return max_dev < replicas_dev_slots(r) &&
+		eytzinger0_find(r->entries, r->nr,
+				r->entry_size,
+				memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+				struct bch_replicas_cpu_entry new_entry,
+				unsigned max_dev)
+{
+	struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
+	int ret = -ENOMEM;
+
+	mutex_lock(&c->sb_lock);
+
+	old_gc = rcu_dereference_protected(c->replicas_gc,
+					   lockdep_is_held(&c->sb_lock));
+	if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+		new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+		if (!new_gc)
+			goto err;
+	}
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+	if (!replicas_has_entry(old_r, new_entry, max_dev)) {
+		new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+		if (!new_r)
+			goto err;
+
+		ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+		if (ret)
+			goto err;
+	}
+
+	/* allocations done, now commit: */
+
+	if (new_r)
+		bch2_write_super(c);
+
+	/* don't update in memory replicas until changes are persistent */
+
+	if (new_gc) {
+		rcu_assign_pointer(c->replicas_gc, new_gc);
+		kfree_rcu(old_gc, rcu);
+	}
+
+	if (new_r) {
+		rcu_assign_pointer(c->replicas, new_r);
+		kfree_rcu(old_r, rcu);
+	}
+
+	mutex_unlock(&c->sb_lock);
+	return 0;
+err:
+	mutex_unlock(&c->sb_lock);
+	if (new_gc)
+		kfree(new_gc);
+	if (new_r)
+		kfree(new_r);
+	return ret;
+}
+
+int bch2_mark_replicas(struct bch_fs *c,
+		       enum bch_data_type data_type,
+		       struct bch_devs_list devs)
+{
+	struct bch_replicas_cpu_entry search;
+	struct bch_replicas_cpu *r, *gc_r;
+	unsigned max_dev;
+	bool marked;
+
+	if (!devs.nr)
+		return 0;
+
+	BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
+	devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+	gc_r = rcu_dereference(c->replicas_gc);
+	marked = replicas_has_entry(r, search, max_dev) &&
+		(!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+	rcu_read_unlock();
+
+	return likely(marked) ? 0
+		: bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+			    enum bch_data_type data_type,
+			    struct bkey_s_c k)
+{
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
+	int ret;
+
+	for (i = 0; i < cached.nr; i++)
+		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+					      bch2_dev_list_single(cached.devs[i]))))
+			return ret;
+
+	return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+	struct bch_replicas_cpu *new_r, *old_r;
+	int ret = 0;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+
+	new_r = rcu_dereference_protected(c->replicas_gc,
+					  lockdep_is_held(&c->sb_lock));
+
+	if (err) {
+		rcu_assign_pointer(c->replicas_gc, NULL);
+		kfree_rcu(new_r, rcu);
+		goto err;
+	}
+
+	if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+		ret = -ENOSPC;
+		goto err;
+	}
+
+	old_r = rcu_dereference_protected(c->replicas,
+					  lockdep_is_held(&c->sb_lock));
+
+	rcu_assign_pointer(c->replicas, new_r);
+	rcu_assign_pointer(c->replicas_gc, NULL);
+	kfree_rcu(old_r, rcu);
+
+	bch2_write_super(c);
+err:
+	mutex_unlock(&c->sb_lock);
+	return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+	struct bch_replicas_cpu *dst, *src;
+	struct bch_replicas_cpu_entry *e;
+
+	lockdep_assert_held(&c->replicas_gc_lock);
+
+	mutex_lock(&c->sb_lock);
+	BUG_ON(c->replicas_gc);
+
+	src = rcu_dereference_protected(c->replicas,
+					lockdep_is_held(&c->sb_lock));
+
+	dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+		      src->nr * src->entry_size, GFP_NOIO);
+	if (!dst) {
+		mutex_unlock(&c->sb_lock);
+		return -ENOMEM;
+	}
+
+	dst->nr		= 0;
+	dst->entry_size	= src->entry_size;
+
+	for_each_cpu_replicas_entry(src, e)
+		if (!((1 << e->data_type) & typemask))
+			memcpy(cpu_replicas_entry(dst, dst->nr++),
+			       e, dst->entry_size);
+
+	bch2_cpu_replicas_sort(dst);
+
+	rcu_assign_pointer(c->replicas_gc, dst);
+	mutex_unlock(&c->sb_lock);
+
+	return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
+					unsigned *nr,
+					unsigned *bytes,
+					unsigned *max_dev)
+{
+	struct bch_replicas_entry *i;
+	unsigned j;
+
+	*nr	= 0;
+	*bytes	= sizeof(*r);
+	*max_dev = 0;
+
+	if (!r)
+		return;
+
+	for_each_replicas_entry(r, i) {
+		for (j = 0; j < i->nr; j++)
+			*max_dev = max_t(unsigned, *max_dev, i->devs[j]);
+		(*nr)++;
+	}
+
+	*bytes = (void *) i - (void *) r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+{
+	struct bch_replicas_cpu *cpu_r;
+	unsigned i, nr, bytes, max_dev, entry_size;
+
+	bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+	entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+		DIV_ROUND_UP(max_dev + 1, 8);
+
+	cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+			nr * entry_size, GFP_NOIO);
+	if (!cpu_r)
+		return NULL;
+
+	cpu_r->nr		= nr;
+	cpu_r->entry_size	= entry_size;
+
+	if (nr) {
+		struct bch_replicas_cpu_entry *dst =
+			cpu_replicas_entry(cpu_r, 0);
+		struct bch_replicas_entry *src = sb_r->entries;
+
+		while (dst < cpu_replicas_entry(cpu_r, nr)) {
+			dst->data_type = src->data_type;
+			for (i = 0; i < src->nr; i++)
+				replicas_set_dev(dst, src->devs[i]);
+
+			src	= replicas_entry_next(src);
+			dst	= (void *) dst + entry_size;
+		}
+	}
+
+	bch2_cpu_replicas_sort(cpu_r);
+	return cpu_r;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_cpu *cpu_r, *old_r;
+
+	sb_r	= bch2_sb_get_replicas(c->disk_sb.sb);
+	cpu_r	= __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if (!cpu_r)
+		return -ENOMEM;
+
+	old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
+	rcu_assign_pointer(c->replicas, cpu_r);
+	if (old_r)
+		kfree_rcu(old_r, rcu);
+
+	return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+					    struct bch_replicas_cpu *r)
+{
+	struct bch_sb_field_replicas *sb_r;
+	struct bch_replicas_entry *sb_e;
+	struct bch_replicas_cpu_entry *e;
+	size_t i, bytes;
+
+	bytes = sizeof(struct bch_sb_field_replicas);
+
+	for_each_cpu_replicas_entry(r, e) {
+		bytes += sizeof(struct bch_replicas_entry);
+		for (i = 0; i < r->entry_size - 1; i++)
+			bytes += hweight8(e->devs[i]);
+	}
+
+	sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+			DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+	if (!sb_r)
+		return -ENOSPC;
+
+	memset(&sb_r->entries, 0,
+	       vstruct_end(&sb_r->field) -
+	       (void *) &sb_r->entries);
+
+	sb_e = sb_r->entries;
+	for_each_cpu_replicas_entry(r, e) {
+		sb_e->data_type = e->data_type;
+
+		for (i = 0; i < replicas_dev_slots(r); i++)
+			if (replicas_test_dev(e, i))
+				sb_e->devs[sb_e->nr++] = i;
+
+		sb_e = replicas_entry_next(sb_e);
+
+		BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+	}
+
+	return 0;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
+{
+	struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+	struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+	struct bch_replicas_cpu *cpu_r = NULL;
+	struct bch_replicas_entry *e;
+	const char *err;
+	unsigned i;
+
+	for_each_replicas_entry(sb_r, e) {
+		err = "invalid replicas entry: invalid data type";
+		if (e->data_type >= BCH_DATA_NR)
+			goto err;
+
+		err = "invalid replicas entry: no devices";
+		if (!e->nr)
+			goto err;
+
+		err = "invalid replicas entry: too many devices";
+		if (e->nr >= BCH_REPLICAS_MAX)
+			goto err;
+
+		err = "invalid replicas entry: invalid device";
+		for (i = 0; i < e->nr; i++)
+			if (!bch2_dev_exists(sb, mi, e->devs[i]))
+				goto err;
+	}
+
+	err = "cannot allocate memory";
+	cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+	if (!cpu_r)
+		goto err;
+
+	sort_cmp_size(cpu_r->entries,
+		      cpu_r->nr,
+		      cpu_r->entry_size,
+		      memcmp, NULL);
+
+	for (i = 0; i + 1 < cpu_r->nr; i++) {
+		struct bch_replicas_cpu_entry *l =
+			cpu_replicas_entry(cpu_r, i);
+		struct bch_replicas_cpu_entry *r =
+			cpu_replicas_entry(cpu_r, i + 1);
+
+		BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+		err = "duplicate replicas entry";
+		if (!memcmp(l, r, cpu_r->entry_size))
+			goto err;
+	}
+
+	err = NULL;
+err:
+	kfree(cpu_r);
+	return err;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+	.validate	= bch2_sb_validate_replicas,
+};
+
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
+{
+	char *out = buf, *end = out + size;
+	struct bch_replicas_entry *e;
+	bool first = true;
+	unsigned i;
+
+	if (!r) {
+		out += scnprintf(out, end - out, "(no replicas section found)");
+		return out - buf;
+	}
+
+	for_each_replicas_entry(r, e) {
+		if (!first)
+			out += scnprintf(out, end - out, " ");
+		first = false;
+
+		out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+		for (i = 0; i < e->nr; i++)
+			out += scnprintf(out, end - out,
+					 i ? " %u" : "%u", e->devs[i]);
+		out += scnprintf(out, end - out, "]");
+	}
+
+	return out - buf;
+}
+
+/* Query replicas: */
+
+bool bch2_replicas_marked(struct bch_fs *c,
+			  enum bch_data_type data_type,
+			  struct bch_devs_list devs)
+{
+	struct bch_replicas_cpu_entry search;
+	unsigned max_dev;
+	bool ret;
+
+	if (!devs.nr)
+		return true;
+
+	devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+	rcu_read_lock();
+	ret = replicas_has_entry(rcu_dereference(c->replicas),
+				 search, max_dev);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+			       enum bch_data_type data_type,
+			       struct bkey_s_c k)
+{
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
+
+	for (i = 0; i < cached.nr; i++)
+		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+					  bch2_dev_list_single(cached.devs[i])))
+			return false;
+
+	return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+					      struct bch_devs_mask online_devs)
+{
+	struct bch_sb_field_members *mi;
+	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_cpu *r;
+	unsigned i, dev, dev_slots, nr_online, nr_offline;
+	struct replicas_status ret;
+
+	memset(&ret, 0, sizeof(ret));
+
+	for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+		ret.replicas[i].nr_online = UINT_MAX;
+
+	mi = bch2_sb_get_members(c->disk_sb.sb);
+	rcu_read_lock();
+
+	r = rcu_dereference(c->replicas);
+	dev_slots = replicas_dev_slots(r);
+
+	for_each_cpu_replicas_entry(r, e) {
+		if (e->data_type >= ARRAY_SIZE(ret.replicas))
+			panic("e %p data_type %u\n", e, e->data_type);
+
+		nr_online = nr_offline = 0;
+
+		for (dev = 0; dev < dev_slots; dev++) {
+			if (!replicas_test_dev(e, dev))
+				continue;
+
+			BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
+
+			if (test_bit(dev, online_devs.d))
+				nr_online++;
+			else
+				nr_offline++;
+		}
+
+		ret.replicas[e->data_type].nr_online =
+			min(ret.replicas[e->data_type].nr_online,
+			    nr_online);
+
+		ret.replicas[e->data_type].nr_offline =
+			max(ret.replicas[e->data_type].nr_offline,
+			    nr_offline);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+	return __bch2_replicas_status(c, bch2_online_devs(c));
+}
+
+static bool have_enough_devs(struct replicas_status s,
+			     enum bch_data_type type,
+			     bool force_if_degraded,
+			     bool force_if_lost)
+{
+	return (!s.replicas[type].nr_offline || force_if_degraded) &&
+		(s.replicas[type].nr_online || force_if_lost);
+}
+
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+	return (have_enough_devs(s, BCH_DATA_JOURNAL,
+				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
+				 flags & BCH_FORCE_IF_METADATA_LOST) &&
+		have_enough_devs(s, BCH_DATA_BTREE,
+				 flags & BCH_FORCE_IF_METADATA_DEGRADED,
+				 flags & BCH_FORCE_IF_METADATA_LOST) &&
+		have_enough_devs(s, BCH_DATA_USER,
+				 flags & BCH_FORCE_IF_DATA_DEGRADED,
+				 flags & BCH_FORCE_IF_DATA_LOST));
+}
+
+unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+	struct replicas_status s = bch2_replicas_status(c);
+
+	return meta
+		? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
+		      s.replicas[BCH_DATA_BTREE].nr_online)
+		: s.replicas[BCH_DATA_USER].nr_online;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+	struct bch_replicas_cpu_entry *e;
+	struct bch_replicas_cpu *r;
+	unsigned ret = 0;
+
+	rcu_read_lock();
+	r = rcu_dereference(c->replicas);
+
+	if (ca->dev_idx >= replicas_dev_slots(r))
+		goto out;
+
+	for_each_cpu_replicas_entry(r, e)
+		if (replicas_test_dev(e, ca->dev_idx))
+			ret |= 1 << e->data_type;
+out:
+	rcu_read_unlock();
+
+	return ret;
+}
--- a/libbcachefs/replicas.h
+++ b/libbcachefs/replicas.h
@ -0,0 +1,51 @@
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
+			  struct bch_devs_list);
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+			       struct bkey_s_c);
+int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
+		       struct bch_devs_list);
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+			    struct bkey_s_c);
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
+
+struct replicas_status {
+	struct {
+		unsigned	nr_online;
+		unsigned	nr_offline;
+	}			replicas[BCH_DATA_NR];
+};
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *,
+					      struct bch_devs_mask);
+struct replicas_status bch2_replicas_status(struct bch_fs *);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
+
+unsigned bch2_replicas_online(struct bch_fs *, bool);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)					\
+	for (_i = (_r)->entries;					\
+	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+	     (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+
+#endif /* _BCACHEFS_REPLICAS_H */
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@ -11,8 +11,6 @@
 struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
 struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
 					  enum bch_sb_field_type, unsigned);
-struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *,
-					 enum bch_sb_field_type, unsigned);

 #define field_to_type(_f, _name)					\
 	container_of_or_null(_f, struct bch_sb_field_##_name, field)
@ -30,13 +28,6 @@ bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)	\
 {									\
 	return field_to_type(bch2_sb_field_resize(sb,			\
 				BCH_SB_FIELD_##_name, u64s), _name);	\
-}									\
-									\
-static inline struct bch_sb_field_##_name *				\
-bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s)		\
-{									\
-	return field_to_type(bch2_fs_sb_field_resize(c,			\
-				BCH_SB_FIELD_##_name, u64s), _name);	\
 }

 BCH_SB_FIELDS()
@ -44,6 +35,12 @@ BCH_SB_FIELDS()

 extern const char * const bch2_sb_fields[];

+struct bch_sb_field_ops {
+	const char *	(*validate)(struct bch_sb *, struct bch_sb_field *);
+	size_t		(*to_text)(char *, size_t, struct bch_sb *,
+				   struct bch_sb_field *);
+};
+
 static inline bool bch2_sb_test_feature(struct bch_sb *sb,
 					enum bch_sb_features f)
 {
@ -90,7 +87,7 @@ int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
 int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);

 void bch2_free_super(struct bch_sb_handle *);
-int bch2_super_realloc(struct bch_sb_handle *, unsigned);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);

 const char *bch2_sb_validate(struct bch_sb_handle *);

@ -139,135 +136,4 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 	};
 }

-/* BCH_SB_FIELD_replicas: */
-
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
-			  struct bch_devs_list);
-bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
-			       struct bkey_s_c);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
-		       struct bch_devs_list);
-int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
-			    struct bkey_s_c);
-
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
-
-struct replicas_status {
-	struct {
-		unsigned	nr_online;
-		unsigned	nr_offline;
-	}			replicas[BCH_DATA_NR];
-};
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *,
-					      struct bch_devs_mask);
-struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct replicas_status, unsigned);
-
-unsigned bch2_replicas_online(struct bch_fs *, bool);
-unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
-
-int bch2_replicas_gc_end(struct bch_fs *, int);
-int bch2_replicas_gc_start(struct bch_fs *, unsigned);
-
-/* iterate over superblock replicas - used by userspace tools: */
-
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
-	return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i)					\
-	for (_i = (_r)->entries;					\
-	     (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-	     (_i) = replicas_entry_next(_i))
-
-/* disk groups: */
-
-static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
-{
-	return groups
-		? (vstruct_end(&groups->field) -
-		   (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
-		: 0;
-}
-
-struct target {
-	enum {
-		TARGET_NULL,
-		TARGET_DEV,
-		TARGET_GROUP,
-	}			type;
-	union {
-		unsigned	dev;
-		unsigned	group;
-	};
-};
-
-#define TARGET_DEV_START	1
-#define TARGET_GROUP_START	(256 + TARGET_DEV_START)
-
-static inline u16 dev_to_target(unsigned dev)
-{
-	return TARGET_DEV_START + dev;
-}
-
-static inline u16 group_to_target(unsigned group)
-{
-	return TARGET_GROUP_START + group;
-}
-
-static inline struct target target_decode(unsigned target)
-{
-	if (target >= TARGET_GROUP_START)
-		return (struct target) {
-			.type	= TARGET_GROUP,
-			.group	= target - TARGET_GROUP_START
-		};
-
-	if (target >= TARGET_DEV_START)
-		return (struct target) {
-			.type	= TARGET_DEV,
-			.group	= target - TARGET_DEV_START
-		};
-
-	return (struct target) { .type = TARGET_NULL };
-}
-
-static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
-{
-	struct target t = target_decode(target);
-
-	switch (t.type) {
-	case TARGET_NULL:
-		return false;
-	case TARGET_DEV:
-		return ca->dev_idx == t.dev;
-	case TARGET_GROUP:
-		return ca->mi.group && ca->mi.group - 1 == t.group;
-	default:
-		BUG();
-	}
-}
-
-static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
-	bool ret;
-
-	rcu_read_lock();
-	ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
-	rcu_read_unlock();
-
-	return ret;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
-
-int __bch2_disk_group_find(struct bch_sb_field_disk_groups *, const char *);
-
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
-
 #endif /* _BCACHEFS_SUPER_IO_H */
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@ -18,6 +18,7 @@
 #include "clock.h"
 #include "compress.h"
 #include "debug.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
@ -30,6 +31,7 @@
 #include "migrate.h"
 #include "movinggc.h"
 #include "quota.h"
+#include "replicas.h"
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
@ -122,7 +124,7 @@ static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
 	lockdep_assert_held(&bch_fs_list_lock);

 	list_for_each_entry(c, &bch_fs_list, list)
-		if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+		if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le)))
 			return c;

 	return NULL;
@ -203,23 +205,12 @@ static void bch_fs_mark_clean(struct bch_fs *c)
 	    !test_bit(BCH_FS_ERROR, &c->flags) &&
 	    !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
 		mutex_lock(&c->sb_lock);
-		SET_BCH_SB_CLEAN(c->disk_sb, true);
+		SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
 		bch2_write_super(c);
 		mutex_unlock(&c->sb_lock);
 	}
 }

-static bool btree_interior_updates_done(struct bch_fs *c)
-{
-	bool ret;
-
-	mutex_lock(&c->btree_interior_update_lock);
-	ret = list_empty(&c->btree_interior_update_list);
-	mutex_unlock(&c->btree_interior_update_lock);
-
-	return ret;
-}
-
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@ -251,7 +242,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	 * fully complete:
 	 */
 	closure_wait_event(&c->btree_interior_update_wait,
-			   btree_interior_updates_done(c));
+			   !bch2_btree_interior_updates_nr_pending(c));

 	if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
 		bch2_btree_verify_flushed(c);
@ -433,7 +424,8 @@ static void bch2_fs_free(struct bch_fs *c)
 	if (c->wq)
 		destroy_workqueue(c->wq);

-	free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
+	free_pages((unsigned long) c->disk_sb.sb,
+		   c->disk_sb.page_order);
 	kvpfree(c, sizeof(*c));
 	module_put(THIS_MODULE);
 }
@ -501,11 +493,54 @@ void bch2_fs_stop(struct bch_fs *c)
 	kobject_put(&c->kobj);
 }

+static const char *bch2_fs_online(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	const char *err = NULL;
+	unsigned i;
+	int ret;
+
+	lockdep_assert_held(&bch_fs_list_lock);
+
+	if (!list_empty(&c->list))
+		return NULL;
+
+	if (__bch2_uuid_to_fs(c->sb.uuid))
+		return "filesystem UUID already open";
+
+	ret = bch2_fs_chardev_init(c);
+	if (ret)
+		return "error creating character device";
+
+	bch2_fs_debug_init(c);
+
+	if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
+	    kobject_add(&c->internal, &c->kobj, "internal") ||
+	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
+	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
+	    bch2_opts_create_sysfs_files(&c->opts_dir))
+		return "error creating sysfs objects";
+
+	mutex_lock(&c->state_lock);
+
+	err = "error creating sysfs objects";
+	__for_each_member_device(ca, c, i, NULL)
+		if (bch2_dev_sysfs_online(c, ca))
+			goto err;
+
+	list_add(&c->list, &bch_fs_list);
+	err = NULL;
+err:
+	mutex_unlock(&c->state_lock);
+	return err;
+}
+
 static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
 	struct bch_sb_field_members *mi;
 	struct bch_fs *c;
 	unsigned i, iter_size;
+	const char *err;

 	pr_verbose_init(opts, "");

@ -516,6 +551,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	__module_get(THIS_MODULE);

 	c->minor		= -1;
+	c->disk_sb.fs_sb	= true;

 	mutex_init(&c->state_lock);
 	mutex_init(&c->sb_lock);
@ -627,9 +663,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_fsio_init(c))
 		goto err;

-	mi = bch2_sb_get_members(c->disk_sb);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (i = 0; i < c->sb.nr_devices; i++)
-		if (bch2_dev_exists(c->disk_sb, mi, i) &&
+		if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
 		    bch2_dev_alloc(c, i))
 			goto err;

@ -644,6 +680,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	kobject_init(&c->internal, &bch2_fs_internal_ktype);
 	kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
 	kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+
+	mutex_lock(&bch_fs_list_lock);
+	err = bch2_fs_online(c);
+	mutex_unlock(&bch_fs_list_lock);
+	if (err) {
+		bch_err(c, "bch2_fs_online() error: %s", err);
+		goto err;
+	}
 out:
 	pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
 	return c;
@ -653,60 +697,7 @@ err:
 	goto out;
 }

-static const char *__bch2_fs_online(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	const char *err = NULL;
-	unsigned i;
-	int ret;
-
-	lockdep_assert_held(&bch_fs_list_lock);
-
-	if (!list_empty(&c->list))
-		return NULL;
-
-	if (__bch2_uuid_to_fs(c->sb.uuid))
-		return "filesystem UUID already open";
-
-	ret = bch2_fs_chardev_init(c);
-	if (ret)
-		return "error creating character device";
-
-	bch2_fs_debug_init(c);
-
-	if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
-	    kobject_add(&c->internal, &c->kobj, "internal") ||
-	    kobject_add(&c->opts_dir, &c->kobj, "options") ||
-	    kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
-	    bch2_opts_create_sysfs_files(&c->opts_dir))
-		return "error creating sysfs objects";
-
-	mutex_lock(&c->state_lock);
-
-	err = "error creating sysfs objects";
-	__for_each_member_device(ca, c, i, NULL)
-		if (bch2_dev_sysfs_online(c, ca))
-			goto err;
-
-	list_add(&c->list, &bch_fs_list);
-	err = NULL;
-err:
-	mutex_unlock(&c->state_lock);
-	return err;
-}
-
-static const char *bch2_fs_online(struct bch_fs *c)
-{
-	const char *err;
-
-	mutex_lock(&bch_fs_list_lock);
-	err = __bch2_fs_online(c);
-	mutex_unlock(&bch_fs_list_lock);
-
-	return err;
-}
-
-static const char *__bch2_fs_start(struct bch_fs *c)
+const char *bch2_fs_start(struct bch_fs *c)
 {
 	const char *err = "cannot allocate memory";
 	struct bch_sb_field_members *mi;
@ -730,15 +721,15 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);

-	if (BCH_SB_INITIALIZED(c->disk_sb)) {
+	if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
 		ret = bch2_journal_read(c, &journal);
 		if (ret)
 			goto err;

 		j = &list_entry(journal.prev, struct journal_replay, list)->j;

-		c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
-		c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+		c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
+		c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);

 		for (i = 0; i < BTREE_ID_NR; i++) {
 			unsigned level;
@ -824,21 +815,18 @@ static const char *__bch2_fs_start(struct bch_fs *c)
 		bch_notice(c, "initializing new filesystem");

 		set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-		set_bit(BCH_FS_BRAND_NEW_FS, &c->flags);

 		ret = bch2_initial_gc(c, &journal);
 		if (ret)
 			goto err;

 		err = "unable to allocate journal buckets";
-		for_each_rw_member(ca, c, i)
-			if (bch2_dev_journal_alloc(c, ca)) {
+		for_each_online_member(ca, c, i)
+			if (bch2_dev_journal_alloc(ca)) {
 				percpu_ref_put(&ca->io_ref);
 				goto err;
 			}

-		clear_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
-
 		for (i = 0; i < BTREE_ID_NR; i++)
 			bch2_btree_root_alloc(c, i);

@ -889,18 +877,20 @@ recovery_done:
 	}

 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
 	now = ktime_get_seconds();

 	for_each_member_device(ca, c, i)
 		mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);

-	SET_BCH_SB_INITIALIZED(c->disk_sb, true);
-	SET_BCH_SB_CLEAN(c->disk_sb, false);
+	SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+	SET_BCH_SB_CLEAN(c->disk_sb.sb, false);

 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

+	set_bit(BCH_FS_STARTED, &c->flags);
+
 	err = NULL;
 out:
 	mutex_unlock(&c->state_lock);
@ -939,11 +929,6 @@ fsck_err:
 	goto out;
 }

-const char *bch2_fs_start(struct bch_fs *c)
-{
-	return __bch2_fs_start(c) ?: bch2_fs_online(c);
-}
-
 static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 {
 	struct bch_sb_field_members *sb_mi;
@ -956,7 +941,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 		return "mismatched block size";

 	if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
-	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
+	    BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
 		return "new cache bucket size is too small";

 	return NULL;
@ -1082,28 +1067,19 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
 	return 0;
 }

-static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+					struct bch_member *member)
 {
-	struct bch_member *member;
-	struct bch_dev *ca = NULL;
-	int ret = 0;
-
-	pr_verbose_init(c->opts, "");
-
-	if (bch2_fs_init_fault("dev_alloc"))
-		goto err;
+	struct bch_dev *ca;

 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	if (!ca)
-		goto err;
+		return NULL;

 	kobject_init(&ca->kobj, &bch2_dev_ktype);
 	init_completion(&ca->ref_completion);
 	init_completion(&ca->io_ref_completion);

-	ca->dev_idx = dev_idx;
-	__set_bit(ca->dev_idx, ca->self.d);
-
 	init_rwsem(&ca->bucket_lock);

 	writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
@ -1113,14 +1089,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)

 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);

-	if (bch2_fs_init_fault("dev_alloc"))
-		goto err;
-
-	member = bch2_sb_get_members(c->disk_sb)->members + dev_idx;
-
 	ca->mi = bch2_mi_to_cpu(member);
 	ca->uuid = member->uuid;
-	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);

 	if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
 			    0, GFP_KERNEL) ||
@ -1132,11 +1102,43 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	    !(ca->io_done	= alloc_percpu(*ca->io_done)))
 		goto err;

+	return ca;
+err:
+	bch2_dev_free(ca);
+	return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+			    unsigned dev_idx)
+{
+	ca->dev_idx = dev_idx;
+	__set_bit(ca->dev_idx, ca->self.d);
+	scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
 	ca->fs = c;
 	rcu_assign_pointer(c->devs[ca->dev_idx], ca);

 	if (bch2_dev_sysfs_online(c, ca))
 		pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+	struct bch_member *member =
+		bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+	struct bch_dev *ca = NULL;
+	int ret = 0;
+
+	pr_verbose_init(c->opts, "");
+
+	if (bch2_fs_init_fault("dev_alloc"))
+		goto err;
+
+	ca = __bch2_dev_alloc(c, member);
+	if (!ca)
+		goto err;
+
+	bch2_dev_attach(c, ca, dev_idx);
 out:
 	pr_verbose_init(c->opts, "ret %i", ret);
 	return ret;
@ -1147,21 +1149,9 @@ err:
 	goto out;
 }

-static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
 {
-	struct bch_dev *ca;
-	int ret;
-
-	lockdep_assert_held(&c->state_lock);
-
-	if (le64_to_cpu(sb->sb->seq) >
-	    le64_to_cpu(c->disk_sb->seq))
-		bch2_sb_to_fs(c, sb->sb);
-
-	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
-	       !c->devs[sb->sb->dev_idx]);
-
-	ca = bch_dev_locked(c, sb->sb->dev_idx);
+	unsigned ret;

 	if (bch2_dev_is_online(ca)) {
 		bch_err(ca, "already have device online in slot %u",
@ -1179,7 +1169,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)

 	if (get_capacity(sb->bdev->bd_disk) <
 	    ca->mi.bucket_size * ca->mi.nbuckets) {
-		bch_err(c, "device too small");
+		bch_err(ca, "device too small");
 		return -EINVAL;
 	}

@ -1187,35 +1177,50 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (ret)
 		return ret;

-	/*
-	 * Increase journal write timeout if flushes to this device are
-	 * expensive:
-	 */
-	if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
-	    journal_flushes_device(ca))
-		c->journal.write_delay_ms =
-			max(c->journal.write_delay_ms, 1000U);
-
 	/* Commit: */
 	ca->disk_sb = *sb;
 	if (sb->mode & FMODE_EXCL)
 		ca->disk_sb.bdev->bd_holder = ca;
 	memset(sb, 0, sizeof(*sb));

+	if (ca->fs)
+		mutex_lock(&ca->fs->sb_lock);
+
+	bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
+	if (ca->fs)
+		mutex_unlock(&ca->fs->sb_lock);
+
+	percpu_ref_reinit(&ca->io_ref);
+
+	return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+	struct bch_dev *ca;
+	int ret;
+
+	lockdep_assert_held(&c->state_lock);
+
+	if (le64_to_cpu(sb->sb->seq) >
+	    le64_to_cpu(c->disk_sb.sb->seq))
+		bch2_sb_to_fs(c, sb->sb);
+
+	BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+	       !c->devs[sb->sb->dev_idx]);
+
+	ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+	ret = __bch2_dev_attach_bdev(ca, sb);
+	if (ret)
+		return ret;
+
 	if (c->sb.nr_devices == 1)
 		bdevname(ca->disk_sb.bdev, c->name);
 	bdevname(ca->disk_sb.bdev, ca->name);

-	mutex_lock(&c->sb_lock);
-	bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-	mutex_unlock(&c->sb_lock);
-
-	if (ca->mi.state == BCH_MEMBER_STATE_RW)
-		bch2_dev_allocator_add(c, ca);
-
 	rebalance_wakeup(c);
-
-	percpu_ref_reinit(&ca->io_ref);
 	return 0;
 }

@ -1289,10 +1294,10 @@ static bool bch2_fs_may_start(struct bch_fs *c)

 	if (!c->opts.degraded) {
 		mutex_lock(&c->sb_lock);
-		mi = bch2_sb_get_members(c->disk_sb);
+		mi = bch2_sb_get_members(c->disk_sb.sb);

-		for (i = 0; i < c->disk_sb->nr_devices; i++) {
-			if (!bch2_dev_exists(c->disk_sb, mi, i))
+		for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+			if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
 				continue;

 			ca = bch_dev_locked(c, i);
@ -1360,7 +1365,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 	bch_notice(ca, "%s", bch2_dev_state[new_state]);

 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
 	SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
@ -1470,7 +1475,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	 * this device must be gone:
 	 */
 	mutex_lock(&c->sb_lock);
-	mi = bch2_sb_get_members(c->disk_sb);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
 	memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));

 	bch2_write_super(c);
@ -1492,8 +1497,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	struct bch_sb_handle sb;
 	const char *err;
 	struct bch_dev *ca = NULL;
-	struct bch_sb_field_members *mi, *dev_mi;
-	struct bch_member saved_mi;
+	struct bch_sb_field_members *mi;
+	struct bch_member dev_mi;
 	unsigned dev_idx, nr_devices, u64s;
 	int ret;

@ -1505,24 +1510,52 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
 	if (err)
 		return -EINVAL;

+	dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+
 	err = bch2_dev_may_add(sb.sb, c);
 	if (err)
 		return -EINVAL;

+	ca = __bch2_dev_alloc(c, &dev_mi);
+	if (!ca) {
+		bch2_free_super(&sb);
+		return -ENOMEM;
+	}
+
+	ret = __bch2_dev_attach_bdev(ca, &sb);
+	if (ret) {
+		bch2_dev_free(ca);
+		return ret;
+	}
+
+	err = "journal alloc failed";
+	ret = bch2_dev_journal_alloc(ca);
+	if (ret)
+		goto err;
+
 	mutex_lock(&c->state_lock);
 	mutex_lock(&c->sb_lock);

-	/* Grab member info for new disk: */
-	dev_mi = bch2_sb_get_members(sb.sb);
-	saved_mi = dev_mi->members[sb.sb->dev_idx];
-	saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
+	err = "insufficient space in new superblock";
+	ret = bch2_sb_from_fs(c, ca);
+	if (ret)
+		goto err_unlock;
+
+	mi = bch2_sb_get_members(ca->disk_sb.sb);
+
+	if (!bch2_sb_resize_members(&ca->disk_sb,
+				le32_to_cpu(mi->field.u64s) +
+				sizeof(dev_mi) / sizeof(u64))) {
+		ret = -ENOSPC;
+		goto err_unlock;
+	}

 	if (dynamic_fault("bcachefs:add:no_slot"))
 		goto no_slot;

-	mi = bch2_sb_get_members(c->disk_sb);
+	mi = bch2_sb_get_members(c->disk_sb.sb);
 	for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
-		if (!bch2_dev_exists(c->disk_sb, mi, dev_idx))
+		if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
 			goto have_slot;
 no_slot:
 	err = "no slots available in superblock";
@ -1533,64 +1566,47 @@ have_slot:
 	nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
 	u64s = (sizeof(struct bch_sb_field_members) +
 		sizeof(struct bch_member) * nr_devices) / sizeof(u64);
+
 	err = "no space in superblock for member info";
+	ret = -ENOSPC;

-	dev_mi = bch2_sb_resize_members(&sb, u64s);
-	if (!dev_mi)
-		goto err_unlock;
-
-	mi = bch2_fs_sb_resize_members(c, u64s);
+	mi = bch2_sb_resize_members(&c->disk_sb, u64s);
 	if (!mi)
 		goto err_unlock;

-	memcpy(dev_mi, mi, u64s * sizeof(u64));
-	dev_mi->members[dev_idx] = saved_mi;
+	/* success: */

-	sb.sb->uuid		= c->disk_sb->uuid;
-	sb.sb->dev_idx		= dev_idx;
-	sb.sb->nr_devices	= nr_devices;
+	mi->members[dev_idx] = dev_mi;
+	mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds());
+	c->disk_sb.sb->nr_devices	= nr_devices;

-	/* commit new member info */
-	memcpy(mi, dev_mi, u64s * sizeof(u64));
-	c->disk_sb->nr_devices	= nr_devices;
-	c->sb.nr_devices	= nr_devices;
+	ca->disk_sb.sb->dev_idx	= dev_idx;
+	bch2_dev_attach(c, ca, dev_idx);

 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);

-	if (bch2_dev_alloc(c, dev_idx)) {
-		err = "cannot allocate memory";
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	if (__bch2_dev_online(c, &sb)) {
-		err = "bch2_dev_online() error";
-		ret = -ENOMEM;
-		goto err;
-	}
-
-	ca = bch_dev_locked(c, dev_idx);
 	if (ca->mi.state == BCH_MEMBER_STATE_RW) {
 		err = __bch2_dev_read_write(c, ca);
 		if (err)
-			goto err;
-
-		err = "journal alloc failed";
-		if (bch2_dev_journal_alloc(c, ca))
-			goto err;
+			goto err_late;
 	}

 	mutex_unlock(&c->state_lock);
 	return 0;
+
 err_unlock:
 	mutex_unlock(&c->sb_lock);
-err:
 	mutex_unlock(&c->state_lock);
+err:
+	if (ca)
+		bch2_dev_free(ca);
 	bch2_free_super(&sb);
-
 	bch_err(c, "Unable to add device: %s", err);
-	return ret ?: -EINVAL;
+	return ret;
+err_late:
+	bch_err(c, "Error going rw after adding device: %s", err);
+	return -EINVAL;
 }

 /* Hot add existing device to running filesystem: */
@ -1613,12 +1629,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path)

 	dev_idx = sb.sb->dev_idx;

-	err = bch2_dev_in_fs(c->disk_sb, sb.sb);
+	err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
 	if (err)
 		goto err;

-	if (__bch2_dev_online(c, &sb)) {
-		err = "__bch2_dev_online() error";
+	if (bch2_dev_attach_bdev(c, &sb)) {
+		err = "bch2_dev_attach_bdev() error";
 		goto err;
 	}

@ -1688,7 +1704,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	}

 	mutex_lock(&c->sb_lock);
-	mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+	mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
 	mi->nbuckets = cpu_to_le64(nbuckets);

 	bch2_write_super(c);
@ -1721,74 +1737,6 @@ found:
 	return ca;
 }

-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *label)
-{
-	struct bch_sb_field_disk_groups *groups;
-	struct bch_disk_group *g;
-	struct bch_member *mi;
-	unsigned i, v, nr_groups;
-	int ret;
-
-	if (strlen(label) > BCH_SB_LABEL_SIZE)
-		return -EINVAL;
-
-	mutex_lock(&c->sb_lock);
-	groups		= bch2_sb_get_disk_groups(c->disk_sb);
-	nr_groups	= disk_groups_nr(groups);
-
-	if (!strcmp(label, "none")) {
-		v = 0;
-		goto write_sb;
-	}
-
-	ret = __bch2_disk_group_find(groups, label);
-	if (ret >= 0) {
-		v = ret + 1;
-		goto write_sb;
-	}
-
-	/* not found - create a new disk group: */
-
-	for (i = 0;
-	     i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
-	     i++)
-		;
-
-	if (i == nr_groups) {
-		unsigned u64s =
-			(sizeof(struct bch_sb_field_disk_groups) +
-			 sizeof(struct bch_disk_group) * (nr_groups + 1)) /
-			sizeof(u64);
-
-		groups = bch2_fs_sb_resize_disk_groups(c, u64s);
-		if (!groups) {
-			mutex_unlock(&c->sb_lock);
-			return -ENOSPC;
-		}
-
-		nr_groups = disk_groups_nr(groups);
-	}
-
-	BUG_ON(i >= nr_groups);
-
-	g = &groups->entries[i];
-	v = i + 1;
-
-	memcpy(g->label, label, strlen(label));
-	if (strlen(label) < sizeof(g->label))
-		g->label[strlen(label)] = '\0';
-	SET_BCH_GROUP_DELETED(g, 0);
-	SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-write_sb:
-	mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
-	SET_BCH_MEMBER_GROUP(mi, v);
-
-	bch2_write_super(c);
-	mutex_unlock(&c->sb_lock);
-
-	return 0;
-}
-
 /* Filesystem open: */

 struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
@ -1845,7 +1793,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 	err = "bch2_dev_online() error";
 	mutex_lock(&c->state_lock);
 	for (i = 0; i < nr_devices; i++)
-		if (__bch2_dev_online(c, &sb[i])) {
+		if (bch2_dev_attach_bdev(c, &sb[i])) {
 			mutex_unlock(&c->state_lock);
 			goto err_print;
 		}
@ -1856,15 +1804,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 		goto err_print;

 	if (!c->opts.nostart) {
-		err = __bch2_fs_start(c);
+		err = bch2_fs_start(c);
 		if (err)
 			goto err_print;
 	}
-
-	err = bch2_fs_online(c);
-	if (err)
-		goto err_print;
-
 out:
 	kfree(sb);
 	module_put(THIS_MODULE);
@ -1900,7 +1843,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
 	if (c) {
 		closure_get(&c->cl);

-		err = bch2_dev_in_fs(c->disk_sb, sb->sb);
+		err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
 		if (err)
 			goto err;
 	} else {
@ -1915,22 +1858,18 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
 	err = "bch2_dev_online() error";

 	mutex_lock(&c->sb_lock);
-	if (__bch2_dev_online(c, sb)) {
+	if (bch2_dev_attach_bdev(c, sb)) {
 		mutex_unlock(&c->sb_lock);
 		goto err;
 	}
 	mutex_unlock(&c->sb_lock);

 	if (!c->opts.nostart && bch2_fs_may_start(c)) {
-		err = __bch2_fs_start(c);
+		err = bch2_fs_start(c);
 		if (err)
 			goto err;
 	}

-	err = __bch2_fs_online(c);
-	if (err)
-		goto err;
-
 	closure_put(&c->cl);
 	mutex_unlock(&bch_fs_list_lock);

--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@ -195,7 +195,6 @@ int bch2_dev_online(struct bch_fs *, const char *);
 int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
 int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
 struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
-int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);

 bool bch2_fs_emergency_read_only(struct bch_fs *);
 void bch2_fs_read_only(struct bch_fs *);
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@ -7,6 +7,9 @@ struct bch_sb_handle {
 	struct bio		*bio;
 	unsigned		page_order;
 	fmode_t			mode;
+	unsigned		have_layout:1;
+	unsigned		have_bio:1;
+	unsigned		fs_sb:1;
 };

 struct bch_devs_mask {
@ -44,8 +47,9 @@ struct bch_replicas_cpu {
 };

 struct bch_disk_group_cpu {
-	struct bch_devs_mask		devs;
 	bool				deleted;
+	u16				parent;
+	struct bch_devs_mask		devs;
 };

 struct bch_disk_groups_cpu {
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@ -18,11 +18,13 @@
 #include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "disk_groups.h"
 #include "inode.h"
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
 #include "opts.h"
+#include "replicas.h"
 #include "super-io.h"
 #include "tier.h"

@ -140,10 +142,10 @@ read_attribute(first_bucket);
 read_attribute(nbuckets);
 read_attribute(durability);
 read_attribute(iostats);
-read_attribute(read_priority_stats);
-read_attribute(write_priority_stats);
-read_attribute(fragmentation_stats);
-read_attribute(oldest_gen_stats);
+read_attribute(last_read_quantiles);
+read_attribute(last_write_quantiles);
+read_attribute(fragmentation_quantiles);
+read_attribute(oldest_gen_quantiles);
 read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
@ -167,7 +169,7 @@ rw_attribute(journal_reclaim_delay_ms);

 rw_attribute(discard);
 rw_attribute(cache_replacement_policy);
-rw_attribute(group);
+rw_attribute(label);

 rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);
@ -546,7 +548,7 @@ STORE(bch2_fs_opts_dir)

 	if (opt->set_sb != SET_NO_SB_OPT) {
 		mutex_lock(&c->sb_lock);
-		opt->set_sb(c->disk_sb, v);
+		opt->set_sb(c->disk_sb.sb, v);
 		bch2_write_super(c);
 		mutex_unlock(&c->sb_lock);
 	}
@ -621,36 +623,41 @@ struct attribute *bch2_fs_time_stats_files[] = {
 	NULL
 };

-typedef unsigned (bucket_map_fn)(struct bch_dev *, size_t, void *);
+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
+				 size_t, void *);

-static unsigned bucket_priority_fn(struct bch_dev *ca, size_t b,
-				   void *private)
+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
+				  size_t b, void *private)
 {
-	struct bucket *g = bucket(ca, b);
 	int rw = (private ? 1 : 0);

-	return ca->fs->prio_clock[rw].hand - g->prio[rw];
+	return bucket_last_io(c, bucket(ca, b), rw);
 }

-static unsigned bucket_sectors_used_fn(struct bch_dev *ca, size_t b,
-				       void *private)
+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
+				       size_t b, void *private)
 {
 	struct bucket *g = bucket(ca, b);
 	return bucket_sectors_used(g->mark);
 }

-static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, size_t b,
-				     void *private)
+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
+				     size_t b, void *private)
 {
 	return bucket_gc_gen(ca, b);
 }

-static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
-			      bucket_map_fn *fn, void *private)
+static int unsigned_cmp(const void *_l, const void *_r)
 {
-	int cmp(const void *l, const void *r)
-	{	return *((unsigned *) r) - *((unsigned *) l); }
+	unsigned l = *((unsigned *) _l);
+	unsigned r = *((unsigned *) _r);

+	return (l > r) - (l < r);
+}
+
+static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
+			      char *buf, bucket_map_fn *fn, void *private)
+{
 	size_t i, n;
 	/* Compute 31 quantiles */
 	unsigned q[31], *p;
@ -666,9 +673,9 @@ static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
 	}

 	for (i = ca->mi.first_bucket; i < n; i++)
-		p[i] = fn(ca, i, private);
+		p[i] = fn(c, ca, i, private);

-	sort(p, n, sizeof(unsigned), cmp, NULL);
+	sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
 	up_read(&ca->bucket_lock);

 	while (n &&
@ -804,24 +811,18 @@ SHOW(bch2_dev)
 	sysfs_print(durability,		ca->mi.durability);
 	sysfs_print(discard,		ca->mi.discard);

-	if (attr == &sysfs_group) {
-		struct bch_sb_field_disk_groups *groups;
-		struct bch_disk_group *g;
-		unsigned len;
+	if (attr == &sysfs_label) {
+		if (ca->mi.group) {
+			mutex_lock(&c->sb_lock);
+			out += bch2_disk_path_print(&c->disk_sb, out, end - out,
+						    ca->mi.group - 1);
+			mutex_unlock(&c->sb_lock);
+		} else {
+			out += scnprintf(out, end - out, "none");
+		}

-		if (!ca->mi.group)
-			return scnprintf(out, end - out, "none\n");
-
-		mutex_lock(&c->sb_lock);
-		groups = bch2_sb_get_disk_groups(c->disk_sb);
-
-		g = &groups->entries[ca->mi.group - 1];
-		len = strnlen(g->label, sizeof(g->label));
-		memcpy(buf, g->label, len);
-		mutex_unlock(&c->sb_lock);
-
-		buf[len++] = '\n';
-		return len;
+		out += scnprintf(out, end - out, "\n");
+		return out - buf;
 	}

 	if (attr == &sysfs_has_data) {
@ -852,14 +853,16 @@ SHOW(bch2_dev)

 	if (attr == &sysfs_iostats)
 		return show_dev_iostats(ca, buf);
-	if (attr == &sysfs_read_priority_stats)
-		return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0);
-	if (attr == &sysfs_write_priority_stats)
-		return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1);
-	if (attr == &sysfs_fragmentation_stats)
-		return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL);
-	if (attr == &sysfs_oldest_gen_stats)
-		return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL);
+
+	if (attr == &sysfs_last_read_quantiles)
+		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
+	if (attr == &sysfs_last_write_quantiles)
+		return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
+	if (attr == &sysfs_fragmentation_quantiles)
+		return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
+	if (attr == &sysfs_oldest_gen_quantiles)
+		return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
+
 	if (attr == &sysfs_reserve_stats)
 		return show_reserve_stats(ca, buf);
 	if (attr == &sysfs_alloc_debug)
@ -880,7 +883,7 @@ STORE(bch2_dev)
 		bool v = strtoul_or_return(buf);

 		mutex_lock(&c->sb_lock);
-		mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];

 		if (v != BCH_MEMBER_DISCARD(mi)) {
 			SET_BCH_MEMBER_DISCARD(mi, v);
@ -896,7 +899,7 @@ STORE(bch2_dev)
 			return v;

 		mutex_lock(&c->sb_lock);
-		mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+		mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];

 		if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
 			SET_BCH_MEMBER_REPLACEMENT(mi, v);
@ -905,7 +908,7 @@ STORE(bch2_dev)
 		mutex_unlock(&c->sb_lock);
 	}

-	if (attr == &sysfs_group) {
+	if (attr == &sysfs_label) {
 		char *tmp;
 		int ret;

@ -938,16 +941,16 @@ struct attribute *bch2_dev_files[] = {
 	&sysfs_discard,
 	&sysfs_cache_replacement_policy,
 	&sysfs_state_rw,
-	&sysfs_group,
+	&sysfs_label,

 	&sysfs_has_data,
 	&sysfs_iostats,

 	/* alloc info - other stats: */
-	&sysfs_read_priority_stats,
-	&sysfs_write_priority_stats,
-	&sysfs_fragmentation_stats,
-	&sysfs_oldest_gen_stats,
+	&sysfs_last_read_quantiles,
+	&sysfs_last_write_quantiles,
+	&sysfs_fragmentation_quantiles,
+	&sysfs_oldest_gen_quantiles,
 	&sysfs_reserve_stats,

 	/* debug: */
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@ -4,6 +4,7 @@
 #include "btree_iter.h"
 #include "buckets.h"
 #include "clock.h"
+#include "disk_groups.h"
 #include "extents.h"
 #include "io.h"
 #include "move.h"
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@ -86,8 +86,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
 	.cmp_bkey	= xattr_cmp_bkey,
 };

-static const char *bch2_xattr_invalid(const struct bch_fs *c,
-				     struct bkey_s_c k)
+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr;
@ -126,8 +125,8 @@ static const char *bch2_xattr_invalid(const struct bch_fs *c,
 	}
 }

-static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
-			      size_t size, struct bkey_s_c k)
+void bch2_xattr_to_text(struct bch_fs *c, char *buf,
+			size_t size, struct bkey_s_c k)
 {
 	const struct xattr_handler *handler;
 	struct bkey_s_c_xattr xattr;
@ -159,11 +158,6 @@ static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
 	}
 }

-const struct bkey_ops bch2_bkey_xattr_ops = {
-	.key_invalid	= bch2_xattr_invalid,
-	.val_to_text	= bch2_xattr_to_text,
-};
-
 int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
 		  const char *name, void *buffer, size_t size, int type)
 {
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@ -4,7 +4,14 @@
 #include "str_hash.h"

 extern const struct bch_hash_desc bch2_xattr_hash_desc;
-extern const struct bkey_ops bch2_bkey_xattr_ops;
+
+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_xattr_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_xattr_invalid,		\
+	.val_to_text	= bch2_xattr_to_text,		\
+}

 struct dentry;
 struct xattr_handler;